A modern general-purpose parsing library for Python that can parse any context-free grammar efficiently
—
Token representation and lexical analysis including Token class for lexical units, lexer configuration, and specialized handling for indentation-sensitive languages.
The Token class represents lexical units produced by the lexer, inheriting from str while adding metadata.
class Token(str):
"""
String with meta-information representing a lexical token.
Inherits from str so it can be used anywhere a string is expected.
"""
def __new__(cls, type_: str, value: str, start_pos: int = None,
line: int = None, column: int = None, end_line: int = None,
end_column: int = None, end_pos: int = None,
pos_in_stream: int = None) -> 'Token':
"""
Create new token instance.
Parameters:
- type_: Token type name (terminal name from grammar)
- value: Token string value
- start_pos: Starting position in input text
- line: Line number (1-based)
- column: Column number (1-based)
- end_line: Ending line number
- end_column: Ending column number
- end_pos: Ending position in input text
- pos_in_stream: Position in token stream
Returns:
Token: New token instance
"""
def update(self, type_: str = None, value: str = None) -> 'Token':
"""
Create updated copy of token with new type or value.
Parameters:
- type_: New token type (optional)
- value: New token value (optional)
Returns:
Token: Updated token copy
"""
@classmethod
def new_borrow_pos(cls, type_: str, value: str, borrow_t: 'Token') -> 'Token':
"""
Create token borrowing position information from another token.
Parameters:
- type_: Token type name
- value: Token string value
- borrow_t: Token to borrow position from
Returns:
Token: New token with borrowed position
"""
# Attributes
type: str # Token type name
value: str # Token string value (same as str content)
start_pos: int # Start position in input
line: int # Line number (1-based)
column: int # Column number (1-based)
end_line: int # End line number
end_column: int # End column number
end_pos: int # End position in input
pos_in_stream: int # Position in token streamPost-lexer processor for handling Python-style indentation with INDENT/DEDENT tokens.
class Indenter:
"""
PostLex processor for Python-like indentation handling.
Converts whitespace at line beginnings into INDENT/DEDENT tokens.
"""
def __init__(self, tab_len: int = 8):
"""
Initialize indenter.
Parameters:
- tab_len: Number of spaces equivalent to one tab
"""
def process(self, stream: Iterator[Token]) -> Iterator[Token]:
"""
Process token stream, converting indentation to INDENT/DEDENT tokens.
Parameters:
- stream: Input token stream
Returns:
Iterator[Token]: Stream with indentation tokens
"""
def handle_NL(self, token: Token) -> Iterator[Token]:
"""
Handle newline tokens for indentation tracking.
Parameters:
- token: Newline token
Returns:
Iterator[Token]: Processed tokens
"""
def handle_OPEN_PAREN(self, token: Token) -> Iterator[Token]:
"""
Handle opening parenthesis tokens.
Parameters:
- token: Opening parenthesis token
Returns:
Iterator[Token]: Processed tokens
"""
def handle_CLOSE_PAREN(self, token: Token) -> Iterator[Token]:
"""
Handle closing parenthesis tokens.
Parameters:
- token: Closing parenthesis token
Returns:
Iterator[Token]: Processed tokens
"""
# Configuration attributes
always_accept: Tuple[str, ...] = ('NL', 'COMMENT') # Always accepted tokens
NL_type: str = 'NL' # Newline token type
OPEN_PAREN_types: Tuple[str, ...] = () # Open paren types
CLOSE_PAREN_types: Tuple[str, ...] = () # Close paren types
INDENT_type: str = 'INDENT' # Indent token type
DEDENT_type: str = 'DEDENT' # Dedent token type
tab_len: int # Tab length in spacesPre-configured indenter for Python syntax.
class PythonIndenter(Indenter):
"""
Indenter configured for Python language syntax.
"""
NL_type = 'NEWLINE'
OPEN_PAREN_types = ('LPAR', 'LSQB', 'LBRACE')
CLOSE_PAREN_types = ('RPAR', 'RSQB', 'RBRACE')
INDENT_type = 'INDENT'
DEDENT_type = 'DEDENT'
tab_len = 8Exception raised for invalid dedentation patterns.
class DedentError(LarkError):
"""
Raised when dedentation doesn't match any previous indentation level.
"""Configuration classes for lexer behavior and terminal definitions.
class LexerConf:
"""
Configuration object for lexer components.
"""
def __init__(self, terminals: List, ignore: List = None,
g_regex_flags: int = 0, use_bytes: bool = False,
lexer_type: str = None, callbacks: Dict = None):
"""
Initialize lexer configuration.
Parameters:
- terminals: List of terminal definitions
- ignore: List of terminals to ignore
- g_regex_flags: Global regex flags
- use_bytes: Whether to use bytes instead of str
- lexer_type: Type of lexer to use
- callbacks: Token callbacks dictionary
"""
terminals: List # Terminal definitions
ignore: List # Ignored terminals
g_regex_flags: int # Global regex flags
use_bytes: bool # Use bytes input
lexer_type: str # Lexer type
callbacks: Dict # Token callbacksClasses representing terminal symbol definitions in grammars.
class TerminalDef:
"""
Definition of a terminal symbol in the grammar.
"""
def __init__(self, name: str, pattern, options: List = None):
"""
Initialize terminal definition.
Parameters:
- name: Terminal name
- pattern: Pattern object or string
- options: List of terminal options
"""
name: str # Terminal name
pattern: Pattern # Pattern for matching
options: List # Terminal optionsCore lexer implementations for tokenizing input text.
class Lexer:
"""
Abstract base lexer class.
"""
def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]:
"""
Tokenize input text.
Parameters:
- text: Input text to tokenize
- dont_ignore: Include normally ignored tokens
Returns:
Iterator[Token]: Token stream
"""
class TraditionalLexer(Lexer):
"""
Traditional regex-based lexer implementation.
"""
class LexerThread:
"""
Lexer state for incremental tokenization.
"""
def lex(self, stream: Iterator, newline_types: Set[str],
ignore_types: Set[str]) -> Iterator[Token]:
"""
Perform lexical analysis on character stream.
Parameters:
- stream: Character stream
- newline_types: Set of newline token types
- ignore_types: Set of token types to ignore
Returns:
Iterator[Token]: Token stream
"""from lark import Lark, Token
parser = Lark(grammar)
# Get tokens without parsing
tokens = list(parser.lex("x = 42"))
for token in tokens:
print(f"Type: {token.type}, Value: '{token.value}', Line: {token.line}")
# Tokens are strings
token = Token('IDENTIFIER', 'variable_name', line=1, column=5)
print(f"Token as string: {token}") # Prints: variable_name
print(f"Token type: {token.type}") # Prints: IDENTIFIERfrom lark import Token
# Create token with position information
token = Token(
type_='NUMBER',
value='123',
start_pos=10,
line=2,
column=5,
end_line=2,
end_column=8,
end_pos=13
)
# Update token
new_token = token.update(type_='INTEGER')
print(f"Updated type: {new_token.type}")
# Borrow position from another token
borrowed = Token.new_borrow_pos('IDENTIFIER', 'x', token)
print(f"Borrowed position - Line: {borrowed.line}, Column: {borrowed.column}")from lark import Lark
from lark.indenter import PythonIndenter
# Grammar for Python-like syntax
python_grammar = """
?start: suite
suite: NEWLINE INDENT stmt+ DEDENT
stmt: simple_stmt NEWLINE
| compound_stmt
simple_stmt: expr_stmt
compound_stmt: if_stmt
if_stmt: "if" expr ":" suite
expr_stmt: NAME "=" NUMBER
%import common.NAME
%import common.NUMBER
%import common.NEWLINE
%import common.WS
%ignore WS
"""
# Use Python indenter for handling indentation
parser = Lark(
python_grammar,
postlex=PythonIndenter(),
parser='lalr'
)
# Parse indented code
code = '''
if x:
y = 1
z = 2
'''
tree = parser.parse(code)
print(tree.pretty())from lark.indenter import Indenter
class CustomIndenter(Indenter):
"""Custom indenter for specific syntax."""
NL_type = 'NEWLINE'
OPEN_PAREN_types = ('LPAREN', 'LBRACE')
CLOSE_PAREN_types = ('RPAREN', 'RBRACE')
INDENT_type = 'INDENT'
DEDENT_type = 'DEDENT'
tab_len = 4 # 4 spaces per indent level
# Use custom indenter
parser = Lark(grammar, postlex=CustomIndenter())from lark import Lark, Token
def uppercase_identifiers(token):
"""Convert identifier tokens to uppercase."""
if token.type == 'IDENTIFIER':
return Token(token.type, token.value.upper(),
line=token.line, column=token.column)
return token
def log_numbers(token):
"""Log all number tokens."""
if token.type == 'NUMBER':
print(f"Found number: {token.value} at line {token.line}")
return token
# Apply callbacks during lexing
parser = Lark(
grammar,
lexer_callbacks={
'IDENTIFIER': uppercase_identifiers,
'NUMBER': log_numbers
}
)
result = parser.parse("x = 123")from lark import Lark
from lark.common import LexerConf
import re
# Configure lexer with specific options
lexer_conf = LexerConf(
terminals=terminal_list,
ignore=['WS', 'COMMENT'],
g_regex_flags=re.IGNORECASE | re.MULTILINE,
use_bytes=False,
lexer_type='standard'
)
parser = Lark(grammar, lexer='standard')from lark import Lark
# Enable position tracking
parser = Lark(grammar, propagate_positions=True)
tree = parser.parse(text)
# Access position information
def print_positions(tree):
if hasattr(tree, 'meta') and tree.meta:
print(f"Rule '{tree.data}' at line {tree.meta.line}, "
f"column {tree.meta.column}")
for child in tree.children:
if hasattr(child, 'children'): # It's a Tree
print_positions(child)
else: # It's a Token
print(f"Token '{child.type}': '{child.value}' at "
f"line {child.line}, column {child.column}")
print_positions(tree)from lark import Lark, Token
class TokenProcessor:
"""Advanced token processing with state."""
def __init__(self):
self.line_count = 0
def process_newlines(self, token):
if token.type == 'NEWLINE':
self.line_count += 1
# Add line number to token value
return Token(token.type, f"\\n#{self.line_count}",
line=token.line, column=token.column)
return token
processor = TokenProcessor()
parser = Lark(
grammar,
lexer_callbacks={'NEWLINE': processor.process_newlines}
)Install with Tessl CLI
npx tessl i tessl/pypi-lark-parser