CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-lark-parser

A modern general-purpose parsing library for Python that can parse any context-free grammar efficiently

Pending
Overview
Eval results
Files

tokens-lexing.mddocs/

Tokens and Lexing

Token representation and lexical analysis including Token class for lexical units, lexer configuration, and specialized handling for indentation-sensitive languages.

Capabilities

Token Representation

The Token class represents lexical units produced by the lexer, inheriting from str while adding metadata.

class Token(str):
    """
    String with meta-information representing a lexical token.
    Inherits from str so it can be used anywhere a string is expected.
    """
    
    def __new__(cls, type_: str, value: str, start_pos: int = None, 
               line: int = None, column: int = None, end_line: int = None,
               end_column: int = None, end_pos: int = None, 
               pos_in_stream: int = None) -> 'Token':
        """
        Create new token instance.
        
        Parameters:
        - type_: Token type name (terminal name from grammar)
        - value: Token string value
        - start_pos: Starting position in input text
        - line: Line number (1-based)
        - column: Column number (1-based)  
        - end_line: Ending line number
        - end_column: Ending column number
        - end_pos: Ending position in input text
        - pos_in_stream: Position in token stream
        
        Returns:
        Token: New token instance
        """
    
    def update(self, type_: str = None, value: str = None) -> 'Token':
        """
        Create updated copy of token with new type or value.
        
        Parameters:
        - type_: New token type (optional)
        - value: New token value (optional)
        
        Returns:
        Token: Updated token copy
        """
    
    @classmethod
    def new_borrow_pos(cls, type_: str, value: str, borrow_t: 'Token') -> 'Token':
        """
        Create token borrowing position information from another token.
        
        Parameters:
        - type_: Token type name
        - value: Token string value
        - borrow_t: Token to borrow position from
        
        Returns:
        Token: New token with borrowed position
        """
    
    # Attributes
    type: str           # Token type name
    value: str          # Token string value (same as str content)
    start_pos: int      # Start position in input
    line: int           # Line number (1-based)
    column: int         # Column number (1-based)
    end_line: int       # End line number
    end_column: int     # End column number
    end_pos: int        # End position in input
    pos_in_stream: int  # Position in token stream

Indentation Handling

Post-lexer processor for handling Python-style indentation with INDENT/DEDENT tokens.

class Indenter:
    """
    PostLex processor for Python-like indentation handling.
    Converts whitespace at line beginnings into INDENT/DEDENT tokens.
    """
    
    def __init__(self, tab_len: int = 8):
        """
        Initialize indenter.
        
        Parameters:
        - tab_len: Number of spaces equivalent to one tab
        """
    
    def process(self, stream: Iterator[Token]) -> Iterator[Token]:
        """
        Process token stream, converting indentation to INDENT/DEDENT tokens.
        
        Parameters:
        - stream: Input token stream
        
        Returns:
        Iterator[Token]: Stream with indentation tokens
        """
    
    def handle_NL(self, token: Token) -> Iterator[Token]:
        """
        Handle newline tokens for indentation tracking.
        
        Parameters:
        - token: Newline token
        
        Returns:
        Iterator[Token]: Processed tokens
        """
    
    def handle_OPEN_PAREN(self, token: Token) -> Iterator[Token]:
        """
        Handle opening parenthesis tokens.
        
        Parameters:
        - token: Opening parenthesis token
        
        Returns:
        Iterator[Token]: Processed tokens
        """
    
    def handle_CLOSE_PAREN(self, token: Token) -> Iterator[Token]:
        """
        Handle closing parenthesis tokens.
        
        Parameters:
        - token: Closing parenthesis token
        
        Returns:
        Iterator[Token]: Processed tokens
        """
    
    # Configuration attributes
    always_accept: Tuple[str, ...] = ('NL', 'COMMENT')  # Always accepted tokens
    NL_type: str = 'NL'                                 # Newline token type
    OPEN_PAREN_types: Tuple[str, ...] = ()             # Open paren types
    CLOSE_PAREN_types: Tuple[str, ...] = ()            # Close paren types
    INDENT_type: str = 'INDENT'                         # Indent token type
    DEDENT_type: str = 'DEDENT'                         # Dedent token type
    tab_len: int                                        # Tab length in spaces

Python-Specific Indenter

Pre-configured indenter for Python syntax.

class PythonIndenter(Indenter):
    """
    Indenter configured for Python language syntax.
    """
    
    NL_type = 'NEWLINE'
    OPEN_PAREN_types = ('LPAR', 'LSQB', 'LBRACE')
    CLOSE_PAREN_types = ('RPAR', 'RSQB', 'RBRACE')
    INDENT_type = 'INDENT'
    DEDENT_type = 'DEDENT'
    tab_len = 8

Indentation Errors

Exception raised for invalid dedentation patterns.

class DedentError(LarkError):
    """
    Raised when dedentation doesn't match any previous indentation level.
    """

Lexer Configuration

Configuration classes for lexer behavior and terminal definitions.

class LexerConf:
    """
    Configuration object for lexer components.
    """
    
    def __init__(self, terminals: List, ignore: List = None, 
                 g_regex_flags: int = 0, use_bytes: bool = False,
                 lexer_type: str = None, callbacks: Dict = None):
        """
        Initialize lexer configuration.
        
        Parameters:
        - terminals: List of terminal definitions
        - ignore: List of terminals to ignore
        - g_regex_flags: Global regex flags
        - use_bytes: Whether to use bytes instead of str
        - lexer_type: Type of lexer to use
        - callbacks: Token callbacks dictionary
        """
    
    terminals: List         # Terminal definitions
    ignore: List           # Ignored terminals
    g_regex_flags: int     # Global regex flags
    use_bytes: bool        # Use bytes input
    lexer_type: str        # Lexer type
    callbacks: Dict        # Token callbacks

Terminal Definitions

Classes representing terminal symbol definitions in grammars.

class TerminalDef:
    """
    Definition of a terminal symbol in the grammar.
    """
    
    def __init__(self, name: str, pattern, options: List = None):
        """
        Initialize terminal definition.
        
        Parameters:
        - name: Terminal name
        - pattern: Pattern object or string
        - options: List of terminal options
        """
    
    name: str              # Terminal name
    pattern: Pattern       # Pattern for matching
    options: List          # Terminal options

Lexer Classes

Core lexer implementations for tokenizing input text.

class Lexer:
    """
    Abstract base lexer class.
    """
    
    def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]:
        """
        Tokenize input text.
        
        Parameters:
        - text: Input text to tokenize
        - dont_ignore: Include normally ignored tokens
        
        Returns:
        Iterator[Token]: Token stream
        """

class TraditionalLexer(Lexer):
    """
    Traditional regex-based lexer implementation.
    """

class LexerThread:
    """
    Lexer state for incremental tokenization.
    """
    
    def lex(self, stream: Iterator, newline_types: Set[str], 
           ignore_types: Set[str]) -> Iterator[Token]:
        """
        Perform lexical analysis on character stream.
        
        Parameters:
        - stream: Character stream
        - newline_types: Set of newline token types
        - ignore_types: Set of token types to ignore
        
        Returns:
        Iterator[Token]: Token stream
        """

Usage Examples

Basic Token Usage

from lark import Lark, Token

parser = Lark(grammar)

# Get tokens without parsing
tokens = list(parser.lex("x = 42"))
for token in tokens:
    print(f"Type: {token.type}, Value: '{token.value}', Line: {token.line}")

# Tokens are strings
token = Token('IDENTIFIER', 'variable_name', line=1, column=5)
print(f"Token as string: {token}")  # Prints: variable_name
print(f"Token type: {token.type}")   # Prints: IDENTIFIER

Creating Custom Tokens

from lark import Token

# Create token with position information
token = Token(
    type_='NUMBER',
    value='123',
    start_pos=10,
    line=2,
    column=5,
    end_line=2,
    end_column=8,
    end_pos=13
)

# Update token
new_token = token.update(type_='INTEGER')
print(f"Updated type: {new_token.type}")

# Borrow position from another token
borrowed = Token.new_borrow_pos('IDENTIFIER', 'x', token)
print(f"Borrowed position - Line: {borrowed.line}, Column: {borrowed.column}")

Python Indentation Handling

from lark import Lark
from lark.indenter import PythonIndenter

# Grammar for Python-like syntax
python_grammar = """
    ?start: suite
    
    suite: NEWLINE INDENT stmt+ DEDENT
    
    stmt: simple_stmt NEWLINE
        | compound_stmt
    
    simple_stmt: expr_stmt
    compound_stmt: if_stmt
    
    if_stmt: "if" expr ":" suite
    
    expr_stmt: NAME "=" NUMBER
    
    %import common.NAME
    %import common.NUMBER
    %import common.NEWLINE
    %import common.WS
    %ignore WS
"""

# Use Python indenter for handling indentation
parser = Lark(
    python_grammar,
    postlex=PythonIndenter(),
    parser='lalr'
)

# Parse indented code
code = '''
if x:
    y = 1
    z = 2
'''

tree = parser.parse(code)
print(tree.pretty())

Custom Indenter

from lark.indenter import Indenter

class CustomIndenter(Indenter):
    """Custom indenter for specific syntax."""
    
    NL_type = 'NEWLINE'
    OPEN_PAREN_types = ('LPAREN', 'LBRACE')
    CLOSE_PAREN_types = ('RPAREN', 'RBRACE')
    INDENT_type = 'INDENT'
    DEDENT_type = 'DEDENT'
    tab_len = 4  # 4 spaces per indent level

# Use custom indenter
parser = Lark(grammar, postlex=CustomIndenter())

Token Callbacks

from lark import Lark, Token

def uppercase_identifiers(token):
    """Convert identifier tokens to uppercase."""
    if token.type == 'IDENTIFIER':
        return Token(token.type, token.value.upper(), 
                    line=token.line, column=token.column)
    return token

def log_numbers(token):
    """Log all number tokens."""
    if token.type == 'NUMBER':
        print(f"Found number: {token.value} at line {token.line}")
    return token

# Apply callbacks during lexing
parser = Lark(
    grammar,
    lexer_callbacks={
        'IDENTIFIER': uppercase_identifiers,
        'NUMBER': log_numbers
    }
)

result = parser.parse("x = 123")

Lexer Configuration

from lark import Lark
from lark.common import LexerConf
import re

# Configure lexer with specific options
lexer_conf = LexerConf(
    terminals=terminal_list,
    ignore=['WS', 'COMMENT'],
    g_regex_flags=re.IGNORECASE | re.MULTILINE,
    use_bytes=False,
    lexer_type='standard'
)

parser = Lark(grammar, lexer='standard')

Position Tracking

from lark import Lark

# Enable position tracking
parser = Lark(grammar, propagate_positions=True)
tree = parser.parse(text)

# Access position information
def print_positions(tree):
    if hasattr(tree, 'meta') and tree.meta:
        print(f"Rule '{tree.data}' at line {tree.meta.line}, "
              f"column {tree.meta.column}")
    
    for child in tree.children:
        if hasattr(child, 'children'):  # It's a Tree
            print_positions(child)
        else:  # It's a Token
            print(f"Token '{child.type}': '{child.value}' at "
                  f"line {child.line}, column {child.column}")

print_positions(tree)

Advanced Token Processing

from lark import Lark, Token

class TokenProcessor:
    """Advanced token processing with state."""
    
    def __init__(self):
        self.line_count = 0
        
    def process_newlines(self, token):
        if token.type == 'NEWLINE':
            self.line_count += 1
            # Add line number to token value
            return Token(token.type, f"\\n#{self.line_count}", 
                        line=token.line, column=token.column)
        return token

processor = TokenProcessor()
parser = Lark(
    grammar,
    lexer_callbacks={'NEWLINE': processor.process_newlines}
)

Install with Tessl CLI

npx tessl i tessl/pypi-lark-parser

docs

core-parsing.md

exceptions.md

index.md

tokens-lexing.md

tree-processing.md

utilities.md

tile.json