tessl/pypi-lark-parser

A modern general-purpose parsing library for Python that can parse any context-free grammar efficiently

—

Pending

Overview

Eval results

Files

Core Parsing

Name: tessl/pypi-lark-parser
Author: tessl

Main parsing functionality providing the primary interface for creating parsers, configuring parsing behavior, and parsing text according to grammar definitions.

Capabilities

Main Parser Interface

The Lark class serves as the primary interface for the parsing library, coordinating grammar loading, lexer configuration, and parse tree generation.

class Lark:
    def __init__(self, grammar: str, **options):
        """
        Initialize parser with grammar and options.
        
        Parameters:
        - grammar: EBNF grammar string or file path
        - **options: Configuration options (see LarkOptions)
        """
    
    def parse(self, text: str, start: str = None, on_error=None) -> Tree:
        """
        Parse text according to grammar.
        
        Parameters:
        - text: Input text to parse
        - start: Starting rule (overrides grammar start)
        - on_error: Error callback function
        
        Returns:
        Tree: Parse tree root
        """
    
    def parse_interactive(self, text: str = None, start: str = None):
        """
        Start interactive parsing session for error recovery.
        
        Parameters:
        - text: Input text (optional for incremental parsing)
        - start: Starting rule
        
        Returns:
        InteractiveParser: Interactive parser instance
        """
    
    def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]:
        """
        Tokenize text without parsing.
        
        Parameters:
        - text: Input text to tokenize
        - dont_ignore: Include normally ignored tokens
        
        Returns:
        Iterator[Token]: Token stream
        """
    
    def get_terminal(self, name: str):
        """
        Get terminal definition by name.
        
        Parameters:
        - name: Terminal name
        
        Returns:
        Terminal definition
        """
    
    def save(self, f):
        """
        Save parser instance to file for caching.
        
        Parameters:
        - f: File object to write to
        """
    
    @classmethod
    def load(cls, f):
        """
        Load parser instance from file.
        
        Parameters:
        - f: File object to read from
        
        Returns:
        Lark: Loaded parser instance
        """
    
    @classmethod
    def open(cls, grammar_filename: str, rel_to: str = None, **options):
        """
        Create parser from grammar file.
        
        Parameters:
        - grammar_filename: Path to grammar file
        - rel_to: Base path for relative imports
        - **options: Parser options
        
        Returns:
        Lark: Parser instance
        """
    
    @classmethod
    def open_from_package(cls, package: str, grammar_path: str, 
                         search_paths: Tuple[str, ...] = ("",), **options):
        """
        Load grammar from Python package.
        
        Parameters:
        - package: Package name
        - grammar_path: Path within package
        - search_paths: Search paths for imports
        - **options: Parser options
        
        Returns:
        Lark: Parser instance
        """
    
    # Properties
    source_path: Optional[str]      # Grammar source file path
    source_grammar: str             # Original grammar string
    grammar: Grammar                # Compiled grammar object
    options: LarkOptions           # Parser configuration
    terminals: List[TerminalDef]   # Terminal definitions
    rules: List[Rule]              # Grammar rules

Parser Configuration

Configuration options controlling parsing behavior, algorithm selection, and feature enablement.

class LarkOptions:
    """
    Configuration options for Lark parser.
    """
    
    # General Options
    start: Union[str, List[str]]           # Start symbol(s)
    debug: bool                            # Enable debug output
    transformer: Optional[Transformer]     # Auto-apply transformer
    propagate_positions: Union[bool, Callable]  # Position propagation
    maybe_placeholders: bool               # [] operator behavior
    cache: Union[bool, str]               # Cache grammar analysis
    regex: bool                           # Use regex module
    g_regex_flags: int                    # Global regex flags
    keep_all_tokens: bool                 # Keep punctuation tokens
    tree_class: type                      # Custom tree class
    
    # Algorithm Options
    parser: str                           # "earley", "lalr", "cyk"
    lexer: str                           # Lexer type
    ambiguity: str                       # Ambiguity handling
    
    # Lexer types:
    # - "auto": Choose based on parser
    # - "standard": Standard lexer
    # - "contextual": Context-sensitive (LALR only)
    # - "dynamic": Flexible (Earley only)
    # - "dynamic_complete": All tokenization variants
    
    # Ambiguity handling (Earley only):
    # - "resolve": Automatic resolution
    # - "explicit": Wrap in _ambig nodes
    # - "forest": Return shared packed parse forest
    
    # Domain Specific Options
    postlex: Optional[PostLex]            # Lexer post-processing
    priority: str                         # Priority evaluation
    lexer_callbacks: Dict[str, Callable]  # Token callbacks
    use_bytes: bool                       # Accept bytes input
    edit_terminals: Optional[Callable]    # Terminal editing callback

Interactive Parsing

Step-by-step parsing with error recovery and incremental input processing.

class InteractiveParser:
    """
    Interactive parser for step-by-step parsing and error recovery.
    Provides advanced control over parsing and error handling with LALR.
    """
    
    def feed_token(self, token: Token):
        """
        Feed parser with a token and advance to next state.
        
        Parameters:
        - token: Token instance to process
        
        Note: token must be an instance of Token class
        """
    
    def exhaust_lexer(self) -> None:
        """
        Feed remaining lexer state into interactive parser.
        Modifies instance in place, does not feed '$END' token.
        """
    
    def feed_eof(self, last_token: Token = None):
        """
        Feed '$END' token to parser.
        
        Parameters:
        - last_token: Token to borrow position from (optional)
        """
    
    def accepts(self) -> Set[str]:
        """
        Get set of token types that will advance parser to valid state.
        
        Returns:
        Set[str]: Set of acceptable token type names
        """
    
    def choices(self) -> Dict[str, Any]:
        """
        Get dictionary of token types matched to parser actions.
        Only returns token types accepted by current state.
        
        Returns:
        Dict[str, Any]: Token types and their actions
        """
    
    def resume_parse(self):
        """
        Resume automated parsing from current state.
        
        Returns:
        Parse result from current position
        """
    
    def copy(self) -> 'InteractiveParser':
        """
        Create new interactive parser with separate state.
        
        Returns:
        InteractiveParser: Independent copy
        """
    
    def as_immutable(self) -> 'ImmutableInteractiveParser':
        """
        Convert to immutable interactive parser.
        
        Returns:
        ImmutableInteractiveParser: Immutable version
        """
    
    def pretty(self) -> str:
        """
        Print parser choices in readable format.
        
        Returns:
        str: Formatted choices and stack information
        """

class ImmutableInteractiveParser(InteractiveParser):
    """
    Immutable version of InteractiveParser.
    Operations create new instances instead of modifying in-place.
    """
    
    result: Any  # Parse result when parsing completes
    
    def feed_token(self, token: Token) -> 'ImmutableInteractiveParser':
        """
        Feed token and return new parser instance with updated state.
        
        Parameters:
        - token: Token to process
        
        Returns:
        ImmutableInteractiveParser: New parser instance
        """
    
    def exhaust_lexer(self) -> 'ImmutableInteractiveParser':
        """
        Feed remaining lexer state and return new parser instance.
        
        Returns:
        ImmutableInteractiveParser: New parser instance
        """
    
    def as_mutable(self) -> InteractiveParser:
        """
        Convert to mutable InteractiveParser.
        
        Returns:
        InteractiveParser: Mutable version
        """

Post-Lexer Processing

Abstract base class for lexer post-processing, such as indentation handling.

class PostLex:
    """
    Abstract base class for lexer post-processing.
    """
    
    def process(self, stream: Iterator[Token]) -> Iterator[Token]:
        """
        Process token stream after lexing.
        
        Parameters:
        - stream: Input token stream
        
        Returns:
        Iterator[Token]: Processed token stream
        """
    
    always_accept: Tuple[str, ...]  # Token types to always accept

Grammar Loading

Functions and classes for loading and processing grammar definitions.

class FromPackageLoader:
    """
    Loader for grammars stored in Python packages.
    """
    
    def __init__(self, package_root: str = ""):
        """
        Initialize package loader.
        
        Parameters:
        - package_root: Root package path
        """
    
    def __call__(self, base_path: str, grammar_path: str) -> Tuple[str, str]:
        """
        Load grammar from package.
        
        Parameters:
        - base_path: Base import path
        - grammar_path: Grammar file path
        
        Returns:
        Tuple[str, str]: (grammar_text, full_path)
        """

Usage Examples

Basic Grammar Definition

from lark import Lark

# Simple arithmetic grammar
grammar = """
    ?start: sum

    ?sum: product
        | sum "+" product   -> add
        | sum "-" product   -> sub

    ?product: atom
        | product "*" atom  -> mul
        | product "/" atom  -> div

    ?atom: NUMBER           -> number
         | "-" atom         -> neg
         | "(" sum ")"

    %import common.NUMBER
    %import common.WS_INLINE
    %ignore WS_INLINE
"""

parser = Lark(grammar)
result = parser.parse("3 + 4 * 2")
print(result.pretty())

Parser Configuration

from lark import Lark

# Configure parser with specific options
parser = Lark(
    grammar,
    parser='lalr',          # Use LALR parser
    lexer='standard',       # Standard lexer
    start='expression',     # Custom start rule
    debug=True,            # Enable debug output
    keep_all_tokens=True,  # Keep all tokens
    propagate_positions=True  # Track positions
)

Grammar from File

from lark import Lark

# Load grammar from file
parser = Lark.open('my_grammar.lark', rel_to=__file__)

# Load from package
parser = Lark.open_from_package(
    'my_package.grammars', 
    'grammar.lark',
    search_paths=('common',)
)

Interactive Parsing

from lark import Lark

parser = Lark(grammar)
interactive = parser.parse_interactive()

# Feed tokens incrementally
for token in parser.lex("1 + 2"):
    try:
        interactive.feed_token(token)
    except UnexpectedToken:
        # Handle error, possibly recover
        acceptable = interactive.accepts()
        print(f"Expected one of: {acceptable}")

Caching for Performance

from lark import Lark

# Cache to temporary file
parser = Lark(grammar, cache=True)

# Cache to specific file
parser = Lark(grammar, cache='my_grammar.cache')

# Manual save/load
parser.save(open('parser.cache', 'wb'))
cached_parser = Lark.load(open('parser.cache', 'rb'))

Install with Tessl CLI