CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-antlr--antlr-runtime

Java runtime library for ANTLR v3 - a framework for constructing recognizers, interpreters, compilers, and translators from grammatical descriptions.

Pending
Overview
Eval results
Files

parsing.mddocs/

Parsing

Core parsing infrastructure including parser base classes, rule return scopes, and recognizer functionality. The parsing system processes token streams to build parse trees and abstract syntax trees.

Capabilities

Parser Base Class

Concrete parser class for processing token streams with error recovery and backtracking support.

/**
 * A parser for TokenStreams. "parser grammars" result in a subclass of this.
 */
public class Parser extends BaseRecognizer {
    public TokenStream input;
    
    public Parser(TokenStream input);
    public Parser(TokenStream input, RecognizerSharedState state);
    
    public void reset();
    public void setTokenStream(TokenStream input);
    public TokenStream getTokenStream();
    
    protected Object getCurrentInputSymbol(IntStream input);
    protected Object getMissingSymbol(IntStream input, RecognitionException e, 
                                     int expectedTokenType, BitSet follow);
    
    /** Set the token stream and reset the parser */
    public void setTokenStream(TokenStream input);
    
    public void traceIn(String ruleName, int ruleIndex);
    public void traceOut(String ruleName, int ruleIndex);
}

Usage Examples:

import org.antlr.runtime.*;

// Create parser with token stream
MyLexer lexer = new MyLexer(new ANTLRStringStream("x = 42;"));
CommonTokenStream tokens = new CommonTokenStream(lexer);
MyParser parser = new MyParser(tokens);

// Parse starting from rule
try {
    MyParser.program_return result = parser.program();
    System.out.println("Parse successful");
    
    // Get parse tree if available
    Object tree = result.getTree();
    if (tree != null) {
        System.out.println("Tree: " + tree.toString());
    }
} catch (RecognitionException e) {
    System.err.println("Parse error: " + e.getMessage());
}

// Reset parser for reuse
parser.reset();
parser.setTokenStream(new CommonTokenStream(new MyLexer(new ANTLRStringStream("y = 10;"))));

Base Recognizer

Abstract base class providing common functionality for all ANTLR recognizers.

/**
 * A generic recognizer that can handle recognizers generated from
 * lexer, parser, and tree grammars. This is all the parsing
 * support code essentially; most of it is error recovery stuff and
 * backtracking.
 */
public abstract class BaseRecognizer {
    public static final int MEMO_RULE_FAILED = -2;
    public static final int MEMO_RULE_UNKNOWN = -1;
    public static final int INITIAL_FOLLOW_STACK_SIZE = 100;
    public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;
    public static final int HIDDEN = Token.HIDDEN_CHANNEL;
    public static final String NEXT_TOKEN_RULE_NAME = "nextToken";
    
    protected RecognizerSharedState state;
    
    public BaseRecognizer();
    public BaseRecognizer(RecognizerSharedState state);
    
    /** reset the parser's state.  Subclasses must rewinds the input stream. */
    public void reset();
    
    /** Match current input symbol against ttype.  Attempt
     *  single token insertion or deletion error recovery.  If
     *  that fails, throw MismatchedTokenException.
     */
    public Object match(IntStream input, int ttype, BitSet follow) throws RecognitionException;
    
    /** Match the wildcard: in a symbol */
    public void matchAny(IntStream input);
    
    public boolean mismatchIsUnwantedToken(IntStream input, int ttype);
    public boolean mismatchIsMissingToken(IntStream input, BitSet follow);
    
    /** Report a recognition problem.
     *  This method sets errorRecovery to indicate the parser is recovering
     *  not parsing.  Once in recovery mode, no errors are generated.
     *  To get out of recovery mode, the parser must successfully match
     *  a token (after a resync).  So it will go:
     *
     *      1. error occurs
     *      2. enter recovery mode, report error
     *      3. consume until token found in resynch set
     *      4. try to resume parsing
     *      5. next match() will reset errorRecovery mode
     */
    public void reportError(RecognitionException e);
    
    public void displayRecognitionError(String[] tokenNames, RecognitionException e);
    
    /** What error message should be generated for the various exception types? */
    public String getErrorMessage(RecognitionException e, String[] tokenNames);
    
    /** How should a token be displayed in an error message? The default
     *  is to display just the text, but during development you might
     *  want to have lots of information spit out.  Override in that case
     *  to use t.toString() (which, for CommonToken, dumps everything about
     *  the token). This is better than forcing you to override a method in
     *  your token objects because you don't have to go modify your lexer
     *  so that it creates a new Java type.
     */
    public String getTokenErrorDisplay(Token t);
    
    /** Override this method to change where error messages go */
    public void emitErrorMessage(String msg);
    
    /** Recover from an error found on the input stream.  This is for NoViableAlt and
     *  mismatched symbol exceptions.  If you enable single token insertion and deletion,
     *  this will usually not handle mismatched symbol exceptions but there could be
     *  a mismatched token that the match() routine could not recover from.
     */
    public void recover(IntStream input, RecognitionException re);
    
    /** A hook to listen in on the token consumption during error recovery.
     *  The DebugParser subclasses this to fire events to the listenter.
     */
    public void beginResync();
    public void endResync();
    
    /** Compute the error recovery set for the current rule.  During
     *  rule invocation, the parser pushes the set of tokens that can
     *  follow that rule reference on the stack; this amounts to
     *  computing FIRST of what follows the rule reference in the
     *  enclosing rule. This local follow set only includes tokens
     *  from within the rule; i.e., the FIRST computation done on
     *  FOLLOW(r) for r not in context.
     */
    protected BitSet computeErrorRecoverySet();
    
    /** Compute the context-sensitive FOLLOW set for current rule.
     *  This is set of token types that can follow a specific rule
     *  reference given a specific call chain.  You get the set of
     *  viable tokens that can possibly come next (lookahead depth 1)
     *  given the current call chain.  Contrast this with the
     *  definition of plain FOLLOW for rule r:
     *
     *   FOLLOW(r) = set of (local) tokens that can possibly follow
     *               references to r in *any* sentential form
     *               (r-->(alpha) beta, r->(alpha) gamma, ...)
     *
     *  Note that the parser pushes just the local FOLLOW sets and upon
     *  error, we want to combine these below all scope until we reach
     *  a recovery rule (typically the top-level rule).
     */
    protected BitSet computeContextSensitiveRuleFOLLOW();
    
    protected BitSet combineFollows(boolean exact);
    
    /** Attempt to recover from a single missing or extra token.
     *
     *  EXTRA TOKEN
     *
     *  LA(1) is not what we are looking for.  If LA(2) has the right token,
     *  however, then assume LA(1) is some extra token and delete it.
     *  LA(2) must be part of the follow set of the missing token.
     *  This can only happen if you are trying to match two tokens and
     *  the first one is wrong.  To match two tokens, either we backtrack
     *  (which is slow) or match the second token after seeing the first
     *  token is wrong. If we match second token, we have to delete the
     *  first token. This is hard to do so typically we backtrack or
     *  just do the default recovery which is to panic and skip until
     *  we find something we know about.
     *
     *  MISSING TOKEN
     *
     *  If current token is consistent with what could come after missing
     *  token, then it is ok to "insert" the missing token.  LA(1) is
     *  what we want to match next and if LA(1) is consistent with what
     *  could come after the expected token, then we assume the expected token
     *  is missing and we insert it.  For example, Input "i=(3;" is missing the
     *  ')'.  At ')' we have LA(1)==';' which is consistent with
     *  what could come after ')' so we generate().
     */
    protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow) 
        throws RecognitionException;
    
    /** Not currently used */
    public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow) 
        throws RecognitionException;
    
    protected Object getCurrentInputSymbol(IntStream input);
    protected Object getMissingSymbol(IntStream input, RecognitionException e, 
                                     int expectedTokenType, BitSet follow);
    
    /** Conjure up a missing token during error recovery.
     *  The recognizer attempts to recover from single missing
     *  symbols. But, actions might refer to that missing symbol.
     *  For example, x=ID {f($x);}. The f($x) refers to the missing ID.
     *  If you just return User.INVALID_TOKEN_TYPE, then f($x) will try to invoke
     *  you action with a null argument. It's better to conjure one up
     *  as follows:
     *
     *      ID(CommonToken(ID, "ID"));
     *
     *  where ID is the token type of the missing token. getMissingSymbol
     *  makes no assumptions about which Token object you use for
     *  the missing symbol.  
     *
     *  This method is used primarily for error recovery not on the
     *  critical path.
     */
    protected Token getMissingSymbol(IntStream input, RecognitionException e, 
                                   int expectedTokenType, BitSet follow);
    
    public void pushFollow(BitSet fset);
    protected void popFollow();
    
    /** Return List<String> of the rules in your parser instance
     *  leading up to a call to the current rule.  You could override if
     *  you want more details such as the file/line info of where
     *  in the parser java code a rule is invoked.
     *
     *  This is very useful for error messages.
     */
    public List<String> getRuleInvocationStack();
    public List<String> getRuleInvocationStack(Throwable e);
    
    public String getBacktrackingLevel();
    
    /** Used to print out token names like ID during debugging and
     *  error reporting.  The generated parsers implement a method
     *  that overrides this to point to their String[] tokenNames.
     */
    public String[] getTokenNames();
    
    /** For debugging and other purposes, might want the grammar name.
     *  Have ANTLR generate an implementation for this method.
     */
    public String getGrammarFileName();
    
    public String getSourceName();
    
    /** A convenience method for use most often with template rewrites.
     *  Convert a List<Token> to List<String>
     */
    public List<String> toStrings(List<Token> tokens);
    
    /** Given a rule number and a start token index number, return
     *  MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
     *  start index.  If this rule has parsed input starting from the
     *  start index before, then return where the rule stopped parsing.
     *  It returns the index of the last token matched by the rule.
     *
     *  For now we use a hashtable and just the slow Object-based one.
     *  Later, we can make a special one for ints and also one that
     *  tosses out entries after we commit past input position i.
     */
    public Integer getRuleMemoization(int ruleIndex, int ruleStartIndex);
    
    /** Has this rule already parsed input at the current index in the
     *  input stream?  Return the stop token index or MEMO_RULE_UNKNOWN.
     *  If we attempted but failed to parse properly before, return
     *  MEMO_RULE_FAILED.
     *
     *  This method has a side-effect: if we have seen this input for
     *  this rule before, then seek ahead to 1 past the stop token matched
     *  for this rule last time.
     */
    public boolean alreadyParsedRule(IntStream input, int ruleIndex);
    
    /** Record whether or not this rule parsed the input at this position
     *  successfully.  Use a standard java hashtable for now.
     */
    public void memoize(IntStream input, int ruleIndex, int ruleStartIndex);
}

Usage Examples:

// Error handling in parser
public class MyParser extends Parser {
    public void displayRecognitionError(String[] tokenNames, RecognitionException e) {
        String hdr = getErrorHeader(e);
        String msg = getErrorMessage(e, tokenNames);
        System.err.println(hdr + " " + msg);
    }
    
    public String getErrorMessage(RecognitionException e, String[] tokenNames) {
        // Custom error message formatting
        return "Syntax error at line " + e.line + ": " + super.getErrorMessage(e, tokenNames);
    }
}

// Using follow sets for error recovery
BitSet follow = new BitSet();
follow.add(MyParser.SEMICOLON);
follow.add(MyParser.RBRACE);
Token recovered = (Token)recoverFromMismatchedToken(input, MyParser.IDENTIFIER, follow);

Rule Return Scopes

Return value containers for parser rules supporting tokens, trees, and custom attributes.

/**
 * Return value holder for rule invocations
 */
public class RuleReturnScope {
    /** Return the start token or tree */
    public Object getStart();
    public void setStart(Object start);
    
    /** Return the stop token or tree */
    public Object getStop();
    public void setStop(Object stop);
    
    /** Has a value potentially if output=AST; */
    public Object getTree();
    public void setTree(Object tree);
}

/**
 * Parser rule return scope
 */
public class ParserRuleReturnScope extends RuleReturnScope {
    /** First token matched by this rule */
    public Token start;
    /** Last token matched by this rule */
    public Token stop;
    
    public Object getStart();
    public Object getStop();
}

Usage Examples:

// Generated parser rule return type
public static class expression_return extends ParserRuleReturnScope {
    public CommonTree tree;
    public Object getTree() { return tree; }
};

// Using rule return in parser
public final expression_return expression() throws RecognitionException {
    expression_return retval = new expression_return();
    retval.start = input.LT(1);
    
    try {
        // Rule implementation...
        
        retval.stop = input.LT(-1);
        // retval.tree = ...
        
    } catch (RecognitionException re) {
        reportError(re);
        recover(input, re);
    }
    
    return retval;
}

// Using returned values
MyParser.expression_return result = parser.expression();
Token startToken = result.start;
Token stopToken = result.stop;
CommonTree tree = (CommonTree) result.getTree();

Recognizer Shared State

Shared state object for managing parser state across rule invocations.

/**
 * Shared state between recognizer instances
 */
public class RecognizerSharedState {
    /** Track the set of token types that can follow any rule invocation.
     *  Stack grows upwards.
     */
    public BitSet[] following;
    public int _fsp = -1;
    
    /** This is true when we see an error and before having successfully
     *  matched a token.  Prevents generation of more than one error message
     *  per error.
     */
    public boolean errorRecovery = false;
    
    /** The index into the input stream where the last error occurred.
     *  This is used to prevent infinite loops where an error is found
     *  but no token is consumed during recovery...another error is found,
     *  ad nauseam.  This is a failsafe mechanism to guarantee that at least
     *  one token/tree node is consumed for two errors.
     */
    public int lastErrorIndex = -1;
    
    /** In lieu of a return value, this indicates that a rule or token
     *  has failed to match.  Reset to false upon valid token match.
     */
    public boolean failed = false;
    
    /** Did the recognizer encounter a syntax error?  Track how many. */
    public int syntaxErrors = 0;
    
    /** If 0, no backtracking is going on.  Safe to exec actions etc...
     *  If >0 then it's the level of backtracking.
     */
    public int backtracking = 0;
    
    /** An array[size num rules] of Map<Integer,Integer> that tracks
     *  the stop token index for each rule.  ruleMemo[ruleIndex] is
     *  the memoization table for ruleIndex.  For key ruleStartIndex, you
     *  get back the stop token for associated rule or MEMO_RULE_FAILED.
     *
     *  This is only used if rule memoization is on.
     */
    public Map<Integer, Integer>[] ruleMemo;
    
    // LEXER FIELDS (must be in same state object to avoid casting
    // constantly in generated code and Lexer object)
    
    /** The goal of all lexer rules/methods is to create a token object.
     *  This is an instance variable as multiple rules may collaborate to
     *  create a single token.  nextToken will return this object after
     *  matching lexer rule(s).  If you subclass to allow multiple token
     *  emissions, then set this to the last token to be matched or
     *  something nonnull so that the auto token emit mechanism will not
     *  emit another token.
     */
    public Token token;
    
    /** What character index in the stream did the current token start at? */
    public int tokenStartCharIndex = -1;
    
    /** The line on which the first character of the token resides */
    public int tokenStartLine;
    
    /** The character position of first character within the line */
    public int tokenStartCharPositionInLine;
    
    /** The channel number for the current token */
    public int channel;
    
    /** The token type for the current token */
    public int type;
    
    /** You can set the text for the current token to override what is in
     *  the input char buffer.  Use setText() or can set this instance var.
     */
    public String text;
}

Types

Parser State Management

public class RecognizerSharedState {
    // Following stack for error recovery
    public BitSet[] following;
    public int _fsp = -1;
    
    // Error recovery state
    public boolean errorRecovery = false;
    public int lastErrorIndex = -1;
    public boolean failed = false;
    public int syntaxErrors = 0;
    
    // Backtracking state
    public int backtracking = 0;
    
    // Memoization tables
    public Map<Integer, Integer>[] ruleMemo;
    
    // Token creation state (for lexers)
    public Token token;
    public int tokenStartCharIndex = -1;
    public int tokenStartLine;
    public int tokenStartCharPositionInLine;
    public int channel;
    public int type;
    public String text;
}

Common Patterns

Custom Error Recovery

public class MyParser extends Parser {
    @Override
    public void recover(IntStream input, RecognitionException re) {
        // Custom recovery logic
        if (re instanceof MismatchedTokenException) {
            MismatchedTokenException mte = (MismatchedTokenException) re;
            // Try to find a recovery token
            while (input.LA(1) != Token.EOF && 
                   input.LA(1) != SEMICOLON && 
                   input.LA(1) != RBRACE) {
                input.consume();
            }
        } else {
            super.recover(input, re);
        }
    }
}

Rule Memoization

// In generated parser with memoization
public final expression_return expression() throws RecognitionException {
    if (state.backtracking > 0 && alreadyParsedRule(input, 5)) {
        return null; // Already tried this rule
    }
    
    expression_return retval = new expression_return();
    // ... rule implementation ...
    
    if (state.backtracking > 0) {
        memoize(input, 5, retval.start);
    }
    
    return retval;
}

Backtracking Support

// Generated code with backtracking
int alt1 = 2;
state.backtracking++;
try {
    alt1 = dfa1.predict(input);
} catch (RecognitionException re) {
    state.failed = true;
    throw re;
} finally {
    state.backtracking--;
}

if (state.failed) return retval;

switch (alt1) {
    case 1 :
        // First alternative
        break;
    case 2 :
        // Second alternative  
        break;
}

Follow Set Management

// Push follow set before rule call
pushFollow(FOLLOW_expression_in_assignment123);
expression();
state._fsp--;

// Error recovery with follow sets
BitSet follow = computeErrorRecoverySet();
Token recovered = (Token)recoverFromMismatchedToken(input, IDENTIFIER, follow);

Install with Tessl CLI

npx tessl i tessl/maven-org-antlr--antlr-runtime

docs

character-streams.md

debug-support.md

error-handling.md

index.md

lexical-analysis.md

parsing.md

token-streams.md

tree-construction.md

tile.json