tessl/maven-org-antlr--antlr-runtime

Java runtime library for ANTLR v3 - a framework for constructing recognizers, interpreters, compilers, and translators from grammatical descriptions.

—

Pending

Overview

Eval results

Files

Lexical Analysis

Name: tessl/maven-org-antlr--antlr-runtime
Author: tessl

Base classes and interfaces for implementing lexers, token sources, and deterministic finite automata. The lexical analysis system transforms character streams into token streams for parsing.

Capabilities

Lexer Base Class

Abstract base class for all ANTLR-generated lexers providing tokenization functionality.

/**
 * A lexer is recognizer that draws input symbols from a character stream.
 * lexer grammars result in a subclass of this object. A Lexer object
 * uses simplified match() and error recovery mechanisms in the interest
 * of speed.
 */
public abstract class Lexer extends BaseRecognizer implements TokenSource {
    public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;
    public static final int HIDDEN = Token.HIDDEN_CHANNEL;
    public static final int EOF = -1;
    
    /** Where is the lexer drawing characters from? */
    protected CharStream input;
    
    public Lexer();
    public Lexer(CharStream input);
    public Lexer(CharStream input, RecognizerSharedState state);
    
    public void reset();
    
    /** Return a token from this source; i.e., match a token on the char stream. */
    public abstract Token nextToken();
    
    /** Instruct the lexer to skip creating a token for current lexer rule and look for another token */
    public void skip();
    
    /** This is the lexer entry point that sets instance var 'token' */
    public void mTokens() throws RecognitionException;
    
    /** Set the char stream and reset the lexer */
    public void setCharStream(CharStream input);
    public CharStream getCharStream();
    
    public String getSourceName();
    
    /** Currently does not support multiple emits per nextToken invocation
     *  for efficiency reasons.  Subclass and override this method and
     *  nextToken (to push tokens into a list and pull from that list rather
     *  than a single variable as this implementation does).
     */
    public void emit(Token token);
    
    /** The standard method called to automatically emit a token at the
     *  outermost lexical rule.  The token object should point into the
     *  char buffer start..stop.  If there is a text override in 'text',
     *  use that to set the token's text.  Override this method to emit
     *  custom Token objects.
     */
    public Token emit();
    
    public void match(String s) throws MismatchedTokenException;
    public void match(int c) throws MismatchedTokenException;
    public void matchAny() throws MismatchedTokenException;
    public void matchRange(int a, int b) throws MismatchedTokenException;
    
    public int getLine();
    public int getCharPositionInLine();
    
    /** What is the index of the current character of lookahead? */
    public int getCharIndex();
    
    /** Return the text matched so far for the current token or any text override. */
    public String getText();
    public void setText(String text);
    
    public void reportError(RecognitionException e);
    public String getErrorMessage(RecognitionException e, String[] tokenNames);
    public String getCharErrorDisplay(int c);
    
    /** Lexers can normally match any char in it's vocabulary after matching
     *  a token, so do the easy thing and just kill a character and hope
     *  it all works out.  You can instead use the rule invocation stack
     *  to do sophisticated error recovery if you are in a fragment rule.
     */
    public void recover(RecognitionException re);
    
    public void traceIn(String ruleName, int ruleIndex);
    public void traceOut(String ruleName, int ruleIndex);
}

Usage Examples:

import org.antlr.runtime.*;

// Create lexer with string input
CharStream input = new ANTLRStringStream("hello world 123");
MyLexer lexer = new MyLexer(input);

// Get tokens one by one
Token token;
while ((token = lexer.nextToken()).getType() != Token.EOF) {
    System.out.println("Token: " + token.getText() + 
                      " Type: " + token.getType() + 
                      " Line: " + token.getLine());
}

// Set custom text for token
// Inside lexer rule:
// setText("custom text");
// emit();

Token Implementation

Standard token implementation containing all token information.

/**
 * Default Token implementation
 */
public class CommonToken implements Token {
    protected int type;
    protected int line;
    protected int charPositionInLine = -1; // set to invalid position
    protected int channel = DEFAULT_CHANNEL;
    protected int index = -1;
    protected CharStream input;
    protected String text;
    protected int start;
    protected int stop;
    
    public CommonToken(int type);
    public CommonToken(Token oldToken);
    public CommonToken(int type, String text);
    public CommonToken(CharStream input, int type, int channel, int start, int stop);
    
    public int getType();
    public void setType(int type);
    public int getLine();
    public void setLine(int line);
    public String getText();
    public void setText(String text);
    public int getCharPositionInLine();
    public void setCharPositionInLine(int charPositionInLine);
    public int getChannel();
    public void setChannel(int channel);
    public int getTokenIndex();
    public void setTokenIndex(int index);
    public CharStream getInputStream();
    public void setInputStream(CharStream input);
    
    public String toString();
}

Usage Examples:

import org.antlr.runtime.*;

// Create tokens manually
Token identifier = new CommonToken(MyLexer.IDENTIFIER, "variable");
identifier.setLine(1);
identifier.setCharPositionInLine(0);

// Create from existing token
Token copy = new CommonToken(identifier);

// Create with all parameters
CharStream input = new ANTLRStringStream("test");
Token detailed = new CommonToken(input, MyLexer.STRING, Token.DEFAULT_CHANNEL, 0, 3);

System.out.println("Token text: " + detailed.getText());
System.out.println("Token type: " + detailed.getType());

Alternative Token Implementation

/**
 * Alternative token implementation
 */
public class ClassicToken implements Token {
    protected String text;
    protected int type;
    protected int line;
    protected int charPositionInLine;
    protected int channel = DEFAULT_CHANNEL;
    protected int index;
    protected CharStream input;
    
    public ClassicToken(int type);
    public ClassicToken(int type, String text);
    public ClassicToken(int type, String text, int channel);
    
    public String getText();
    public void setText(String text);
    public int getType();
    public void setType(int type);
    public int getLine();  
    public void setLine(int line);
    public int getCharPositionInLine();
    public void setCharPositionInLine(int charPositionInLine);
    public int getChannel();
    public void setChannel(int channel);
    public int getTokenIndex();
    public void setTokenIndex(int index);
    public CharStream getInputStream();
    public void setInputStream(CharStream input);
    
    public String toString();
}

Deterministic Finite Automaton

DFA implementation for efficient lexical recognition with prediction caching.

/**
 * Deterministic finite automaton for recognition
 */
public class DFA {
    public static final int UNINITIALIZED = -1;
    
    short[][] eot;
    short[][] eof; 
    char[][] min;
    char[][] max;
    short[][] accept;
    short[][] special;
    short[][] transition;
    
    int decisionNumber;
    BaseRecognizer recognizer;
    
    public DFA();
    public DFA(short[][] eot, short[][] eof, char[][] min, char[][] max, 
               short[][] accept, short[][] special, short[][] transition, 
               int decisionNumber);
    
    /** From the input stream, predict what alternative will succeed
     *  using this DFA (representing the covering regular approximation
     *  to the underlying CFL).  Return an alternative number 1..n.  
     *  Throw an exception upon error.
     */
    public int predict(IntStream input) throws RecognitionException;
    
    protected void error(NoViableAltException nvae);
    
    public static short[] unpackEncodedString(String encodedString);
    public static char[] unpackEncodedStringToUnsignedChars(String encodedString);
    
    /** A hook to allow subclasses to do something useful with the DFA tables */
    protected int specialStateTransition(int s, IntStream _input) throws NoViableAltException;
    
    public String getDescription();
}

Usage Examples:

// DFA is typically used internally by generated lexers
// Users don't usually create DFA objects directly

// In generated lexer code:
// public class MyLexer extends Lexer {
//     static final String DFA1_eotS = "...";
//     static final String DFA1_eofS = "...";
//     ...
//     static final short[] DFA1_eot = unpackEncodedString(DFA1_eotS);
//     ...
//     class DFA1 extends DFA {
//         public DFA1(BaseRecognizer recognizer) {
//             this.recognizer = recognizer;
//             this.decisionNumber = 1;
//             this.eot = DFA1_eot;
//             ...
//         }
//     }
// }

Token Source Interface

Interface for objects that produce tokens.

/**
 * Source that produces Token objects
 */
public interface TokenSource {
    /** Return the next token from the character stream.
     *  Return null at end of input.
     */
    public Token nextToken();
    
    /** Where are you getting tokens from? normally the implication will simply
     *  ask lexers input stream.
     */
    public String getSourceName();
}

Bit Set Utility

Efficient set operations for token types and character classes.

/**
 * Set of integers for efficient FOLLOW set representation
 */
public class BitSet implements Cloneable {
    public static final int BITS = 64;    // number of bits / long
    public static final int LOG_BITS = 6; // 2^6 == 64
    
    protected long[] bits;
    
    public BitSet();
    public BitSet(long[] bits_);
    public BitSet(List<Integer> items);
    public BitSet(int nbits);
    
    public static BitSet of(int el);
    public static BitSet of(int a, int b);
    public static BitSet of(int a, int b, int c);
    public static BitSet of(int a, int b, int c, int d);
    
    public BitSet or(BitSet a);
    public void add(int el);
    public BitSet and(BitSet a);
    public boolean member(int el);
    public void remove(int el);
    public boolean isNil();
    public int numBits();
    public int lengthInLongWords();
    public int size();
    public boolean equals(Object other);
    public Object clone();
    public int[] toArray();
    public long[] toPackedArray();
    public String toString();
    public String toString(String[] tokenNames);
}

Usage Examples:

import org.antlr.runtime.BitSet;

// Create bit sets for token types
BitSet keywords = BitSet.of(MyLexer.IF, MyLexer.WHILE, MyLexer.FOR);
BitSet operators = BitSet.of(MyLexer.PLUS, MyLexer.MINUS, MyLexer.MULT);

// Combine sets
BitSet combined = keywords.or(operators);

// Test membership
boolean isKeyword = keywords.member(tokenType);

// Convert to array
int[] types = combined.toArray();

// Display with token names
String[] tokenNames = {"IF", "WHILE", "FOR", "PLUS", "MINUS", "MULT"};
System.out.println(combined.toString(tokenNames));

Types

Token Constants

public interface Token {
    /** imaginary tree navigation type; traverse "get child" link */
    public static final int DOWN = 2;
    /** imaginary tree navigation type; finish with a child list */
    public static final int UP = 3;
    public static final int MIN_TOKEN_TYPE = UP+1;
    public static final int EOF = CharStream.EOF;
    public static final int INVALID_TOKEN_TYPE = 0;
    public static final int DEFAULT_CHANNEL = 0;
    /** Anything on different channel than DEFAULT_CHANNEL is not parsed by parser. */
    public static final int HIDDEN_CHANNEL = 99;
    
    public static final Token INVALID_TOKEN = new CommonToken(INVALID_TOKEN_TYPE);
    public static final Token SKIP_TOKEN = new CommonToken(INVALID_TOKEN_TYPE);
}

Common Patterns

Custom Token Creation

public class MyLexer extends Lexer {
    public Token emit() {
        CommonToken t = new CommonToken(input, state.type, state.channel, 
                                       state.tokenStartCharIndex, getCharIndex()-1);
        t.setLine(state.tokenStartLine);
        t.setText(state.text);
        t.setCharPositionInLine(state.tokenStartCharPositionInLine);
        emit(t);
        return t;
    }
}

Skip Tokens

// In lexer rule to skip whitespace
public void mWHITESPACE() throws RecognitionException {
    // ... match whitespace characters ...
    skip(); // Don't create token, get next one
}

Channel-based Token Routing

// In lexer rule to send comments to hidden channel
public void mCOMMENT() throws RecognitionException {
    // ... match comment ...
    state.channel = HIDDEN; // Send to hidden channel
}

Error Recovery in Lexer

public void recover(RecognitionException re) {
    // Simple: skip the current character and try again
    if (input.LA(1) != CharStream.EOF) {
        input.consume();
    }
}

Token Text Override

// In lexer rule to modify token text
public void mSTRING_LITERAL() throws RecognitionException {
    // ... match string including quotes ...
    
    // Remove quotes from token text
    String s = getText();
    setText(s.substring(1, s.length()-1)); // Remove first and last char
}

Install with Tessl CLI

npx tessl i tessl/maven-org-antlr--antlr-runtime

docs

tessl/maven-org-antlr--antlr-runtime