Tessl Tile for maven/org.antlr/antlr-runtime@3.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

character-streams.md debug-support.md error-handling.md index.md lexical-analysis.md parsing.md token-streams.md tree-construction.md

lexical-analysis.mddocs/

0
# Lexical Analysis
1

2
Base classes and interfaces for implementing lexers, token sources, and deterministic finite automata. The lexical analysis system transforms character streams into token streams for parsing.
3

4
## Capabilities
5

6
### Lexer Base Class
7

8
Abstract base class for all ANTLR-generated lexers providing tokenization functionality.
9

10
```java { .api }
11
/**
12
 * A lexer is recognizer that draws input symbols from a character stream.
13
 * lexer grammars result in a subclass of this object. A Lexer object
14
 * uses simplified match() and error recovery mechanisms in the interest
15
 * of speed.
16
 */
17
public abstract class Lexer extends BaseRecognizer implements TokenSource {
18
    public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;
19
    public static final int HIDDEN = Token.HIDDEN_CHANNEL;
20
    public static final int EOF = -1;
21
    
22
    /** Where is the lexer drawing characters from? */
23
    protected CharStream input;
24
    
25
    public Lexer();
26
    public Lexer(CharStream input);
27
    public Lexer(CharStream input, RecognizerSharedState state);
28
    
29
    public void reset();
30
    
31
    /** Return a token from this source; i.e., match a token on the char stream. */
32
    public abstract Token nextToken();
33
    
34
    /** Instruct the lexer to skip creating a token for current lexer rule and look for another token */
35
    public void skip();
36
    
37
    /** This is the lexer entry point that sets instance var 'token' */
38
    public void mTokens() throws RecognitionException;
39
    
40
    /** Set the char stream and reset the lexer */
41
    public void setCharStream(CharStream input);
42
    public CharStream getCharStream();
43
    
44
    public String getSourceName();
45
    
46
    /** Currently does not support multiple emits per nextToken invocation
47
     *  for efficiency reasons.  Subclass and override this method and
48
     *  nextToken (to push tokens into a list and pull from that list rather
49
     *  than a single variable as this implementation does).
50
     */
51
    public void emit(Token token);
52
    
53
    /** The standard method called to automatically emit a token at the
54
     *  outermost lexical rule.  The token object should point into the
55
     *  char buffer start..stop.  If there is a text override in 'text',
56
     *  use that to set the token's text.  Override this method to emit
57
     *  custom Token objects.
58
     */
59
    public Token emit();
60
    
61
    public void match(String s) throws MismatchedTokenException;
62
    public void match(int c) throws MismatchedTokenException;
63
    public void matchAny() throws MismatchedTokenException;
64
    public void matchRange(int a, int b) throws MismatchedTokenException;
65
    
66
    public int getLine();
67
    public int getCharPositionInLine();
68
    
69
    /** What is the index of the current character of lookahead? */
70
    public int getCharIndex();
71
    
72
    /** Return the text matched so far for the current token or any text override. */
73
    public String getText();
74
    public void setText(String text);
75
    
76
    public void reportError(RecognitionException e);
77
    public String getErrorMessage(RecognitionException e, String[] tokenNames);
78
    public String getCharErrorDisplay(int c);
79
    
80
    /** Lexers can normally match any char in it's vocabulary after matching
81
     *  a token, so do the easy thing and just kill a character and hope
82
     *  it all works out.  You can instead use the rule invocation stack
83
     *  to do sophisticated error recovery if you are in a fragment rule.
84
     */
85
    public void recover(RecognitionException re);
86
    
87
    public void traceIn(String ruleName, int ruleIndex);
88
    public void traceOut(String ruleName, int ruleIndex);
89
}
90
```
91

92
**Usage Examples:**
93

94
```java
95
import org.antlr.runtime.*;
96

97
// Create lexer with string input
98
CharStream input = new ANTLRStringStream("hello world 123");
99
MyLexer lexer = new MyLexer(input);
100

101
// Get tokens one by one
102
Token token;
103
while ((token = lexer.nextToken()).getType() != Token.EOF) {
104
    System.out.println("Token: " + token.getText() + 
105
                      " Type: " + token.getType() + 
106
                      " Line: " + token.getLine());
107
}
108

109
// Set custom text for token
110
// Inside lexer rule:
111
// setText("custom text");
112
// emit();
113
```
114

115
### Token Implementation
116

117
Standard token implementation containing all token information.
118

119
```java { .api }
120
/**
121
 * Default Token implementation
122
 */
123
public class CommonToken implements Token {
124
    protected int type;
125
    protected int line;
126
    protected int charPositionInLine = -1; // set to invalid position
127
    protected int channel = DEFAULT_CHANNEL;
128
    protected int index = -1;
129
    protected CharStream input;
130
    protected String text;
131
    protected int start;
132
    protected int stop;
133
    
134
    public CommonToken(int type);
135
    public CommonToken(Token oldToken);
136
    public CommonToken(int type, String text);
137
    public CommonToken(CharStream input, int type, int channel, int start, int stop);
138
    
139
    public int getType();
140
    public void setType(int type);
141
    public int getLine();
142
    public void setLine(int line);
143
    public String getText();
144
    public void setText(String text);
145
    public int getCharPositionInLine();
146
    public void setCharPositionInLine(int charPositionInLine);
147
    public int getChannel();
148
    public void setChannel(int channel);
149
    public int getTokenIndex();
150
    public void setTokenIndex(int index);
151
    public CharStream getInputStream();
152
    public void setInputStream(CharStream input);
153
    
154
    public String toString();
155
}
156
```
157

158
**Usage Examples:**
159

160
```java
161
import org.antlr.runtime.*;
162

163
// Create tokens manually
164
Token identifier = new CommonToken(MyLexer.IDENTIFIER, "variable");
165
identifier.setLine(1);
166
identifier.setCharPositionInLine(0);
167

168
// Create from existing token
169
Token copy = new CommonToken(identifier);
170

171
// Create with all parameters
172
CharStream input = new ANTLRStringStream("test");
173
Token detailed = new CommonToken(input, MyLexer.STRING, Token.DEFAULT_CHANNEL, 0, 3);
174

175
System.out.println("Token text: " + detailed.getText());
176
System.out.println("Token type: " + detailed.getType());
177
```
178

179
### Alternative Token Implementation
180

181
```java { .api }
182
/**
183
 * Alternative token implementation
184
 */
185
public class ClassicToken implements Token {
186
    protected String text;
187
    protected int type;
188
    protected int line;
189
    protected int charPositionInLine;
190
    protected int channel = DEFAULT_CHANNEL;
191
    protected int index;
192
    protected CharStream input;
193
    
194
    public ClassicToken(int type);
195
    public ClassicToken(int type, String text);
196
    public ClassicToken(int type, String text, int channel);
197
    
198
    public String getText();
199
    public void setText(String text);
200
    public int getType();
201
    public void setType(int type);
202
    public int getLine();  
203
    public void setLine(int line);
204
    public int getCharPositionInLine();
205
    public void setCharPositionInLine(int charPositionInLine);
206
    public int getChannel();
207
    public void setChannel(int channel);
208
    public int getTokenIndex();
209
    public void setTokenIndex(int index);
210
    public CharStream getInputStream();
211
    public void setInputStream(CharStream input);
212
    
213
    public String toString();
214
}
215
```
216

217
### Deterministic Finite Automaton
218

219
DFA implementation for efficient lexical recognition with prediction caching.
220

221
```java { .api }
222
/**
223
 * Deterministic finite automaton for recognition
224
 */
225
public class DFA {
226
    public static final int UNINITIALIZED = -1;
227
    
228
    short[][] eot;
229
    short[][] eof; 
230
    char[][] min;
231
    char[][] max;
232
    short[][] accept;
233
    short[][] special;
234
    short[][] transition;
235
    
236
    int decisionNumber;
237
    BaseRecognizer recognizer;
238
    
239
    public DFA();
240
    public DFA(short[][] eot, short[][] eof, char[][] min, char[][] max, 
241
               short[][] accept, short[][] special, short[][] transition, 
242
               int decisionNumber);
243
    
244
    /** From the input stream, predict what alternative will succeed
245
     *  using this DFA (representing the covering regular approximation
246
     *  to the underlying CFL).  Return an alternative number 1..n.  
247
     *  Throw an exception upon error.
248
     */
249
    public int predict(IntStream input) throws RecognitionException;
250
    
251
    protected void error(NoViableAltException nvae);
252
    
253
    public static short[] unpackEncodedString(String encodedString);
254
    public static char[] unpackEncodedStringToUnsignedChars(String encodedString);
255
    
256
    /** A hook to allow subclasses to do something useful with the DFA tables */
257
    protected int specialStateTransition(int s, IntStream _input) throws NoViableAltException;
258
    
259
    public String getDescription();
260
}
261
```
262

263
**Usage Examples:**
264

265
```java
266
// DFA is typically used internally by generated lexers
267
// Users don't usually create DFA objects directly
268

269
// In generated lexer code:
270
// public class MyLexer extends Lexer {
271
//     static final String DFA1_eotS = "...";
272
//     static final String DFA1_eofS = "...";
273
//     ...
274
//     static final short[] DFA1_eot = unpackEncodedString(DFA1_eotS);
275
//     ...
276
//     class DFA1 extends DFA {
277
//         public DFA1(BaseRecognizer recognizer) {
278
//             this.recognizer = recognizer;
279
//             this.decisionNumber = 1;
280
//             this.eot = DFA1_eot;
281
//             ...
282
//         }
283
//     }
284
// }
285
```
286

287
### Token Source Interface
288

289
Interface for objects that produce tokens.
290

291
```java { .api }
292
/**
293
 * Source that produces Token objects
294
 */
295
public interface TokenSource {
296
    /** Return the next token from the character stream.
297
     *  Return null at end of input.
298
     */
299
    public Token nextToken();
300
    
301
    /** Where are you getting tokens from? normally the implication will simply
302
     *  ask lexers input stream.
303
     */
304
    public String getSourceName();
305
}
306
```
307

308
### Bit Set Utility
309

310
Efficient set operations for token types and character classes.
311

312
```java { .api }
313
/**
314
 * Set of integers for efficient FOLLOW set representation
315
 */
316
public class BitSet implements Cloneable {
317
    public static final int BITS = 64;    // number of bits / long
318
    public static final int LOG_BITS = 6; // 2^6 == 64
319
    
320
    protected long[] bits;
321
    
322
    public BitSet();
323
    public BitSet(long[] bits_);
324
    public BitSet(List<Integer> items);
325
    public BitSet(int nbits);
326
    
327
    public static BitSet of(int el);
328
    public static BitSet of(int a, int b);
329
    public static BitSet of(int a, int b, int c);
330
    public static BitSet of(int a, int b, int c, int d);
331
    
332
    public BitSet or(BitSet a);
333
    public void add(int el);
334
    public BitSet and(BitSet a);
335
    public boolean member(int el);
336
    public void remove(int el);
337
    public boolean isNil();
338
    public int numBits();
339
    public int lengthInLongWords();
340
    public int size();
341
    public boolean equals(Object other);
342
    public Object clone();
343
    public int[] toArray();
344
    public long[] toPackedArray();
345
    public String toString();
346
    public String toString(String[] tokenNames);
347
}
348
```
349

350
**Usage Examples:**
351

352
```java
353
import org.antlr.runtime.BitSet;
354

355
// Create bit sets for token types
356
BitSet keywords = BitSet.of(MyLexer.IF, MyLexer.WHILE, MyLexer.FOR);
357
BitSet operators = BitSet.of(MyLexer.PLUS, MyLexer.MINUS, MyLexer.MULT);
358

359
// Combine sets
360
BitSet combined = keywords.or(operators);
361

362
// Test membership
363
boolean isKeyword = keywords.member(tokenType);
364

365
// Convert to array
366
int[] types = combined.toArray();
367

368
// Display with token names
369
String[] tokenNames = {"IF", "WHILE", "FOR", "PLUS", "MINUS", "MULT"};
370
System.out.println(combined.toString(tokenNames));
371
```
372

373
## Types
374

375
### Token Constants
376

377
```java { .api }
378
public interface Token {
379
    /** imaginary tree navigation type; traverse "get child" link */
380
    public static final int DOWN = 2;
381
    /** imaginary tree navigation type; finish with a child list */
382
    public static final int UP = 3;
383
    public static final int MIN_TOKEN_TYPE = UP+1;
384
    public static final int EOF = CharStream.EOF;
385
    public static final int INVALID_TOKEN_TYPE = 0;
386
    public static final int DEFAULT_CHANNEL = 0;
387
    /** Anything on different channel than DEFAULT_CHANNEL is not parsed by parser. */
388
    public static final int HIDDEN_CHANNEL = 99;
389
    
390
    public static final Token INVALID_TOKEN = new CommonToken(INVALID_TOKEN_TYPE);
391
    public static final Token SKIP_TOKEN = new CommonToken(INVALID_TOKEN_TYPE);
392
}
393
```
394

395
## Common Patterns
396

397
### Custom Token Creation
398

399
```java
400
public class MyLexer extends Lexer {
401
    public Token emit() {
402
        CommonToken t = new CommonToken(input, state.type, state.channel, 
403
                                       state.tokenStartCharIndex, getCharIndex()-1);
404
        t.setLine(state.tokenStartLine);
405
        t.setText(state.text);
406
        t.setCharPositionInLine(state.tokenStartCharPositionInLine);
407
        emit(t);
408
        return t;
409
    }
410
}
411
```
412

413
### Skip Tokens
414

415
```java
416
// In lexer rule to skip whitespace
417
public void mWHITESPACE() throws RecognitionException {
418
    // ... match whitespace characters ...
419
    skip(); // Don't create token, get next one
420
}
421
```
422

423
### Channel-based Token Routing
424

425
```java
426
// In lexer rule to send comments to hidden channel
427
public void mCOMMENT() throws RecognitionException {
428
    // ... match comment ...
429
    state.channel = HIDDEN; // Send to hidden channel
430
}
431
```
432

433
### Error Recovery in Lexer
434

435
```java
436
public void recover(RecognitionException re) {
437
    // Simple: skip the current character and try again
438
    if (input.LA(1) != CharStream.EOF) {
439
        input.consume();
440
    }
441
}
442
```
443

444
### Token Text Override
445

446
```java
447
// In lexer rule to modify token text
448
public void mSTRING_LITERAL() throws RecognitionException {
449
    // ... match string including quotes ...
450
    
451
    // Remove quotes from token text
452
    String s = getText();
453
    setText(s.substring(1, s.length()-1)); // Remove first and last char
454
}
455
```

Version

Tile

Files

lexical-analysis.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

lexical-analysis.mddocs/