0
# Lexical Analysis
1
2
Base classes and interfaces for implementing lexers, token sources, and deterministic finite automata. The lexical analysis system transforms character streams into token streams for parsing.
3
4
## Capabilities
5
6
### Lexer Base Class
7
8
Abstract base class for all ANTLR-generated lexers providing tokenization functionality.
9
10
```java { .api }
11
/**
12
* A lexer is recognizer that draws input symbols from a character stream.
13
* lexer grammars result in a subclass of this object. A Lexer object
14
* uses simplified match() and error recovery mechanisms in the interest
15
* of speed.
16
*/
17
public abstract class Lexer extends BaseRecognizer implements TokenSource {
18
public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;
19
public static final int HIDDEN = Token.HIDDEN_CHANNEL;
20
public static final int EOF = -1;
21
22
/** Where is the lexer drawing characters from? */
23
protected CharStream input;
24
25
public Lexer();
26
public Lexer(CharStream input);
27
public Lexer(CharStream input, RecognizerSharedState state);
28
29
public void reset();
30
31
/** Return a token from this source; i.e., match a token on the char stream. */
32
public abstract Token nextToken();
33
34
/** Instruct the lexer to skip creating a token for current lexer rule and look for another token */
35
public void skip();
36
37
/** This is the lexer entry point that sets instance var 'token' */
38
public void mTokens() throws RecognitionException;
39
40
/** Set the char stream and reset the lexer */
41
public void setCharStream(CharStream input);
42
public CharStream getCharStream();
43
44
public String getSourceName();
45
46
/** Currently does not support multiple emits per nextToken invocation
47
* for efficiency reasons. Subclass and override this method and
48
* nextToken (to push tokens into a list and pull from that list rather
49
* than a single variable as this implementation does).
50
*/
51
public void emit(Token token);
52
53
/** The standard method called to automatically emit a token at the
54
* outermost lexical rule. The token object should point into the
55
* char buffer start..stop. If there is a text override in 'text',
56
* use that to set the token's text. Override this method to emit
57
* custom Token objects.
58
*/
59
public Token emit();
60
61
public void match(String s) throws MismatchedTokenException;
62
public void match(int c) throws MismatchedTokenException;
63
public void matchAny() throws MismatchedTokenException;
64
public void matchRange(int a, int b) throws MismatchedTokenException;
65
66
public int getLine();
67
public int getCharPositionInLine();
68
69
/** What is the index of the current character of lookahead? */
70
public int getCharIndex();
71
72
/** Return the text matched so far for the current token or any text override. */
73
public String getText();
74
public void setText(String text);
75
76
public void reportError(RecognitionException e);
77
public String getErrorMessage(RecognitionException e, String[] tokenNames);
78
public String getCharErrorDisplay(int c);
79
80
/** Lexers can normally match any char in it's vocabulary after matching
81
* a token, so do the easy thing and just kill a character and hope
82
* it all works out. You can instead use the rule invocation stack
83
* to do sophisticated error recovery if you are in a fragment rule.
84
*/
85
public void recover(RecognitionException re);
86
87
public void traceIn(String ruleName, int ruleIndex);
88
public void traceOut(String ruleName, int ruleIndex);
89
}
90
```
91
92
**Usage Examples:**
93
94
```java
95
import org.antlr.runtime.*;
96
97
// Create lexer with string input
98
CharStream input = new ANTLRStringStream("hello world 123");
99
MyLexer lexer = new MyLexer(input);
100
101
// Get tokens one by one
102
Token token;
103
while ((token = lexer.nextToken()).getType() != Token.EOF) {
104
System.out.println("Token: " + token.getText() +
105
" Type: " + token.getType() +
106
" Line: " + token.getLine());
107
}
108
109
// Set custom text for token
110
// Inside lexer rule:
111
// setText("custom text");
112
// emit();
113
```
114
115
### Token Implementation
116
117
Standard token implementation containing all token information.
118
119
```java { .api }
120
/**
121
* Default Token implementation
122
*/
123
public class CommonToken implements Token {
124
protected int type;
125
protected int line;
126
protected int charPositionInLine = -1; // set to invalid position
127
protected int channel = DEFAULT_CHANNEL;
128
protected int index = -1;
129
protected CharStream input;
130
protected String text;
131
protected int start;
132
protected int stop;
133
134
public CommonToken(int type);
135
public CommonToken(Token oldToken);
136
public CommonToken(int type, String text);
137
public CommonToken(CharStream input, int type, int channel, int start, int stop);
138
139
public int getType();
140
public void setType(int type);
141
public int getLine();
142
public void setLine(int line);
143
public String getText();
144
public void setText(String text);
145
public int getCharPositionInLine();
146
public void setCharPositionInLine(int charPositionInLine);
147
public int getChannel();
148
public void setChannel(int channel);
149
public int getTokenIndex();
150
public void setTokenIndex(int index);
151
public CharStream getInputStream();
152
public void setInputStream(CharStream input);
153
154
public String toString();
155
}
156
```
157
158
**Usage Examples:**
159
160
```java
161
import org.antlr.runtime.*;
162
163
// Create tokens manually
164
Token identifier = new CommonToken(MyLexer.IDENTIFIER, "variable");
165
identifier.setLine(1);
166
identifier.setCharPositionInLine(0);
167
168
// Create from existing token
169
Token copy = new CommonToken(identifier);
170
171
// Create with all parameters
172
CharStream input = new ANTLRStringStream("test");
173
Token detailed = new CommonToken(input, MyLexer.STRING, Token.DEFAULT_CHANNEL, 0, 3);
174
175
System.out.println("Token text: " + detailed.getText());
176
System.out.println("Token type: " + detailed.getType());
177
```
178
179
### Alternative Token Implementation
180
181
```java { .api }
182
/**
183
* Alternative token implementation
184
*/
185
public class ClassicToken implements Token {
186
protected String text;
187
protected int type;
188
protected int line;
189
protected int charPositionInLine;
190
protected int channel = DEFAULT_CHANNEL;
191
protected int index;
192
protected CharStream input;
193
194
public ClassicToken(int type);
195
public ClassicToken(int type, String text);
196
public ClassicToken(int type, String text, int channel);
197
198
public String getText();
199
public void setText(String text);
200
public int getType();
201
public void setType(int type);
202
public int getLine();
203
public void setLine(int line);
204
public int getCharPositionInLine();
205
public void setCharPositionInLine(int charPositionInLine);
206
public int getChannel();
207
public void setChannel(int channel);
208
public int getTokenIndex();
209
public void setTokenIndex(int index);
210
public CharStream getInputStream();
211
public void setInputStream(CharStream input);
212
213
public String toString();
214
}
215
```
216
217
### Deterministic Finite Automaton
218
219
DFA implementation for efficient lexical recognition with prediction caching.
220
221
```java { .api }
222
/**
223
* Deterministic finite automaton for recognition
224
*/
225
public class DFA {
226
public static final int UNINITIALIZED = -1;
227
228
short[][] eot;
229
short[][] eof;
230
char[][] min;
231
char[][] max;
232
short[][] accept;
233
short[][] special;
234
short[][] transition;
235
236
int decisionNumber;
237
BaseRecognizer recognizer;
238
239
public DFA();
240
public DFA(short[][] eot, short[][] eof, char[][] min, char[][] max,
241
short[][] accept, short[][] special, short[][] transition,
242
int decisionNumber);
243
244
/** From the input stream, predict what alternative will succeed
245
* using this DFA (representing the covering regular approximation
246
* to the underlying CFL). Return an alternative number 1..n.
247
* Throw an exception upon error.
248
*/
249
public int predict(IntStream input) throws RecognitionException;
250
251
protected void error(NoViableAltException nvae);
252
253
public static short[] unpackEncodedString(String encodedString);
254
public static char[] unpackEncodedStringToUnsignedChars(String encodedString);
255
256
/** A hook to allow subclasses to do something useful with the DFA tables */
257
protected int specialStateTransition(int s, IntStream _input) throws NoViableAltException;
258
259
public String getDescription();
260
}
261
```
262
263
**Usage Examples:**
264
265
```java
266
// DFA is typically used internally by generated lexers
267
// Users don't usually create DFA objects directly
268
269
// In generated lexer code:
270
// public class MyLexer extends Lexer {
271
// static final String DFA1_eotS = "...";
272
// static final String DFA1_eofS = "...";
273
// ...
274
// static final short[] DFA1_eot = unpackEncodedString(DFA1_eotS);
275
// ...
276
// class DFA1 extends DFA {
277
// public DFA1(BaseRecognizer recognizer) {
278
// this.recognizer = recognizer;
279
// this.decisionNumber = 1;
280
// this.eot = DFA1_eot;
281
// ...
282
// }
283
// }
284
// }
285
```
286
287
### Token Source Interface
288
289
Interface for objects that produce tokens.
290
291
```java { .api }
292
/**
293
* Source that produces Token objects
294
*/
295
public interface TokenSource {
296
/** Return the next token from the character stream.
297
* Return null at end of input.
298
*/
299
public Token nextToken();
300
301
/** Where are you getting tokens from? normally the implication will simply
302
* ask lexers input stream.
303
*/
304
public String getSourceName();
305
}
306
```
307
308
### Bit Set Utility
309
310
Efficient set operations for token types and character classes.
311
312
```java { .api }
313
/**
314
* Set of integers for efficient FOLLOW set representation
315
*/
316
public class BitSet implements Cloneable {
317
public static final int BITS = 64; // number of bits / long
318
public static final int LOG_BITS = 6; // 2^6 == 64
319
320
protected long[] bits;
321
322
public BitSet();
323
public BitSet(long[] bits_);
324
public BitSet(List<Integer> items);
325
public BitSet(int nbits);
326
327
public static BitSet of(int el);
328
public static BitSet of(int a, int b);
329
public static BitSet of(int a, int b, int c);
330
public static BitSet of(int a, int b, int c, int d);
331
332
public BitSet or(BitSet a);
333
public void add(int el);
334
public BitSet and(BitSet a);
335
public boolean member(int el);
336
public void remove(int el);
337
public boolean isNil();
338
public int numBits();
339
public int lengthInLongWords();
340
public int size();
341
public boolean equals(Object other);
342
public Object clone();
343
public int[] toArray();
344
public long[] toPackedArray();
345
public String toString();
346
public String toString(String[] tokenNames);
347
}
348
```
349
350
**Usage Examples:**
351
352
```java
353
import org.antlr.runtime.BitSet;
354
355
// Create bit sets for token types
356
BitSet keywords = BitSet.of(MyLexer.IF, MyLexer.WHILE, MyLexer.FOR);
357
BitSet operators = BitSet.of(MyLexer.PLUS, MyLexer.MINUS, MyLexer.MULT);
358
359
// Combine sets
360
BitSet combined = keywords.or(operators);
361
362
// Test membership
363
boolean isKeyword = keywords.member(tokenType);
364
365
// Convert to array
366
int[] types = combined.toArray();
367
368
// Display with token names
369
String[] tokenNames = {"IF", "WHILE", "FOR", "PLUS", "MINUS", "MULT"};
370
System.out.println(combined.toString(tokenNames));
371
```
372
373
## Types
374
375
### Token Constants
376
377
```java { .api }
378
public interface Token {
379
/** imaginary tree navigation type; traverse "get child" link */
380
public static final int DOWN = 2;
381
/** imaginary tree navigation type; finish with a child list */
382
public static final int UP = 3;
383
public static final int MIN_TOKEN_TYPE = UP+1;
384
public static final int EOF = CharStream.EOF;
385
public static final int INVALID_TOKEN_TYPE = 0;
386
public static final int DEFAULT_CHANNEL = 0;
387
/** Anything on different channel than DEFAULT_CHANNEL is not parsed by parser. */
388
public static final int HIDDEN_CHANNEL = 99;
389
390
public static final Token INVALID_TOKEN = new CommonToken(INVALID_TOKEN_TYPE);
391
public static final Token SKIP_TOKEN = new CommonToken(INVALID_TOKEN_TYPE);
392
}
393
```
394
395
## Common Patterns
396
397
### Custom Token Creation
398
399
```java
400
public class MyLexer extends Lexer {
401
public Token emit() {
402
CommonToken t = new CommonToken(input, state.type, state.channel,
403
state.tokenStartCharIndex, getCharIndex()-1);
404
t.setLine(state.tokenStartLine);
405
t.setText(state.text);
406
t.setCharPositionInLine(state.tokenStartCharPositionInLine);
407
emit(t);
408
return t;
409
}
410
}
411
```
412
413
### Skip Tokens
414
415
```java
416
// In lexer rule to skip whitespace
417
public void mWHITESPACE() throws RecognitionException {
418
// ... match whitespace characters ...
419
skip(); // Don't create token, get next one
420
}
421
```
422
423
### Channel-based Token Routing
424
425
```java
426
// In lexer rule to send comments to hidden channel
427
public void mCOMMENT() throws RecognitionException {
428
// ... match comment ...
429
state.channel = HIDDEN; // Send to hidden channel
430
}
431
```
432
433
### Error Recovery in Lexer
434
435
```java
436
public void recover(RecognitionException re) {
437
// Simple: skip the current character and try again
438
if (input.LA(1) != CharStream.EOF) {
439
input.consume();
440
}
441
}
442
```
443
444
### Token Text Override
445
446
```java
447
// In lexer rule to modify token text
448
public void mSTRING_LITERAL() throws RecognitionException {
449
// ... match string including quotes ...
450
451
// Remove quotes from token text
452
String s = getText();
453
setText(s.substring(1, s.length()-1)); // Remove first and last char
454
}
455
```