0
# Parsing
1
2
Core parsing infrastructure including parser base classes, rule return scopes, and recognizer functionality. The parsing system processes token streams to build parse trees and abstract syntax trees.
3
4
## Capabilities
5
6
### Parser Base Class
7
8
Concrete parser class for processing token streams with error recovery and backtracking support.
9
10
```java { .api }
11
/**
12
* A parser for TokenStreams. "parser grammars" result in a subclass of this.
13
*/
14
public class Parser extends BaseRecognizer {
15
public TokenStream input;
16
17
public Parser(TokenStream input);
18
public Parser(TokenStream input, RecognizerSharedState state);
19
20
public void reset();
21
public void setTokenStream(TokenStream input);
22
public TokenStream getTokenStream();
23
24
protected Object getCurrentInputSymbol(IntStream input);
25
protected Object getMissingSymbol(IntStream input, RecognitionException e,
26
int expectedTokenType, BitSet follow);
27
28
/** Set the token stream and reset the parser */
29
public void setTokenStream(TokenStream input);
30
31
public void traceIn(String ruleName, int ruleIndex);
32
public void traceOut(String ruleName, int ruleIndex);
33
}
34
```
35
36
**Usage Examples:**
37
38
```java
39
import org.antlr.runtime.*;
40
41
// Create parser with token stream
42
MyLexer lexer = new MyLexer(new ANTLRStringStream("x = 42;"));
43
CommonTokenStream tokens = new CommonTokenStream(lexer);
44
MyParser parser = new MyParser(tokens);
45
46
// Parse starting from rule
47
try {
48
MyParser.program_return result = parser.program();
49
System.out.println("Parse successful");
50
51
// Get parse tree if available
52
Object tree = result.getTree();
53
if (tree != null) {
54
System.out.println("Tree: " + tree.toString());
55
}
56
} catch (RecognitionException e) {
57
System.err.println("Parse error: " + e.getMessage());
58
}
59
60
// Reset parser for reuse
61
parser.reset();
62
parser.setTokenStream(new CommonTokenStream(new MyLexer(new ANTLRStringStream("y = 10;"))));
63
```
64
65
### Base Recognizer
66
67
Abstract base class providing common functionality for all ANTLR recognizers.
68
69
```java { .api }
70
/**
71
* A generic recognizer that can handle recognizers generated from
72
* lexer, parser, and tree grammars. This is all the parsing
73
* support code essentially; most of it is error recovery stuff and
74
* backtracking.
75
*/
76
public abstract class BaseRecognizer {
77
public static final int MEMO_RULE_FAILED = -2;
78
public static final int MEMO_RULE_UNKNOWN = -1;
79
public static final int INITIAL_FOLLOW_STACK_SIZE = 100;
80
public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;
81
public static final int HIDDEN = Token.HIDDEN_CHANNEL;
82
public static final String NEXT_TOKEN_RULE_NAME = "nextToken";
83
84
protected RecognizerSharedState state;
85
86
public BaseRecognizer();
87
public BaseRecognizer(RecognizerSharedState state);
88
89
/** reset the parser's state. Subclasses must rewinds the input stream. */
90
public void reset();
91
92
/** Match current input symbol against ttype. Attempt
93
* single token insertion or deletion error recovery. If
94
* that fails, throw MismatchedTokenException.
95
*/
96
public Object match(IntStream input, int ttype, BitSet follow) throws RecognitionException;
97
98
/** Match the wildcard: in a symbol */
99
public void matchAny(IntStream input);
100
101
public boolean mismatchIsUnwantedToken(IntStream input, int ttype);
102
public boolean mismatchIsMissingToken(IntStream input, BitSet follow);
103
104
/** Report a recognition problem.
105
* This method sets errorRecovery to indicate the parser is recovering
106
* not parsing. Once in recovery mode, no errors are generated.
107
* To get out of recovery mode, the parser must successfully match
108
* a token (after a resync). So it will go:
109
*
110
* 1. error occurs
111
* 2. enter recovery mode, report error
112
* 3. consume until token found in resynch set
113
* 4. try to resume parsing
114
* 5. next match() will reset errorRecovery mode
115
*/
116
public void reportError(RecognitionException e);
117
118
public void displayRecognitionError(String[] tokenNames, RecognitionException e);
119
120
/** What error message should be generated for the various exception types? */
121
public String getErrorMessage(RecognitionException e, String[] tokenNames);
122
123
/** How should a token be displayed in an error message? The default
124
* is to display just the text, but during development you might
125
* want to have lots of information spit out. Override in that case
126
* to use t.toString() (which, for CommonToken, dumps everything about
127
* the token). This is better than forcing you to override a method in
128
* your token objects because you don't have to go modify your lexer
129
* so that it creates a new Java type.
130
*/
131
public String getTokenErrorDisplay(Token t);
132
133
/** Override this method to change where error messages go */
134
public void emitErrorMessage(String msg);
135
136
/** Recover from an error found on the input stream. This is for NoViableAlt and
137
* mismatched symbol exceptions. If you enable single token insertion and deletion,
138
* this will usually not handle mismatched symbol exceptions but there could be
139
* a mismatched token that the match() routine could not recover from.
140
*/
141
public void recover(IntStream input, RecognitionException re);
142
143
/** A hook to listen in on the token consumption during error recovery.
144
* The DebugParser subclasses this to fire events to the listenter.
145
*/
146
public void beginResync();
147
public void endResync();
148
149
/** Compute the error recovery set for the current rule. During
150
* rule invocation, the parser pushes the set of tokens that can
151
* follow that rule reference on the stack; this amounts to
152
* computing FIRST of what follows the rule reference in the
153
* enclosing rule. This local follow set only includes tokens
154
* from within the rule; i.e., the FIRST computation done on
155
* FOLLOW(r) for r not in context.
156
*/
157
protected BitSet computeErrorRecoverySet();
158
159
/** Compute the context-sensitive FOLLOW set for current rule.
160
* This is set of token types that can follow a specific rule
161
* reference given a specific call chain. You get the set of
162
* viable tokens that can possibly come next (lookahead depth 1)
163
* given the current call chain. Contrast this with the
164
* definition of plain FOLLOW for rule r:
165
*
166
* FOLLOW(r) = set of (local) tokens that can possibly follow
167
* references to r in *any* sentential form
168
* (r-->(alpha) beta, r->(alpha) gamma, ...)
169
*
170
* Note that the parser pushes just the local FOLLOW sets and upon
171
* error, we want to combine these below all scope until we reach
172
* a recovery rule (typically the top-level rule).
173
*/
174
protected BitSet computeContextSensitiveRuleFOLLOW();
175
176
protected BitSet combineFollows(boolean exact);
177
178
/** Attempt to recover from a single missing or extra token.
179
*
180
* EXTRA TOKEN
181
*
182
* LA(1) is not what we are looking for. If LA(2) has the right token,
183
* however, then assume LA(1) is some extra token and delete it.
184
* LA(2) must be part of the follow set of the missing token.
185
* This can only happen if you are trying to match two tokens and
186
* the first one is wrong. To match two tokens, either we backtrack
187
* (which is slow) or match the second token after seeing the first
188
* token is wrong. If we match second token, we have to delete the
189
* first token. This is hard to do so typically we backtrack or
190
* just do the default recovery which is to panic and skip until
191
* we find something we know about.
192
*
193
* MISSING TOKEN
194
*
195
* If current token is consistent with what could come after missing
196
* token, then it is ok to "insert" the missing token. LA(1) is
197
* what we want to match next and if LA(1) is consistent with what
198
* could come after the expected token, then we assume the expected token
199
* is missing and we insert it. For example, Input "i=(3;" is missing the
200
* ')'. At ')' we have LA(1)==';' which is consistent with
201
* what could come after ')' so we generate().
202
*/
203
protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)
204
throws RecognitionException;
205
206
/** Not currently used */
207
public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)
208
throws RecognitionException;
209
210
protected Object getCurrentInputSymbol(IntStream input);
211
protected Object getMissingSymbol(IntStream input, RecognitionException e,
212
int expectedTokenType, BitSet follow);
213
214
/** Conjure up a missing token during error recovery.
215
* The recognizer attempts to recover from single missing
216
* symbols. But, actions might refer to that missing symbol.
217
* For example, x=ID {f($x);}. The f($x) refers to the missing ID.
218
* If you just return User.INVALID_TOKEN_TYPE, then f($x) will try to invoke
219
* you action with a null argument. It's better to conjure one up
220
* as follows:
221
*
222
* ID(CommonToken(ID, "ID"));
223
*
224
* where ID is the token type of the missing token. getMissingSymbol
225
* makes no assumptions about which Token object you use for
226
* the missing symbol.
227
*
228
* This method is used primarily for error recovery not on the
229
* critical path.
230
*/
231
protected Token getMissingSymbol(IntStream input, RecognitionException e,
232
int expectedTokenType, BitSet follow);
233
234
public void pushFollow(BitSet fset);
235
protected void popFollow();
236
237
/** Return List<String> of the rules in your parser instance
238
* leading up to a call to the current rule. You could override if
239
* you want more details such as the file/line info of where
240
* in the parser java code a rule is invoked.
241
*
242
* This is very useful for error messages.
243
*/
244
public List<String> getRuleInvocationStack();
245
public List<String> getRuleInvocationStack(Throwable e);
246
247
public String getBacktrackingLevel();
248
249
/** Used to print out token names like ID during debugging and
250
* error reporting. The generated parsers implement a method
251
* that overrides this to point to their String[] tokenNames.
252
*/
253
public String[] getTokenNames();
254
255
/** For debugging and other purposes, might want the grammar name.
256
* Have ANTLR generate an implementation for this method.
257
*/
258
public String getGrammarFileName();
259
260
public String getSourceName();
261
262
/** A convenience method for use most often with template rewrites.
263
* Convert a List<Token> to List<String>
264
*/
265
public List<String> toStrings(List<Token> tokens);
266
267
/** Given a rule number and a start token index number, return
268
* MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
269
* start index. If this rule has parsed input starting from the
270
* start index before, then return where the rule stopped parsing.
271
* It returns the index of the last token matched by the rule.
272
*
273
* For now we use a hashtable and just the slow Object-based one.
274
* Later, we can make a special one for ints and also one that
275
* tosses out entries after we commit past input position i.
276
*/
277
public Integer getRuleMemoization(int ruleIndex, int ruleStartIndex);
278
279
/** Has this rule already parsed input at the current index in the
280
* input stream? Return the stop token index or MEMO_RULE_UNKNOWN.
281
* If we attempted but failed to parse properly before, return
282
* MEMO_RULE_FAILED.
283
*
284
* This method has a side-effect: if we have seen this input for
285
* this rule before, then seek ahead to 1 past the stop token matched
286
* for this rule last time.
287
*/
288
public boolean alreadyParsedRule(IntStream input, int ruleIndex);
289
290
/** Record whether or not this rule parsed the input at this position
291
* successfully. Use a standard java hashtable for now.
292
*/
293
public void memoize(IntStream input, int ruleIndex, int ruleStartIndex);
294
}
295
```
296
297
**Usage Examples:**
298
299
```java
300
// Error handling in parser
301
public class MyParser extends Parser {
302
public void displayRecognitionError(String[] tokenNames, RecognitionException e) {
303
String hdr = getErrorHeader(e);
304
String msg = getErrorMessage(e, tokenNames);
305
System.err.println(hdr + " " + msg);
306
}
307
308
public String getErrorMessage(RecognitionException e, String[] tokenNames) {
309
// Custom error message formatting
310
return "Syntax error at line " + e.line + ": " + super.getErrorMessage(e, tokenNames);
311
}
312
}
313
314
// Using follow sets for error recovery
315
BitSet follow = new BitSet();
316
follow.add(MyParser.SEMICOLON);
317
follow.add(MyParser.RBRACE);
318
Token recovered = (Token)recoverFromMismatchedToken(input, MyParser.IDENTIFIER, follow);
319
```
320
321
### Rule Return Scopes
322
323
Return value containers for parser rules supporting tokens, trees, and custom attributes.
324
325
```java { .api }
326
/**
327
* Return value holder for rule invocations
328
*/
329
public class RuleReturnScope {
330
/** Return the start token or tree */
331
public Object getStart();
332
public void setStart(Object start);
333
334
/** Return the stop token or tree */
335
public Object getStop();
336
public void setStop(Object stop);
337
338
/** Has a value potentially if output=AST; */
339
public Object getTree();
340
public void setTree(Object tree);
341
}
342
343
/**
344
* Parser rule return scope
345
*/
346
public class ParserRuleReturnScope extends RuleReturnScope {
347
/** First token matched by this rule */
348
public Token start;
349
/** Last token matched by this rule */
350
public Token stop;
351
352
public Object getStart();
353
public Object getStop();
354
}
355
```
356
357
**Usage Examples:**
358
359
```java
360
// Generated parser rule return type
361
public static class expression_return extends ParserRuleReturnScope {
362
public CommonTree tree;
363
public Object getTree() { return tree; }
364
};
365
366
// Using rule return in parser
367
public final expression_return expression() throws RecognitionException {
368
expression_return retval = new expression_return();
369
retval.start = input.LT(1);
370
371
try {
372
// Rule implementation...
373
374
retval.stop = input.LT(-1);
375
// retval.tree = ...
376
377
} catch (RecognitionException re) {
378
reportError(re);
379
recover(input, re);
380
}
381
382
return retval;
383
}
384
385
// Using returned values
386
MyParser.expression_return result = parser.expression();
387
Token startToken = result.start;
388
Token stopToken = result.stop;
389
CommonTree tree = (CommonTree) result.getTree();
390
```
391
392
### Recognizer Shared State
393
394
Shared state object for managing parser state across rule invocations.
395
396
```java { .api }
397
/**
398
* Shared state between recognizer instances
399
*/
400
public class RecognizerSharedState {
401
/** Track the set of token types that can follow any rule invocation.
402
* Stack grows upwards.
403
*/
404
public BitSet[] following;
405
public int _fsp = -1;
406
407
/** This is true when we see an error and before having successfully
408
* matched a token. Prevents generation of more than one error message
409
* per error.
410
*/
411
public boolean errorRecovery = false;
412
413
/** The index into the input stream where the last error occurred.
414
* This is used to prevent infinite loops where an error is found
415
* but no token is consumed during recovery...another error is found,
416
* ad nauseam. This is a failsafe mechanism to guarantee that at least
417
* one token/tree node is consumed for two errors.
418
*/
419
public int lastErrorIndex = -1;
420
421
/** In lieu of a return value, this indicates that a rule or token
422
* has failed to match. Reset to false upon valid token match.
423
*/
424
public boolean failed = false;
425
426
/** Did the recognizer encounter a syntax error? Track how many. */
427
public int syntaxErrors = 0;
428
429
/** If 0, no backtracking is going on. Safe to exec actions etc...
430
* If >0 then it's the level of backtracking.
431
*/
432
public int backtracking = 0;
433
434
/** An array[size num rules] of Map<Integer,Integer> that tracks
435
* the stop token index for each rule. ruleMemo[ruleIndex] is
436
* the memoization table for ruleIndex. For key ruleStartIndex, you
437
* get back the stop token for associated rule or MEMO_RULE_FAILED.
438
*
439
* This is only used if rule memoization is on.
440
*/
441
public Map<Integer, Integer>[] ruleMemo;
442
443
// LEXER FIELDS (must be in same state object to avoid casting
444
// constantly in generated code and Lexer object)
445
446
/** The goal of all lexer rules/methods is to create a token object.
447
* This is an instance variable as multiple rules may collaborate to
448
* create a single token. nextToken will return this object after
449
* matching lexer rule(s). If you subclass to allow multiple token
450
* emissions, then set this to the last token to be matched or
451
* something nonnull so that the auto token emit mechanism will not
452
* emit another token.
453
*/
454
public Token token;
455
456
/** What character index in the stream did the current token start at? */
457
public int tokenStartCharIndex = -1;
458
459
/** The line on which the first character of the token resides */
460
public int tokenStartLine;
461
462
/** The character position of first character within the line */
463
public int tokenStartCharPositionInLine;
464
465
/** The channel number for the current token */
466
public int channel;
467
468
/** The token type for the current token */
469
public int type;
470
471
/** You can set the text for the current token to override what is in
472
* the input char buffer. Use setText() or can set this instance var.
473
*/
474
public String text;
475
}
476
```
477
478
## Types
479
480
### Parser State Management
481
482
```java { .api }
483
public class RecognizerSharedState {
484
// Following stack for error recovery
485
public BitSet[] following;
486
public int _fsp = -1;
487
488
// Error recovery state
489
public boolean errorRecovery = false;
490
public int lastErrorIndex = -1;
491
public boolean failed = false;
492
public int syntaxErrors = 0;
493
494
// Backtracking state
495
public int backtracking = 0;
496
497
// Memoization tables
498
public Map<Integer, Integer>[] ruleMemo;
499
500
// Token creation state (for lexers)
501
public Token token;
502
public int tokenStartCharIndex = -1;
503
public int tokenStartLine;
504
public int tokenStartCharPositionInLine;
505
public int channel;
506
public int type;
507
public String text;
508
}
509
```
510
511
## Common Patterns
512
513
### Custom Error Recovery
514
515
```java
516
public class MyParser extends Parser {
517
@Override
518
public void recover(IntStream input, RecognitionException re) {
519
// Custom recovery logic
520
if (re instanceof MismatchedTokenException) {
521
MismatchedTokenException mte = (MismatchedTokenException) re;
522
// Try to find a recovery token
523
while (input.LA(1) != Token.EOF &&
524
input.LA(1) != SEMICOLON &&
525
input.LA(1) != RBRACE) {
526
input.consume();
527
}
528
} else {
529
super.recover(input, re);
530
}
531
}
532
}
533
```
534
535
### Rule Memoization
536
537
```java
538
// In generated parser with memoization
539
public final expression_return expression() throws RecognitionException {
540
if (state.backtracking > 0 && alreadyParsedRule(input, 5)) {
541
return null; // Already tried this rule
542
}
543
544
expression_return retval = new expression_return();
545
// ... rule implementation ...
546
547
if (state.backtracking > 0) {
548
memoize(input, 5, retval.start);
549
}
550
551
return retval;
552
}
553
```
554
555
### Backtracking Support
556
557
```java
558
// Generated code with backtracking
559
int alt1 = 2;
560
state.backtracking++;
561
try {
562
alt1 = dfa1.predict(input);
563
} catch (RecognitionException re) {
564
state.failed = true;
565
throw re;
566
} finally {
567
state.backtracking--;
568
}
569
570
if (state.failed) return retval;
571
572
switch (alt1) {
573
case 1 :
574
// First alternative
575
break;
576
case 2 :
577
// Second alternative
578
break;
579
}
580
```
581
582
### Follow Set Management
583
584
```java
585
// Push follow set before rule call
586
pushFollow(FOLLOW_expression_in_assignment123);
587
expression();
588
state._fsp--;
589
590
// Error recovery with follow sets
591
BitSet follow = computeErrorRecoverySet();
592
Token recovered = (Token)recoverFromMismatchedToken(input, IDENTIFIER, follow);
593
```