or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

character-streams.mddebug-support.mderror-handling.mdindex.mdlexical-analysis.mdparsing.mdtoken-streams.mdtree-construction.md

parsing.mddocs/

0

# Parsing

1

2

Core parsing infrastructure including parser base classes, rule return scopes, and recognizer functionality. The parsing system processes token streams to build parse trees and abstract syntax trees.

3

4

## Capabilities

5

6

### Parser Base Class

7

8

Concrete parser class for processing token streams with error recovery and backtracking support.

9

10

```java { .api }

11

/**

12

* A parser for TokenStreams. "parser grammars" result in a subclass of this.

13

*/

14

public class Parser extends BaseRecognizer {

15

public TokenStream input;

16

17

public Parser(TokenStream input);

18

public Parser(TokenStream input, RecognizerSharedState state);

19

20

public void reset();

21

public void setTokenStream(TokenStream input);

22

public TokenStream getTokenStream();

23

24

protected Object getCurrentInputSymbol(IntStream input);

25

protected Object getMissingSymbol(IntStream input, RecognitionException e,

26

int expectedTokenType, BitSet follow);

27

28

/** Set the token stream and reset the parser */

29

public void setTokenStream(TokenStream input);

30

31

public void traceIn(String ruleName, int ruleIndex);

32

public void traceOut(String ruleName, int ruleIndex);

33

}

34

```

35

36

**Usage Examples:**

37

38

```java

39

import org.antlr.runtime.*;

40

41

// Create parser with token stream

42

MyLexer lexer = new MyLexer(new ANTLRStringStream("x = 42;"));

43

CommonTokenStream tokens = new CommonTokenStream(lexer);

44

MyParser parser = new MyParser(tokens);

45

46

// Parse starting from rule

47

try {

48

MyParser.program_return result = parser.program();

49

System.out.println("Parse successful");

50

51

// Get parse tree if available

52

Object tree = result.getTree();

53

if (tree != null) {

54

System.out.println("Tree: " + tree.toString());

55

}

56

} catch (RecognitionException e) {

57

System.err.println("Parse error: " + e.getMessage());

58

}

59

60

// Reset parser for reuse

61

parser.reset();

62

parser.setTokenStream(new CommonTokenStream(new MyLexer(new ANTLRStringStream("y = 10;"))));

63

```

64

65

### Base Recognizer

66

67

Abstract base class providing common functionality for all ANTLR recognizers.

68

69

```java { .api }

70

/**

71

* A generic recognizer that can handle recognizers generated from

72

* lexer, parser, and tree grammars. This is all the parsing

73

* support code essentially; most of it is error recovery stuff and

74

* backtracking.

75

*/

76

public abstract class BaseRecognizer {

77

public static final int MEMO_RULE_FAILED = -2;

78

public static final int MEMO_RULE_UNKNOWN = -1;

79

public static final int INITIAL_FOLLOW_STACK_SIZE = 100;

80

public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;

81

public static final int HIDDEN = Token.HIDDEN_CHANNEL;

82

public static final String NEXT_TOKEN_RULE_NAME = "nextToken";

83

84

protected RecognizerSharedState state;

85

86

public BaseRecognizer();

87

public BaseRecognizer(RecognizerSharedState state);

88

89

/** reset the parser's state. Subclasses must rewinds the input stream. */

90

public void reset();

91

92

/** Match current input symbol against ttype. Attempt

93

* single token insertion or deletion error recovery. If

94

* that fails, throw MismatchedTokenException.

95

*/

96

public Object match(IntStream input, int ttype, BitSet follow) throws RecognitionException;

97

98

/** Match the wildcard: in a symbol */

99

public void matchAny(IntStream input);

100

101

public boolean mismatchIsUnwantedToken(IntStream input, int ttype);

102

public boolean mismatchIsMissingToken(IntStream input, BitSet follow);

103

104

/** Report a recognition problem.

105

* This method sets errorRecovery to indicate the parser is recovering

106

* not parsing. Once in recovery mode, no errors are generated.

107

* To get out of recovery mode, the parser must successfully match

108

* a token (after a resync). So it will go:

109

*

110

* 1. error occurs

111

* 2. enter recovery mode, report error

112

* 3. consume until token found in resynch set

113

* 4. try to resume parsing

114

* 5. next match() will reset errorRecovery mode

115

*/

116

public void reportError(RecognitionException e);

117

118

public void displayRecognitionError(String[] tokenNames, RecognitionException e);

119

120

/** What error message should be generated for the various exception types? */

121

public String getErrorMessage(RecognitionException e, String[] tokenNames);

122

123

/** How should a token be displayed in an error message? The default

124

* is to display just the text, but during development you might

125

* want to have lots of information spit out. Override in that case

126

* to use t.toString() (which, for CommonToken, dumps everything about

127

* the token). This is better than forcing you to override a method in

128

* your token objects because you don't have to go modify your lexer

129

* so that it creates a new Java type.

130

*/

131

public String getTokenErrorDisplay(Token t);

132

133

/** Override this method to change where error messages go */

134

public void emitErrorMessage(String msg);

135

136

/** Recover from an error found on the input stream. This is for NoViableAlt and

137

* mismatched symbol exceptions. If you enable single token insertion and deletion,

138

* this will usually not handle mismatched symbol exceptions but there could be

139

* a mismatched token that the match() routine could not recover from.

140

*/

141

public void recover(IntStream input, RecognitionException re);

142

143

/** A hook to listen in on the token consumption during error recovery.

144

* The DebugParser subclasses this to fire events to the listenter.

145

*/

146

public void beginResync();

147

public void endResync();

148

149

/** Compute the error recovery set for the current rule. During

150

* rule invocation, the parser pushes the set of tokens that can

151

* follow that rule reference on the stack; this amounts to

152

* computing FIRST of what follows the rule reference in the

153

* enclosing rule. This local follow set only includes tokens

154

* from within the rule; i.e., the FIRST computation done on

155

* FOLLOW(r) for r not in context.

156

*/

157

protected BitSet computeErrorRecoverySet();

158

159

/** Compute the context-sensitive FOLLOW set for current rule.

160

* This is set of token types that can follow a specific rule

161

* reference given a specific call chain. You get the set of

162

* viable tokens that can possibly come next (lookahead depth 1)

163

* given the current call chain. Contrast this with the

164

* definition of plain FOLLOW for rule r:

165

*

166

* FOLLOW(r) = set of (local) tokens that can possibly follow

167

* references to r in *any* sentential form

168

* (r-->(alpha) beta, r->(alpha) gamma, ...)

169

*

170

* Note that the parser pushes just the local FOLLOW sets and upon

171

* error, we want to combine these below all scope until we reach

172

* a recovery rule (typically the top-level rule).

173

*/

174

protected BitSet computeContextSensitiveRuleFOLLOW();

175

176

protected BitSet combineFollows(boolean exact);

177

178

/** Attempt to recover from a single missing or extra token.

179

*

180

* EXTRA TOKEN

181

*

182

* LA(1) is not what we are looking for. If LA(2) has the right token,

183

* however, then assume LA(1) is some extra token and delete it.

184

* LA(2) must be part of the follow set of the missing token.

185

* This can only happen if you are trying to match two tokens and

186

* the first one is wrong. To match two tokens, either we backtrack

187

* (which is slow) or match the second token after seeing the first

188

* token is wrong. If we match second token, we have to delete the

189

* first token. This is hard to do so typically we backtrack or

190

* just do the default recovery which is to panic and skip until

191

* we find something we know about.

192

*

193

* MISSING TOKEN

194

*

195

* If current token is consistent with what could come after missing

196

* token, then it is ok to "insert" the missing token. LA(1) is

197

* what we want to match next and if LA(1) is consistent with what

198

* could come after the expected token, then we assume the expected token

199

* is missing and we insert it. For example, Input "i=(3;" is missing the

200

* ')'. At ')' we have LA(1)==';' which is consistent with

201

* what could come after ')' so we generate().

202

*/

203

protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)

204

throws RecognitionException;

205

206

/** Not currently used */

207

public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)

208

throws RecognitionException;

209

210

protected Object getCurrentInputSymbol(IntStream input);

211

protected Object getMissingSymbol(IntStream input, RecognitionException e,

212

int expectedTokenType, BitSet follow);

213

214

/** Conjure up a missing token during error recovery.

215

* The recognizer attempts to recover from single missing

216

* symbols. But, actions might refer to that missing symbol.

217

* For example, x=ID {f($x);}. The f($x) refers to the missing ID.

218

* If you just return User.INVALID_TOKEN_TYPE, then f($x) will try to invoke

219

* you action with a null argument. It's better to conjure one up

220

* as follows:

221

*

222

* ID(CommonToken(ID, "ID"));

223

*

224

* where ID is the token type of the missing token. getMissingSymbol

225

* makes no assumptions about which Token object you use for

226

* the missing symbol.

227

*

228

* This method is used primarily for error recovery not on the

229

* critical path.

230

*/

231

protected Token getMissingSymbol(IntStream input, RecognitionException e,

232

int expectedTokenType, BitSet follow);

233

234

public void pushFollow(BitSet fset);

235

protected void popFollow();

236

237

/** Return List<String> of the rules in your parser instance

238

* leading up to a call to the current rule. You could override if

239

* you want more details such as the file/line info of where

240

* in the parser java code a rule is invoked.

241

*

242

* This is very useful for error messages.

243

*/

244

public List<String> getRuleInvocationStack();

245

public List<String> getRuleInvocationStack(Throwable e);

246

247

public String getBacktrackingLevel();

248

249

/** Used to print out token names like ID during debugging and

250

* error reporting. The generated parsers implement a method

251

* that overrides this to point to their String[] tokenNames.

252

*/

253

public String[] getTokenNames();

254

255

/** For debugging and other purposes, might want the grammar name.

256

* Have ANTLR generate an implementation for this method.

257

*/

258

public String getGrammarFileName();

259

260

public String getSourceName();

261

262

/** A convenience method for use most often with template rewrites.

263

* Convert a List<Token> to List<String>

264

*/

265

public List<String> toStrings(List<Token> tokens);

266

267

/** Given a rule number and a start token index number, return

268

* MEMO_RULE_UNKNOWN if the rule has not parsed input starting from

269

* start index. If this rule has parsed input starting from the

270

* start index before, then return where the rule stopped parsing.

271

* It returns the index of the last token matched by the rule.

272

*

273

* For now we use a hashtable and just the slow Object-based one.

274

* Later, we can make a special one for ints and also one that

275

* tosses out entries after we commit past input position i.

276

*/

277

public Integer getRuleMemoization(int ruleIndex, int ruleStartIndex);

278

279

/** Has this rule already parsed input at the current index in the

280

* input stream? Return the stop token index or MEMO_RULE_UNKNOWN.

281

* If we attempted but failed to parse properly before, return

282

* MEMO_RULE_FAILED.

283

*

284

* This method has a side-effect: if we have seen this input for

285

* this rule before, then seek ahead to 1 past the stop token matched

286

* for this rule last time.

287

*/

288

public boolean alreadyParsedRule(IntStream input, int ruleIndex);

289

290

/** Record whether or not this rule parsed the input at this position

291

* successfully. Use a standard java hashtable for now.

292

*/

293

public void memoize(IntStream input, int ruleIndex, int ruleStartIndex);

294

}

295

```

296

297

**Usage Examples:**

298

299

```java

300

// Error handling in parser

301

public class MyParser extends Parser {

302

public void displayRecognitionError(String[] tokenNames, RecognitionException e) {

303

String hdr = getErrorHeader(e);

304

String msg = getErrorMessage(e, tokenNames);

305

System.err.println(hdr + " " + msg);

306

}

307

308

public String getErrorMessage(RecognitionException e, String[] tokenNames) {

309

// Custom error message formatting

310

return "Syntax error at line " + e.line + ": " + super.getErrorMessage(e, tokenNames);

311

}

312

}

313

314

// Using follow sets for error recovery

315

BitSet follow = new BitSet();

316

follow.add(MyParser.SEMICOLON);

317

follow.add(MyParser.RBRACE);

318

Token recovered = (Token)recoverFromMismatchedToken(input, MyParser.IDENTIFIER, follow);

319

```

320

321

### Rule Return Scopes

322

323

Return value containers for parser rules supporting tokens, trees, and custom attributes.

324

325

```java { .api }

326

/**

327

* Return value holder for rule invocations

328

*/

329

public class RuleReturnScope {

330

/** Return the start token or tree */

331

public Object getStart();

332

public void setStart(Object start);

333

334

/** Return the stop token or tree */

335

public Object getStop();

336

public void setStop(Object stop);

337

338

/** Has a value potentially if output=AST; */

339

public Object getTree();

340

public void setTree(Object tree);

341

}

342

343

/**

344

* Parser rule return scope

345

*/

346

public class ParserRuleReturnScope extends RuleReturnScope {

347

/** First token matched by this rule */

348

public Token start;

349

/** Last token matched by this rule */

350

public Token stop;

351

352

public Object getStart();

353

public Object getStop();

354

}

355

```

356

357

**Usage Examples:**

358

359

```java

360

// Generated parser rule return type

361

public static class expression_return extends ParserRuleReturnScope {

362

public CommonTree tree;

363

public Object getTree() { return tree; }

364

};

365

366

// Using rule return in parser

367

public final expression_return expression() throws RecognitionException {

368

expression_return retval = new expression_return();

369

retval.start = input.LT(1);

370

371

try {

372

// Rule implementation...

373

374

retval.stop = input.LT(-1);

375

// retval.tree = ...

376

377

} catch (RecognitionException re) {

378

reportError(re);

379

recover(input, re);

380

}

381

382

return retval;

383

}

384

385

// Using returned values

386

MyParser.expression_return result = parser.expression();

387

Token startToken = result.start;

388

Token stopToken = result.stop;

389

CommonTree tree = (CommonTree) result.getTree();

390

```

391

392

### Recognizer Shared State

393

394

Shared state object for managing parser state across rule invocations.

395

396

```java { .api }

397

/**

398

* Shared state between recognizer instances

399

*/

400

public class RecognizerSharedState {

401

/** Track the set of token types that can follow any rule invocation.

402

* Stack grows upwards.

403

*/

404

public BitSet[] following;

405

public int _fsp = -1;

406

407

/** This is true when we see an error and before having successfully

408

* matched a token. Prevents generation of more than one error message

409

* per error.

410

*/

411

public boolean errorRecovery = false;

412

413

/** The index into the input stream where the last error occurred.

414

* This is used to prevent infinite loops where an error is found

415

* but no token is consumed during recovery...another error is found,

416

* ad nauseam. This is a failsafe mechanism to guarantee that at least

417

* one token/tree node is consumed for two errors.

418

*/

419

public int lastErrorIndex = -1;

420

421

/** In lieu of a return value, this indicates that a rule or token

422

* has failed to match. Reset to false upon valid token match.

423

*/

424

public boolean failed = false;

425

426

/** Did the recognizer encounter a syntax error? Track how many. */

427

public int syntaxErrors = 0;

428

429

/** If 0, no backtracking is going on. Safe to exec actions etc...

430

* If >0 then it's the level of backtracking.

431

*/

432

public int backtracking = 0;

433

434

/** An array[size num rules] of Map<Integer,Integer> that tracks

435

* the stop token index for each rule. ruleMemo[ruleIndex] is

436

* the memoization table for ruleIndex. For key ruleStartIndex, you

437

* get back the stop token for associated rule or MEMO_RULE_FAILED.

438

*

439

* This is only used if rule memoization is on.

440

*/

441

public Map<Integer, Integer>[] ruleMemo;

442

443

// LEXER FIELDS (must be in same state object to avoid casting

444

// constantly in generated code and Lexer object)

445

446

/** The goal of all lexer rules/methods is to create a token object.

447

* This is an instance variable as multiple rules may collaborate to

448

* create a single token. nextToken will return this object after

449

* matching lexer rule(s). If you subclass to allow multiple token

450

* emissions, then set this to the last token to be matched or

451

* something nonnull so that the auto token emit mechanism will not

452

* emit another token.

453

*/

454

public Token token;

455

456

/** What character index in the stream did the current token start at? */

457

public int tokenStartCharIndex = -1;

458

459

/** The line on which the first character of the token resides */

460

public int tokenStartLine;

461

462

/** The character position of first character within the line */

463

public int tokenStartCharPositionInLine;

464

465

/** The channel number for the current token */

466

public int channel;

467

468

/** The token type for the current token */

469

public int type;

470

471

/** You can set the text for the current token to override what is in

472

* the input char buffer. Use setText() or can set this instance var.

473

*/

474

public String text;

475

}

476

```

477

478

## Types

479

480

### Parser State Management

481

482

```java { .api }

483

public class RecognizerSharedState {

484

// Following stack for error recovery

485

public BitSet[] following;

486

public int _fsp = -1;

487

488

// Error recovery state

489

public boolean errorRecovery = false;

490

public int lastErrorIndex = -1;

491

public boolean failed = false;

492

public int syntaxErrors = 0;

493

494

// Backtracking state

495

public int backtracking = 0;

496

497

// Memoization tables

498

public Map<Integer, Integer>[] ruleMemo;

499

500

// Token creation state (for lexers)

501

public Token token;

502

public int tokenStartCharIndex = -1;

503

public int tokenStartLine;

504

public int tokenStartCharPositionInLine;

505

public int channel;

506

public int type;

507

public String text;

508

}

509

```

510

511

## Common Patterns

512

513

### Custom Error Recovery

514

515

```java

516

public class MyParser extends Parser {

517

@Override

518

public void recover(IntStream input, RecognitionException re) {

519

// Custom recovery logic

520

if (re instanceof MismatchedTokenException) {

521

MismatchedTokenException mte = (MismatchedTokenException) re;

522

// Try to find a recovery token

523

while (input.LA(1) != Token.EOF &&

524

input.LA(1) != SEMICOLON &&

525

input.LA(1) != RBRACE) {

526

input.consume();

527

}

528

} else {

529

super.recover(input, re);

530

}

531

}

532

}

533

```

534

535

### Rule Memoization

536

537

```java

538

// In generated parser with memoization

539

public final expression_return expression() throws RecognitionException {

540

if (state.backtracking > 0 && alreadyParsedRule(input, 5)) {

541

return null; // Already tried this rule

542

}

543

544

expression_return retval = new expression_return();

545

// ... rule implementation ...

546

547

if (state.backtracking > 0) {

548

memoize(input, 5, retval.start);

549

}

550

551

return retval;

552

}

553

```

554

555

### Backtracking Support

556

557

```java

558

// Generated code with backtracking

559

int alt1 = 2;

560

state.backtracking++;

561

try {

562

alt1 = dfa1.predict(input);

563

} catch (RecognitionException re) {

564

state.failed = true;

565

throw re;

566

} finally {

567

state.backtracking--;

568

}

569

570

if (state.failed) return retval;

571

572

switch (alt1) {

573

case 1 :

574

// First alternative

575

break;

576

case 2 :

577

// Second alternative

578

break;

579

}

580

```

581

582

### Follow Set Management

583

584

```java

585

// Push follow set before rule call

586

pushFollow(FOLLOW_expression_in_assignment123);

587

expression();

588

state._fsp--;

589

590

// Error recovery with follow sets

591

BitSet follow = computeErrorRecoverySet();

592

Token recovered = (Token)recoverFromMismatchedToken(input, IDENTIFIER, follow);

593

```