or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

character-streams.mddebug-support.mderror-handling.mdindex.mdlexical-analysis.mdparsing.mdtoken-streams.mdtree-construction.md

lexical-analysis.mddocs/

0

# Lexical Analysis

1

2

Base classes and interfaces for implementing lexers, token sources, and deterministic finite automata. The lexical analysis system transforms character streams into token streams for parsing.

3

4

## Capabilities

5

6

### Lexer Base Class

7

8

Abstract base class for all ANTLR-generated lexers providing tokenization functionality.

9

10

```java { .api }

11

/**

12

* A lexer is recognizer that draws input symbols from a character stream.

13

* lexer grammars result in a subclass of this object. A Lexer object

14

* uses simplified match() and error recovery mechanisms in the interest

15

* of speed.

16

*/

17

public abstract class Lexer extends BaseRecognizer implements TokenSource {

18

public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;

19

public static final int HIDDEN = Token.HIDDEN_CHANNEL;

20

public static final int EOF = -1;

21

22

/** Where is the lexer drawing characters from? */

23

protected CharStream input;

24

25

public Lexer();

26

public Lexer(CharStream input);

27

public Lexer(CharStream input, RecognizerSharedState state);

28

29

public void reset();

30

31

/** Return a token from this source; i.e., match a token on the char stream. */

32

public abstract Token nextToken();

33

34

/** Instruct the lexer to skip creating a token for current lexer rule and look for another token */

35

public void skip();

36

37

/** This is the lexer entry point that sets instance var 'token' */

38

public void mTokens() throws RecognitionException;

39

40

/** Set the char stream and reset the lexer */

41

public void setCharStream(CharStream input);

42

public CharStream getCharStream();

43

44

public String getSourceName();

45

46

/** Currently does not support multiple emits per nextToken invocation

47

* for efficiency reasons. Subclass and override this method and

48

* nextToken (to push tokens into a list and pull from that list rather

49

* than a single variable as this implementation does).

50

*/

51

public void emit(Token token);

52

53

/** The standard method called to automatically emit a token at the

54

* outermost lexical rule. The token object should point into the

55

* char buffer start..stop. If there is a text override in 'text',

56

* use that to set the token's text. Override this method to emit

57

* custom Token objects.

58

*/

59

public Token emit();

60

61

public void match(String s) throws MismatchedTokenException;

62

public void match(int c) throws MismatchedTokenException;

63

public void matchAny() throws MismatchedTokenException;

64

public void matchRange(int a, int b) throws MismatchedTokenException;

65

66

public int getLine();

67

public int getCharPositionInLine();

68

69

/** What is the index of the current character of lookahead? */

70

public int getCharIndex();

71

72

/** Return the text matched so far for the current token or any text override. */

73

public String getText();

74

public void setText(String text);

75

76

public void reportError(RecognitionException e);

77

public String getErrorMessage(RecognitionException e, String[] tokenNames);

78

public String getCharErrorDisplay(int c);

79

80

/** Lexers can normally match any char in it's vocabulary after matching

81

* a token, so do the easy thing and just kill a character and hope

82

* it all works out. You can instead use the rule invocation stack

83

* to do sophisticated error recovery if you are in a fragment rule.

84

*/

85

public void recover(RecognitionException re);

86

87

public void traceIn(String ruleName, int ruleIndex);

88

public void traceOut(String ruleName, int ruleIndex);

89

}

90

```

91

92

**Usage Examples:**

93

94

```java

95

import org.antlr.runtime.*;

96

97

// Create lexer with string input

98

CharStream input = new ANTLRStringStream("hello world 123");

99

MyLexer lexer = new MyLexer(input);

100

101

// Get tokens one by one

102

Token token;

103

while ((token = lexer.nextToken()).getType() != Token.EOF) {

104

System.out.println("Token: " + token.getText() +

105

" Type: " + token.getType() +

106

" Line: " + token.getLine());

107

}

108

109

// Set custom text for token

110

// Inside lexer rule:

111

// setText("custom text");

112

// emit();

113

```

114

115

### Token Implementation

116

117

Standard token implementation containing all token information.

118

119

```java { .api }

120

/**

121

* Default Token implementation

122

*/

123

public class CommonToken implements Token {

124

protected int type;

125

protected int line;

126

protected int charPositionInLine = -1; // set to invalid position

127

protected int channel = DEFAULT_CHANNEL;

128

protected int index = -1;

129

protected CharStream input;

130

protected String text;

131

protected int start;

132

protected int stop;

133

134

public CommonToken(int type);

135

public CommonToken(Token oldToken);

136

public CommonToken(int type, String text);

137

public CommonToken(CharStream input, int type, int channel, int start, int stop);

138

139

public int getType();

140

public void setType(int type);

141

public int getLine();

142

public void setLine(int line);

143

public String getText();

144

public void setText(String text);

145

public int getCharPositionInLine();

146

public void setCharPositionInLine(int charPositionInLine);

147

public int getChannel();

148

public void setChannel(int channel);

149

public int getTokenIndex();

150

public void setTokenIndex(int index);

151

public CharStream getInputStream();

152

public void setInputStream(CharStream input);

153

154

public String toString();

155

}

156

```

157

158

**Usage Examples:**

159

160

```java

161

import org.antlr.runtime.*;

162

163

// Create tokens manually

164

Token identifier = new CommonToken(MyLexer.IDENTIFIER, "variable");

165

identifier.setLine(1);

166

identifier.setCharPositionInLine(0);

167

168

// Create from existing token

169

Token copy = new CommonToken(identifier);

170

171

// Create with all parameters

172

CharStream input = new ANTLRStringStream("test");

173

Token detailed = new CommonToken(input, MyLexer.STRING, Token.DEFAULT_CHANNEL, 0, 3);

174

175

System.out.println("Token text: " + detailed.getText());

176

System.out.println("Token type: " + detailed.getType());

177

```

178

179

### Alternative Token Implementation

180

181

```java { .api }

182

/**

183

* Alternative token implementation

184

*/

185

public class ClassicToken implements Token {

186

protected String text;

187

protected int type;

188

protected int line;

189

protected int charPositionInLine;

190

protected int channel = DEFAULT_CHANNEL;

191

protected int index;

192

protected CharStream input;

193

194

public ClassicToken(int type);

195

public ClassicToken(int type, String text);

196

public ClassicToken(int type, String text, int channel);

197

198

public String getText();

199

public void setText(String text);

200

public int getType();

201

public void setType(int type);

202

public int getLine();

203

public void setLine(int line);

204

public int getCharPositionInLine();

205

public void setCharPositionInLine(int charPositionInLine);

206

public int getChannel();

207

public void setChannel(int channel);

208

public int getTokenIndex();

209

public void setTokenIndex(int index);

210

public CharStream getInputStream();

211

public void setInputStream(CharStream input);

212

213

public String toString();

214

}

215

```

216

217

### Deterministic Finite Automaton

218

219

DFA implementation for efficient lexical recognition with prediction caching.

220

221

```java { .api }

222

/**

223

* Deterministic finite automaton for recognition

224

*/

225

public class DFA {

226

public static final int UNINITIALIZED = -1;

227

228

short[][] eot;

229

short[][] eof;

230

char[][] min;

231

char[][] max;

232

short[][] accept;

233

short[][] special;

234

short[][] transition;

235

236

int decisionNumber;

237

BaseRecognizer recognizer;

238

239

public DFA();

240

public DFA(short[][] eot, short[][] eof, char[][] min, char[][] max,

241

short[][] accept, short[][] special, short[][] transition,

242

int decisionNumber);

243

244

/** From the input stream, predict what alternative will succeed

245

* using this DFA (representing the covering regular approximation

246

* to the underlying CFL). Return an alternative number 1..n.

247

* Throw an exception upon error.

248

*/

249

public int predict(IntStream input) throws RecognitionException;

250

251

protected void error(NoViableAltException nvae);

252

253

public static short[] unpackEncodedString(String encodedString);

254

public static char[] unpackEncodedStringToUnsignedChars(String encodedString);

255

256

/** A hook to allow subclasses to do something useful with the DFA tables */

257

protected int specialStateTransition(int s, IntStream _input) throws NoViableAltException;

258

259

public String getDescription();

260

}

261

```

262

263

**Usage Examples:**

264

265

```java

266

// DFA is typically used internally by generated lexers

267

// Users don't usually create DFA objects directly

268

269

// In generated lexer code:

270

// public class MyLexer extends Lexer {

271

// static final String DFA1_eotS = "...";

272

// static final String DFA1_eofS = "...";

273

// ...

274

// static final short[] DFA1_eot = unpackEncodedString(DFA1_eotS);

275

// ...

276

// class DFA1 extends DFA {

277

// public DFA1(BaseRecognizer recognizer) {

278

// this.recognizer = recognizer;

279

// this.decisionNumber = 1;

280

// this.eot = DFA1_eot;

281

// ...

282

// }

283

// }

284

// }

285

```

286

287

### Token Source Interface

288

289

Interface for objects that produce tokens.

290

291

```java { .api }

292

/**

293

* Source that produces Token objects

294

*/

295

public interface TokenSource {

296

/** Return the next token from the character stream.

297

* Return null at end of input.

298

*/

299

public Token nextToken();

300

301

/** Where are you getting tokens from? normally the implication will simply

302

* ask lexers input stream.

303

*/

304

public String getSourceName();

305

}

306

```

307

308

### Bit Set Utility

309

310

Efficient set operations for token types and character classes.

311

312

```java { .api }

313

/**

314

* Set of integers for efficient FOLLOW set representation

315

*/

316

public class BitSet implements Cloneable {

317

public static final int BITS = 64; // number of bits / long

318

public static final int LOG_BITS = 6; // 2^6 == 64

319

320

protected long[] bits;

321

322

public BitSet();

323

public BitSet(long[] bits_);

324

public BitSet(List<Integer> items);

325

public BitSet(int nbits);

326

327

public static BitSet of(int el);

328

public static BitSet of(int a, int b);

329

public static BitSet of(int a, int b, int c);

330

public static BitSet of(int a, int b, int c, int d);

331

332

public BitSet or(BitSet a);

333

public void add(int el);

334

public BitSet and(BitSet a);

335

public boolean member(int el);

336

public void remove(int el);

337

public boolean isNil();

338

public int numBits();

339

public int lengthInLongWords();

340

public int size();

341

public boolean equals(Object other);

342

public Object clone();

343

public int[] toArray();

344

public long[] toPackedArray();

345

public String toString();

346

public String toString(String[] tokenNames);

347

}

348

```

349

350

**Usage Examples:**

351

352

```java

353

import org.antlr.runtime.BitSet;

354

355

// Create bit sets for token types

356

BitSet keywords = BitSet.of(MyLexer.IF, MyLexer.WHILE, MyLexer.FOR);

357

BitSet operators = BitSet.of(MyLexer.PLUS, MyLexer.MINUS, MyLexer.MULT);

358

359

// Combine sets

360

BitSet combined = keywords.or(operators);

361

362

// Test membership

363

boolean isKeyword = keywords.member(tokenType);

364

365

// Convert to array

366

int[] types = combined.toArray();

367

368

// Display with token names

369

String[] tokenNames = {"IF", "WHILE", "FOR", "PLUS", "MINUS", "MULT"};

370

System.out.println(combined.toString(tokenNames));

371

```

372

373

## Types

374

375

### Token Constants

376

377

```java { .api }

378

public interface Token {

379

/** imaginary tree navigation type; traverse "get child" link */

380

public static final int DOWN = 2;

381

/** imaginary tree navigation type; finish with a child list */

382

public static final int UP = 3;

383

public static final int MIN_TOKEN_TYPE = UP+1;

384

public static final int EOF = CharStream.EOF;

385

public static final int INVALID_TOKEN_TYPE = 0;

386

public static final int DEFAULT_CHANNEL = 0;

387

/** Anything on different channel than DEFAULT_CHANNEL is not parsed by parser. */

388

public static final int HIDDEN_CHANNEL = 99;

389

390

public static final Token INVALID_TOKEN = new CommonToken(INVALID_TOKEN_TYPE);

391

public static final Token SKIP_TOKEN = new CommonToken(INVALID_TOKEN_TYPE);

392

}

393

```

394

395

## Common Patterns

396

397

### Custom Token Creation

398

399

```java

400

public class MyLexer extends Lexer {

401

public Token emit() {

402

CommonToken t = new CommonToken(input, state.type, state.channel,

403

state.tokenStartCharIndex, getCharIndex()-1);

404

t.setLine(state.tokenStartLine);

405

t.setText(state.text);

406

t.setCharPositionInLine(state.tokenStartCharPositionInLine);

407

emit(t);

408

return t;

409

}

410

}

411

```

412

413

### Skip Tokens

414

415

```java

416

// In lexer rule to skip whitespace

417

public void mWHITESPACE() throws RecognitionException {

418

// ... match whitespace characters ...

419

skip(); // Don't create token, get next one

420

}

421

```

422

423

### Channel-based Token Routing

424

425

```java

426

// In lexer rule to send comments to hidden channel

427

public void mCOMMENT() throws RecognitionException {

428

// ... match comment ...

429

state.channel = HIDDEN; // Send to hidden channel

430

}

431

```

432

433

### Error Recovery in Lexer

434

435

```java

436

public void recover(RecognitionException re) {

437

// Simple: skip the current character and try again

438

if (input.LA(1) != CharStream.EOF) {

439

input.consume();

440

}

441

}

442

```

443

444

### Token Text Override

445

446

```java

447

// In lexer rule to modify token text

448

public void mSTRING_LITERAL() throws RecognitionException {

449

// ... match string including quotes ...

450

451

// Remove quotes from token text

452

String s = getText();

453

setText(s.substring(1, s.length()-1)); // Remove first and last char

454

}

455

```