or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-parsing.mdexceptions.mdindex.mdtokens-lexing.mdtree-processing.mdutilities.md

tokens-lexing.mddocs/

0

# Tokens and Lexing

1

2

Token representation and lexical analysis including Token class for lexical units, lexer configuration, and specialized handling for indentation-sensitive languages.

3

4

## Capabilities

5

6

### Token Representation

7

8

The Token class represents lexical units produced by the lexer, inheriting from str while adding metadata.

9

10

```python { .api }

11

class Token(str):

12

"""

13

String with meta-information representing a lexical token.

14

Inherits from str so it can be used anywhere a string is expected.

15

"""

16

17

def __new__(cls, type_: str, value: str, start_pos: int = None,

18

line: int = None, column: int = None, end_line: int = None,

19

end_column: int = None, end_pos: int = None,

20

pos_in_stream: int = None) -> 'Token':

21

"""

22

Create new token instance.

23

24

Parameters:

25

- type_: Token type name (terminal name from grammar)

26

- value: Token string value

27

- start_pos: Starting position in input text

28

- line: Line number (1-based)

29

- column: Column number (1-based)

30

- end_line: Ending line number

31

- end_column: Ending column number

32

- end_pos: Ending position in input text

33

- pos_in_stream: Position in token stream

34

35

Returns:

36

Token: New token instance

37

"""

38

39

def update(self, type_: str = None, value: str = None) -> 'Token':

40

"""

41

Create updated copy of token with new type or value.

42

43

Parameters:

44

- type_: New token type (optional)

45

- value: New token value (optional)

46

47

Returns:

48

Token: Updated token copy

49

"""

50

51

@classmethod

52

def new_borrow_pos(cls, type_: str, value: str, borrow_t: 'Token') -> 'Token':

53

"""

54

Create token borrowing position information from another token.

55

56

Parameters:

57

- type_: Token type name

58

- value: Token string value

59

- borrow_t: Token to borrow position from

60

61

Returns:

62

Token: New token with borrowed position

63

"""

64

65

# Attributes

66

type: str # Token type name

67

value: str # Token string value (same as str content)

68

start_pos: int # Start position in input

69

line: int # Line number (1-based)

70

column: int # Column number (1-based)

71

end_line: int # End line number

72

end_column: int # End column number

73

end_pos: int # End position in input

74

pos_in_stream: int # Position in token stream

75

```

76

77

### Indentation Handling

78

79

Post-lexer processor for handling Python-style indentation with INDENT/DEDENT tokens.

80

81

```python { .api }

82

class Indenter:

83

"""

84

PostLex processor for Python-like indentation handling.

85

Converts whitespace at line beginnings into INDENT/DEDENT tokens.

86

"""

87

88

def __init__(self, tab_len: int = 8):

89

"""

90

Initialize indenter.

91

92

Parameters:

93

- tab_len: Number of spaces equivalent to one tab

94

"""

95

96

def process(self, stream: Iterator[Token]) -> Iterator[Token]:

97

"""

98

Process token stream, converting indentation to INDENT/DEDENT tokens.

99

100

Parameters:

101

- stream: Input token stream

102

103

Returns:

104

Iterator[Token]: Stream with indentation tokens

105

"""

106

107

def handle_NL(self, token: Token) -> Iterator[Token]:

108

"""

109

Handle newline tokens for indentation tracking.

110

111

Parameters:

112

- token: Newline token

113

114

Returns:

115

Iterator[Token]: Processed tokens

116

"""

117

118

def handle_OPEN_PAREN(self, token: Token) -> Iterator[Token]:

119

"""

120

Handle opening parenthesis tokens.

121

122

Parameters:

123

- token: Opening parenthesis token

124

125

Returns:

126

Iterator[Token]: Processed tokens

127

"""

128

129

def handle_CLOSE_PAREN(self, token: Token) -> Iterator[Token]:

130

"""

131

Handle closing parenthesis tokens.

132

133

Parameters:

134

- token: Closing parenthesis token

135

136

Returns:

137

Iterator[Token]: Processed tokens

138

"""

139

140

# Configuration attributes

141

always_accept: Tuple[str, ...] = ('NL', 'COMMENT') # Always accepted tokens

142

NL_type: str = 'NL' # Newline token type

143

OPEN_PAREN_types: Tuple[str, ...] = () # Open paren types

144

CLOSE_PAREN_types: Tuple[str, ...] = () # Close paren types

145

INDENT_type: str = 'INDENT' # Indent token type

146

DEDENT_type: str = 'DEDENT' # Dedent token type

147

tab_len: int # Tab length in spaces

148

```

149

150

### Python-Specific Indenter

151

152

Pre-configured indenter for Python syntax.

153

154

```python { .api }

155

class PythonIndenter(Indenter):

156

"""

157

Indenter configured for Python language syntax.

158

"""

159

160

NL_type = 'NEWLINE'

161

OPEN_PAREN_types = ('LPAR', 'LSQB', 'LBRACE')

162

CLOSE_PAREN_types = ('RPAR', 'RSQB', 'RBRACE')

163

INDENT_type = 'INDENT'

164

DEDENT_type = 'DEDENT'

165

tab_len = 8

166

```

167

168

### Indentation Errors

169

170

Exception raised for invalid dedentation patterns.

171

172

```python { .api }

173

class DedentError(LarkError):

174

"""

175

Raised when dedentation doesn't match any previous indentation level.

176

"""

177

```

178

179

### Lexer Configuration

180

181

Configuration classes for lexer behavior and terminal definitions.

182

183

```python { .api }

184

class LexerConf:

185

"""

186

Configuration object for lexer components.

187

"""

188

189

def __init__(self, terminals: List, ignore: List = None,

190

g_regex_flags: int = 0, use_bytes: bool = False,

191

lexer_type: str = None, callbacks: Dict = None):

192

"""

193

Initialize lexer configuration.

194

195

Parameters:

196

- terminals: List of terminal definitions

197

- ignore: List of terminals to ignore

198

- g_regex_flags: Global regex flags

199

- use_bytes: Whether to use bytes instead of str

200

- lexer_type: Type of lexer to use

201

- callbacks: Token callbacks dictionary

202

"""

203

204

terminals: List # Terminal definitions

205

ignore: List # Ignored terminals

206

g_regex_flags: int # Global regex flags

207

use_bytes: bool # Use bytes input

208

lexer_type: str # Lexer type

209

callbacks: Dict # Token callbacks

210

```

211

212

### Terminal Definitions

213

214

Classes representing terminal symbol definitions in grammars.

215

216

```python { .api }

217

class TerminalDef:

218

"""

219

Definition of a terminal symbol in the grammar.

220

"""

221

222

def __init__(self, name: str, pattern, options: List = None):

223

"""

224

Initialize terminal definition.

225

226

Parameters:

227

- name: Terminal name

228

- pattern: Pattern object or string

229

- options: List of terminal options

230

"""

231

232

name: str # Terminal name

233

pattern: Pattern # Pattern for matching

234

options: List # Terminal options

235

```

236

237

### Lexer Classes

238

239

Core lexer implementations for tokenizing input text.

240

241

```python { .api }

242

class Lexer:

243

"""

244

Abstract base lexer class.

245

"""

246

247

def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]:

248

"""

249

Tokenize input text.

250

251

Parameters:

252

- text: Input text to tokenize

253

- dont_ignore: Include normally ignored tokens

254

255

Returns:

256

Iterator[Token]: Token stream

257

"""

258

259

class TraditionalLexer(Lexer):

260

"""

261

Traditional regex-based lexer implementation.

262

"""

263

264

class LexerThread:

265

"""

266

Lexer state for incremental tokenization.

267

"""

268

269

def lex(self, stream: Iterator, newline_types: Set[str],

270

ignore_types: Set[str]) -> Iterator[Token]:

271

"""

272

Perform lexical analysis on character stream.

273

274

Parameters:

275

- stream: Character stream

276

- newline_types: Set of newline token types

277

- ignore_types: Set of token types to ignore

278

279

Returns:

280

Iterator[Token]: Token stream

281

"""

282

```

283

284

## Usage Examples

285

286

### Basic Token Usage

287

288

```python

289

from lark import Lark, Token

290

291

parser = Lark(grammar)

292

293

# Get tokens without parsing

294

tokens = list(parser.lex("x = 42"))

295

for token in tokens:

296

print(f"Type: {token.type}, Value: '{token.value}', Line: {token.line}")

297

298

# Tokens are strings

299

token = Token('IDENTIFIER', 'variable_name', line=1, column=5)

300

print(f"Token as string: {token}") # Prints: variable_name

301

print(f"Token type: {token.type}") # Prints: IDENTIFIER

302

```

303

304

### Creating Custom Tokens

305

306

```python

307

from lark import Token

308

309

# Create token with position information

310

token = Token(

311

type_='NUMBER',

312

value='123',

313

start_pos=10,

314

line=2,

315

column=5,

316

end_line=2,

317

end_column=8,

318

end_pos=13

319

)

320

321

# Update token

322

new_token = token.update(type_='INTEGER')

323

print(f"Updated type: {new_token.type}")

324

325

# Borrow position from another token

326

borrowed = Token.new_borrow_pos('IDENTIFIER', 'x', token)

327

print(f"Borrowed position - Line: {borrowed.line}, Column: {borrowed.column}")

328

```

329

330

### Python Indentation Handling

331

332

```python

333

from lark import Lark

334

from lark.indenter import PythonIndenter

335

336

# Grammar for Python-like syntax

337

python_grammar = """

338

?start: suite

339

340

suite: NEWLINE INDENT stmt+ DEDENT

341

342

stmt: simple_stmt NEWLINE

343

| compound_stmt

344

345

simple_stmt: expr_stmt

346

compound_stmt: if_stmt

347

348

if_stmt: "if" expr ":" suite

349

350

expr_stmt: NAME "=" NUMBER

351

352

%import common.NAME

353

%import common.NUMBER

354

%import common.NEWLINE

355

%import common.WS

356

%ignore WS

357

"""

358

359

# Use Python indenter for handling indentation

360

parser = Lark(

361

python_grammar,

362

postlex=PythonIndenter(),

363

parser='lalr'

364

)

365

366

# Parse indented code

367

code = '''

368

if x:

369

y = 1

370

z = 2

371

'''

372

373

tree = parser.parse(code)

374

print(tree.pretty())

375

```

376

377

### Custom Indenter

378

379

```python

380

from lark.indenter import Indenter

381

382

class CustomIndenter(Indenter):

383

"""Custom indenter for specific syntax."""

384

385

NL_type = 'NEWLINE'

386

OPEN_PAREN_types = ('LPAREN', 'LBRACE')

387

CLOSE_PAREN_types = ('RPAREN', 'RBRACE')

388

INDENT_type = 'INDENT'

389

DEDENT_type = 'DEDENT'

390

tab_len = 4 # 4 spaces per indent level

391

392

# Use custom indenter

393

parser = Lark(grammar, postlex=CustomIndenter())

394

```

395

396

### Token Callbacks

397

398

```python

399

from lark import Lark, Token

400

401

def uppercase_identifiers(token):

402

"""Convert identifier tokens to uppercase."""

403

if token.type == 'IDENTIFIER':

404

return Token(token.type, token.value.upper(),

405

line=token.line, column=token.column)

406

return token

407

408

def log_numbers(token):

409

"""Log all number tokens."""

410

if token.type == 'NUMBER':

411

print(f"Found number: {token.value} at line {token.line}")

412

return token

413

414

# Apply callbacks during lexing

415

parser = Lark(

416

grammar,

417

lexer_callbacks={

418

'IDENTIFIER': uppercase_identifiers,

419

'NUMBER': log_numbers

420

}

421

)

422

423

result = parser.parse("x = 123")

424

```

425

426

### Lexer Configuration

427

428

```python

429

from lark import Lark

430

from lark.common import LexerConf

431

import re

432

433

# Configure lexer with specific options

434

lexer_conf = LexerConf(

435

terminals=terminal_list,

436

ignore=['WS', 'COMMENT'],

437

g_regex_flags=re.IGNORECASE | re.MULTILINE,

438

use_bytes=False,

439

lexer_type='standard'

440

)

441

442

parser = Lark(grammar, lexer='standard')

443

```

444

445

### Position Tracking

446

447

```python

448

from lark import Lark

449

450

# Enable position tracking

451

parser = Lark(grammar, propagate_positions=True)

452

tree = parser.parse(text)

453

454

# Access position information

455

def print_positions(tree):

456

if hasattr(tree, 'meta') and tree.meta:

457

print(f"Rule '{tree.data}' at line {tree.meta.line}, "

458

f"column {tree.meta.column}")

459

460

for child in tree.children:

461

if hasattr(child, 'children'): # It's a Tree

462

print_positions(child)

463

else: # It's a Token

464

print(f"Token '{child.type}': '{child.value}' at "

465

f"line {child.line}, column {child.column}")

466

467

print_positions(tree)

468

```

469

470

### Advanced Token Processing

471

472

```python

473

from lark import Lark, Token

474

475

class TokenProcessor:

476

"""Advanced token processing with state."""

477

478

def __init__(self):

479

self.line_count = 0

480

481

def process_newlines(self, token):

482

if token.type == 'NEWLINE':

483

self.line_count += 1

484

# Add line number to token value

485

return Token(token.type, f"\\n#{self.line_count}",

486

line=token.line, column=token.column)

487

return token

488

489

processor = TokenProcessor()

490

parser = Lark(

491

grammar,

492

lexer_callbacks={'NEWLINE': processor.process_newlines}

493

)

494

```