Tessl Tile for npm/parse5@8.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

error-handling.md html-utilities.md index.md parsing.md serialization.md tokenization.md tree-adapters.md

tokenization.mddocs/

0
# HTML Tokenization
1

2
Low-level HTML tokenization functionality for advanced use cases that require direct access to the tokenization process. The tokenizer converts HTML text into a stream of tokens that the parser then processes into an AST.
3

4
## Capabilities
5

6
### Tokenizer Class
7

8
Core tokenizer class that processes HTML text into tokens.
9

10
```typescript { .api }
11
/**
12
 * HTML tokenizer class for low-level token processing
13
 * @internal - Advanced API for specialized use cases
14
 */
15
class Tokenizer {
16
  /**
17
   * Creates a new tokenizer instance
18
   * @param options - Tokenizer configuration options
19
   * @param handler - Token handler for processing tokens
20
   */
21
  constructor(options: TokenizerOptions, handler: TokenHandler);
22

23
  /**
24
   * Write HTML text to the tokenizer for processing
25
   * @param chunk - HTML text chunk to tokenize
26
   * @param isLastChunk - Whether this is the final chunk
27
   */
28
  write(chunk: string, isLastChunk: boolean): void;
29

30
  /**
31
   * Insert HTML text at the current position
32
   * @param chunk - HTML text to insert
33
   */
34
  insertHtmlAtCurrentPos(chunk: string): void;
35

36
  /**
37
   * Start new named entity consumption
38
   * @param startCp - Starting code point
39
   * @param endCp - Ending code point  
40
   */
41
  startNamedEntityConsumption(startCp: number, endCp: number): void;
42

43
  /**
44
   * Emit current character as token
45
   */
46
  emitCurrentCharacter(): void;
47

48
  /**
49
   * Emit EOF token
50
   */
51
  emitEOFToken(): void;
52

53
  /**
54
   * Get current tokenizer state
55
   */
56
  get state(): State;
57

58
  /**
59
   * Set tokenizer state
60
   */
61
  set state(newState: State);
62
}
63
```
64

65
### Tokenizer Options
66

67
Configuration options for the tokenizer.
68

69
```typescript { .api }
70
/**
71
 * Tokenizer configuration options
72
 */
73
interface TokenizerOptions {
74
  /**
75
   * Enable source code location information tracking.
76
   * When enabled, tokens will include location data.
77
   * Defaults to false.
78
   */
79
  sourceCodeLocationInfo?: boolean;
80
}
81
```
82

83
### Tokenizer Modes
84

85
Constants defining different tokenizer parsing modes based on context.
86

87
```typescript { .api }
88
/**
89
 * Tokenizer mode constants for different parsing contexts
90
 */
91
const TokenizerMode: {
92
  readonly DATA: State.DATA;
93
  readonly RCDATA: State.RCDATA;
94
  readonly RAWTEXT: State.RAWTEXT;
95
  readonly SCRIPT_DATA: State.SCRIPT_DATA;
96
  readonly PLAINTEXT: State.PLAINTEXT;
97
  readonly CDATA_SECTION: State.CDATA_SECTION;
98
};
99

100
/**
101
 * Internal tokenizer states (used by TokenizerMode)
102
 */
103
enum State {
104
  DATA = 0,
105
  RCDATA = 1,
106
  RAWTEXT = 2,
107
  SCRIPT_DATA = 3,
108
  PLAINTEXT = 4,
109
  CDATA_SECTION = 5,
110
  // ... additional internal states
111
}
112
```
113

114
**Usage Examples:**
115

116
```typescript
117
import { Tokenizer, TokenizerMode, type TokenizerOptions, type TokenHandler } from "parse5";
118

119
// Create token handler
120
const handler: TokenHandler = {
121
  onComment: (token) => console.log('Comment:', token.data),
122
  onDoctype: (token) => console.log('DOCTYPE:', token.name),
123
  onStartTag: (token) => console.log('Start tag:', token.tagName),
124
  onEndTag: (token) => console.log('End tag:', token.tagName),
125
  onEof: (token) => console.log('EOF reached'),
126
  onCharacter: (token) => console.log('Character:', token.chars),
127
  onNullCharacter: (token) => console.log('Null character'),
128
  onWhitespaceCharacter: (token) => console.log('Whitespace:', token.chars)
129
};
130

131
// Create tokenizer with location tracking
132
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler);
133

134
// Process HTML text
135
tokenizer.write('<div>Hello <span>World</span></div>', true);
136

137
// Set specific tokenizer mode for different contexts
138
tokenizer.state = TokenizerMode.SCRIPT_DATA; // For script content
139
tokenizer.state = TokenizerMode.RAWTEXT;     // For style/title content
140
```
141

142
### Token Handler Interface
143

144
Interface for handling tokens emitted by the tokenizer.
145

146
```typescript { .api }
147
/**
148
 * Token handler interface for processing tokenizer output
149
 */
150
interface TokenHandler {
151
  /**
152
   * Handle comment tokens
153
   * @param token - Comment token
154
   */
155
  onComment(token: CommentToken): void;
156

157
  /**
158
   * Handle DOCTYPE tokens
159
   * @param token - DOCTYPE token
160
   */
161
  onDoctype(token: DoctypeToken): void;
162

163
  /**
164
   * Handle start tag tokens
165
   * @param token - Start tag token
166
   */
167
  onStartTag(token: TagToken): void;
168

169
  /**
170
   * Handle end tag tokens
171
   * @param token - End tag token
172
   */
173
  onEndTag(token: TagToken): void;
174

175
  /**
176
   * Handle end of file tokens
177
   * @param token - EOF token
178
   */
179
  onEof(token: EOFToken): void;
180

181
  /**
182
   * Handle character tokens
183
   * @param token - Character token
184
   */
185
  onCharacter(token: CharacterToken): void;
186

187
  /**
188
   * Handle null character tokens
189
   * @param token - Null character token
190
   */
191
  onNullCharacter(token: CharacterToken): void;
192

193
  /**
194
   * Handle whitespace character tokens
195
   * @param token - Whitespace character token
196
   */
197
  onWhitespaceCharacter(token: CharacterToken): void;
198

199
  /**
200
   * Optional error handler for parsing errors
201
   * @param error - Parser error information
202
   */
203
  onParseError?: ParserErrorHandler | null;
204
}
205
```
206

207
## Token Types
208

209
### Token Base Interface
210

211
Base interface shared by all token types.
212

213
```typescript { .api }
214
/**
215
 * Base interface for all token types
216
 */
217
interface TokenBase {
218
  /** Location information if sourceCodeLocationInfo is enabled */
219
  location?: Location;
220
}
221

222
/**
223
 * Union type of all token types
224
 */
225
type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken;
226
```
227

228
### Tag Tokens
229

230
Tokens representing HTML tags (both start and end tags).
231

232
```typescript { .api }
233
/**
234
 * Tag token representing HTML start and end tags
235
 */
236
interface TagToken extends TokenBase {
237
  /** Tag name (e.g., 'div', 'span') */
238
  tagName: string;
239
  
240
  /** Tag ID for efficient comparison */
241
  tagID: TAG_ID;
242
  
243
  /** Whether this is a self-closing tag */
244
  selfClosing: boolean;
245
  
246
  /** Acknowledgment flag for self-closing */
247
  ackSelfClosing: boolean;
248
  
249
  /** Tag attributes */
250
  attrs: Attribute[];
251
  
252
  /** Location info for attributes if enabled */
253
  location?: LocationWithAttributes;
254
}
255

256
/**
257
 * Attribute interface
258
 */
259
interface Attribute {
260
  /** Attribute name */
261
  name: string;
262
  
263
  /** Attribute value */
264
  value: string;
265
  
266
  /** Namespace URI if applicable */
267
  namespace?: string;
268
  
269
  /** Namespace prefix if applicable */
270
  prefix?: string;
271
}
272
```
273

274
### Character Tokens
275

276
Tokens representing text content and character data.
277

278
```typescript { .api }
279
/**
280
 * Character token representing text content
281
 */
282
interface CharacterToken extends TokenBase {
283
  /** Character data */
284
  chars: string;
285
  
286
  /** Location info if enabled */
287
  location?: Location;
288
}
289
```
290

291
### Comment Tokens
292

293
Tokens representing HTML comments.
294

295
```typescript { .api }
296
/**
297
 * Comment token representing HTML comments
298
 */
299
interface CommentToken extends TokenBase {
300
  /** Comment text content */
301
  data: string;
302
  
303
  /** Location info if enabled */
304
  location?: Location;
305
}
306
```
307

308
### DOCTYPE Tokens
309

310
Tokens representing HTML DOCTYPE declarations.
311

312
```typescript { .api }
313
/**
314
 * DOCTYPE token representing document type declarations
315
 */
316
interface DoctypeToken extends TokenBase {
317
  /** DOCTYPE name (usually 'html') */
318
  name: string | null;
319
  
320
  /** Public identifier */
321
  publicId: string | null;
322
  
323
  /** System identifier */
324
  systemId: string | null;
325
  
326
  /** Whether the DOCTYPE is force-quirks */
327
  forceQuirks: boolean;
328
  
329
  /** Location info if enabled */
330
  location?: Location;
331
}
332
```
333

334
### EOF Tokens
335

336
Tokens representing end of file.
337

338
```typescript { .api }
339
/**
340
 * EOF token representing end of file
341
 */
342
interface EOFToken extends TokenBase {
343
  /** Location info if enabled */
344
  location?: Location;
345
}
346
```
347

348
### Token Utilities
349

350
Utility functions for working with tokens.
351

352
```typescript { .api }
353
/**
354
 * Get attribute value from tag token
355
 * @param token - Tag token to search
356
 * @param attrName - Attribute name to find
357
 * @returns Attribute value or null if not found
358
 */
359
function getTokenAttr(token: TagToken, attrName: string): string | null;
360

361
/**
362
 * Token type enumeration
363
 */
364
enum TokenType {
365
  CHARACTER = 0,
366
  NULL_CHARACTER = 1,
367
  WHITESPACE_CHARACTER = 2,
368
  START_TAG = 3,
369
  END_TAG = 4,
370
  COMMENT = 5,
371
  DOCTYPE = 6,
372
  EOF = 7,
373
  HIBERNATION = 8
374
}
375
```
376

377
**Usage Examples:**
378

379
```typescript
380
import { Token, type TagToken, type CharacterToken } from "parse5";
381

382
// Check token attribute
383
const tagToken: TagToken = /* ... */;
384
const className = Token.getTokenAttr(tagToken, 'class');
385
if (className) {
386
  console.log('Class name:', className);
387
}
388

389
// Handle different token types
390
function processToken(token: Token.Token) {
391
  switch (token.type) {
392
    case Token.TokenType.START_TAG:
393
      console.log('Start tag:', (token as TagToken).tagName);
394
      break;
395
    case Token.TokenType.CHARACTER:
396
      console.log('Text:', (token as CharacterToken).chars);
397
      break;
398
    case Token.TokenType.COMMENT:
399
      console.log('Comment:', (token as CommentToken).data);
400
      break;
401
  }
402
}
403
```
404

405
## Advanced Tokenization Patterns
406

407
### Custom Token Processing
408

409
```typescript
410
import { Tokenizer, type TokenHandler, type TagToken } from "parse5";
411

412
class CustomTokenProcessor implements TokenHandler {
413
  private tagStack: string[] = [];
414

415
  onStartTag(token: TagToken): void {
416
    this.tagStack.push(token.tagName);
417
    console.log(`Entering tag: ${token.tagName}, depth: ${this.tagStack.length}`);
418
    
419
    // Process attributes
420
    token.attrs.forEach(attr => {
421
      console.log(`  Attribute: ${attr.name}="${attr.value}"`);
422
    });
423
  }
424

425
  onEndTag(token: TagToken): void {
426
    const expectedTag = this.tagStack.pop();
427
    if (expectedTag !== token.tagName) {
428
      console.warn(`Mismatched tags: expected ${expectedTag}, got ${token.tagName}`);
429
    }
430
    console.log(`Exiting tag: ${token.tagName}, depth: ${this.tagStack.length}`);
431
  }
432

433
  onComment(token: CommentToken): void {
434
    console.log(`Comment: ${token.data}`);
435
  }
436

437
  onDoctype(token: DoctypeToken): void {
438
    console.log(`DOCTYPE: ${token.name}`);
439
  }
440

441
  onEof(): void {
442
    console.log('End of file reached');
443
  }
444

445
  onCharacter(token: CharacterToken): void {
446
    const trimmed = token.chars.trim();
447
    if (trimmed) {
448
      console.log(`Text content: ${trimmed}`);
449
    }
450
  }
451

452
  onNullCharacter(): void {
453
    console.warn('Null character encountered');
454
  }
455

456
  onWhitespaceCharacter(): void {
457
    // Usually ignore whitespace
458
  }
459
}
460

461
// Use custom processor
462
const processor = new CustomTokenProcessor();
463
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: false }, processor);
464
tokenizer.write('<html><body>Hello World!</body></html>', true);
465
```
466

467
### Location-Aware Tokenization
468

469
```typescript
470
import { Tokenizer, type TokenHandler, type Location } from "parse5";
471

472
class LocationAwareHandler implements TokenHandler {
473
  private html: string;
474

475
  constructor(html: string) {
476
    this.html = html;
477
  }
478

479
  private getSourceSnippet(location: Location): string {
480
    return this.html.substring(location.startOffset, location.endOffset);
481
  }
482

483
  onStartTag(token: TagToken): void {
484
    if (token.location) {
485
      const snippet = this.getSourceSnippet(token.location);
486
      console.log(`Start tag at line ${token.location.startLine}: ${snippet}`);
487
      
488
      // Show attribute locations
489
      if (token.location.attrs) {
490
        Object.entries(token.location.attrs).forEach(([name, attrLocation]) => {
491
          const attrSnippet = this.getSourceSnippet(attrLocation);
492
          console.log(`  Attribute ${name} at line ${attrLocation.startLine}: ${attrSnippet}`);
493
        });
494
      }
495
    }
496
  }
497

498
  // ... implement other methods with location awareness
499
}
500
```

Version

Tile

Files

tokenization.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

tokenization.mddocs/