0
# HTML Tokenization
1
2
Low-level HTML tokenization functionality for advanced use cases that require direct access to the tokenization process. The tokenizer converts HTML text into a stream of tokens that the parser then processes into an AST.
3
4
## Capabilities
5
6
### Tokenizer Class
7
8
Core tokenizer class that processes HTML text into tokens.
9
10
```typescript { .api }
11
/**
12
* HTML tokenizer class for low-level token processing
13
* @internal - Advanced API for specialized use cases
14
*/
15
class Tokenizer {
16
/**
17
* Creates a new tokenizer instance
18
* @param options - Tokenizer configuration options
19
* @param handler - Token handler for processing tokens
20
*/
21
constructor(options: TokenizerOptions, handler: TokenHandler);
22
23
/**
24
* Write HTML text to the tokenizer for processing
25
* @param chunk - HTML text chunk to tokenize
26
* @param isLastChunk - Whether this is the final chunk
27
*/
28
write(chunk: string, isLastChunk: boolean): void;
29
30
/**
31
* Insert HTML text at the current position
32
* @param chunk - HTML text to insert
33
*/
34
insertHtmlAtCurrentPos(chunk: string): void;
35
36
/**
37
* Start new named entity consumption
38
* @param startCp - Starting code point
39
* @param endCp - Ending code point
40
*/
41
startNamedEntityConsumption(startCp: number, endCp: number): void;
42
43
/**
44
* Emit current character as token
45
*/
46
emitCurrentCharacter(): void;
47
48
/**
49
* Emit EOF token
50
*/
51
emitEOFToken(): void;
52
53
/**
54
* Get current tokenizer state
55
*/
56
get state(): State;
57
58
/**
59
* Set tokenizer state
60
*/
61
set state(newState: State);
62
}
63
```
64
65
### Tokenizer Options
66
67
Configuration options for the tokenizer.
68
69
```typescript { .api }
70
/**
71
* Tokenizer configuration options
72
*/
73
interface TokenizerOptions {
74
/**
75
* Enable source code location information tracking.
76
* When enabled, tokens will include location data.
77
* Defaults to false.
78
*/
79
sourceCodeLocationInfo?: boolean;
80
}
81
```
82
83
### Tokenizer Modes
84
85
Constants defining different tokenizer parsing modes based on context.
86
87
```typescript { .api }
88
/**
89
* Tokenizer mode constants for different parsing contexts
90
*/
91
const TokenizerMode: {
92
readonly DATA: State.DATA;
93
readonly RCDATA: State.RCDATA;
94
readonly RAWTEXT: State.RAWTEXT;
95
readonly SCRIPT_DATA: State.SCRIPT_DATA;
96
readonly PLAINTEXT: State.PLAINTEXT;
97
readonly CDATA_SECTION: State.CDATA_SECTION;
98
};
99
100
/**
101
* Internal tokenizer states (used by TokenizerMode)
102
*/
103
enum State {
104
DATA = 0,
105
RCDATA = 1,
106
RAWTEXT = 2,
107
SCRIPT_DATA = 3,
108
PLAINTEXT = 4,
109
CDATA_SECTION = 5,
110
// ... additional internal states
111
}
112
```
113
114
**Usage Examples:**
115
116
```typescript
117
import { Tokenizer, TokenizerMode, type TokenizerOptions, type TokenHandler } from "parse5";
118
119
// Create token handler
120
const handler: TokenHandler = {
121
onComment: (token) => console.log('Comment:', token.data),
122
onDoctype: (token) => console.log('DOCTYPE:', token.name),
123
onStartTag: (token) => console.log('Start tag:', token.tagName),
124
onEndTag: (token) => console.log('End tag:', token.tagName),
125
onEof: (token) => console.log('EOF reached'),
126
onCharacter: (token) => console.log('Character:', token.chars),
127
onNullCharacter: (token) => console.log('Null character'),
128
onWhitespaceCharacter: (token) => console.log('Whitespace:', token.chars)
129
};
130
131
// Create tokenizer with location tracking
132
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler);
133
134
// Process HTML text
135
tokenizer.write('<div>Hello <span>World</span></div>', true);
136
137
// Set specific tokenizer mode for different contexts
138
tokenizer.state = TokenizerMode.SCRIPT_DATA; // For script content
139
tokenizer.state = TokenizerMode.RAWTEXT; // For style/title content
140
```
141
142
### Token Handler Interface
143
144
Interface for handling tokens emitted by the tokenizer.
145
146
```typescript { .api }
147
/**
148
* Token handler interface for processing tokenizer output
149
*/
150
interface TokenHandler {
151
/**
152
* Handle comment tokens
153
* @param token - Comment token
154
*/
155
onComment(token: CommentToken): void;
156
157
/**
158
* Handle DOCTYPE tokens
159
* @param token - DOCTYPE token
160
*/
161
onDoctype(token: DoctypeToken): void;
162
163
/**
164
* Handle start tag tokens
165
* @param token - Start tag token
166
*/
167
onStartTag(token: TagToken): void;
168
169
/**
170
* Handle end tag tokens
171
* @param token - End tag token
172
*/
173
onEndTag(token: TagToken): void;
174
175
/**
176
* Handle end of file tokens
177
* @param token - EOF token
178
*/
179
onEof(token: EOFToken): void;
180
181
/**
182
* Handle character tokens
183
* @param token - Character token
184
*/
185
onCharacter(token: CharacterToken): void;
186
187
/**
188
* Handle null character tokens
189
* @param token - Null character token
190
*/
191
onNullCharacter(token: CharacterToken): void;
192
193
/**
194
* Handle whitespace character tokens
195
* @param token - Whitespace character token
196
*/
197
onWhitespaceCharacter(token: CharacterToken): void;
198
199
/**
200
* Optional error handler for parsing errors
201
* @param error - Parser error information
202
*/
203
onParseError?: ParserErrorHandler | null;
204
}
205
```
206
207
## Token Types
208
209
### Token Base Interface
210
211
Base interface shared by all token types.
212
213
```typescript { .api }
214
/**
215
* Base interface for all token types
216
*/
217
interface TokenBase {
218
/** Location information if sourceCodeLocationInfo is enabled */
219
location?: Location;
220
}
221
222
/**
223
* Union type of all token types
224
*/
225
type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken;
226
```
227
228
### Tag Tokens
229
230
Tokens representing HTML tags (both start and end tags).
231
232
```typescript { .api }
233
/**
234
* Tag token representing HTML start and end tags
235
*/
236
interface TagToken extends TokenBase {
237
/** Tag name (e.g., 'div', 'span') */
238
tagName: string;
239
240
/** Tag ID for efficient comparison */
241
tagID: TAG_ID;
242
243
/** Whether this is a self-closing tag */
244
selfClosing: boolean;
245
246
/** Acknowledgment flag for self-closing */
247
ackSelfClosing: boolean;
248
249
/** Tag attributes */
250
attrs: Attribute[];
251
252
/** Location info for attributes if enabled */
253
location?: LocationWithAttributes;
254
}
255
256
/**
257
* Attribute interface
258
*/
259
interface Attribute {
260
/** Attribute name */
261
name: string;
262
263
/** Attribute value */
264
value: string;
265
266
/** Namespace URI if applicable */
267
namespace?: string;
268
269
/** Namespace prefix if applicable */
270
prefix?: string;
271
}
272
```
273
274
### Character Tokens
275
276
Tokens representing text content and character data.
277
278
```typescript { .api }
279
/**
280
* Character token representing text content
281
*/
282
interface CharacterToken extends TokenBase {
283
/** Character data */
284
chars: string;
285
286
/** Location info if enabled */
287
location?: Location;
288
}
289
```
290
291
### Comment Tokens
292
293
Tokens representing HTML comments.
294
295
```typescript { .api }
296
/**
297
* Comment token representing HTML comments
298
*/
299
interface CommentToken extends TokenBase {
300
/** Comment text content */
301
data: string;
302
303
/** Location info if enabled */
304
location?: Location;
305
}
306
```
307
308
### DOCTYPE Tokens
309
310
Tokens representing HTML DOCTYPE declarations.
311
312
```typescript { .api }
313
/**
314
* DOCTYPE token representing document type declarations
315
*/
316
interface DoctypeToken extends TokenBase {
317
/** DOCTYPE name (usually 'html') */
318
name: string | null;
319
320
/** Public identifier */
321
publicId: string | null;
322
323
/** System identifier */
324
systemId: string | null;
325
326
/** Whether the DOCTYPE is force-quirks */
327
forceQuirks: boolean;
328
329
/** Location info if enabled */
330
location?: Location;
331
}
332
```
333
334
### EOF Tokens
335
336
Tokens representing end of file.
337
338
```typescript { .api }
339
/**
340
* EOF token representing end of file
341
*/
342
interface EOFToken extends TokenBase {
343
/** Location info if enabled */
344
location?: Location;
345
}
346
```
347
348
### Token Utilities
349
350
Utility functions for working with tokens.
351
352
```typescript { .api }
353
/**
354
* Get attribute value from tag token
355
* @param token - Tag token to search
356
* @param attrName - Attribute name to find
357
* @returns Attribute value or null if not found
358
*/
359
function getTokenAttr(token: TagToken, attrName: string): string | null;
360
361
/**
362
* Token type enumeration
363
*/
364
enum TokenType {
365
CHARACTER = 0,
366
NULL_CHARACTER = 1,
367
WHITESPACE_CHARACTER = 2,
368
START_TAG = 3,
369
END_TAG = 4,
370
COMMENT = 5,
371
DOCTYPE = 6,
372
EOF = 7,
373
HIBERNATION = 8
374
}
375
```
376
377
**Usage Examples:**
378
379
```typescript
380
import { Token, type TagToken, type CharacterToken } from "parse5";
381
382
// Check token attribute
383
const tagToken: TagToken = /* ... */;
384
const className = Token.getTokenAttr(tagToken, 'class');
385
if (className) {
386
console.log('Class name:', className);
387
}
388
389
// Handle different token types
390
function processToken(token: Token.Token) {
391
switch (token.type) {
392
case Token.TokenType.START_TAG:
393
console.log('Start tag:', (token as TagToken).tagName);
394
break;
395
case Token.TokenType.CHARACTER:
396
console.log('Text:', (token as CharacterToken).chars);
397
break;
398
case Token.TokenType.COMMENT:
399
console.log('Comment:', (token as CommentToken).data);
400
break;
401
}
402
}
403
```
404
405
## Advanced Tokenization Patterns
406
407
### Custom Token Processing
408
409
```typescript
410
import { Tokenizer, type TokenHandler, type TagToken } from "parse5";
411
412
class CustomTokenProcessor implements TokenHandler {
413
private tagStack: string[] = [];
414
415
onStartTag(token: TagToken): void {
416
this.tagStack.push(token.tagName);
417
console.log(`Entering tag: ${token.tagName}, depth: ${this.tagStack.length}`);
418
419
// Process attributes
420
token.attrs.forEach(attr => {
421
console.log(` Attribute: ${attr.name}="${attr.value}"`);
422
});
423
}
424
425
onEndTag(token: TagToken): void {
426
const expectedTag = this.tagStack.pop();
427
if (expectedTag !== token.tagName) {
428
console.warn(`Mismatched tags: expected ${expectedTag}, got ${token.tagName}`);
429
}
430
console.log(`Exiting tag: ${token.tagName}, depth: ${this.tagStack.length}`);
431
}
432
433
onComment(token: CommentToken): void {
434
console.log(`Comment: ${token.data}`);
435
}
436
437
onDoctype(token: DoctypeToken): void {
438
console.log(`DOCTYPE: ${token.name}`);
439
}
440
441
onEof(): void {
442
console.log('End of file reached');
443
}
444
445
onCharacter(token: CharacterToken): void {
446
const trimmed = token.chars.trim();
447
if (trimmed) {
448
console.log(`Text content: ${trimmed}`);
449
}
450
}
451
452
onNullCharacter(): void {
453
console.warn('Null character encountered');
454
}
455
456
onWhitespaceCharacter(): void {
457
// Usually ignore whitespace
458
}
459
}
460
461
// Use custom processor
462
const processor = new CustomTokenProcessor();
463
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: false }, processor);
464
tokenizer.write('<html><body>Hello World!</body></html>', true);
465
```
466
467
### Location-Aware Tokenization
468
469
```typescript
470
import { Tokenizer, type TokenHandler, type Location } from "parse5";
471
472
class LocationAwareHandler implements TokenHandler {
473
private html: string;
474
475
constructor(html: string) {
476
this.html = html;
477
}
478
479
private getSourceSnippet(location: Location): string {
480
return this.html.substring(location.startOffset, location.endOffset);
481
}
482
483
onStartTag(token: TagToken): void {
484
if (token.location) {
485
const snippet = this.getSourceSnippet(token.location);
486
console.log(`Start tag at line ${token.location.startLine}: ${snippet}`);
487
488
// Show attribute locations
489
if (token.location.attrs) {
490
Object.entries(token.location.attrs).forEach(([name, attrLocation]) => {
491
const attrSnippet = this.getSourceSnippet(attrLocation);
492
console.log(` Attribute ${name} at line ${attrLocation.startLine}: ${attrSnippet}`);
493
});
494
}
495
}
496
}
497
498
// ... implement other methods with location awareness
499
}
500
```