or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

error-handling.mdhtml-utilities.mdindex.mdparsing.mdserialization.mdtokenization.mdtree-adapters.md

tokenization.mddocs/

0

# HTML Tokenization

1

2

Low-level HTML tokenization functionality for advanced use cases that require direct access to the tokenization process. The tokenizer converts HTML text into a stream of tokens that the parser then processes into an AST.

3

4

## Capabilities

5

6

### Tokenizer Class

7

8

Core tokenizer class that processes HTML text into tokens.

9

10

```typescript { .api }

11

/**

12

* HTML tokenizer class for low-level token processing

13

* @internal - Advanced API for specialized use cases

14

*/

15

class Tokenizer {

16

/**

17

* Creates a new tokenizer instance

18

* @param options - Tokenizer configuration options

19

* @param handler - Token handler for processing tokens

20

*/

21

constructor(options: TokenizerOptions, handler: TokenHandler);

22

23

/**

24

* Write HTML text to the tokenizer for processing

25

* @param chunk - HTML text chunk to tokenize

26

* @param isLastChunk - Whether this is the final chunk

27

*/

28

write(chunk: string, isLastChunk: boolean): void;

29

30

/**

31

* Insert HTML text at the current position

32

* @param chunk - HTML text to insert

33

*/

34

insertHtmlAtCurrentPos(chunk: string): void;

35

36

/**

37

* Start new named entity consumption

38

* @param startCp - Starting code point

39

* @param endCp - Ending code point

40

*/

41

startNamedEntityConsumption(startCp: number, endCp: number): void;

42

43

/**

44

* Emit current character as token

45

*/

46

emitCurrentCharacter(): void;

47

48

/**

49

* Emit EOF token

50

*/

51

emitEOFToken(): void;

52

53

/**

54

* Get current tokenizer state

55

*/

56

get state(): State;

57

58

/**

59

* Set tokenizer state

60

*/

61

set state(newState: State);

62

}

63

```

64

65

### Tokenizer Options

66

67

Configuration options for the tokenizer.

68

69

```typescript { .api }

70

/**

71

* Tokenizer configuration options

72

*/

73

interface TokenizerOptions {

74

/**

75

* Enable source code location information tracking.

76

* When enabled, tokens will include location data.

77

* Defaults to false.

78

*/

79

sourceCodeLocationInfo?: boolean;

80

}

81

```

82

83

### Tokenizer Modes

84

85

Constants defining different tokenizer parsing modes based on context.

86

87

```typescript { .api }

88

/**

89

* Tokenizer mode constants for different parsing contexts

90

*/

91

const TokenizerMode: {

92

readonly DATA: State.DATA;

93

readonly RCDATA: State.RCDATA;

94

readonly RAWTEXT: State.RAWTEXT;

95

readonly SCRIPT_DATA: State.SCRIPT_DATA;

96

readonly PLAINTEXT: State.PLAINTEXT;

97

readonly CDATA_SECTION: State.CDATA_SECTION;

98

};

99

100

/**

101

* Internal tokenizer states (used by TokenizerMode)

102

*/

103

enum State {

104

DATA = 0,

105

RCDATA = 1,

106

RAWTEXT = 2,

107

SCRIPT_DATA = 3,

108

PLAINTEXT = 4,

109

CDATA_SECTION = 5,

110

// ... additional internal states

111

}

112

```

113

114

**Usage Examples:**

115

116

```typescript

117

import { Tokenizer, TokenizerMode, type TokenizerOptions, type TokenHandler } from "parse5";

118

119

// Create token handler

120

const handler: TokenHandler = {

121

onComment: (token) => console.log('Comment:', token.data),

122

onDoctype: (token) => console.log('DOCTYPE:', token.name),

123

onStartTag: (token) => console.log('Start tag:', token.tagName),

124

onEndTag: (token) => console.log('End tag:', token.tagName),

125

onEof: (token) => console.log('EOF reached'),

126

onCharacter: (token) => console.log('Character:', token.chars),

127

onNullCharacter: (token) => console.log('Null character'),

128

onWhitespaceCharacter: (token) => console.log('Whitespace:', token.chars)

129

};

130

131

// Create tokenizer with location tracking

132

const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler);

133

134

// Process HTML text

135

tokenizer.write('<div>Hello <span>World</span></div>', true);

136

137

// Set specific tokenizer mode for different contexts

138

tokenizer.state = TokenizerMode.SCRIPT_DATA; // For script content

139

tokenizer.state = TokenizerMode.RAWTEXT; // For style/title content

140

```

141

142

### Token Handler Interface

143

144

Interface for handling tokens emitted by the tokenizer.

145

146

```typescript { .api }

147

/**

148

* Token handler interface for processing tokenizer output

149

*/

150

interface TokenHandler {

151

/**

152

* Handle comment tokens

153

* @param token - Comment token

154

*/

155

onComment(token: CommentToken): void;

156

157

/**

158

* Handle DOCTYPE tokens

159

* @param token - DOCTYPE token

160

*/

161

onDoctype(token: DoctypeToken): void;

162

163

/**

164

* Handle start tag tokens

165

* @param token - Start tag token

166

*/

167

onStartTag(token: TagToken): void;

168

169

/**

170

* Handle end tag tokens

171

* @param token - End tag token

172

*/

173

onEndTag(token: TagToken): void;

174

175

/**

176

* Handle end of file tokens

177

* @param token - EOF token

178

*/

179

onEof(token: EOFToken): void;

180

181

/**

182

* Handle character tokens

183

* @param token - Character token

184

*/

185

onCharacter(token: CharacterToken): void;

186

187

/**

188

* Handle null character tokens

189

* @param token - Null character token

190

*/

191

onNullCharacter(token: CharacterToken): void;

192

193

/**

194

* Handle whitespace character tokens

195

* @param token - Whitespace character token

196

*/

197

onWhitespaceCharacter(token: CharacterToken): void;

198

199

/**

200

* Optional error handler for parsing errors

201

* @param error - Parser error information

202

*/

203

onParseError?: ParserErrorHandler | null;

204

}

205

```

206

207

## Token Types

208

209

### Token Base Interface

210

211

Base interface shared by all token types.

212

213

```typescript { .api }

214

/**

215

* Base interface for all token types

216

*/

217

interface TokenBase {

218

/** Location information if sourceCodeLocationInfo is enabled */

219

location?: Location;

220

}

221

222

/**

223

* Union type of all token types

224

*/

225

type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken;

226

```

227

228

### Tag Tokens

229

230

Tokens representing HTML tags (both start and end tags).

231

232

```typescript { .api }

233

/**

234

* Tag token representing HTML start and end tags

235

*/

236

interface TagToken extends TokenBase {

237

/** Tag name (e.g., 'div', 'span') */

238

tagName: string;

239

240

/** Tag ID for efficient comparison */

241

tagID: TAG_ID;

242

243

/** Whether this is a self-closing tag */

244

selfClosing: boolean;

245

246

/** Acknowledgment flag for self-closing */

247

ackSelfClosing: boolean;

248

249

/** Tag attributes */

250

attrs: Attribute[];

251

252

/** Location info for attributes if enabled */

253

location?: LocationWithAttributes;

254

}

255

256

/**

257

* Attribute interface

258

*/

259

interface Attribute {

260

/** Attribute name */

261

name: string;

262

263

/** Attribute value */

264

value: string;

265

266

/** Namespace URI if applicable */

267

namespace?: string;

268

269

/** Namespace prefix if applicable */

270

prefix?: string;

271

}

272

```

273

274

### Character Tokens

275

276

Tokens representing text content and character data.

277

278

```typescript { .api }

279

/**

280

* Character token representing text content

281

*/

282

interface CharacterToken extends TokenBase {

283

/** Character data */

284

chars: string;

285

286

/** Location info if enabled */

287

location?: Location;

288

}

289

```

290

291

### Comment Tokens

292

293

Tokens representing HTML comments.

294

295

```typescript { .api }

296

/**

297

* Comment token representing HTML comments

298

*/

299

interface CommentToken extends TokenBase {

300

/** Comment text content */

301

data: string;

302

303

/** Location info if enabled */

304

location?: Location;

305

}

306

```

307

308

### DOCTYPE Tokens

309

310

Tokens representing HTML DOCTYPE declarations.

311

312

```typescript { .api }

313

/**

314

* DOCTYPE token representing document type declarations

315

*/

316

interface DoctypeToken extends TokenBase {

317

/** DOCTYPE name (usually 'html') */

318

name: string | null;

319

320

/** Public identifier */

321

publicId: string | null;

322

323

/** System identifier */

324

systemId: string | null;

325

326

/** Whether the DOCTYPE is force-quirks */

327

forceQuirks: boolean;

328

329

/** Location info if enabled */

330

location?: Location;

331

}

332

```

333

334

### EOF Tokens

335

336

Tokens representing end of file.

337

338

```typescript { .api }

339

/**

340

* EOF token representing end of file

341

*/

342

interface EOFToken extends TokenBase {

343

/** Location info if enabled */

344

location?: Location;

345

}

346

```

347

348

### Token Utilities

349

350

Utility functions for working with tokens.

351

352

```typescript { .api }

353

/**

354

* Get attribute value from tag token

355

* @param token - Tag token to search

356

* @param attrName - Attribute name to find

357

* @returns Attribute value or null if not found

358

*/

359

function getTokenAttr(token: TagToken, attrName: string): string | null;

360

361

/**

362

* Token type enumeration

363

*/

364

enum TokenType {

365

CHARACTER = 0,

366

NULL_CHARACTER = 1,

367

WHITESPACE_CHARACTER = 2,

368

START_TAG = 3,

369

END_TAG = 4,

370

COMMENT = 5,

371

DOCTYPE = 6,

372

EOF = 7,

373

HIBERNATION = 8

374

}

375

```

376

377

**Usage Examples:**

378

379

```typescript

380

import { Token, type TagToken, type CharacterToken } from "parse5";

381

382

// Check token attribute

383

const tagToken: TagToken = /* ... */;

384

const className = Token.getTokenAttr(tagToken, 'class');

385

if (className) {

386

console.log('Class name:', className);

387

}

388

389

// Handle different token types

390

function processToken(token: Token.Token) {

391

switch (token.type) {

392

case Token.TokenType.START_TAG:

393

console.log('Start tag:', (token as TagToken).tagName);

394

break;

395

case Token.TokenType.CHARACTER:

396

console.log('Text:', (token as CharacterToken).chars);

397

break;

398

case Token.TokenType.COMMENT:

399

console.log('Comment:', (token as CommentToken).data);

400

break;

401

}

402

}

403

```

404

405

## Advanced Tokenization Patterns

406

407

### Custom Token Processing

408

409

```typescript

410

import { Tokenizer, type TokenHandler, type TagToken } from "parse5";

411

412

class CustomTokenProcessor implements TokenHandler {

413

private tagStack: string[] = [];

414

415

onStartTag(token: TagToken): void {

416

this.tagStack.push(token.tagName);

417

console.log(`Entering tag: ${token.tagName}, depth: ${this.tagStack.length}`);

418

419

// Process attributes

420

token.attrs.forEach(attr => {

421

console.log(` Attribute: ${attr.name}="${attr.value}"`);

422

});

423

}

424

425

onEndTag(token: TagToken): void {

426

const expectedTag = this.tagStack.pop();

427

if (expectedTag !== token.tagName) {

428

console.warn(`Mismatched tags: expected ${expectedTag}, got ${token.tagName}`);

429

}

430

console.log(`Exiting tag: ${token.tagName}, depth: ${this.tagStack.length}`);

431

}

432

433

onComment(token: CommentToken): void {

434

console.log(`Comment: ${token.data}`);

435

}

436

437

onDoctype(token: DoctypeToken): void {

438

console.log(`DOCTYPE: ${token.name}`);

439

}

440

441

onEof(): void {

442

console.log('End of file reached');

443

}

444

445

onCharacter(token: CharacterToken): void {

446

const trimmed = token.chars.trim();

447

if (trimmed) {

448

console.log(`Text content: ${trimmed}`);

449

}

450

}

451

452

onNullCharacter(): void {

453

console.warn('Null character encountered');

454

}

455

456

onWhitespaceCharacter(): void {

457

// Usually ignore whitespace

458

}

459

}

460

461

// Use custom processor

462

const processor = new CustomTokenProcessor();

463

const tokenizer = new Tokenizer({ sourceCodeLocationInfo: false }, processor);

464

tokenizer.write('<html><body>Hello World!</body></html>', true);

465

```

466

467

### Location-Aware Tokenization

468

469

```typescript

470

import { Tokenizer, type TokenHandler, type Location } from "parse5";

471

472

class LocationAwareHandler implements TokenHandler {

473

private html: string;

474

475

constructor(html: string) {

476

this.html = html;

477

}

478

479

private getSourceSnippet(location: Location): string {

480

return this.html.substring(location.startOffset, location.endOffset);

481

}

482

483

onStartTag(token: TagToken): void {

484

if (token.location) {

485

const snippet = this.getSourceSnippet(token.location);

486

console.log(`Start tag at line ${token.location.startLine}: ${snippet}`);

487

488

// Show attribute locations

489

if (token.location.attrs) {

490

Object.entries(token.location.attrs).forEach(([name, attrLocation]) => {

491

const attrSnippet = this.getSourceSnippet(attrLocation);

492

console.log(` Attribute ${name} at line ${attrLocation.startLine}: ${attrSnippet}`);

493

});

494

}

495

}

496

}

497

498

// ... implement other methods with location awareness

499

}

500

```