or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

callback-parsing.mddom-parsing.mdfeed-parsing.mdindex.mdstream-processing.mdtokenization.md
tile.json

tokenization.mddocs/

Low-Level Tokenization

Direct access to the underlying tokenizer for custom parsing implementations and advanced use cases. The Tokenizer class provides the lowest-level interface to HTML/XML parsing.

Capabilities

Tokenizer Class

The core tokenization engine that processes HTML/XML character streams and fires low-level parsing events.

/**
 * Low-level HTML/XML tokenizer with state machine parsing
 */
class Tokenizer {
  /** Whether the tokenizer is currently running */
  running: boolean;
  
  /**
   * Create a new Tokenizer instance
   * @param options - Tokenizer configuration options
   * @param cbs - Callback object implementing Callbacks interface
   */
  constructor(options: ParserOptions, cbs: Callbacks);
  
  /**
   * Write data to the tokenizer for processing
   * @param chunk - String data to tokenize
   */
  write(chunk: string): void;
  
  /**
   * Signal end of input and complete tokenization
   * @param chunk - Optional final chunk of data
   */
  end(chunk?: string): void;
  
  /** Pause tokenization - can be resumed later */
  pause(): void;
  
  /** Resume tokenization after pause */
  resume(): void;
  
  /** Reset tokenizer to initial state */
  reset(): void;
}

Tokenizer Callbacks

The Callbacks interface provides low-level events fired during tokenization:

interface Callbacks {
  /** Called for attribute data content */
  onattribdata(start: number, endIndex: number): void;
  
  /** Called for HTML entity in attribute value */
  onattribentity(codepoint: number): void;
  
  /** Called when attribute ends */
  onattribend(quote: QuoteType, endIndex: number): void;
  
  /** Called for attribute name */
  onattribname(start: number, endIndex: number): void;
  
  /** Called for CDATA section content */
  oncdata(start: number, endIndex: number, endOffset: number): void;
  
  /** Called for closing tag */
  onclosetag(start: number, endIndex: number): void;
  
  /** Called for comment content */
  oncomment(start: number, endIndex: number, endOffset: number): void;
  
  /** Called for DOCTYPE declaration */
  ondeclaration(start: number, endIndex: number): void;
  
  /** Called when tokenization ends */
  onend(): void;
  
  /** Called when opening tag ends */
  onopentagend(endIndex: number): void;
  
  /** Called for opening tag name */
  onopentagname(start: number, endIndex: number): void;
  
  /** Called for processing instruction */
  onprocessinginstruction(start: number, endIndex: number): void;
  
  /** Called for self-closing tag */
  onselfclosingtag(endIndex: number): void;
  
  /** Called for text content */
  ontext(start: number, endIndex: number): void;
  
  /** Called for HTML entity in text */
  ontextentity(codepoint: number, endIndex: number): void;
}

Quote Type Enumeration

Defines the types of quotes used for attribute values:

enum QuoteType {
  /** Attribute has no value (e.g., `disabled`) */
  NoValue = 0,
  
  /** Attribute value is unquoted (e.g., `class=button`) */
  Unquoted = 1,
  
  /** Attribute value uses single quotes (e.g., `class='button'`) */
  Single = 2,
  
  /** Attribute value uses double quotes (e.g., `class="button"`) */
  Double = 3
}

Advanced Usage Examples

Custom Parser Implementation

import Tokenizer, { type Callbacks, QuoteType } from "htmlparser2/lib/Tokenizer";

class CustomHtmlParser implements Callbacks {
  private buffer: string = '';
  private tags: Array<{name: string, attrs: Record<string, string>}> = [];
  private currentTag: string = '';
  private currentAttr: string = '';
  private attributes: Record<string, string> = {};
  
  constructor(private options: ParserOptions = {}) {
    this.tokenizer = new Tokenizer(options, this);
  }
  
  private tokenizer: Tokenizer;
  
  parse(html: string): Array<{name: string, attrs: Record<string, string>}> {
    this.buffer = html;
    this.tokenizer.write(html);
    this.tokenizer.end();
    return this.tags;
  }
  
  // Implement required callbacks
  onattribdata(start: number, endIndex: number): void {
    const data = this.buffer.slice(start, endIndex);
    this.attributes[this.currentAttr] = data;
  }
  
  onattribentity(codepoint: number): void {
    // Handle HTML entities in attributes
    this.attributes[this.currentAttr] = String.fromCodePoint(codepoint);
  }
  
  onattribend(quote: QuoteType, endIndex: number): void {
    // Attribute parsing complete
  }
  
  onattribname(start: number, endIndex: number): void {
    this.currentAttr = this.buffer.slice(start, endIndex);
    this.attributes[this.currentAttr] = '';
  }
  
  onopentagname(start: number, endIndex: number): void {
    this.currentTag = this.buffer.slice(start, endIndex);
    this.attributes = {};
  }
  
  onopentagend(endIndex: number): void {
    this.tags.push({
      name: this.currentTag,
      attrs: { ...this.attributes }
    });
  }
  
  // Implement other required callbacks
  oncdata(): void {}
  onclosetag(): void {}
  oncomment(): void {}
  ondeclaration(): void {}
  onend(): void {}
  onprocessinginstruction(): void {}
  onselfclosingtag(): void {}
  ontext(): void {}
  ontextentity(): void {}
}

// Usage
const parser = new CustomHtmlParser({ lowerCaseTags: true });
const tags = parser.parse('<DIV class="test" id=myid><span>Hello</span></DIV>');
console.log(tags);

Streaming Tokenizer

import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";

class StreamingTokenizer implements Callbacks {
  private buffer: string = '';
  private tokenizer: Tokenizer;
  
  constructor() {
    this.tokenizer = new Tokenizer({}, this);
  }
  
  processChunk(chunk: string): void {
    this.buffer += chunk;
    this.tokenizer.write(chunk);
  }
  
  finish(): void {
    this.tokenizer.end();
  }
  
  // Track text content positions
  ontext(start: number, endIndex: number): void {
    const text = this.buffer.slice(start, endIndex);
    if (text.trim()) {
      console.log(`Text at ${start}-${endIndex}: "${text}"`);
    }
  }
  
  // Track tag positions
  onopentagname(start: number, endIndex: number): void {
    const tagName = this.buffer.slice(start, endIndex);
    console.log(`Opening tag at ${start}-${endIndex}: <${tagName}>`);
  }
  
  onclosetag(start: number, endIndex: number): void {
    const tagName = this.buffer.slice(start + 2, endIndex - 1); // Skip </ and >
    console.log(`Closing tag at ${start}-${endIndex}: </${tagName}>`);
  }
  
  // Implement required callbacks (minimal)
  onattribdata(): void {}
  onattribentity(): void {}
  onattribend(): void {}
  onattribname(): void {}
  oncdata(): void {}
  oncomment(): void {}
  ondeclaration(): void {}
  onend(): void { console.log("Tokenization complete"); }
  onopentagend(): void {}
  onprocessinginstruction(): void {}
  onselfclosingtag(): void {}
  ontextentity(): void {}
}

// Usage for streaming
const tokenizer = new StreamingTokenizer();
tokenizer.processChunk("<html><body>");
tokenizer.processChunk("<h1>Title</h1>");
tokenizer.processChunk("<p>Content</p>");
tokenizer.processChunk("</body></html>");
tokenizer.finish();

Entity Handling

import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";

class EntityProcessor implements Callbacks {
  private buffer: string = '';
  private entities: Array<{position: number, codepoint: number, char: string}> = [];
  
  constructor() {
    const tokenizer = new Tokenizer({ decodeEntities: true }, this);
    this.tokenizer = tokenizer;
  }
  
  private tokenizer: Tokenizer;
  
  process(html: string): Array<{position: number, codepoint: number, char: string}> {
    this.buffer = html;
    this.entities = [];
    this.tokenizer.write(html);
    this.tokenizer.end();
    return this.entities;
  }
  
  ontextentity(codepoint: number, endIndex: number): void {
    this.entities.push({
      position: endIndex,
      codepoint,
      char: String.fromCodePoint(codepoint)
    });
  }
  
  onattribentity(codepoint: number): void {
    this.entities.push({
      position: -1, // In attribute
      codepoint,
      char: String.fromCodePoint(codepoint)
    });
  }
  
  // Minimal required callbacks
  onattribdata(): void {}
  onattribend(): void {}
  onattribname(): void {}
  oncdata(): void {}
  onclosetag(): void {}
  oncomment(): void {}
  ondeclaration(): void {}
  onend(): void {}
  onopentagend(): void {}
  onopentagname(): void {}
  onprocessinginstruction(): void {}
  onselfclosingtag(): void {}
  ontext(): void {}
}

// Usage
const processor = new EntityProcessor();
const entities = processor.process('Text &amp; more &lt;tags&gt; &copy; 2025');
console.log(entities);
// Output: entities found with their codepoints and positions

Performance Monitoring

import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";

class PerformanceTokenizer implements Callbacks {
  private stats = {
    tags: 0,
    attributes: 0,
    textNodes: 0,
    comments: 0,
    entities: 0,
    startTime: 0,
    endTime: 0
  };
  
  constructor() {
    const tokenizer = new Tokenizer({}, this);
    this.tokenizer = tokenizer;
  }
  
  private tokenizer: Tokenizer;
  
  parse(html: string) {
    this.stats.startTime = Date.now();
    this.tokenizer.write(html);
    this.tokenizer.end();
    return this.stats;
  }
  
  onopentagname(): void {
    this.stats.tags++;
  }
  
  onattribname(): void {
    this.stats.attributes++;
  }
  
  ontext(): void {
    this.stats.textNodes++;
  }
  
  oncomment(): void {
    this.stats.comments++;
  }
  
  ontextentity(): void {
    this.stats.entities++;
  }
  
  onattribentity(): void {
    this.stats.entities++;
  }
  
  onend(): void {
    this.stats.endTime = Date.now();
    console.log(`Parsing completed in ${this.stats.endTime - this.stats.startTime}ms`);
    console.log(`Found: ${this.stats.tags} tags, ${this.stats.attributes} attributes`);
  }
  
  // Minimal required callbacks
  onattribdata(): void {}
  onattribend(): void {}
  oncdata(): void {}
  onclosetag(): void {}
  ondeclaration(): void {}
  onopentagend(): void {}
  onprocessinginstruction(): void {}
  onselfclosingtag(): void {}
}

Configuration Options

The Tokenizer uses the same ParserOptions as the Parser:

interface ParserOptions {
  /** Enable XML parsing mode */
  xmlMode?: boolean;
  /** Decode HTML entities */
  decodeEntities?: boolean;  
  /** Convert tag names to lowercase */
  lowerCaseTags?: boolean;
  /** Convert attribute names to lowercase */
  lowerCaseAttributeNames?: boolean;
  /** Recognize CDATA sections */
  recognizeCDATA?: boolean;
  /** Recognize self-closing tags */
  recognizeSelfClosing?: boolean;
}

These options affect how the tokenizer processes the input stream and what events are fired.