CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/npm-htmlparser2

Fast & forgiving HTML/XML parser with callback-based interface and DOM generation capabilities

Pending
Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

SecuritybySnyk

Pending

The risk profile of this skill

Overview
Eval results
Files

tokenization.mddocs/

Low-Level Tokenization

Direct access to the underlying tokenizer for custom parsing implementations and advanced use cases. The Tokenizer class provides the lowest-level interface to HTML/XML parsing.

Capabilities

Tokenizer Class

The core tokenization engine that processes HTML/XML character streams and fires low-level parsing events.

/**
 * Low-level HTML/XML tokenizer with state machine parsing
 */
class Tokenizer {
  /** Whether the tokenizer is currently running */
  running: boolean;
  
  /**
   * Create a new Tokenizer instance
   * @param options - Tokenizer configuration options
   * @param cbs - Callback object implementing Callbacks interface
   */
  constructor(options: ParserOptions, cbs: Callbacks);
  
  /**
   * Write data to the tokenizer for processing
   * @param chunk - String data to tokenize
   */
  write(chunk: string): void;
  
  /**
   * Signal end of input and complete tokenization
   * @param chunk - Optional final chunk of data
   */
  end(chunk?: string): void;
  
  /** Pause tokenization - can be resumed later */
  pause(): void;
  
  /** Resume tokenization after pause */
  resume(): void;
  
  /** Reset tokenizer to initial state */
  reset(): void;
}

Tokenizer Callbacks

The Callbacks interface provides low-level events fired during tokenization:

interface Callbacks {
  /** Called for attribute data content */
  onattribdata(start: number, endIndex: number): void;
  
  /** Called for HTML entity in attribute value */
  onattribentity(codepoint: number): void;
  
  /** Called when attribute ends */
  onattribend(quote: QuoteType, endIndex: number): void;
  
  /** Called for attribute name */
  onattribname(start: number, endIndex: number): void;
  
  /** Called for CDATA section content */
  oncdata(start: number, endIndex: number, endOffset: number): void;
  
  /** Called for closing tag */
  onclosetag(start: number, endIndex: number): void;
  
  /** Called for comment content */
  oncomment(start: number, endIndex: number, endOffset: number): void;
  
  /** Called for DOCTYPE declaration */
  ondeclaration(start: number, endIndex: number): void;
  
  /** Called when tokenization ends */
  onend(): void;
  
  /** Called when opening tag ends */
  onopentagend(endIndex: number): void;
  
  /** Called for opening tag name */
  onopentagname(start: number, endIndex: number): void;
  
  /** Called for processing instruction */
  onprocessinginstruction(start: number, endIndex: number): void;
  
  /** Called for self-closing tag */
  onselfclosingtag(endIndex: number): void;
  
  /** Called for text content */
  ontext(start: number, endIndex: number): void;
  
  /** Called for HTML entity in text */
  ontextentity(codepoint: number, endIndex: number): void;
}

Quote Type Enumeration

Defines the types of quotes used for attribute values:

enum QuoteType {
  /** Attribute has no value (e.g., `disabled`) */
  NoValue = 0,
  
  /** Attribute value is unquoted (e.g., `class=button`) */
  Unquoted = 1,
  
  /** Attribute value uses single quotes (e.g., `class='button'`) */
  Single = 2,
  
  /** Attribute value uses double quotes (e.g., `class="button"`) */
  Double = 3
}

Advanced Usage Examples

Custom Parser Implementation

import Tokenizer, { type Callbacks, QuoteType } from "htmlparser2/lib/Tokenizer";

class CustomHtmlParser implements Callbacks {
  private buffer: string = '';
  private tags: Array<{name: string, attrs: Record<string, string>}> = [];
  private currentTag: string = '';
  private currentAttr: string = '';
  private attributes: Record<string, string> = {};
  
  constructor(private options: ParserOptions = {}) {
    this.tokenizer = new Tokenizer(options, this);
  }
  
  private tokenizer: Tokenizer;
  
  parse(html: string): Array<{name: string, attrs: Record<string, string>}> {
    this.buffer = html;
    this.tokenizer.write(html);
    this.tokenizer.end();
    return this.tags;
  }
  
  // Implement required callbacks
  onattribdata(start: number, endIndex: number): void {
    const data = this.buffer.slice(start, endIndex);
    this.attributes[this.currentAttr] = data;
  }
  
  onattribentity(codepoint: number): void {
    // Handle HTML entities in attributes
    this.attributes[this.currentAttr] = String.fromCodePoint(codepoint);
  }
  
  onattribend(quote: QuoteType, endIndex: number): void {
    // Attribute parsing complete
  }
  
  onattribname(start: number, endIndex: number): void {
    this.currentAttr = this.buffer.slice(start, endIndex);
    this.attributes[this.currentAttr] = '';
  }
  
  onopentagname(start: number, endIndex: number): void {
    this.currentTag = this.buffer.slice(start, endIndex);
    this.attributes = {};
  }
  
  onopentagend(endIndex: number): void {
    this.tags.push({
      name: this.currentTag,
      attrs: { ...this.attributes }
    });
  }
  
  // Implement other required callbacks
  oncdata(): void {}
  onclosetag(): void {}
  oncomment(): void {}
  ondeclaration(): void {}
  onend(): void {}
  onprocessinginstruction(): void {}
  onselfclosingtag(): void {}
  ontext(): void {}
  ontextentity(): void {}
}

// Usage
const parser = new CustomHtmlParser({ lowerCaseTags: true });
const tags = parser.parse('<DIV class="test" id=myid><span>Hello</span></DIV>');
console.log(tags);

Streaming Tokenizer

import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";

class StreamingTokenizer implements Callbacks {
  private buffer: string = '';
  private tokenizer: Tokenizer;
  
  constructor() {
    this.tokenizer = new Tokenizer({}, this);
  }
  
  processChunk(chunk: string): void {
    this.buffer += chunk;
    this.tokenizer.write(chunk);
  }
  
  finish(): void {
    this.tokenizer.end();
  }
  
  // Track text content positions
  ontext(start: number, endIndex: number): void {
    const text = this.buffer.slice(start, endIndex);
    if (text.trim()) {
      console.log(`Text at ${start}-${endIndex}: "${text}"`);
    }
  }
  
  // Track tag positions
  onopentagname(start: number, endIndex: number): void {
    const tagName = this.buffer.slice(start, endIndex);
    console.log(`Opening tag at ${start}-${endIndex}: <${tagName}>`);
  }
  
  onclosetag(start: number, endIndex: number): void {
    const tagName = this.buffer.slice(start + 2, endIndex - 1); // Skip </ and >
    console.log(`Closing tag at ${start}-${endIndex}: </${tagName}>`);
  }
  
  // Implement required callbacks (minimal)
  onattribdata(): void {}
  onattribentity(): void {}
  onattribend(): void {}
  onattribname(): void {}
  oncdata(): void {}
  oncomment(): void {}
  ondeclaration(): void {}
  onend(): void { console.log("Tokenization complete"); }
  onopentagend(): void {}
  onprocessinginstruction(): void {}
  onselfclosingtag(): void {}
  ontextentity(): void {}
}

// Usage for streaming
const tokenizer = new StreamingTokenizer();
tokenizer.processChunk("<html><body>");
tokenizer.processChunk("<h1>Title</h1>");
tokenizer.processChunk("<p>Content</p>");
tokenizer.processChunk("</body></html>");
tokenizer.finish();

Entity Handling

import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";

class EntityProcessor implements Callbacks {
  private buffer: string = '';
  private entities: Array<{position: number, codepoint: number, char: string}> = [];
  
  constructor() {
    const tokenizer = new Tokenizer({ decodeEntities: true }, this);
    this.tokenizer = tokenizer;
  }
  
  private tokenizer: Tokenizer;
  
  process(html: string): Array<{position: number, codepoint: number, char: string}> {
    this.buffer = html;
    this.entities = [];
    this.tokenizer.write(html);
    this.tokenizer.end();
    return this.entities;
  }
  
  ontextentity(codepoint: number, endIndex: number): void {
    this.entities.push({
      position: endIndex,
      codepoint,
      char: String.fromCodePoint(codepoint)
    });
  }
  
  onattribentity(codepoint: number): void {
    this.entities.push({
      position: -1, // In attribute
      codepoint,
      char: String.fromCodePoint(codepoint)
    });
  }
  
  // Minimal required callbacks
  onattribdata(): void {}
  onattribend(): void {}
  onattribname(): void {}
  oncdata(): void {}
  onclosetag(): void {}
  oncomment(): void {}
  ondeclaration(): void {}
  onend(): void {}
  onopentagend(): void {}
  onopentagname(): void {}
  onprocessinginstruction(): void {}
  onselfclosingtag(): void {}
  ontext(): void {}
}

// Usage
const processor = new EntityProcessor();
const entities = processor.process('Text &amp; more &lt;tags&gt; &copy; 2025');
console.log(entities);
// Output: entities found with their codepoints and positions

Performance Monitoring

import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";

class PerformanceTokenizer implements Callbacks {
  private stats = {
    tags: 0,
    attributes: 0,
    textNodes: 0,
    comments: 0,
    entities: 0,
    startTime: 0,
    endTime: 0
  };
  
  constructor() {
    const tokenizer = new Tokenizer({}, this);
    this.tokenizer = tokenizer;
  }
  
  private tokenizer: Tokenizer;
  
  parse(html: string) {
    this.stats.startTime = Date.now();
    this.tokenizer.write(html);
    this.tokenizer.end();
    return this.stats;
  }
  
  onopentagname(): void {
    this.stats.tags++;
  }
  
  onattribname(): void {
    this.stats.attributes++;
  }
  
  ontext(): void {
    this.stats.textNodes++;
  }
  
  oncomment(): void {
    this.stats.comments++;
  }
  
  ontextentity(): void {
    this.stats.entities++;
  }
  
  onattribentity(): void {
    this.stats.entities++;
  }
  
  onend(): void {
    this.stats.endTime = Date.now();
    console.log(`Parsing completed in ${this.stats.endTime - this.stats.startTime}ms`);
    console.log(`Found: ${this.stats.tags} tags, ${this.stats.attributes} attributes`);
  }
  
  // Minimal required callbacks
  onattribdata(): void {}
  onattribend(): void {}
  oncdata(): void {}
  onclosetag(): void {}
  ondeclaration(): void {}
  onopentagend(): void {}
  onprocessinginstruction(): void {}
  onselfclosingtag(): void {}
}

Configuration Options

The Tokenizer uses the same ParserOptions as the Parser:

interface ParserOptions {
  /** Enable XML parsing mode */
  xmlMode?: boolean;
  /** Decode HTML entities */
  decodeEntities?: boolean;  
  /** Convert tag names to lowercase */
  lowerCaseTags?: boolean;
  /** Convert attribute names to lowercase */
  lowerCaseAttributeNames?: boolean;
  /** Recognize CDATA sections */
  recognizeCDATA?: boolean;
  /** Recognize self-closing tags */
  recognizeSelfClosing?: boolean;
}

These options affect how the tokenizer processes the input stream and what events are fired.

docs

callback-parsing.md

dom-parsing.md

feed-parsing.md

index.md

stream-processing.md

tokenization.md

tile.json