or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

callback-parsing.mddom-parsing.mdfeed-parsing.mdindex.mdstream-processing.mdtokenization.md
tile.json

callback-parsing.mddocs/

Callback-Based Parsing

Low-level Parser class with callback interface for memory-efficient parsing. This approach is ideal for processing large documents, streaming data, or when you need maximum control over parsing behavior.

Capabilities

Parser Class

The core Parser class that tokenizes HTML/XML and fires callback events for each parsing event.

/**
 * Main HTML/XML parser class with callback-based interface
 */
class Parser {
  /** The start index of the last event */
  public startIndex: number;
  /** The end index of the last event */
  public endIndex: number;
  
  /**
   * Create a new Parser instance
   * @param cbs - Callback object implementing Handler interface
   * @param options - Parser configuration options
   */
  constructor(cbs?: Partial<Handler> | null, options?: ParserOptions);
  
  /**
   * Write data to the parser for processing
   * @param chunk - HTML/XML string data to parse
   */
  write(chunk: string): void;
  
  /**
   * Signal end of input and complete parsing
   * @param chunk - Optional final chunk of data
   */
  end(chunk?: string): void;
  
  /** Pause parsing - can be resumed later */
  pause(): void;
  
  /** Resume parsing after pause */
  resume(): void;
  
  /** Reset parser to initial state */
  reset(): void;
  
  /**
   * Reset parser and parse complete data in one call
   * @param data - Complete HTML/XML string to parse
   */
  parseComplete(data: string): void;
  
  /**
   * Parse a chunk of data (deprecated - use write instead)
   * @param chunk - HTML/XML string data to parse
   * @deprecated Use write() instead
   */
  parseChunk(chunk: string): void;
  
  /**
   * Signal end of input (deprecated - use end instead)
   * @param chunk - Optional final chunk of data  
   * @deprecated Use end() instead
   */
  done(chunk?: string): void;
  
  /**
   * Checks if the current tag is a void element. Can be overridden to specify additional void elements
   * @param name - Tag name to check
   * @returns True if the tag is a void element
   * @protected
   */
  protected isVoidElement(name: string): boolean;
}

Usage Examples:

import { Parser } from "htmlparser2";

// Basic callback parsing
const parser = new Parser({
  onopentag(name, attributes) {
    console.log("Opening tag:", name, attributes);
  },
  ontext(text) {
    console.log("Text content:", text);
  },
  onclosetag(tagname) {
    console.log("Closing tag:", tagname);
  }
});

parser.write("<div class='content'>Hello <b>world</b>!</div>");
parser.end();

// Advanced parsing with all callbacks
const advancedParser = new Parser({
  onparserinit(parser) {
    console.log("Parser initialized");
  },
  onopentag(name, attribs, isImplied) {
    if (name === "img") {
      console.log("Image found:", attribs.src);
    }
  },
  onattribute(name, value, quote) {
    console.log(`Attribute: ${name}="${value}" (quote: ${quote})`);
  },
  ontext(data) {
    if (data.trim()) {
      console.log("Text:", data.trim());
    }
  },
  oncomment(data) {
    console.log("Comment:", data);
  },
  onprocessinginstruction(name, data) {
    console.log("Processing instruction:", name, data);
  },
  onerror(error) {
    console.error("Parse error:", error);
  },
  onend() {
    console.log("Parsing complete");
  }
}, { xmlMode: false });

Handler Interface

Complete callback interface for handling all parser events:

interface Handler {
  /** Called when parser is initialized */
  onparserinit(parser: Parser): void;
  
  /** Called to reset handler state */
  onreset(): void;
  
  /** Called when parsing is complete */
  onend(): void;
  
  /** Called when a parsing error occurs */
  onerror(error: Error): void;
  
  /** Called when a closing tag is found */
  onclosetag(name: string, isImplied: boolean): void;
  
  /** Called when an opening tag name is found (before attributes) */
  onopentagname(name: string): void;
  
  /** 
   * Called for each attribute found in an opening tag
   * @param name - Attribute name
   * @param value - Attribute value  
   * @param quote - Quote character used ("\"", "'", null for unquoted, undefined for no value)
   */
  onattribute(
    name: string,
    value: string,
    quote?: string | undefined | null
  ): void;
  
  /**
   * Called when an opening tag is complete (after attributes)
   * @param name - Tag name
   * @param attribs - Object containing all attributes
   * @param isImplied - Whether tag was implied by parser
   */
  onopentag(
    name: string,
    attribs: { [s: string]: string },
    isImplied: boolean
  ): void;
  
  /** Called for text content */
  ontext(data: string): void;
  
  /** Called for HTML comments */
  oncomment(data: string): void;
  
  /** Called at start of CDATA section */
  oncdatastart(): void;
  
  /** Called at end of CDATA section */
  oncdataend(): void;
  
  /** Called at end of comment */
  oncommentend(): void;
  
  /** Called for processing instructions like <?xml ?> */
  onprocessinginstruction(name: string, data: string): void;
}

Parser Configuration

interface ParserOptions {
  /**
   * Enable XML parsing mode for feeds and XML documents
   * Affects tag case sensitivity, self-closing tags, and CDATA handling
   * @default false
   */
  xmlMode?: boolean;
  
  /**
   * Decode HTML entities in text and attribute values
   * @default true
   */
  decodeEntities?: boolean;
  
  /**
   * Convert all tag names to lowercase
   * @default !xmlMode
   */
  lowerCaseTags?: boolean;
  
  /**
   * Convert all attribute names to lowercase
   * Has performance impact but improves compatibility
   * @default !xmlMode
   */
  lowerCaseAttributeNames?: boolean;
  
  /**
   * Recognize CDATA sections even in HTML mode
   * @default xmlMode
   */
  recognizeCDATA?: boolean;
  
  /**
   * Recognize self-closing tags even in HTML mode  
   * @default xmlMode
   */
  recognizeSelfClosing?: boolean;
  
  /**
   * Custom tokenizer class to use instead of default
   * Advanced usage for custom parsing behavior
   */
  Tokenizer?: typeof Tokenizer;
}

Parsing Patterns

Streaming Processing

import { Parser } from "htmlparser2";

let currentTag = '';
let depth = 0;

const streamingParser = new Parser({
  onopentag(name, attribs) {
    depth++;
    if (name === 'article') {
      currentTag = name;
      console.log('Article started:', attribs);
    }
  },
  ontext(text) {
    if (currentTag === 'article' && text.trim()) {
      console.log('Article text:', text.trim());
    }
  },
  onclosetag(name) {
    if (name === 'article') {
      console.log('Article ended');
      currentTag = '';
    }
    depth--;
  }
});

// Process data in chunks
const chunks = ['<html><body>', '<article id="1">', 'Article content here', '</article>', '</body></html>'];
chunks.forEach(chunk => streamingParser.write(chunk));
streamingParser.end();

Form Processing

import { Parser } from "htmlparser2";

const forms = [];
let currentForm = null;

const formParser = new Parser({
  onopentag(name, attribs) {
    if (name === 'form') {
      currentForm = { 
        action: attribs.action, 
        method: attribs.method || 'GET',
        fields: [] 
      };
    } else if (currentForm && name === 'input') {
      currentForm.fields.push({
        name: attribs.name,
        type: attribs.type || 'text',
        value: attribs.value
      });
    }
  },
  onclosetag(name) {
    if (name === 'form' && currentForm) {
      forms.push(currentForm);
      currentForm = null;
    }
  }
});

Error Handling

import { Parser } from "htmlparser2";

const parser = new Parser({
  onopentag(name, attribs) {
    // Process tags
  },
  onerror(error) {
    console.error('Parser error:', error.message);
    // Handle malformed HTML gracefully
    parser.resume(); // Continue parsing after error
  }
}, { 
  // Options for better error recovery
  lowerCaseTags: true,
  decodeEntities: true 
});