or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

callback-parsing.mddom-parsing.mdfeed-parsing.mdindex.mdstream-processing.mdtokenization.md
tile.json

dom-parsing.mddocs/

DOM Parsing

High-level functions for parsing HTML/XML into DOM trees using the domhandler integration. These functions provide the easiest way to parse documents into manipulatable DOM structures.

Capabilities

parseDocument

Parses HTML/XML data and returns a complete Document object with full DOM tree structure.

/**
 * Parses the data, returns the resulting document.
 * @param data - The HTML/XML string to parse
 * @param options - Optional parser and DOM handler options
 * @returns Complete Document object with DOM tree
 */
function parseDocument(data: string, options?: Options): Document;

Usage Examples:

import { parseDocument } from "htmlparser2";

// Basic HTML parsing
const doc = parseDocument("<html><body><h1>Hello World</h1></body></html>");
console.log(doc.children[0].name); // "html"

// Parse with options
const xmlDoc = parseDocument(`<?xml version="1.0"?>
<root>
  <item id="1">First</item>
  <item id="2">Second</item>
</root>`, { xmlMode: true });

// Access elements
const items = xmlDoc.children[0].children.filter(node => node.name === "item");
console.log(items.length); // 2

parseDOM (Deprecated)

Parses data and returns an array of root nodes. This function is deprecated in favor of

parseDocument
.

/**
 * Parses data, returns an array of the root nodes.
 * Note: root nodes still have a Document node as their parent.
 * @param data - The HTML/XML string to parse  
 * @param options - Optional parser and DOM handler options
 * @returns Array of root child nodes
 * @deprecated Use parseDocument instead
 */
function parseDOM(data: string, options?: Options): ChildNode[];

createDocumentStream

Creates a streaming parser that builds a DOM and returns it via callback when parsing is complete.

/**
 * Creates a parser instance with attached DOM handler for streaming.
 * @param callback - Called when parsing completes with error or document
 * @param options - Optional parser and DOM handler options  
 * @param elementCallback - Optional callback fired for each completed element
 * @returns Parser instance for writing data
 */
function createDocumentStream(
  callback: (error: Error | null, document: Document) => void,
  options?: Options,
  elementCallback?: (element: Element) => void
): Parser;

Usage Examples:

import { createDocumentStream } from "htmlparser2";

// Stream parsing with callback
const parser = createDocumentStream((error, document) => {
  if (error) {
    console.error("Parsing failed:", error);
    return;
  }
  
  console.log("Parsed document:", document);
  // Process the complete DOM tree
});

// Write data in chunks (useful for streaming)
parser.write("<html><head><title>");
parser.write("My Page");  
parser.write("</title></head>");
parser.write("<body>Content here</body></html>");
parser.end();

// With element callback for processing elements as they're completed
const streamParser = createDocumentStream(
  (error, doc) => console.log("Final document ready"),
  { xmlMode: false },
  (element) => {
    // Called for each completed element during parsing
    console.log("Completed element:", element.name);
  }
);

createDomStream (Deprecated)

Creates a streaming parser that returns an array of child nodes via callback. Deprecated in favor of

createDocumentStream
.

/**
 * Creates a parser instance with attached DOM handler for streaming.
 * @param callback - Called when parsing completes with error or child nodes
 * @param options - Optional parser and DOM handler options
 * @param elementCallback - Optional callback fired for each completed element  
 * @returns Parser instance for writing data
 * @deprecated Use createDocumentStream instead
 */
function createDomStream(
  callback: (error: Error | null, dom: ChildNode[]) => void,
  options?: Options,
  elementCallback?: (element: Element) => void
): Parser;

Configuration Options

DOM parsing functions accept an

Options
object combining
ParserOptions
and
DomHandlerOptions
:

interface Options extends ParserOptions, DomHandlerOptions {}

interface ParserOptions {
  /** Enable XML parsing mode - important for XML documents and feeds */
  xmlMode?: boolean;
  /** Decode HTML entities in text content (default: true) */
  decodeEntities?: boolean;
  /** Convert tag names to lowercase (default: !xmlMode) */
  lowerCaseTags?: boolean;  
  /** Convert attribute names to lowercase (default: !xmlMode) */
  lowerCaseAttributeNames?: boolean;
  /** Recognize CDATA sections even in HTML mode (default: xmlMode) */
  recognizeCDATA?: boolean;
  /** Recognize self-closing tags even in HTML mode (default: xmlMode) */
  recognizeSelfClosing?: boolean;
}

interface DomHandlerOptions {
  /** Include location information for nodes */
  withStartIndices?: boolean;
  /** Include end location information for nodes */
  withEndIndices?: boolean;
  /** Normalize whitespace in text content */
  normalizeWhitespace?: boolean;
}

DOM Types

The parsed DOM uses types from the domhandler package:

interface Document extends Node {
  type: "root";
  name: "root";
  children: ChildNode[];
  parent: null;
}

interface Element extends Node {
  type: "tag";
  name: string;
  attribs: { [name: string]: string };
  children: ChildNode[];
  parent: ParentNode | null;
}

interface Text extends Node {
  type: "text";
  data: string;
  parent: ParentNode | null;
}

interface Comment extends Node {
  type: "comment"; 
  data: string;
  parent: ParentNode | null;
}

interface ProcessingInstruction extends Node {
  type: "directive";
  name: string;
  data: string;
  parent: ParentNode | null;
}

type ChildNode = Element | Text | Comment | ProcessingInstruction;
type ParentNode = Document | Element;