htmlparser2 is a fast and forgiving HTML/XML parser that provides both low-level callback-based parsing and high-level DOM generation. It's designed for maximum performance with minimal memory allocations and supports streaming, malformed HTML handling, and comprehensive parsing of RSS/Atom feeds.
npm install htmlparser2import * as htmlparser2 from "htmlparser2";
import { Parser, parseDocument, parseFeed, WritableStream } from "htmlparser2";For CommonJS:
const htmlparser2 = require("htmlparser2");
const { Parser, parseDocument, parseFeed, WritableStream } = require("htmlparser2");For WritableStream (separate export):
import { WritableStream } from "htmlparser2/WritableStream";import { parseDocument, Parser } from "htmlparser2";
// DOM parsing - parse complete HTML to DOM tree
const document = parseDocument("<div>Hello <b>world</b>!</div>");
console.log(document.children[0].children[1].children[0].data); // "world"
// Callback-based parsing - for minimal memory usage
const parser = new Parser({
onopentag(name, attributes) {
if (name === "script" && attributes.type === "text/javascript") {
console.log("Found JavaScript!");
}
},
ontext(text) {
console.log("Text:", text);
},
onclosetag(tagname) {
console.log("Closed:", tagname);
}
});
parser.write("Xyz <script type='text/javascript'>const foo = 'bar';</script>");
parser.end();htmlparser2 is built around several key components:
High-level functions for parsing HTML/XML into DOM trees using domhandler. Perfect for scraping, template processing, and document analysis.
function parseDocument(data: string, options?: Options): Document;
/** @deprecated Use parseDocument instead */
function parseDOM(data: string, options?: Options): ChildNode[];Low-level Parser class with callback interface for memory-efficient streaming parsing. Ideal for large documents and real-time processing.
class Parser {
constructor(cbs?: Partial<Handler> | null, options?: ParserOptions);
write(chunk: string): void;
end(chunk?: string): void;
}
interface Handler {
onopentag(name: string, attribs: { [s: string]: string }, isImplied: boolean): void;
ontext(data: string): void;
onclosetag(name: string, isImplied: boolean): void;
oncomment(data: string): void;
// ... additional callback methods
}WritableStream integration for Node.js streams, enabling pipeline processing and integration with other stream-based tools.
class WritableStream extends Writable {
constructor(cbs: Partial<Handler>, options?: ParserOptions);
}Specialized functionality for parsing RSS, RDF, and Atom feeds with automatic feed detection and structured data extraction.
function parseFeed(feed: string, options?: Options): Feed | null;Direct access to the underlying tokenizer for custom parsing implementations and advanced use cases.
class Tokenizer {
constructor(options: ParserOptions, cbs: Callbacks);
write(chunk: string): void;
end(chunk?: string): void;
}interface Options extends ParserOptions, DomHandlerOptions {}
interface DomHandlerOptions {
/** Include location information for nodes */
withStartIndices?: boolean;
/** Include end location information for nodes */
withEndIndices?: boolean;
/** Normalize whitespace in text content */
normalizeWhitespace?: boolean;
}
interface ParserOptions {
/** Enable XML parsing mode for feeds and XML documents */
xmlMode?: boolean;
/** Decode HTML entities in text content */
decodeEntities?: boolean;
/** Convert tag names to lowercase */
lowerCaseTags?: boolean;
/** Convert attribute names to lowercase */
lowerCaseAttributeNames?: boolean;
/** Recognize CDATA sections even in HTML mode */
recognizeCDATA?: boolean;
/** Recognize self-closing tags even in HTML mode */
recognizeSelfClosing?: boolean;
/** Custom tokenizer class to use */
Tokenizer?: typeof Tokenizer;
}
// DOM types (from domhandler dependency)
interface Document extends Node {
children: ChildNode[];
}
interface Element extends Node {
name: string;
attribs: { [name: string]: string };
children: ChildNode[];
}
interface Text extends Node {
type: "text";
data: string;
}
interface Comment extends Node {
type: "comment";
data: string;
}
interface ProcessingInstruction extends Node {
type: "directive";
name: string;
data: string;
}
type ChildNode = Element | Text | Comment | ProcessingInstruction;
// DOM Handler classes
class DomHandler {
constructor(callback?: (error: Error | null, dom: ChildNode[]) => void, options?: DomHandlerOptions, elementCallback?: (element: Element) => void);
root: Document;
}
/** @deprecated Use DomHandler instead */
const DefaultHandler = DomHandler;
// Feed types (from domutils dependency)
interface Feed {
type: string;
title?: string;
link?: string;
description?: string;
items: FeedItem[];
}
// Namespace exports
namespace ElementType {
const Text: string;
const Directive: string;
const Comment: string;
const Script: string;
const Style: string;
const Tag: string;
const CDATA: string;
const Doctype: string;
}
namespace DomUtils {
function getFeed(dom: ChildNode[]): Feed | null;
// Additional DOM manipulation utilities from domutils package
}