Handler for htmlparser2 that turns pages into a DOM tree structure
npx @tessl/cli install tessl/npm-domhandler@5.0.0domhandler is a TypeScript library that provides a DOM handler for htmlparser2, creating tree structures containing all nodes of a parsed HTML page. It serves as a foundational component in the HTML parsing ecosystem, enabling the transformation of HTML markup into structured DOM trees that can be manipulated programmatically.
npm install domhandlerimport { DomHandler, type DomHandlerOptions } from "domhandler";
import { Element, Text, Comment, Document, type ChildNode } from "domhandler";
// Default import
import DomHandler from "domhandler";For CommonJS:
const { DomHandler } = require("domhandler");
const { Element, Text, Comment, Document } = require("domhandler");
// Default import
const DomHandler = require("domhandler");import { Parser } from "htmlparser2";
import { DomHandler } from "domhandler";
const rawHtml = 'Xyz <script language="javascript">var foo = "<<bar>>";</script><!--<!-- Waah! -- -->';
const handler = new DomHandler((error, dom) => {
if (error) {
// Handle error
} else {
// Parsing completed, use the DOM
console.log(dom);
}
});
const parser = new Parser(handler);
parser.write(rawHtml);
parser.end();domhandler is built around several key components:
The main class that processes HTML parsing events and creates a DOM tree structure.
class DomHandler {
/** The elements of the DOM */
dom: ChildNode[];
/** The root element for the DOM */
root: Document;
constructor(
callback?: Callback | null,
options?: DomHandlerOptions | null,
elementCB?: ElementCallback
);
onparserinit(parser: ParserInterface): void;
onreset(): void;
onend(): void;
onerror(error: Error): void;
onclosetag(): void;
onopentag(name: string, attribs: { [key: string]: string }): void;
ontext(data: string): void;
oncomment(data: string): void;
oncommentend(): void;
oncdatastart(): void;
oncdataend(): void;
onprocessinginstruction(name: string, data: string): void;
}
interface DomHandlerOptions {
/** Add a startIndex property to nodes (default: false) */
withStartIndices?: boolean;
/** Add an endIndex property to nodes (default: false) */
withEndIndices?: boolean;
/** Treat the markup as XML (default: false) */
xmlMode?: boolean;
}
interface ParserInterface {
startIndex: number | null;
endIndex: number | null;
}
type Callback = (error: Error | null, dom: ChildNode[]) => void;
type ElementCallback = (element: Element) => void;Core node classes representing different types of DOM elements.
abstract class Node {
/** The type of the node */
abstract readonly type: ElementType;
/** Parent of the node */
parent: ParentNode | null;
/** Previous sibling */
prev: ChildNode | null;
/** Next sibling */
next: ChildNode | null;
/** The start index of the node */
startIndex: number | null;
/** The end index of the node */
endIndex: number | null;
/** parse5 source code location info */
sourceCodeLocation?: SourceCodeLocation | null;
/** DOM spec-compatible node type */
abstract readonly nodeType: number;
/** DOM spec-compatible alias for parent */
get parentNode(): ParentNode | null;
/** DOM spec-compatible alias for prev */
get previousSibling(): ChildNode | null;
/** DOM spec-compatible alias for next */
get nextSibling(): ChildNode | null;
cloneNode<T extends Node>(this: T, recursive?: boolean): T;
}
abstract class DataNode extends Node {
constructor(public data: string);
/** DOM spec-compatible alias for data */
get nodeValue(): string;
set nodeValue(data: string);
}
abstract class NodeWithChildren extends Node {
constructor(public children: ChildNode[]);
/** First child of the node */
get firstChild(): ChildNode | null;
/** Last child of the node */
get lastChild(): ChildNode | null;
/** DOM spec-compatible alias for children */
get childNodes(): ChildNode[];
set childNodes(children: ChildNode[]);
}Individual node types for different HTML elements.
class Text extends DataNode {
readonly type: ElementType.Text;
readonly nodeType: 3;
}
class Comment extends DataNode {
readonly type: ElementType.Comment;
readonly nodeType: 8;
}
class ProcessingInstruction extends DataNode {
readonly type: ElementType.Directive;
readonly nodeType: 1;
constructor(public name: string, data: string);
/** Document type name (parse5 only) */
"x-name"?: string;
/** Document type public identifier (parse5 only) */
"x-publicId"?: string;
/** Document type system identifier (parse5 only) */
"x-systemId"?: string;
}
class CDATA extends NodeWithChildren {
readonly type: ElementType.CDATA;
readonly nodeType: 4;
}
class Document extends NodeWithChildren {
readonly type: ElementType.Root;
readonly nodeType: 9;
/** Document mode (parse5 only) */
"x-mode"?: "no-quirks" | "quirks" | "limited-quirks";
}
class Element extends NodeWithChildren {
readonly nodeType: 1;
constructor(
public name: string,
public attribs: { [name: string]: string },
children?: ChildNode[],
public type?: ElementType.Tag | ElementType.Script | ElementType.Style
);
/** DOM spec-compatible alias for name */
get tagName(): string;
set tagName(name: string);
/** DOM spec-compatible attributes array */
get attributes(): Attribute[];
/** parse5 source code location info with start & end tags */
sourceCodeLocation?: TagSourceCodeLocation | null;
/** Element namespace (parse5 only) */
namespace?: string;
/** Element attribute namespaces (parse5 only) */
"x-attribsNamespace"?: Record<string, string>;
/** Element attribute namespace-related prefixes (parse5 only) */
"x-attribsPrefix"?: Record<string, string>;
}Helper functions for working with DOM nodes.
/** Check if node is an Element */
function isTag(node: Node): node is Element;
/** Check if node is CDATA */
function isCDATA(node: Node): node is CDATA;
/** Check if node is Text */
function isText(node: Node): node is Text;
/** Check if node is Comment */
function isComment(node: Node): node is Comment;
/** Check if node is ProcessingInstruction */
function isDirective(node: Node): node is ProcessingInstruction;
/** Check if node is Document */
function isDocument(node: Node): node is Document;
/** Check if node has children */
function hasChildren(node: Node): node is ParentNode;
/** Clone a node with optional recursive cloning */
function cloneNode<T extends Node>(node: T, recursive?: boolean): T;// Element type enum (from domelementtype dependency)
enum ElementType {
Text = "text",
Directive = "directive",
Comment = "comment",
Script = "script",
Style = "style",
Tag = "tag",
CDATA = "cdata",
Root = "root"
}
// Core type aliases
type ParentNode = Document | Element | CDATA;
type ChildNode = Text | Comment | ProcessingInstruction | Element | CDATA | Document;
type AnyNode = ParentNode | ChildNode;
// Attribute interface
interface Attribute {
name: string;
value: string;
namespace?: string;
prefix?: string;
}
// Location interfaces
interface SourceCodeLocation {
startLine: number;
startCol: number;
startOffset: number;
endLine: number;
endCol: number;
endOffset: number;
}
interface TagSourceCodeLocation extends SourceCodeLocation {
startTag?: SourceCodeLocation;
endTag?: SourceCodeLocation;
}