HTML parser and serializer that is fully compliant with the WHATWG HTML Living Standard.
npx @tessl/cli install tessl/npm-parse5@8.0.0Parse5 is a HTML parser and serializer that provides fast, standard-compliant HTML parsing for Node.js applications. It implements the WHATWG HTML Living Standard and handles malformed HTML gracefully, making it suitable for server-side HTML processing, DOM manipulation, and web scraping applications.
npm install parse5// Main parsing and serialization functions
import { parse, parseFragment, serialize, serializeOuter } from "parse5";
// Tree adapters and types
import { defaultTreeAdapter, DefaultTreeAdapterTypes } from "parse5";
import type { TreeAdapter, TreeAdapterTypeMap, DefaultTreeAdapterMap } from "parse5";
// Error handling
import { ErrorCodes, type ParserError, type ParserErrorHandler } from "parse5";
// Options interfaces
import type { ParserOptions, SerializerOptions } from "parse5";
// HTML constants and utilities
import { html, Token } from "parse5";
// Advanced tokenization (internal APIs)
import { Tokenizer, TokenizerMode, type TokenizerOptions, type TokenHandler } from "parse5";For CommonJS:
const {
parse, parseFragment, serialize, serializeOuter,
defaultTreeAdapter, DefaultTreeAdapterTypes,
ErrorCodes, html, Token,
Tokenizer, TokenizerMode
} = require("parse5");import { parse, parseFragment, serialize } from "parse5";
// Parse a complete HTML document
const document = parse('<!DOCTYPE html><html><head></head><body>Hello World!</body></html>');
// Parse HTML fragment
const fragment = parseFragment('<div><span>Content</span></div>');
// Serialize back to HTML string
const htmlString = serialize(document);
console.log(htmlString);Parse5 is built around several core components:
Core HTML parsing functionality that converts HTML strings into abstract syntax trees. Supports both complete documents and fragments with optional context.
function parse<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
html: string,
options?: ParserOptions<T>
): T['document'];
function parseFragment<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
fragmentContext: T['parentNode'] | null,
html: string,
options: ParserOptions<T>
): T['documentFragment'];
function parseFragment<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
html: string,
options?: ParserOptions<T>
): T['documentFragment'];Serialization functionality for converting parsed AST nodes back to HTML strings. Supports both inner content and complete element serialization.
function serialize<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
node: T['parentNode'],
options?: SerializerOptions<T>
): string;
function serializeOuter<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
node: T['node'],
options?: SerializerOptions<T>
): string;Pluggable tree adapter system that defines the structure and manipulation of AST nodes. Allows customization of how parsed HTML is represented in memory.
interface TreeAdapter<T extends TreeAdapterTypeMap = TreeAdapterTypeMap> {
// Node creation methods
createDocument(): T['document'];
createDocumentFragment(): T['documentFragment'];
createElement(tagName: string, namespaceURI: string, attrs: Attribute[]): T['element'];
createCommentNode(data: string): T['commentNode'];
createTextNode(value: string): T['textNode'];
// Node manipulation methods
appendChild(parentNode: T['parentNode'], newNode: T['childNode']): void;
insertBefore(parentNode: T['parentNode'], newNode: T['childNode'], referenceNode: T['childNode']): void;
detachNode(node: T['childNode']): void;
// Node inspection and type guards
isElementNode(node: T['node']): node is T['element'];
isTextNode(node: T['node']): node is T['textNode'];
isCommentNode(node: T['node']): node is T['commentNode'];
isDocumentTypeNode(node: T['node']): node is T['documentType'];
}
const defaultTreeAdapter: TreeAdapter<DefaultTreeAdapterMap>;Comprehensive error handling system that provides detailed parsing error information with source code locations and error codes.
interface ParserError {
code: string;
startLine: number;
startCol: number;
startOffset: number;
endLine: number;
endCol: number;
endOffset: number;
}
type ParserErrorHandler = (error: ParserError) => void;
enum ErrorCodes {
controlCharacterInInputStream = 'control-character-in-input-stream',
noncharacterInInputStream = 'noncharacter-in-input-stream',
unexpectedNullCharacter = 'unexpected-null-character',
unexpectedQuestionMarkInsteadOfTagName = 'unexpected-question-mark-instead-of-tag-name',
// ... many more error codes
}Low-level HTML tokenization functionality for advanced use cases that require direct access to the tokenization process. Provides tokenizer class, token types, and token handlers.
class Tokenizer {
constructor(options: TokenizerOptions, handler: TokenHandler);
write(chunk: string, isLastChunk: boolean): void;
insertHtmlAtCurrentPos(chunk: string): void;
}
interface TokenizerOptions {
sourceCodeLocationInfo?: boolean;
}
interface TokenHandler {
onComment(token: CommentToken): void;
onDoctype(token: DoctypeToken): void;
onStartTag(token: TagToken): void;
onEndTag(token: TagToken): void;
onEof(token: EOFToken): void;
onCharacter(token: CharacterToken): void;
onNullCharacter(token: CharacterToken): void;
onWhitespaceCharacter(token: CharacterToken): void;
}
const TokenizerMode: {
readonly DATA: State.DATA;
readonly RCDATA: State.RCDATA;
readonly RAWTEXT: State.RAWTEXT;
readonly SCRIPT_DATA: State.SCRIPT_DATA;
readonly PLAINTEXT: State.PLAINTEXT;
readonly CDATA_SECTION: State.CDATA_SECTION;
};HTML specification constants, enumerations, and utility functions providing access to standardized HTML element names, namespace URIs, document modes, and other HTML5 specification details.
namespace html {
enum NS {
HTML = 'http://www.w3.org/1999/xhtml',
MATHML = 'http://www.w3.org/1998/Math/MathML',
SVG = 'http://www.w3.org/2000/svg',
XLINK = 'http://www.w3.org/1999/xlink',
XML = 'http://www.w3.org/XML/1998/namespace',
XMLNS = 'http://www.w3.org/2000/xmlns/'
}
enum TAG_NAMES {
A = 'a',
DIV = 'div',
SPAN = 'span',
P = 'p',
// ... 100+ tag names
}
enum TAG_ID {
UNKNOWN = 0,
A = 1,
DIV = 27,
SPAN = 100,
// ... corresponding IDs
}
enum DOCUMENT_MODE {
NO_QUIRKS = 'no-quirks',
QUIRKS = 'quirks',
LIMITED_QUIRKS = 'limited-quirks'
}
enum ATTRS {
CLASS = 'class',
ID = 'id',
SRC = 'src',
HREF = 'href',
// ... common attributes
}
function getTagID(tagName: string): TAG_ID;
function hasUnescapedText(tagName: string, scriptingEnabled: boolean): boolean;
}interface DefaultTreeAdapterMap extends TreeAdapterTypeMap<
Node,
ParentNode,
ChildNode,
Document,
DocumentFragment,
Element,
CommentNode,
TextNode,
Template,
DocumentType
> {}
interface ParserOptions<T extends TreeAdapterTypeMap> {
/** Controls noscript element parsing. Defaults to true */
scriptingEnabled?: boolean;
/** Enables source code location tracking. Defaults to false */
sourceCodeLocationInfo?: boolean;
/** Custom tree adapter for AST node structure */
treeAdapter?: TreeAdapter<T>;
/** Error handling callback */
onParseError?: ParserErrorHandler;
}
interface SerializerOptions<T extends TreeAdapterTypeMap> {
/** Custom tree adapter for AST node structure */
treeAdapter?: TreeAdapter<T>;
/** Controls noscript element serialization. Defaults to true */
scriptingEnabled?: boolean;
}