HTML parser and serializer that is fully compliant with the WHATWG HTML Living Standard.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Core HTML parsing functionality that converts HTML strings into abstract syntax trees. Parse5 implements the WHATWG HTML Living Standard parsing algorithm and handles malformed HTML gracefully.
Parses a complete HTML document string into a document AST node.
/**
* Parses an HTML string into a complete document AST
* @param html - Input HTML string to parse
* @param options - Optional parsing configuration
* @returns Document AST node representing the parsed HTML
*/
function parse<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
html: string,
options?: ParserOptions<T>
): T['document'];Usage Examples:
import { parse } from "parse5";
// Parse a complete HTML document
const document = parse('<!DOCTYPE html><html><head><title>Test</title></head><body><h1>Hello World</h1></body></html>');
// Access document structure
console.log(document.childNodes[0].nodeName); // '#documentType'
console.log(document.childNodes[1].tagName); // 'html'
// Parse with options
const documentWithLocation = parse('<html><body>Content</body></html>', {
sourceCodeLocationInfo: true,
scriptingEnabled: false
});Parses HTML fragments with optional context element. When parsing fragments, the parser behavior changes based on the context element to match browser behavior.
/**
* Parses HTML fragment with context element
* @param fragmentContext - Context element that affects parsing behavior
* @param html - HTML fragment string to parse
* @param options - Parsing configuration options
* @returns DocumentFragment containing parsed nodes
*/
function parseFragment<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
fragmentContext: T['parentNode'] | null,
html: string,
options: ParserOptions<T>
): T['documentFragment'];
/**
* Parses HTML fragment without context element
* @param html - HTML fragment string to parse
* @param options - Optional parsing configuration
* @returns DocumentFragment containing parsed nodes
*/
function parseFragment<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
html: string,
options?: ParserOptions<T>
): T['documentFragment'];Usage Examples:
import { parse, parseFragment } from "parse5";
// Parse fragment without context
const fragment = parseFragment('<div><span>Hello</span><p>World</p></div>');
console.log(fragment.childNodes.length); // 1
console.log(fragment.childNodes[0].tagName); // 'div'
// Parse fragment with context for accurate parsing
const document = parse('<table></table>');
const tableElement = document.childNodes[1].childNodes[1].childNodes[0]; // html > body > table
const tableRowFragment = parseFragment(
tableElement,
'<tr><td>Cell content</td></tr>',
{ sourceCodeLocationInfo: true }
);
console.log(tableRowFragment.childNodes[0].tagName); // 'tr'
// Parse template content
const templateFragment = parseFragment('<div>Template content</div>');Control parsing behavior through comprehensive options.
interface ParserOptions<T extends TreeAdapterTypeMap> {
/**
* The scripting flag. If set to true, noscript element content
* will be parsed as text. Defaults to true.
*/
scriptingEnabled?: boolean;
/**
* Enables source code location information. When enabled, each node
* will have a sourceCodeLocation property with position data.
* Defaults to false.
*/
sourceCodeLocationInfo?: boolean;
/**
* Specifies the tree adapter to use for creating and manipulating AST nodes.
* Defaults to the built-in default tree adapter.
*/
treeAdapter?: TreeAdapter<T>;
/**
* Error handling callback function. Called for each parsing error encountered.
*/
onParseError?: ParserErrorHandler;
}Usage Examples:
import { parse, parseFragment } from "parse5";
// Enable location tracking for debugging
const documentWithLocations = parse('<div>Content</div>', {
sourceCodeLocationInfo: true
});
// Each element will have sourceCodeLocation property
const divElement = documentWithLocations.childNodes[1].childNodes[1].childNodes[0];
console.log(divElement.sourceCodeLocation);
// Output: { startLine: 1, startCol: 1, startOffset: 0, endLine: 1, endCol: 19, endOffset: 18 }
// Handle parsing errors
const errors: string[] = [];
const documentWithErrors = parse('<div><span></div>', {
onParseError: (error) => {
errors.push(`${error.code} at line ${error.startLine}`);
}
});
console.log(errors); // ['end-tag-with-trailing-solidus at line 1']
// Disable script execution context
const noScriptDocument = parse('<noscript>This content is visible</noscript>', {
scriptingEnabled: false
});Advanced users can directly use the Parser class for more control over the parsing process.
/**
* Core HTML parser class. Internal API - use parse() and parseFragment() functions instead.
*/
class Parser<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> {
/**
* Static method to parse HTML string into document
*/
static parse<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
html: string,
options?: ParserOptions<T>
): T['document'];
/**
* Static method to get fragment parser instance
*/
static getFragmentParser<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(
fragmentContext: T['parentNode'] | null,
options?: ParserOptions<T>
): Parser<T>;
/**
* Get parsed fragment from fragment parser
*/
getFragment(): T['documentFragment'];
}import { parse } from "parse5";
const html = '<!DOCTYPE html><html><head><title>Page</title></head><body><div>Content</div></body></html>';
const document = parse(html);
// Document structure:
// document
// ├── DocumentType node ('#documentType')
// └── Element node ('html')
// ├── Element node ('head')
// │ └── Element node ('title')
// │ └── Text node ('Page')
// └── Element node ('body')
// └── Element node ('div')
// └── Text node ('Content')import { parse, parseFragment } from "parse5";
// Parse table row requires table context for proper parsing
const table = parse('<table></table>');
const tableElement = table.childNodes[1].childNodes[1].childNodes[0];
const fragment = parseFragment(tableElement, '<tr><td>Data</td></tr>');
// Without context, the tr would be parsed incorrectlyParse5 automatically recovers from many HTML errors:
import { parse } from "parse5";
// Missing closing tags
const doc1 = parse('<div><p>Unclosed paragraph<div>Another div</div>');
// Parser automatically closes the <p> tag
// Misplaced elements
const doc2 = parse('<html><div>Content before body</div><body>Body content</body></html>');
// Parser moves the div to the correct location in bodyParse5 provides comprehensive source code location tracking for debugging and development tools. When enabled, each parsed node includes detailed position information about its location in the original HTML source.
/**
* Basic location information interface
*/
interface Location {
/** One-based line index of the first character */
startLine: number;
/** One-based column index of the first character */
startCol: number;
/** Zero-based first character index */
startOffset: number;
/** One-based line index of the last character */
endLine: number;
/** One-based column index of the last character (after the character) */
endCol: number;
/** Zero-based last character index (after the character) */
endOffset: number;
}
/**
* Location information with attribute positions
*/
interface LocationWithAttributes extends Location {
/** Start tag attributes' location info */
attrs?: Record<string, Location>;
}
/**
* Element location with start and end tag positions
*/
interface ElementLocation extends LocationWithAttributes {
/** Element's start tag location info */
startTag?: Location;
/** Element's end tag location info (undefined if no closing tag) */
endTag?: Location;
}Location tracking is controlled through the sourceCodeLocationInfo option in ParserOptions:
import { parse, parseFragment } from "parse5";
// Enable location tracking for document parsing
const document = parse('<div class="container">Hello <span>World</span></div>', {
sourceCodeLocationInfo: true
});
// Enable location tracking for fragment parsing
const fragment = parseFragment('<p>Paragraph with <strong>emphasis</strong></p>', {
sourceCodeLocationInfo: true
});When location tracking is enabled, each node includes a sourceCodeLocation property:
import { parse } from "parse5";
import type { Element, Location, ElementLocation } from "parse5";
const html = `<div class="container">
<h1>Title</h1>
<p>Paragraph with <em>emphasis</em></p>
</div>`;
const document = parse(html, { sourceCodeLocationInfo: true });
// Navigate to elements
const htmlElement = document.childNodes[1] as Element;
const bodyElement = htmlElement.childNodes[1] as Element;
const divElement = bodyElement.childNodes[1] as Element;
// Access location information
const divLocation = divElement.sourceCodeLocation as ElementLocation;
console.log('Div element location:');
console.log(` Start: line ${divLocation.startLine}, col ${divLocation.startCol}`);
console.log(` End: line ${divLocation.endLine}, col ${divLocation.endCol}`);
console.log(` Offset: ${divLocation.startOffset}-${divLocation.endOffset}`);
// Access start tag location
if (divLocation.startTag) {
console.log('Start tag location:');
console.log(` <div class="container"> at line ${divLocation.startTag.startLine}`);
}
// Access end tag location
if (divLocation.endTag) {
console.log('End tag location:');
console.log(` </div> at line ${divLocation.endTag.startLine}`);
}
// Access attribute locations
if (divLocation.attrs && divLocation.attrs.class) {
const classLocation = divLocation.attrs.class;
console.log(`Class attribute at line ${classLocation.startLine}, col ${classLocation.startCol}`);
}import { parse } from "parse5";
import type { Element, ElementLocation } from "parse5";
class SourceExtractor {
constructor(private html: string) {}
getElementSource(element: Element): string | null {
const location = element.sourceCodeLocation as ElementLocation;
if (!location) return null;
return this.html.substring(location.startOffset, location.endOffset);
}
getStartTagSource(element: Element): string | null {
const location = element.sourceCodeLocation as ElementLocation;
if (!location?.startTag) return null;
return this.html.substring(location.startTag.startOffset, location.startTag.endOffset);
}
getAttributeSource(element: Element, attrName: string): string | null {
const location = element.sourceCodeLocation as ElementLocation;
const attrLocation = location?.attrs?.[attrName];
if (!attrLocation) return null;
return this.html.substring(attrLocation.startOffset, attrLocation.endOffset);
}
getElementContext(element: Element, contextLines = 2): string[] | null {
const location = element.sourceCodeLocation as ElementLocation;
if (!location) return null;
const lines = this.html.split('\n');
const startLine = Math.max(0, location.startLine - 1 - contextLines);
const endLine = Math.min(lines.length, location.endLine + contextLines);
return lines.slice(startLine, endLine).map((line, index) => {
const lineNumber = startLine + index + 1;
const marker = lineNumber >= location.startLine && lineNumber <= location.endLine ? '>' : ' ';
return `${marker} ${lineNumber.toString().padStart(3)}: ${line}`;
});
}
}
// Usage
const html = `<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<div class="container">
<h1>Main Title</h1>
<p>Content paragraph</p>
</div>
</body>
</html>`;
const document = parse(html, { sourceCodeLocationInfo: true });
const extractor = new SourceExtractor(html);
// Find the div element
function findElementByTagName(node: any, tagName: string): Element | null {
if (node.tagName === tagName) return node;
if (node.childNodes) {
for (const child of node.childNodes) {
const found = findElementByTagName(child, tagName);
if (found) return found;
}
}
return null;
}
const divElement = findElementByTagName(document, 'div');
if (divElement) {
console.log('Element source:', extractor.getElementSource(divElement));
console.log('Start tag source:', extractor.getStartTagSource(divElement));
console.log('Class attribute source:', extractor.getAttributeSource(divElement, 'class'));
console.log('Context:');
console.log(extractor.getElementContext(divElement)?.join('\n'));
}import { parse } from "parse5";
import type { ParserError, Element } from "parse5";
class LocationAwareErrorReporter {
private errors: Array<{ error: ParserError; context: string }> = [];
parseWithLocationTracking(html: string) {
const lines = html.split('\n');
const document = parse(html, {
sourceCodeLocationInfo: true,
onParseError: (error) => {
const line = lines[error.startLine - 1] || '';
const contextStart = Math.max(0, error.startCol - 10);
const contextEnd = Math.min(line.length, error.endCol + 10);
const context = line.substring(contextStart, contextEnd);
this.errors.push({ error, context });
}
});
return { document, errors: this.errors };
}
generateErrorReport(): string {
if (this.errors.length === 0) {
return 'No parsing errors found.';
}
let report = `Found ${this.errors.length} parsing error(s):\n\n`;
this.errors.forEach((item, index) => {
const { error, context } = item;
report += `${index + 1}. Error: ${error.code}\n`;
report += ` Location: Line ${error.startLine}, Column ${error.startCol}\n`;
report += ` Context: "${context}"\n`;
report += ` Position: ${error.startOffset}-${error.endOffset}\n\n`;
});
return report;
}
}
// Usage
const reporter = new LocationAwareErrorReporter();
const result = reporter.parseWithLocationTracking('<div><span></div>'); // Missing closing span tag
console.log(reporter.generateErrorReport());Location tracking adds overhead to parsing performance and memory usage:
import { parse } from "parse5";
// Benchmark parsing with and without location tracking
function benchmarkParsing(html: string, iterations = 1000) {
console.log('Benchmarking parsing performance...');
// Without location tracking
const startWithout = Date.now();
for (let i = 0; i < iterations; i++) {
parse(html, { sourceCodeLocationInfo: false });
}
const timeWithout = Date.now() - startWithout;
// With location tracking
const startWith = Date.now();
for (let i = 0; i < iterations; i++) {
parse(html, { sourceCodeLocationInfo: true });
}
const timeWith = Date.now() - startWith;
console.log(`Without location tracking: ${timeWithout}ms`);
console.log(`With location tracking: ${timeWith}ms`);
console.log(`Overhead: ${((timeWith - timeWithout) / timeWithout * 100).toFixed(1)}%`);
}
// Test with sample HTML
const sampleHtml = '<div><p>Hello</p><span>World</span></div>'.repeat(100);
benchmarkParsing(sampleHtml);Best Practices:
sourceCodeLocationInfo: false for better performanceInstall with Tessl CLI
npx tessl i tessl/npm-parse5