HTML parser and serializer that is fully compliant with the WHATWG HTML Living Standard.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Low-level HTML tokenization functionality for advanced use cases that require direct access to the tokenization process. The tokenizer converts HTML text into a stream of tokens that the parser then processes into an AST.
Core tokenizer class that processes HTML text into tokens.
/**
* HTML tokenizer class for low-level token processing
* @internal - Advanced API for specialized use cases
*/
class Tokenizer {
/**
* Creates a new tokenizer instance
* @param options - Tokenizer configuration options
* @param handler - Token handler for processing tokens
*/
constructor(options: TokenizerOptions, handler: TokenHandler);
/**
* Write HTML text to the tokenizer for processing
* @param chunk - HTML text chunk to tokenize
* @param isLastChunk - Whether this is the final chunk
*/
write(chunk: string, isLastChunk: boolean): void;
/**
* Insert HTML text at the current position
* @param chunk - HTML text to insert
*/
insertHtmlAtCurrentPos(chunk: string): void;
/**
* Start new named entity consumption
* @param startCp - Starting code point
* @param endCp - Ending code point
*/
startNamedEntityConsumption(startCp: number, endCp: number): void;
/**
* Emit current character as token
*/
emitCurrentCharacter(): void;
/**
* Emit EOF token
*/
emitEOFToken(): void;
/**
* Get current tokenizer state
*/
get state(): State;
/**
* Set tokenizer state
*/
set state(newState: State);
}Configuration options for the tokenizer.
/**
* Tokenizer configuration options
*/
interface TokenizerOptions {
/**
* Enable source code location information tracking.
* When enabled, tokens will include location data.
* Defaults to false.
*/
sourceCodeLocationInfo?: boolean;
}Constants defining different tokenizer parsing modes based on context.
/**
* Tokenizer mode constants for different parsing contexts
*/
const TokenizerMode: {
readonly DATA: State.DATA;
readonly RCDATA: State.RCDATA;
readonly RAWTEXT: State.RAWTEXT;
readonly SCRIPT_DATA: State.SCRIPT_DATA;
readonly PLAINTEXT: State.PLAINTEXT;
readonly CDATA_SECTION: State.CDATA_SECTION;
};
/**
* Internal tokenizer states (used by TokenizerMode)
*/
enum State {
DATA = 0,
RCDATA = 1,
RAWTEXT = 2,
SCRIPT_DATA = 3,
PLAINTEXT = 4,
CDATA_SECTION = 5,
// ... additional internal states
}Usage Examples:
import { Tokenizer, TokenizerMode, type TokenizerOptions, type TokenHandler } from "parse5";
// Create token handler
const handler: TokenHandler = {
onComment: (token) => console.log('Comment:', token.data),
onDoctype: (token) => console.log('DOCTYPE:', token.name),
onStartTag: (token) => console.log('Start tag:', token.tagName),
onEndTag: (token) => console.log('End tag:', token.tagName),
onEof: (token) => console.log('EOF reached'),
onCharacter: (token) => console.log('Character:', token.chars),
onNullCharacter: (token) => console.log('Null character'),
onWhitespaceCharacter: (token) => console.log('Whitespace:', token.chars)
};
// Create tokenizer with location tracking
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler);
// Process HTML text
tokenizer.write('<div>Hello <span>World</span></div>', true);
// Set specific tokenizer mode for different contexts
tokenizer.state = TokenizerMode.SCRIPT_DATA; // For script content
tokenizer.state = TokenizerMode.RAWTEXT; // For style/title contentInterface for handling tokens emitted by the tokenizer.
/**
* Token handler interface for processing tokenizer output
*/
interface TokenHandler {
/**
* Handle comment tokens
* @param token - Comment token
*/
onComment(token: CommentToken): void;
/**
* Handle DOCTYPE tokens
* @param token - DOCTYPE token
*/
onDoctype(token: DoctypeToken): void;
/**
* Handle start tag tokens
* @param token - Start tag token
*/
onStartTag(token: TagToken): void;
/**
* Handle end tag tokens
* @param token - End tag token
*/
onEndTag(token: TagToken): void;
/**
* Handle end of file tokens
* @param token - EOF token
*/
onEof(token: EOFToken): void;
/**
* Handle character tokens
* @param token - Character token
*/
onCharacter(token: CharacterToken): void;
/**
* Handle null character tokens
* @param token - Null character token
*/
onNullCharacter(token: CharacterToken): void;
/**
* Handle whitespace character tokens
* @param token - Whitespace character token
*/
onWhitespaceCharacter(token: CharacterToken): void;
/**
* Optional error handler for parsing errors
* @param error - Parser error information
*/
onParseError?: ParserErrorHandler | null;
}Base interface shared by all token types.
/**
* Base interface for all token types
*/
interface TokenBase {
/** Location information if sourceCodeLocationInfo is enabled */
location?: Location;
}
/**
* Union type of all token types
*/
type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken;Tokens representing HTML tags (both start and end tags).
/**
* Tag token representing HTML start and end tags
*/
interface TagToken extends TokenBase {
/** Tag name (e.g., 'div', 'span') */
tagName: string;
/** Tag ID for efficient comparison */
tagID: TAG_ID;
/** Whether this is a self-closing tag */
selfClosing: boolean;
/** Acknowledgment flag for self-closing */
ackSelfClosing: boolean;
/** Tag attributes */
attrs: Attribute[];
/** Location info for attributes if enabled */
location?: LocationWithAttributes;
}
/**
* Attribute interface
*/
interface Attribute {
/** Attribute name */
name: string;
/** Attribute value */
value: string;
/** Namespace URI if applicable */
namespace?: string;
/** Namespace prefix if applicable */
prefix?: string;
}Tokens representing text content and character data.
/**
* Character token representing text content
*/
interface CharacterToken extends TokenBase {
/** Character data */
chars: string;
/** Location info if enabled */
location?: Location;
}Tokens representing HTML comments.
/**
* Comment token representing HTML comments
*/
interface CommentToken extends TokenBase {
/** Comment text content */
data: string;
/** Location info if enabled */
location?: Location;
}Tokens representing HTML DOCTYPE declarations.
/**
* DOCTYPE token representing document type declarations
*/
interface DoctypeToken extends TokenBase {
/** DOCTYPE name (usually 'html') */
name: string | null;
/** Public identifier */
publicId: string | null;
/** System identifier */
systemId: string | null;
/** Whether the DOCTYPE is force-quirks */
forceQuirks: boolean;
/** Location info if enabled */
location?: Location;
}Tokens representing end of file.
/**
* EOF token representing end of file
*/
interface EOFToken extends TokenBase {
/** Location info if enabled */
location?: Location;
}Utility functions for working with tokens.
/**
* Get attribute value from tag token
* @param token - Tag token to search
* @param attrName - Attribute name to find
* @returns Attribute value or null if not found
*/
function getTokenAttr(token: TagToken, attrName: string): string | null;
/**
* Token type enumeration
*/
enum TokenType {
CHARACTER = 0,
NULL_CHARACTER = 1,
WHITESPACE_CHARACTER = 2,
START_TAG = 3,
END_TAG = 4,
COMMENT = 5,
DOCTYPE = 6,
EOF = 7,
HIBERNATION = 8
}Usage Examples:
import { Token, type TagToken, type CharacterToken } from "parse5";
// Check token attribute
const tagToken: TagToken = /* ... */;
const className = Token.getTokenAttr(tagToken, 'class');
if (className) {
console.log('Class name:', className);
}
// Handle different token types
function processToken(token: Token.Token) {
switch (token.type) {
case Token.TokenType.START_TAG:
console.log('Start tag:', (token as TagToken).tagName);
break;
case Token.TokenType.CHARACTER:
console.log('Text:', (token as CharacterToken).chars);
break;
case Token.TokenType.COMMENT:
console.log('Comment:', (token as CommentToken).data);
break;
}
}import { Tokenizer, type TokenHandler, type TagToken } from "parse5";
class CustomTokenProcessor implements TokenHandler {
private tagStack: string[] = [];
onStartTag(token: TagToken): void {
this.tagStack.push(token.tagName);
console.log(`Entering tag: ${token.tagName}, depth: ${this.tagStack.length}`);
// Process attributes
token.attrs.forEach(attr => {
console.log(` Attribute: ${attr.name}="${attr.value}"`);
});
}
onEndTag(token: TagToken): void {
const expectedTag = this.tagStack.pop();
if (expectedTag !== token.tagName) {
console.warn(`Mismatched tags: expected ${expectedTag}, got ${token.tagName}`);
}
console.log(`Exiting tag: ${token.tagName}, depth: ${this.tagStack.length}`);
}
onComment(token: CommentToken): void {
console.log(`Comment: ${token.data}`);
}
onDoctype(token: DoctypeToken): void {
console.log(`DOCTYPE: ${token.name}`);
}
onEof(): void {
console.log('End of file reached');
}
onCharacter(token: CharacterToken): void {
const trimmed = token.chars.trim();
if (trimmed) {
console.log(`Text content: ${trimmed}`);
}
}
onNullCharacter(): void {
console.warn('Null character encountered');
}
onWhitespaceCharacter(): void {
// Usually ignore whitespace
}
}
// Use custom processor
const processor = new CustomTokenProcessor();
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: false }, processor);
tokenizer.write('<html><body>Hello World!</body></html>', true);import { Tokenizer, type TokenHandler, type Location } from "parse5";
class LocationAwareHandler implements TokenHandler {
private html: string;
constructor(html: string) {
this.html = html;
}
private getSourceSnippet(location: Location): string {
return this.html.substring(location.startOffset, location.endOffset);
}
onStartTag(token: TagToken): void {
if (token.location) {
const snippet = this.getSourceSnippet(token.location);
console.log(`Start tag at line ${token.location.startLine}: ${snippet}`);
// Show attribute locations
if (token.location.attrs) {
Object.entries(token.location.attrs).forEach(([name, attrLocation]) => {
const attrSnippet = this.getSourceSnippet(attrLocation);
console.log(` Attribute ${name} at line ${attrLocation.startLine}: ${attrSnippet}`);
});
}
}
}
// ... implement other methods with location awareness
}Install with Tessl CLI
npx tessl i tessl/npm-parse5