Fast & forgiving HTML/XML parser with callback-based interface and DOM generation capabilities
—
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Pending
The risk profile of this skill
Direct access to the underlying tokenizer for custom parsing implementations and advanced use cases. The Tokenizer class provides the lowest-level interface to HTML/XML parsing.
The core tokenization engine that processes HTML/XML character streams and fires low-level parsing events.
/**
* Low-level HTML/XML tokenizer with state machine parsing
*/
class Tokenizer {
/** Whether the tokenizer is currently running */
running: boolean;
/**
* Create a new Tokenizer instance
* @param options - Tokenizer configuration options
* @param cbs - Callback object implementing Callbacks interface
*/
constructor(options: ParserOptions, cbs: Callbacks);
/**
* Write data to the tokenizer for processing
* @param chunk - String data to tokenize
*/
write(chunk: string): void;
/**
* Signal end of input and complete tokenization
* @param chunk - Optional final chunk of data
*/
end(chunk?: string): void;
/** Pause tokenization - can be resumed later */
pause(): void;
/** Resume tokenization after pause */
resume(): void;
/** Reset tokenizer to initial state */
reset(): void;
}The Callbacks interface provides low-level events fired during tokenization:
interface Callbacks {
/** Called for attribute data content */
onattribdata(start: number, endIndex: number): void;
/** Called for HTML entity in attribute value */
onattribentity(codepoint: number): void;
/** Called when attribute ends */
onattribend(quote: QuoteType, endIndex: number): void;
/** Called for attribute name */
onattribname(start: number, endIndex: number): void;
/** Called for CDATA section content */
oncdata(start: number, endIndex: number, endOffset: number): void;
/** Called for closing tag */
onclosetag(start: number, endIndex: number): void;
/** Called for comment content */
oncomment(start: number, endIndex: number, endOffset: number): void;
/** Called for DOCTYPE declaration */
ondeclaration(start: number, endIndex: number): void;
/** Called when tokenization ends */
onend(): void;
/** Called when opening tag ends */
onopentagend(endIndex: number): void;
/** Called for opening tag name */
onopentagname(start: number, endIndex: number): void;
/** Called for processing instruction */
onprocessinginstruction(start: number, endIndex: number): void;
/** Called for self-closing tag */
onselfclosingtag(endIndex: number): void;
/** Called for text content */
ontext(start: number, endIndex: number): void;
/** Called for HTML entity in text */
ontextentity(codepoint: number, endIndex: number): void;
}Defines the types of quotes used for attribute values:
enum QuoteType {
/** Attribute has no value (e.g., `disabled`) */
NoValue = 0,
/** Attribute value is unquoted (e.g., `class=button`) */
Unquoted = 1,
/** Attribute value uses single quotes (e.g., `class='button'`) */
Single = 2,
/** Attribute value uses double quotes (e.g., `class="button"`) */
Double = 3
}import Tokenizer, { type Callbacks, QuoteType } from "htmlparser2/lib/Tokenizer";
class CustomHtmlParser implements Callbacks {
private buffer: string = '';
private tags: Array<{name: string, attrs: Record<string, string>}> = [];
private currentTag: string = '';
private currentAttr: string = '';
private attributes: Record<string, string> = {};
constructor(private options: ParserOptions = {}) {
this.tokenizer = new Tokenizer(options, this);
}
private tokenizer: Tokenizer;
parse(html: string): Array<{name: string, attrs: Record<string, string>}> {
this.buffer = html;
this.tokenizer.write(html);
this.tokenizer.end();
return this.tags;
}
// Implement required callbacks
onattribdata(start: number, endIndex: number): void {
const data = this.buffer.slice(start, endIndex);
this.attributes[this.currentAttr] = data;
}
onattribentity(codepoint: number): void {
// Handle HTML entities in attributes
this.attributes[this.currentAttr] = String.fromCodePoint(codepoint);
}
onattribend(quote: QuoteType, endIndex: number): void {
// Attribute parsing complete
}
onattribname(start: number, endIndex: number): void {
this.currentAttr = this.buffer.slice(start, endIndex);
this.attributes[this.currentAttr] = '';
}
onopentagname(start: number, endIndex: number): void {
this.currentTag = this.buffer.slice(start, endIndex);
this.attributes = {};
}
onopentagend(endIndex: number): void {
this.tags.push({
name: this.currentTag,
attrs: { ...this.attributes }
});
}
// Implement other required callbacks
oncdata(): void {}
onclosetag(): void {}
oncomment(): void {}
ondeclaration(): void {}
onend(): void {}
onprocessinginstruction(): void {}
onselfclosingtag(): void {}
ontext(): void {}
ontextentity(): void {}
}
// Usage
const parser = new CustomHtmlParser({ lowerCaseTags: true });
const tags = parser.parse('<DIV class="test" id=myid><span>Hello</span></DIV>');
console.log(tags);import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";
class StreamingTokenizer implements Callbacks {
private buffer: string = '';
private tokenizer: Tokenizer;
constructor() {
this.tokenizer = new Tokenizer({}, this);
}
processChunk(chunk: string): void {
this.buffer += chunk;
this.tokenizer.write(chunk);
}
finish(): void {
this.tokenizer.end();
}
// Track text content positions
ontext(start: number, endIndex: number): void {
const text = this.buffer.slice(start, endIndex);
if (text.trim()) {
console.log(`Text at ${start}-${endIndex}: "${text}"`);
}
}
// Track tag positions
onopentagname(start: number, endIndex: number): void {
const tagName = this.buffer.slice(start, endIndex);
console.log(`Opening tag at ${start}-${endIndex}: <${tagName}>`);
}
onclosetag(start: number, endIndex: number): void {
const tagName = this.buffer.slice(start + 2, endIndex - 1); // Skip </ and >
console.log(`Closing tag at ${start}-${endIndex}: </${tagName}>`);
}
// Implement required callbacks (minimal)
onattribdata(): void {}
onattribentity(): void {}
onattribend(): void {}
onattribname(): void {}
oncdata(): void {}
oncomment(): void {}
ondeclaration(): void {}
onend(): void { console.log("Tokenization complete"); }
onopentagend(): void {}
onprocessinginstruction(): void {}
onselfclosingtag(): void {}
ontextentity(): void {}
}
// Usage for streaming
const tokenizer = new StreamingTokenizer();
tokenizer.processChunk("<html><body>");
tokenizer.processChunk("<h1>Title</h1>");
tokenizer.processChunk("<p>Content</p>");
tokenizer.processChunk("</body></html>");
tokenizer.finish();import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";
class EntityProcessor implements Callbacks {
private buffer: string = '';
private entities: Array<{position: number, codepoint: number, char: string}> = [];
constructor() {
const tokenizer = new Tokenizer({ decodeEntities: true }, this);
this.tokenizer = tokenizer;
}
private tokenizer: Tokenizer;
process(html: string): Array<{position: number, codepoint: number, char: string}> {
this.buffer = html;
this.entities = [];
this.tokenizer.write(html);
this.tokenizer.end();
return this.entities;
}
ontextentity(codepoint: number, endIndex: number): void {
this.entities.push({
position: endIndex,
codepoint,
char: String.fromCodePoint(codepoint)
});
}
onattribentity(codepoint: number): void {
this.entities.push({
position: -1, // In attribute
codepoint,
char: String.fromCodePoint(codepoint)
});
}
// Minimal required callbacks
onattribdata(): void {}
onattribend(): void {}
onattribname(): void {}
oncdata(): void {}
onclosetag(): void {}
oncomment(): void {}
ondeclaration(): void {}
onend(): void {}
onopentagend(): void {}
onopentagname(): void {}
onprocessinginstruction(): void {}
onselfclosingtag(): void {}
ontext(): void {}
}
// Usage
const processor = new EntityProcessor();
const entities = processor.process('Text & more <tags> © 2025');
console.log(entities);
// Output: entities found with their codepoints and positionsimport Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";
class PerformanceTokenizer implements Callbacks {
private stats = {
tags: 0,
attributes: 0,
textNodes: 0,
comments: 0,
entities: 0,
startTime: 0,
endTime: 0
};
constructor() {
const tokenizer = new Tokenizer({}, this);
this.tokenizer = tokenizer;
}
private tokenizer: Tokenizer;
parse(html: string) {
this.stats.startTime = Date.now();
this.tokenizer.write(html);
this.tokenizer.end();
return this.stats;
}
onopentagname(): void {
this.stats.tags++;
}
onattribname(): void {
this.stats.attributes++;
}
ontext(): void {
this.stats.textNodes++;
}
oncomment(): void {
this.stats.comments++;
}
ontextentity(): void {
this.stats.entities++;
}
onattribentity(): void {
this.stats.entities++;
}
onend(): void {
this.stats.endTime = Date.now();
console.log(`Parsing completed in ${this.stats.endTime - this.stats.startTime}ms`);
console.log(`Found: ${this.stats.tags} tags, ${this.stats.attributes} attributes`);
}
// Minimal required callbacks
onattribdata(): void {}
onattribend(): void {}
oncdata(): void {}
onclosetag(): void {}
ondeclaration(): void {}
onopentagend(): void {}
onprocessinginstruction(): void {}
onselfclosingtag(): void {}
}The Tokenizer uses the same ParserOptions as the Parser:
interface ParserOptions {
/** Enable XML parsing mode */
xmlMode?: boolean;
/** Decode HTML entities */
decodeEntities?: boolean;
/** Convert tag names to lowercase */
lowerCaseTags?: boolean;
/** Convert attribute names to lowercase */
lowerCaseAttributeNames?: boolean;
/** Recognize CDATA sections */
recognizeCDATA?: boolean;
/** Recognize self-closing tags */
recognizeSelfClosing?: boolean;
}These options affect how the tokenizer processes the input stream and what events are fired.