Direct access to the underlying tokenizer for custom parsing implementations and advanced use cases. The Tokenizer class provides the lowest-level interface to HTML/XML parsing.
The core tokenization engine that processes HTML/XML character streams and fires low-level parsing events.
/**
* Low-level HTML/XML tokenizer with state machine parsing
*/
class Tokenizer {
/** Whether the tokenizer is currently running */
running: boolean;
/**
* Create a new Tokenizer instance
* @param options - Tokenizer configuration options
* @param cbs - Callback object implementing Callbacks interface
*/
constructor(options: ParserOptions, cbs: Callbacks);
/**
* Write data to the tokenizer for processing
* @param chunk - String data to tokenize
*/
write(chunk: string): void;
/**
* Signal end of input and complete tokenization
* @param chunk - Optional final chunk of data
*/
end(chunk?: string): void;
/** Pause tokenization - can be resumed later */
pause(): void;
/** Resume tokenization after pause */
resume(): void;
/** Reset tokenizer to initial state */
reset(): void;
}The Callbacks interface provides low-level events fired during tokenization:
interface Callbacks {
/** Called for attribute data content */
onattribdata(start: number, endIndex: number): void;
/** Called for HTML entity in attribute value */
onattribentity(codepoint: number): void;
/** Called when attribute ends */
onattribend(quote: QuoteType, endIndex: number): void;
/** Called for attribute name */
onattribname(start: number, endIndex: number): void;
/** Called for CDATA section content */
oncdata(start: number, endIndex: number, endOffset: number): void;
/** Called for closing tag */
onclosetag(start: number, endIndex: number): void;
/** Called for comment content */
oncomment(start: number, endIndex: number, endOffset: number): void;
/** Called for DOCTYPE declaration */
ondeclaration(start: number, endIndex: number): void;
/** Called when tokenization ends */
onend(): void;
/** Called when opening tag ends */
onopentagend(endIndex: number): void;
/** Called for opening tag name */
onopentagname(start: number, endIndex: number): void;
/** Called for processing instruction */
onprocessinginstruction(start: number, endIndex: number): void;
/** Called for self-closing tag */
onselfclosingtag(endIndex: number): void;
/** Called for text content */
ontext(start: number, endIndex: number): void;
/** Called for HTML entity in text */
ontextentity(codepoint: number, endIndex: number): void;
}Defines the types of quotes used for attribute values:
enum QuoteType {
/** Attribute has no value (e.g., `disabled`) */
NoValue = 0,
/** Attribute value is unquoted (e.g., `class=button`) */
Unquoted = 1,
/** Attribute value uses single quotes (e.g., `class='button'`) */
Single = 2,
/** Attribute value uses double quotes (e.g., `class="button"`) */
Double = 3
}import Tokenizer, { type Callbacks, QuoteType } from "htmlparser2/lib/Tokenizer";
class CustomHtmlParser implements Callbacks {
private buffer: string = '';
private tags: Array<{name: string, attrs: Record<string, string>}> = [];
private currentTag: string = '';
private currentAttr: string = '';
private attributes: Record<string, string> = {};
constructor(private options: ParserOptions = {}) {
this.tokenizer = new Tokenizer(options, this);
}
private tokenizer: Tokenizer;
parse(html: string): Array<{name: string, attrs: Record<string, string>}> {
this.buffer = html;
this.tokenizer.write(html);
this.tokenizer.end();
return this.tags;
}
// Implement required callbacks
onattribdata(start: number, endIndex: number): void {
const data = this.buffer.slice(start, endIndex);
this.attributes[this.currentAttr] = data;
}
onattribentity(codepoint: number): void {
// Handle HTML entities in attributes
this.attributes[this.currentAttr] = String.fromCodePoint(codepoint);
}
onattribend(quote: QuoteType, endIndex: number): void {
// Attribute parsing complete
}
onattribname(start: number, endIndex: number): void {
this.currentAttr = this.buffer.slice(start, endIndex);
this.attributes[this.currentAttr] = '';
}
onopentagname(start: number, endIndex: number): void {
this.currentTag = this.buffer.slice(start, endIndex);
this.attributes = {};
}
onopentagend(endIndex: number): void {
this.tags.push({
name: this.currentTag,
attrs: { ...this.attributes }
});
}
// Implement other required callbacks
oncdata(): void {}
onclosetag(): void {}
oncomment(): void {}
ondeclaration(): void {}
onend(): void {}
onprocessinginstruction(): void {}
onselfclosingtag(): void {}
ontext(): void {}
ontextentity(): void {}
}
// Usage
const parser = new CustomHtmlParser({ lowerCaseTags: true });
const tags = parser.parse('<DIV class="test" id=myid><span>Hello</span></DIV>');
console.log(tags);import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";
class StreamingTokenizer implements Callbacks {
private buffer: string = '';
private tokenizer: Tokenizer;
constructor() {
this.tokenizer = new Tokenizer({}, this);
}
processChunk(chunk: string): void {
this.buffer += chunk;
this.tokenizer.write(chunk);
}
finish(): void {
this.tokenizer.end();
}
// Track text content positions
ontext(start: number, endIndex: number): void {
const text = this.buffer.slice(start, endIndex);
if (text.trim()) {
console.log(`Text at ${start}-${endIndex}: "${text}"`);
}
}
// Track tag positions
onopentagname(start: number, endIndex: number): void {
const tagName = this.buffer.slice(start, endIndex);
console.log(`Opening tag at ${start}-${endIndex}: <${tagName}>`);
}
onclosetag(start: number, endIndex: number): void {
const tagName = this.buffer.slice(start + 2, endIndex - 1); // Skip </ and >
console.log(`Closing tag at ${start}-${endIndex}: </${tagName}>`);
}
// Implement required callbacks (minimal)
onattribdata(): void {}
onattribentity(): void {}
onattribend(): void {}
onattribname(): void {}
oncdata(): void {}
oncomment(): void {}
ondeclaration(): void {}
onend(): void { console.log("Tokenization complete"); }
onopentagend(): void {}
onprocessinginstruction(): void {}
onselfclosingtag(): void {}
ontextentity(): void {}
}
// Usage for streaming
const tokenizer = new StreamingTokenizer();
tokenizer.processChunk("<html><body>");
tokenizer.processChunk("<h1>Title</h1>");
tokenizer.processChunk("<p>Content</p>");
tokenizer.processChunk("</body></html>");
tokenizer.finish();import Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";
class EntityProcessor implements Callbacks {
private buffer: string = '';
private entities: Array<{position: number, codepoint: number, char: string}> = [];
constructor() {
const tokenizer = new Tokenizer({ decodeEntities: true }, this);
this.tokenizer = tokenizer;
}
private tokenizer: Tokenizer;
process(html: string): Array<{position: number, codepoint: number, char: string}> {
this.buffer = html;
this.entities = [];
this.tokenizer.write(html);
this.tokenizer.end();
return this.entities;
}
ontextentity(codepoint: number, endIndex: number): void {
this.entities.push({
position: endIndex,
codepoint,
char: String.fromCodePoint(codepoint)
});
}
onattribentity(codepoint: number): void {
this.entities.push({
position: -1, // In attribute
codepoint,
char: String.fromCodePoint(codepoint)
});
}
// Minimal required callbacks
onattribdata(): void {}
onattribend(): void {}
onattribname(): void {}
oncdata(): void {}
onclosetag(): void {}
oncomment(): void {}
ondeclaration(): void {}
onend(): void {}
onopentagend(): void {}
onopentagname(): void {}
onprocessinginstruction(): void {}
onselfclosingtag(): void {}
ontext(): void {}
}
// Usage
const processor = new EntityProcessor();
const entities = processor.process('Text & more <tags> © 2025');
console.log(entities);
// Output: entities found with their codepoints and positionsimport Tokenizer, { type Callbacks } from "htmlparser2/lib/Tokenizer";
class PerformanceTokenizer implements Callbacks {
private stats = {
tags: 0,
attributes: 0,
textNodes: 0,
comments: 0,
entities: 0,
startTime: 0,
endTime: 0
};
constructor() {
const tokenizer = new Tokenizer({}, this);
this.tokenizer = tokenizer;
}
private tokenizer: Tokenizer;
parse(html: string) {
this.stats.startTime = Date.now();
this.tokenizer.write(html);
this.tokenizer.end();
return this.stats;
}
onopentagname(): void {
this.stats.tags++;
}
onattribname(): void {
this.stats.attributes++;
}
ontext(): void {
this.stats.textNodes++;
}
oncomment(): void {
this.stats.comments++;
}
ontextentity(): void {
this.stats.entities++;
}
onattribentity(): void {
this.stats.entities++;
}
onend(): void {
this.stats.endTime = Date.now();
console.log(`Parsing completed in ${this.stats.endTime - this.stats.startTime}ms`);
console.log(`Found: ${this.stats.tags} tags, ${this.stats.attributes} attributes`);
}
// Minimal required callbacks
onattribdata(): void {}
onattribend(): void {}
oncdata(): void {}
onclosetag(): void {}
ondeclaration(): void {}
onopentagend(): void {}
onprocessinginstruction(): void {}
onselfclosingtag(): void {}
}The Tokenizer uses the same ParserOptions as the Parser:
interface ParserOptions {
/** Enable XML parsing mode */
xmlMode?: boolean;
/** Decode HTML entities */
decodeEntities?: boolean;
/** Convert tag names to lowercase */
lowerCaseTags?: boolean;
/** Convert attribute names to lowercase */
lowerCaseAttributeNames?: boolean;
/** Recognize CDATA sections */
recognizeCDATA?: boolean;
/** Recognize self-closing tags */
recognizeSelfClosing?: boolean;
}These options affect how the tokenizer processes the input stream and what events are fired.