Streaming entity decoder and low-level decoding utilities for performance-critical applications and custom parsing scenarios.
Stateful decoder class that supports streaming/partial entity processing and custom error handling.
/**
* Token decoder with support for writing partial entities
* Useful for streaming parsers and custom entity processing
*/
class EntityDecoder {
/**
* Creates a new EntityDecoder instance
* @param decodeTree - Decode tree data (Uint16Array)
* @param emitCodePoint - Callback for decoded codepoints
* @param errors - Optional error producer for validation
*/
constructor(
decodeTree: Uint16Array,
emitCodePoint: (cp: number, consumed: number) => void,
errors?: EntityErrorProducer
);
/**
* Resets the decoder state for reuse with new decoding mode
* @param decodeMode - Strictness mode for decoding
*/
startEntity(decodeMode: DecodingMode): void;
/**
* Processes entity data incrementally
* @param input - String containing entity data
* @param offset - Starting offset in the input string
* @returns Number of characters consumed, or -1 if entity incomplete
*/
write(input: string, offset: number): number;
}
/**
* Error callback interface for character reference validation
*/
interface EntityErrorProducer {
/** Called when an entity is missing a trailing semicolon */
missingSemicolonAfterCharacterReference(): void;
/**
* Called when a numeric entity has no digits
* @param consumedCharacters - Number of characters consumed before error
*/
absenceOfDigitsInNumericCharacterReference(consumedCharacters: number): void;
/**
* Called to validate a numeric character reference value
* @param code - The numeric code point value
*/
validateNumericCharacterReference(code: number): void;
}
enum DecodingMode {
/** Entities in text nodes that can end with any character */
Legacy = 0,
/** Only allow entities terminated with a semicolon */
Strict = 1,
/** Entities in attributes have limitations on ending characters */
Attribute = 2
}Usage Examples:
import { EntityDecoder, DecodingMode, htmlDecodeTree } from "entities";
// Custom entity processing with error handling
class MyErrorHandler {
missingSemicolonAfterCharacterReference() {
console.warn("Missing semicolon in entity");
}
absenceOfDigitsInNumericCharacterReference(consumed: number) {
console.error(`No digits found after consuming ${consumed} characters`);
}
validateNumericCharacterReference(code: number) {
if (code > 0x10FFFF) {
console.error(`Invalid code point: ${code}`);
}
}
}
// Streaming decoder example
const results: string[] = [];
const decoder = new EntityDecoder(
htmlDecodeTree,
(codePoint, consumed) => {
results.push(String.fromCodePoint(codePoint));
},
new MyErrorHandler()
);
// Process partial entity data
decoder.startEntity(DecodingMode.Legacy);
let consumed = decoder.write("&", 0); // Returns -1 (incomplete)
consumed = decoder.write(";", 0); // Returns 1, emits '&'Direct access to decoding trees and utility functions for custom implementations.
/**
* HTML entity decode tree data
* Binary trie structure for fast HTML entity lookup
*/
const htmlDecodeTree: Uint16Array;
/**
* XML entity decode tree data
* Binary trie structure for fast XML entity lookup
*/
const xmlDecodeTree: Uint16Array;
/**
* Polyfill for String.fromCodePoint
* Creates a string from Unicode code points
* @param codePoints - One or more Unicode code points
* @returns String representation of the code points
*/
function fromCodePoint(...codePoints: number[]): string;
/**
* Replaces invalid code points with replacement character
* Handles surrogates and out-of-range values
* @param codePoint - Unicode code point to validate
* @returns Valid code point or replacement character (0xFFFD)
*/
function replaceCodePoint(codePoint: number): number;
/**
* @deprecated Use fromCodePoint(replaceCodePoint(codePoint)) instead
* Decodes a code point to string with validation
* @param codePoint - Code point to decode
* @returns String representation
*/
function decodeCodePoint(codePoint: number): string;Usage Examples:
import {
htmlDecodeTree,
xmlDecodeTree,
fromCodePoint,
replaceCodePoint
} from "entities/decode";
// Using decode trees directly (advanced usage)
console.log(htmlDecodeTree.length); // Size of HTML entity trie
console.log(xmlDecodeTree.length); // Size of XML entity trie
// Safe code point processing
const safeCodePoint = replaceCodePoint(0x1F600); // 😀 emoji
const emoji = fromCodePoint(safeCodePoint);
// Handle invalid code points
const invalidCodePoint = replaceCodePoint(0x999999); // Returns 0xFFFD
const replacement = fromCodePoint(invalidCodePoint); // Returns "�"import { EntityDecoder, DecodingMode, htmlDecodeTree } from "entities";
class HTMLTokenizer {
private entityDecoder: EntityDecoder;
private currentEntity = '';
constructor() {
this.entityDecoder = new EntityDecoder(
htmlDecodeTree,
(codePoint, consumed) => {
this.currentEntity += String.fromCodePoint(codePoint);
}
);
}
processText(text: string): string {
let result = '';
let i = 0;
while (i < text.length) {
if (text[i] === '&') {
// Found potential entity
this.entityDecoder.startEntity(DecodingMode.Legacy);
this.currentEntity = '';
const consumed = this.entityDecoder.write(text.substr(i), 0);
if (consumed > 0) {
result += this.currentEntity;
i += consumed;
} else {
result += text[i];
i++;
}
} else {
result += text[i];
i++;
}
}
return result;
}
}import { EntityDecoder, DecodingMode, htmlDecodeTree } from "entities";
class StreamingDecoder {
private decoder: EntityDecoder;
private buffer = '';
private results: string[] = [];
constructor() {
this.decoder = new EntityDecoder(
htmlDecodeTree,
(codePoint) => {
this.results.push(String.fromCodePoint(codePoint));
}
);
}
processChunk(chunk: string): string[] {
this.buffer += chunk;
const newResults = [];
// Process complete entities in buffer
let entityStart = this.buffer.indexOf('&');
while (entityStart !== -1) {
// Add text before entity
if (entityStart > 0) {
newResults.push(this.buffer.substr(0, entityStart));
}
// Try to decode entity
this.decoder.startEntity(DecodingMode.Legacy);
const oldResultsLength = this.results.length;
const consumed = this.decoder.write(this.buffer.substr(entityStart), 0);
if (consumed > 0) {
// Entity was decoded
newResults.push(...this.results.slice(oldResultsLength));
this.buffer = this.buffer.substr(entityStart + consumed);
} else {
// Incomplete entity, keep in buffer
break;
}
entityStart = this.buffer.indexOf('&');
}
return newResults;
}
}import { EntityDecoder, DecodingMode, htmlDecodeTree, EntityErrorProducer } from "entities";
class EntityErrorCollector implements EntityErrorProducer {
public errors: Array<{type: string, details?: any}> = [];
missingSemicolonAfterCharacterReference() {
this.errors.push({ type: 'missing_semicolon' });
}
absenceOfDigitsInNumericCharacterReference(consumed: number) {
this.errors.push({
type: 'no_digits',
details: { consumed }
});
}
validateNumericCharacterReference(code: number) {
if (code > 0x10FFFF || (code >= 0xD800 && code <= 0xDFFF)) {
this.errors.push({
type: 'invalid_codepoint',
details: { code }
});
}
}
getErrorSummary() {
const summary = this.errors.reduce((acc, error) => {
acc[error.type] = (acc[error.type] || 0) + 1;
return acc;
}, {} as Record<string, number>);
return summary;
}
}
// Usage
const errorCollector = new EntityErrorCollector();
const decoder = new EntityDecoder(
htmlDecodeTree,
(cp) => { /* handle decoded character */ },
errorCollector
);
// After processing, check for errors
console.log(errorCollector.getErrorSummary());startEntity() to reset rather than creating new instancesxmlDecodeTree for XML-only content// Efficient reuse pattern
const decoder = new EntityDecoder(htmlDecodeTree, emitCodePoint);
for (const entityText of manyEntities) {
decoder.startEntity(DecodingMode.Legacy);
decoder.write(entityText, 0);
}The EntityDecoder class powers the high-level decoding functions:
// These high-level functions use EntityDecoder internally:
import { decodeHTML, decodeXML } from "entities";
// Equivalent to using EntityDecoder with appropriate trees and settings
const htmlResult = decodeHTML(text);
const xmlResult = decodeXML(text);Use EntityDecoder directly when you need:
Use high-level functions (decode, decodeHTML, etc.) for: