CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/npm-parse5

HTML parser and serializer that is fully compliant with the WHATWG HTML Living Standard.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

tokenization.mddocs/

HTML Tokenization

Low-level HTML tokenization functionality for advanced use cases that require direct access to the tokenization process. The tokenizer converts HTML text into a stream of tokens that the parser then processes into an AST.

Capabilities

Tokenizer Class

Core tokenizer class that processes HTML text into tokens.

/**
 * HTML tokenizer class for low-level token processing
 * @internal - Advanced API for specialized use cases
 */
class Tokenizer {
  /**
   * Creates a new tokenizer instance
   * @param options - Tokenizer configuration options
   * @param handler - Token handler for processing tokens
   */
  constructor(options: TokenizerOptions, handler: TokenHandler);

  /**
   * Write HTML text to the tokenizer for processing
   * @param chunk - HTML text chunk to tokenize
   * @param isLastChunk - Whether this is the final chunk
   */
  write(chunk: string, isLastChunk: boolean): void;

  /**
   * Insert HTML text at the current position
   * @param chunk - HTML text to insert
   */
  insertHtmlAtCurrentPos(chunk: string): void;

  /**
   * Start new named entity consumption
   * @param startCp - Starting code point
   * @param endCp - Ending code point  
   */
  startNamedEntityConsumption(startCp: number, endCp: number): void;

  /**
   * Emit current character as token
   */
  emitCurrentCharacter(): void;

  /**
   * Emit EOF token
   */
  emitEOFToken(): void;

  /**
   * Get current tokenizer state
   */
  get state(): State;

  /**
   * Set tokenizer state
   */
  set state(newState: State);
}

Tokenizer Options

Configuration options for the tokenizer.

/**
 * Tokenizer configuration options
 */
interface TokenizerOptions {
  /**
   * Enable source code location information tracking.
   * When enabled, tokens will include location data.
   * Defaults to false.
   */
  sourceCodeLocationInfo?: boolean;
}

Tokenizer Modes

Constants defining different tokenizer parsing modes based on context.

/**
 * Tokenizer mode constants for different parsing contexts
 */
const TokenizerMode: {
  readonly DATA: State.DATA;
  readonly RCDATA: State.RCDATA;
  readonly RAWTEXT: State.RAWTEXT;
  readonly SCRIPT_DATA: State.SCRIPT_DATA;
  readonly PLAINTEXT: State.PLAINTEXT;
  readonly CDATA_SECTION: State.CDATA_SECTION;
};

/**
 * Internal tokenizer states (used by TokenizerMode)
 */
enum State {
  DATA = 0,
  RCDATA = 1,
  RAWTEXT = 2,
  SCRIPT_DATA = 3,
  PLAINTEXT = 4,
  CDATA_SECTION = 5,
  // ... additional internal states
}

Usage Examples:

import { Tokenizer, TokenizerMode, type TokenizerOptions, type TokenHandler } from "parse5";

// Create token handler
const handler: TokenHandler = {
  onComment: (token) => console.log('Comment:', token.data),
  onDoctype: (token) => console.log('DOCTYPE:', token.name),
  onStartTag: (token) => console.log('Start tag:', token.tagName),
  onEndTag: (token) => console.log('End tag:', token.tagName),
  onEof: (token) => console.log('EOF reached'),
  onCharacter: (token) => console.log('Character:', token.chars),
  onNullCharacter: (token) => console.log('Null character'),
  onWhitespaceCharacter: (token) => console.log('Whitespace:', token.chars)
};

// Create tokenizer with location tracking
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler);

// Process HTML text
tokenizer.write('<div>Hello <span>World</span></div>', true);

// Set specific tokenizer mode for different contexts
tokenizer.state = TokenizerMode.SCRIPT_DATA; // For script content
tokenizer.state = TokenizerMode.RAWTEXT;     // For style/title content

Token Handler Interface

Interface for handling tokens emitted by the tokenizer.

/**
 * Token handler interface for processing tokenizer output
 */
interface TokenHandler {
  /**
   * Handle comment tokens
   * @param token - Comment token
   */
  onComment(token: CommentToken): void;

  /**
   * Handle DOCTYPE tokens
   * @param token - DOCTYPE token
   */
  onDoctype(token: DoctypeToken): void;

  /**
   * Handle start tag tokens
   * @param token - Start tag token
   */
  onStartTag(token: TagToken): void;

  /**
   * Handle end tag tokens
   * @param token - End tag token
   */
  onEndTag(token: TagToken): void;

  /**
   * Handle end of file tokens
   * @param token - EOF token
   */
  onEof(token: EOFToken): void;

  /**
   * Handle character tokens
   * @param token - Character token
   */
  onCharacter(token: CharacterToken): void;

  /**
   * Handle null character tokens
   * @param token - Null character token
   */
  onNullCharacter(token: CharacterToken): void;

  /**
   * Handle whitespace character tokens
   * @param token - Whitespace character token
   */
  onWhitespaceCharacter(token: CharacterToken): void;

  /**
   * Optional error handler for parsing errors
   * @param error - Parser error information
   */
  onParseError?: ParserErrorHandler | null;
}

Token Types

Token Base Interface

Base interface shared by all token types.

/**
 * Base interface for all token types
 */
interface TokenBase {
  /** Location information if sourceCodeLocationInfo is enabled */
  location?: Location;
}

/**
 * Union type of all token types
 */
type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken;

Tag Tokens

Tokens representing HTML tags (both start and end tags).

/**
 * Tag token representing HTML start and end tags
 */
interface TagToken extends TokenBase {
  /** Tag name (e.g., 'div', 'span') */
  tagName: string;
  
  /** Tag ID for efficient comparison */
  tagID: TAG_ID;
  
  /** Whether this is a self-closing tag */
  selfClosing: boolean;
  
  /** Acknowledgment flag for self-closing */
  ackSelfClosing: boolean;
  
  /** Tag attributes */
  attrs: Attribute[];
  
  /** Location info for attributes if enabled */
  location?: LocationWithAttributes;
}

/**
 * Attribute interface
 */
interface Attribute {
  /** Attribute name */
  name: string;
  
  /** Attribute value */
  value: string;
  
  /** Namespace URI if applicable */
  namespace?: string;
  
  /** Namespace prefix if applicable */
  prefix?: string;
}

Character Tokens

Tokens representing text content and character data.

/**
 * Character token representing text content
 */
interface CharacterToken extends TokenBase {
  /** Character data */
  chars: string;
  
  /** Location info if enabled */
  location?: Location;
}

Comment Tokens

Tokens representing HTML comments.

/**
 * Comment token representing HTML comments
 */
interface CommentToken extends TokenBase {
  /** Comment text content */
  data: string;
  
  /** Location info if enabled */
  location?: Location;
}

DOCTYPE Tokens

Tokens representing HTML DOCTYPE declarations.

/**
 * DOCTYPE token representing document type declarations
 */
interface DoctypeToken extends TokenBase {
  /** DOCTYPE name (usually 'html') */
  name: string | null;
  
  /** Public identifier */
  publicId: string | null;
  
  /** System identifier */
  systemId: string | null;
  
  /** Whether the DOCTYPE is force-quirks */
  forceQuirks: boolean;
  
  /** Location info if enabled */
  location?: Location;
}

EOF Tokens

Tokens representing end of file.

/**
 * EOF token representing end of file
 */
interface EOFToken extends TokenBase {
  /** Location info if enabled */
  location?: Location;
}

Token Utilities

Utility functions for working with tokens.

/**
 * Get attribute value from tag token
 * @param token - Tag token to search
 * @param attrName - Attribute name to find
 * @returns Attribute value or null if not found
 */
function getTokenAttr(token: TagToken, attrName: string): string | null;

/**
 * Token type enumeration
 */
enum TokenType {
  CHARACTER = 0,
  NULL_CHARACTER = 1,
  WHITESPACE_CHARACTER = 2,
  START_TAG = 3,
  END_TAG = 4,
  COMMENT = 5,
  DOCTYPE = 6,
  EOF = 7,
  HIBERNATION = 8
}

Usage Examples:

import { Token, type TagToken, type CharacterToken } from "parse5";

// Check token attribute
const tagToken: TagToken = /* ... */;
const className = Token.getTokenAttr(tagToken, 'class');
if (className) {
  console.log('Class name:', className);
}

// Handle different token types
function processToken(token: Token.Token) {
  switch (token.type) {
    case Token.TokenType.START_TAG:
      console.log('Start tag:', (token as TagToken).tagName);
      break;
    case Token.TokenType.CHARACTER:
      console.log('Text:', (token as CharacterToken).chars);
      break;
    case Token.TokenType.COMMENT:
      console.log('Comment:', (token as CommentToken).data);
      break;
  }
}

Advanced Tokenization Patterns

Custom Token Processing

import { Tokenizer, type TokenHandler, type TagToken } from "parse5";

class CustomTokenProcessor implements TokenHandler {
  private tagStack: string[] = [];

  onStartTag(token: TagToken): void {
    this.tagStack.push(token.tagName);
    console.log(`Entering tag: ${token.tagName}, depth: ${this.tagStack.length}`);
    
    // Process attributes
    token.attrs.forEach(attr => {
      console.log(`  Attribute: ${attr.name}="${attr.value}"`);
    });
  }

  onEndTag(token: TagToken): void {
    const expectedTag = this.tagStack.pop();
    if (expectedTag !== token.tagName) {
      console.warn(`Mismatched tags: expected ${expectedTag}, got ${token.tagName}`);
    }
    console.log(`Exiting tag: ${token.tagName}, depth: ${this.tagStack.length}`);
  }

  onComment(token: CommentToken): void {
    console.log(`Comment: ${token.data}`);
  }

  onDoctype(token: DoctypeToken): void {
    console.log(`DOCTYPE: ${token.name}`);
  }

  onEof(): void {
    console.log('End of file reached');
  }

  onCharacter(token: CharacterToken): void {
    const trimmed = token.chars.trim();
    if (trimmed) {
      console.log(`Text content: ${trimmed}`);
    }
  }

  onNullCharacter(): void {
    console.warn('Null character encountered');
  }

  onWhitespaceCharacter(): void {
    // Usually ignore whitespace
  }
}

// Use custom processor
const processor = new CustomTokenProcessor();
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: false }, processor);
tokenizer.write('<html><body>Hello World!</body></html>', true);

Location-Aware Tokenization

import { Tokenizer, type TokenHandler, type Location } from "parse5";

class LocationAwareHandler implements TokenHandler {
  private html: string;

  constructor(html: string) {
    this.html = html;
  }

  private getSourceSnippet(location: Location): string {
    return this.html.substring(location.startOffset, location.endOffset);
  }

  onStartTag(token: TagToken): void {
    if (token.location) {
      const snippet = this.getSourceSnippet(token.location);
      console.log(`Start tag at line ${token.location.startLine}: ${snippet}`);
      
      // Show attribute locations
      if (token.location.attrs) {
        Object.entries(token.location.attrs).forEach(([name, attrLocation]) => {
          const attrSnippet = this.getSourceSnippet(attrLocation);
          console.log(`  Attribute ${name} at line ${attrLocation.startLine}: ${attrSnippet}`);
        });
      }
    }
  }

  // ... implement other methods with location awareness
}

Install with Tessl CLI

npx tessl i tessl/npm-parse5

docs

error-handling.md

html-utilities.md

index.md

parsing.md

serialization.md

tokenization.md

tree-adapters.md

tile.json