tessl/npm-parse5

HTML parser and serializer that is fully compliant with the WHATWG HTML Living Standard.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

HTML Tokenization

Name: tessl/npm-parse5
Author: tessl

Low-level HTML tokenization functionality for advanced use cases that require direct access to the tokenization process. The tokenizer converts HTML text into a stream of tokens that the parser then processes into an AST.

Capabilities

Tokenizer Class

Core tokenizer class that processes HTML text into tokens.

/**
 * HTML tokenizer class for low-level token processing
 * @internal - Advanced API for specialized use cases
 */
class Tokenizer {
  /**
   * Creates a new tokenizer instance
   * @param options - Tokenizer configuration options
   * @param handler - Token handler for processing tokens
   */
  constructor(options: TokenizerOptions, handler: TokenHandler);

  /**
   * Write HTML text to the tokenizer for processing
   * @param chunk - HTML text chunk to tokenize
   * @param isLastChunk - Whether this is the final chunk
   */
  write(chunk: string, isLastChunk: boolean): void;

  /**
   * Insert HTML text at the current position
   * @param chunk - HTML text to insert
   */
  insertHtmlAtCurrentPos(chunk: string): void;

  /**
   * Start new named entity consumption
   * @param startCp - Starting code point
   * @param endCp - Ending code point  
   */
  startNamedEntityConsumption(startCp: number, endCp: number): void;

  /**
   * Emit current character as token
   */
  emitCurrentCharacter(): void;

  /**
   * Emit EOF token
   */
  emitEOFToken(): void;

  /**
   * Get current tokenizer state
   */
  get state(): State;

  /**
   * Set tokenizer state
   */
  set state(newState: State);
}

Tokenizer Options

Configuration options for the tokenizer.

/**
 * Tokenizer configuration options
 */
interface TokenizerOptions {
  /**
   * Enable source code location information tracking.
   * When enabled, tokens will include location data.
   * Defaults to false.
   */
  sourceCodeLocationInfo?: boolean;
}

Tokenizer Modes

Constants defining different tokenizer parsing modes based on context.

/**
 * Tokenizer mode constants for different parsing contexts
 */
const TokenizerMode: {
  readonly DATA: State.DATA;
  readonly RCDATA: State.RCDATA;
  readonly RAWTEXT: State.RAWTEXT;
  readonly SCRIPT_DATA: State.SCRIPT_DATA;
  readonly PLAINTEXT: State.PLAINTEXT;
  readonly CDATA_SECTION: State.CDATA_SECTION;
};

/**
 * Internal tokenizer states (used by TokenizerMode)
 */
enum State {
  DATA = 0,
  RCDATA = 1,
  RAWTEXT = 2,
  SCRIPT_DATA = 3,
  PLAINTEXT = 4,
  CDATA_SECTION = 5,
  // ... additional internal states
}

Usage Examples:

import { Tokenizer, TokenizerMode, type TokenizerOptions, type TokenHandler } from "parse5";

// Create token handler
const handler: TokenHandler = {
  onComment: (token) => console.log('Comment:', token.data),
  onDoctype: (token) => console.log('DOCTYPE:', token.name),
  onStartTag: (token) => console.log('Start tag:', token.tagName),
  onEndTag: (token) => console.log('End tag:', token.tagName),
  onEof: (token) => console.log('EOF reached'),
  onCharacter: (token) => console.log('Character:', token.chars),
  onNullCharacter: (token) => console.log('Null character'),
  onWhitespaceCharacter: (token) => console.log('Whitespace:', token.chars)
};

// Create tokenizer with location tracking
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler);

// Process HTML text
tokenizer.write('<div>Hello <span>World</span></div>', true);

// Set specific tokenizer mode for different contexts
tokenizer.state = TokenizerMode.SCRIPT_DATA; // For script content
tokenizer.state = TokenizerMode.RAWTEXT;     // For style/title content

Token Handler Interface

Interface for handling tokens emitted by the tokenizer.

/**
 * Token handler interface for processing tokenizer output
 */
interface TokenHandler {
  /**
   * Handle comment tokens
   * @param token - Comment token
   */
  onComment(token: CommentToken): void;

  /**
   * Handle DOCTYPE tokens
   * @param token - DOCTYPE token
   */
  onDoctype(token: DoctypeToken): void;

  /**
   * Handle start tag tokens
   * @param token - Start tag token
   */
  onStartTag(token: TagToken): void;

  /**
   * Handle end tag tokens
   * @param token - End tag token
   */
  onEndTag(token: TagToken): void;

  /**
   * Handle end of file tokens
   * @param token - EOF token
   */
  onEof(token: EOFToken): void;

  /**
   * Handle character tokens
   * @param token - Character token
   */
  onCharacter(token: CharacterToken): void;

  /**
   * Handle null character tokens
   * @param token - Null character token
   */
  onNullCharacter(token: CharacterToken): void;

  /**
   * Handle whitespace character tokens
   * @param token - Whitespace character token
   */
  onWhitespaceCharacter(token: CharacterToken): void;

  /**
   * Optional error handler for parsing errors
   * @param error - Parser error information
   */
  onParseError?: ParserErrorHandler | null;
}

Token Types

Token Base Interface

Base interface shared by all token types.

/**
 * Base interface for all token types
 */
interface TokenBase {
  /** Location information if sourceCodeLocationInfo is enabled */
  location?: Location;
}

/**
 * Union type of all token types
 */
type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken;

Tag Tokens

Tokens representing HTML tags (both start and end tags).

/**
 * Tag token representing HTML start and end tags
 */
interface TagToken extends TokenBase {
  /** Tag name (e.g., 'div', 'span') */
  tagName: string;
  
  /** Tag ID for efficient comparison */
  tagID: TAG_ID;
  
  /** Whether this is a self-closing tag */
  selfClosing: boolean;
  
  /** Acknowledgment flag for self-closing */
  ackSelfClosing: boolean;
  
  /** Tag attributes */
  attrs: Attribute[];
  
  /** Location info for attributes if enabled */
  location?: LocationWithAttributes;
}

/**
 * Attribute interface
 */
interface Attribute {
  /** Attribute name */
  name: string;
  
  /** Attribute value */
  value: string;
  
  /** Namespace URI if applicable */
  namespace?: string;
  
  /** Namespace prefix if applicable */
  prefix?: string;
}

Character Tokens

Tokens representing text content and character data.

/**
 * Character token representing text content
 */
interface CharacterToken extends TokenBase {
  /** Character data */
  chars: string;
  
  /** Location info if enabled */
  location?: Location;
}

Comment Tokens

Tokens representing HTML comments.

/**
 * Comment token representing HTML comments
 */
interface CommentToken extends TokenBase {
  /** Comment text content */
  data: string;
  
  /** Location info if enabled */
  location?: Location;
}

DOCTYPE Tokens

Tokens representing HTML DOCTYPE declarations.

/**
 * DOCTYPE token representing document type declarations
 */
interface DoctypeToken extends TokenBase {
  /** DOCTYPE name (usually 'html') */
  name: string | null;
  
  /** Public identifier */
  publicId: string | null;
  
  /** System identifier */
  systemId: string | null;
  
  /** Whether the DOCTYPE is force-quirks */
  forceQuirks: boolean;
  
  /** Location info if enabled */
  location?: Location;
}

EOF Tokens

Tokens representing end of file.

/**
 * EOF token representing end of file
 */
interface EOFToken extends TokenBase {
  /** Location info if enabled */
  location?: Location;
}

Token Utilities

Utility functions for working with tokens.

/**
 * Get attribute value from tag token
 * @param token - Tag token to search
 * @param attrName - Attribute name to find
 * @returns Attribute value or null if not found
 */
function getTokenAttr(token: TagToken, attrName: string): string | null;

/**
 * Token type enumeration
 */
enum TokenType {
  CHARACTER = 0,
  NULL_CHARACTER = 1,
  WHITESPACE_CHARACTER = 2,
  START_TAG = 3,
  END_TAG = 4,
  COMMENT = 5,
  DOCTYPE = 6,
  EOF = 7,
  HIBERNATION = 8
}

Usage Examples:

import { Token, type TagToken, type CharacterToken } from "parse5";

// Check token attribute
const tagToken: TagToken = /* ... */;
const className = Token.getTokenAttr(tagToken, 'class');
if (className) {
  console.log('Class name:', className);
}

// Handle different token types
function processToken(token: Token.Token) {
  switch (token.type) {
    case Token.TokenType.START_TAG:
      console.log('Start tag:', (token as TagToken).tagName);
      break;
    case Token.TokenType.CHARACTER:
      console.log('Text:', (token as CharacterToken).chars);
      break;
    case Token.TokenType.COMMENT:
      console.log('Comment:', (token as CommentToken).data);
      break;
  }
}

Advanced Tokenization Patterns

Custom Token Processing

import { Tokenizer, type TokenHandler, type TagToken } from "parse5";

class CustomTokenProcessor implements TokenHandler {
  private tagStack: string[] = [];

  onStartTag(token: TagToken): void {
    this.tagStack.push(token.tagName);
    console.log(`Entering tag: ${token.tagName}, depth: ${this.tagStack.length}`);
    
    // Process attributes
    token.attrs.forEach(attr => {
      console.log(`  Attribute: ${attr.name}="${attr.value}"`);
    });
  }

  onEndTag(token: TagToken): void {
    const expectedTag = this.tagStack.pop();
    if (expectedTag !== token.tagName) {
      console.warn(`Mismatched tags: expected ${expectedTag}, got ${token.tagName}`);
    }
    console.log(`Exiting tag: ${token.tagName}, depth: ${this.tagStack.length}`);
  }

  onComment(token: CommentToken): void {
    console.log(`Comment: ${token.data}`);
  }

  onDoctype(token: DoctypeToken): void {
    console.log(`DOCTYPE: ${token.name}`);
  }

  onEof(): void {
    console.log('End of file reached');
  }

  onCharacter(token: CharacterToken): void {
    const trimmed = token.chars.trim();
    if (trimmed) {
      console.log(`Text content: ${trimmed}`);
    }
  }

  onNullCharacter(): void {
    console.warn('Null character encountered');
  }

  onWhitespaceCharacter(): void {
    // Usually ignore whitespace
  }
}

// Use custom processor
const processor = new CustomTokenProcessor();
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: false }, processor);
tokenizer.write('<html><body>Hello World!</body></html>', true);

Location-Aware Tokenization

import { Tokenizer, type TokenHandler, type Location } from "parse5";

class LocationAwareHandler implements TokenHandler {
  private html: string;

  constructor(html: string) {
    this.html = html;
  }

  private getSourceSnippet(location: Location): string {
    return this.html.substring(location.startOffset, location.endOffset);
  }

  onStartTag(token: TagToken): void {
    if (token.location) {
      const snippet = this.getSourceSnippet(token.location);
      console.log(`Start tag at line ${token.location.startLine}: ${snippet}`);
      
      // Show attribute locations
      if (token.location.attrs) {
        Object.entries(token.location.attrs).forEach(([name, attrLocation]) => {
          const attrSnippet = this.getSourceSnippet(attrLocation);
          console.log(`  Attribute ${name} at line ${attrLocation.startLine}: ${attrSnippet}`);
        });
      }
    }
  }

  // ... implement other methods with location awareness
}

Install with Tessl CLI