or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
npmpkg:npm/marked@17.0.x

docs

index.md
tile.json

tessl/npm-marked

tessl install tessl/npm-marked@17.0.0

A markdown parser built for speed

tokenizer.mddocs/reference/

Tokenizer

The Tokenizer class provides low-level methods for recognizing and creating tokens from markdown syntax patterns. It is used by the Lexer during tokenization.

Class: Tokenizer

class Tokenizer {
  constructor(options?: MarkedOptions);

  /**
   * Tokenizer options
   * Affects tokenization behavior
   */
  options: MarkedOptions;

  /**
   * Regular expression rules for tokenization (set by lexer)
   * Contains block and inline rule sets
   */
  rules: Rules;

  /**
   * Reference to the lexer instance (set by lexer)
   * Use for recursive tokenization of nested content
   */
  lexer: Lexer;

  // Block-level tokenizer methods

  /**
   * Tokenize whitespace/newlines
   * @param src - Source starting with potential space
   * @returns Space token or undefined if no match
   */
  space(src: string): Tokens.Space | undefined;

  /**
   * Tokenize indented code block (4+ spaces)
   * @param src - Source starting with potential indented code
   * @returns Code token or undefined if no match
   */
  code(src: string): Tokens.Code | undefined;

  /**
   * Tokenize fenced code block (``` or ~~~)
   * @param src - Source starting with potential fenced code
   * @returns Code token or undefined if no match
   */
  fences(src: string): Tokens.Code | undefined;

  /**
   * Tokenize ATX heading (# Heading)
   * @param src - Source starting with potential heading
   * @returns Heading token or undefined if no match
   */
  heading(src: string): Tokens.Heading | undefined;

  /**
   * Tokenize horizontal rule (---, ***, ___)
   * Requires at least 3 characters
   * @param src - Source starting with potential hr
   * @returns Hr token or undefined if no match
   */
  hr(src: string): Tokens.Hr | undefined;

  /**
   * Tokenize blockquote (> quote)
   * @param src - Source starting with potential blockquote
   * @returns Blockquote token or undefined if no match
   */
  blockquote(src: string): Tokens.Blockquote | undefined;

  /**
   * Tokenize list (ordered or unordered)
   * Handles nested lists and task lists
   * @param src - Source starting with potential list
   * @returns List token or undefined if no match
   */
  list(src: string): Tokens.List | undefined;

  /**
   * Tokenize block-level HTML
   * @param src - Source starting with potential HTML block
   * @returns HTML token or undefined if no match
   */
  html(src: string): Tokens.HTML | undefined;

  /**
   * Tokenize link definition ([label]: url "title")
   * @param src - Source starting with potential definition
   * @returns Def token or undefined if no match
   */
  def(src: string): Tokens.Def | undefined;

  /**
   * Tokenize table (GFM)
   * Requires header row and separator row
   * @param src - Source starting with potential table
   * @returns Table token or undefined if no match
   */
  table(src: string): Tokens.Table | undefined;

  /**
   * Tokenize setext heading (underlined heading)
   * Text followed by === or ---
   * @param src - Source starting with potential lheading
   * @returns Heading token or undefined if no match
   */
  lheading(src: string): Tokens.Heading | undefined;

  /**
   * Tokenize paragraph
   * @param src - Source starting with potential paragraph
   * @returns Paragraph token or undefined if no match
   */
  paragraph(src: string): Tokens.Paragraph | undefined;

  /**
   * Tokenize block-level text (fallback)
   * @param src - Source to tokenize as text
   * @returns Text token or undefined if no match
   */
  text(src: string): Tokens.Text | undefined;

  // Inline-level tokenizer methods

  /**
   * Tokenize escape sequence (\*)
   * @param src - Source starting with potential escape
   * @returns Escape token or undefined if no match
   */
  escape(src: string): Tokens.Escape | undefined;

  /**
   * Tokenize inline HTML tag
   * @param src - Source starting with potential tag
   * @returns Tag token or undefined if no match
   */
  tag(src: string): Tokens.Tag | undefined;

  /**
   * Tokenize link or image ([text](url) or ![alt](url))
   * @param src - Source starting with potential link/image
   * @returns Link or Image token or undefined if no match
   */
  link(src: string): Tokens.Link | Tokens.Image | undefined;

  /**
   * Tokenize reference link or image ([text][ref])
   * @param src - Source starting with potential reflink
   * @param links - Map of link references
   * @returns Link, Image, or Text token, or undefined if no match
   */
  reflink(src: string, links: Links): Tokens.Link | Tokens.Image | Tokens.Text | undefined;

  /**
   * Tokenize emphasis or strong (*em* or **strong**)
   * Handles complex nesting and precedence
   * @param src - Source starting with potential em/strong
   * @param maskedSrc - Source with masked regions (from emStrongMask hook)
   * @param prevChar - Previous character for context (affects delimiter rules)
   * @returns Em or Strong token or undefined if no match
   */
  emStrong(src: string, maskedSrc: string, prevChar: string): Tokens.Em | Tokens.Strong | undefined;

  /**
   * Tokenize inline code (`code`)
   * Supports multiple backticks
   * @param src - Source starting with potential codespan
   * @returns Codespan token or undefined if no match
   */
  codespan(src: string): Tokens.Codespan | undefined;

  /**
   * Tokenize line break (two spaces + \n or just \n with breaks: true)
   * @param src - Source starting with potential br
   * @returns Br token or undefined if no match
   */
  br(src: string): Tokens.Br | undefined;

  /**
   * Tokenize strikethrough (~~deleted~~) (GFM)
   * Requires gfm: true
   * @param src - Source starting with potential strikethrough
   * @returns Del token or undefined if no match
   */
  del(src: string): Tokens.Del | undefined;

  /**
   * Tokenize autolink (<url> or <email>)
   * @param src - Source starting with potential autolink
   * @returns Link token or undefined if no match
   */
  autolink(src: string): Tokens.Link | undefined;

  /**
   * Tokenize raw URL (GFM)
   * Requires gfm: true
   * @param src - Source starting with potential URL
   * @returns Link token or undefined if no match
   */
  url(src: string): Tokens.Link | undefined;

  /**
   * Tokenize inline text (fallback)
   * Matches any text that doesn't match other inline patterns
   * @param src - Source to tokenize as text
   * @returns Text token or undefined if no match
   */
  inlineText(src: string): Tokens.Text | undefined;
}

Usage

Direct Tokenizer Usage

import { Tokenizer, Lexer } from "marked";

const tokenizer = new Tokenizer();

// Must set rules and lexer (normally done by lexer automatically)
const lexer = new Lexer();
tokenizer.rules = Lexer.rules;
tokenizer.lexer = lexer;

// Tokenize specific element
const headingToken = tokenizer.heading('# Hello World');

console.log(headingToken);
// {
//   type: 'heading',
//   raw: '# Hello World',
//   depth: 1,
//   text: 'Hello World',
//   tokens: [...]
// }

Direct Usage Notes:

  • Rarely needed (use extensions instead)
  • Must configure rules and lexer reference
  • Useful for testing or custom workflows

Override Tokenizer Methods

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Override heading tokenization to add custom metadata
tokenizer.heading = function(src) {
  // Match heading with optional {#id} syntax
  const match = src.match(/^(#{1,6})\s+(.+?)(?:\s+\{#([^}]+)\})?\s*$/);

  if (match) {
    const depth = match[1].length;
    const text = match[2];
    const customId = match[3];

    return {
      type: 'heading',
      raw: match[0],
      depth,
      text,
      tokens: this.lexer.inline(text),
      customId // Custom property
    };
  }
  // Return undefined if no match
};

marked.setOptions({ tokenizer });

// Now "# Title {#my-id}" will include customId in token
const html = marked.parse('# Title {#my-id}');

Override Best Practices:

  • Always return undefined when no match (not null or false)
  • Include raw property in returned token
  • Use this.lexer for nested tokenization
  • Match from start of source (use ^ regex anchor)

Custom Code Block Tokenizer

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Add filename support to fenced code blocks
tokenizer.fences = function(src) {
  // Match ```lang [filename]
  const match = src.match(/^```(\w+)?(?:\s+\[([^\]]+)\])?\n([\s\S]*?)\n```/);

  if (match) {
    return {
      type: 'code',
      raw: match[0],
      lang: match[1] || '',
      filename: match[2] || null, // Custom property
      text: match[3]
    };
  }
  // Return undefined if no match
};

marked.setOptions({ tokenizer });

// Usage: ```js [example.js]
// Token will include filename property

Code Block Customization:

  • Add metadata (filename, line numbers)
  • Support alternative fence characters
  • Extract code attributes
  • Custom language tags

Extend List Tokenizer

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Save original list tokenizer
const originalList = tokenizer.list.bind(tokenizer);

tokenizer.list = function(src) {
  // Try original tokenizer first
  const token = originalList(src);
  if (token) {
    return token;
  }

  // Try custom marker (e.g., ☐ for tasks)
  const match = src.match(/^(☐|☑)\s+(.+)/);
  if (match) {
    const checked = match[1] === '☑';
    const text = match[2];
    
    return {
      type: 'list',
      raw: match[0],
      ordered: false,
      start: '',
      loose: false,
      items: [{
        type: 'list_item',
        raw: match[0],
        task: true,
        checked: checked,
        loose: false,
        text: text,
        tokens: this.lexer.inline(text)
      }]
    };
  }
  
  return undefined;
};

marked.setOptions({ tokenizer });

// Now supports: ☐ Task item

List Customization:

  • Custom list markers
  • Nested list handling
  • Task list variants
  • Definition lists

Override Inline Tokenizers

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Add support for ==highlighted text==
const originalEmStrong = tokenizer.emStrong.bind(tokenizer);

tokenizer.emStrong = function(src, maskedSrc, prevChar) {
  // Check for highlight first
  const highlightMatch = src.match(/^==([^=]+)==/);
  if (highlightMatch) {
    return {
      type: 'highlight', // Custom token type
      raw: highlightMatch[0],
      text: highlightMatch[1],
      tokens: this.lexer.inlineTokens(highlightMatch[1])
    };
  }

  // Fall back to original em/strong handling
  return originalEmStrong(src, maskedSrc, prevChar);
};

// Also need renderer for custom token type
marked.use({
  tokenizer,
  renderer: {
    highlight(token) {
      return `<mark>${this.parser.parseInline(token.tokens)}</mark>`;
    }
  }
});

Inline Customization:

  • Custom emphasis markers
  • Additional formatting
  • Custom link syntax
  • Inline extensions

Math Expression Support

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Add support for inline math: $x^2$
const originalCodespan = tokenizer.codespan.bind(tokenizer);

tokenizer.codespan = function(src) {
  // Check for math expression first
  const mathMatch = src.match(/^\$([^$\n]+)\$/);
  if (mathMatch) {
    return {
      type: 'inlineMath',
      raw: mathMatch[0],
      text: mathMatch[1]
    };
  }

  // Fall back to regular codespan
  return originalCodespan(src);
};

// Add support for block math: $$...$$
tokenizer.fences = function(src) {
  const mathMatch = src.match(/^\$\$\n([\s\S]+?)\n\$\$/);
  if (mathMatch) {
    return {
      type: 'blockMath',
      raw: mathMatch[0],
      text: mathMatch[1]
    };
  }

  // Handle regular fenced code...
  // (would need to implement or call original)
  return undefined;
};

// Add renderers
marked.use({
  tokenizer,
  renderer: {
    inlineMath(token) {
      return `<span class="math-inline">\\(${token.text}\\)</span>`;
    },
    blockMath(token) {
      return `<div class="math-block">\\[${token.text}\\]</div>\n`;
    }
  }
});

Math Support Patterns:

  • Inline math: $...$
  • Block math: $$...$$
  • LaTeX commands
  • MathJax/KaTeX integration

Preserve Method Chain

When overriding tokenizers, you can chain to the default implementation:

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Save original method
const originalLink = tokenizer.link.bind(tokenizer);

tokenizer.link = function(src) {
  // Custom handling for wiki links: [[Page]]
  const wikiMatch = src.match(/^\[\[([^\]]+)\]\]/);
  if (wikiMatch) {
    const page = wikiMatch[1];
    const slug = page.toLowerCase().replace(/\s+/g, '-');
    
    return {
      type: 'link',
      raw: wikiMatch[0],
      href: `/wiki/${slug}`,
      title: null,
      text: page,
      tokens: this.lexer.inlineTokens(page)
    };
  }

  // Fall back to original for standard links
  return originalLink(src);
};

marked.setOptions({ tokenizer });

// Now supports: [[Wiki Page]]

Chaining Pattern:

  1. Save original method
  2. Try custom pattern first
  3. Fall back to original if no match
  4. Preserves default behavior

Context Access

Tokenizer methods have access to this.lexer for recursive tokenization:

import { Tokenizer } from "marked";

const tokenizer = new Tokenizer();

tokenizer.blockquote = function(src) {
  const match = src.match(/^> (.+)/);
  if (match) {
    const text = match[1];

    return {
      type: 'blockquote',
      raw: match[0],
      text,
      // Use lexer to recursively tokenize blockquote content
      tokens: this.lexer.blockTokens(text)
    };
  }
  return undefined;
};

Context Properties:

  • this.lexer: Lexer instance for nested tokenization
  • this.options: Current marked options
  • this.rules: Regular expression rules

Lexer Methods Available:

  • this.lexer.inline(src): Queue inline tokens
  • this.lexer.inlineTokens(src): Process inline tokens
  • this.lexer.blockTokens(src): Process block tokens

Conditional Tokenization

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Only recognize GitHub-style task lists when GFM enabled
tokenizer.list = function(src) {
  // Only when GFM is enabled
  if (!this.options.gfm) {
    return undefined; // Use default (non-GFM) handling
  }

  // Custom GFM task list parsing...
  const match = src.match(/^- \[([ x])\] (.+)/);
  if (match) {
    const checked = match[1] === 'x';
    const text = match[2];
    
    return {
      type: 'list',
      raw: match[0],
      ordered: false,
      start: '',
      loose: false,
      items: [{
        type: 'list_item',
        raw: match[0],
        task: true,
        checked: checked,
        loose: false,
        text: text,
        tokens: this.lexer.inline(text)
      }]
    };
  }

  return undefined;
};

marked.setOptions({ tokenizer, gfm: true });

Conditional Logic Uses:

  • Check this.options.gfm for GFM features
  • Check this.options.pedantic for strict mode
  • Check this.options.breaks for line break handling
  • Custom option checks

Working with Rules

Tokenizers use regular expression rules stored in this.rules:

import { Tokenizer, Lexer } from "marked";

const tokenizer = new Tokenizer();
tokenizer.rules = Lexer.rules;

// Access rules
console.log(tokenizer.rules.block.heading); // Regex for headings
console.log(tokenizer.rules.inline.link); // Regex for links

// Rules vary by options (GFM, pedantic, breaks)
// Example: GFM rules include table patterns
console.log(tokenizer.rules.block.table); // Table regex (GFM only)

Rule Structure:

  • rules.block: Block-level patterns
  • rules.inline: Inline-level patterns
  • Compiled regex patterns
  • Optimized for performance

Best Practices

Return undefined for No Match

tokenizer.heading = function(src) {
  const match = src.match(/^#{1,6}\s+(.+)/);
  if (!match) {
    return undefined; // Explicit undefined (not null or false)
  }

  return {
    type: 'heading',
    depth: match[0].indexOf(' '),
    text: match[1],
    raw: match[0],
    tokens: this.lexer.inline(match[1])
  };
};

Return Value Rules:

  • Return token object if matched
  • Return undefined if no match
  • Never return null or false from tokenizer
  • (false is for renderer fallbacks only)

Include raw Property

Every token must include the raw property:

tokenizer.custom = function(src) {
  const match = src.match(/^@(\w+)/);
  if (match) {
    return {
      type: 'custom',
      raw: match[0], // REQUIRED: original matched text
      value: match[1]
    };
  }
  return undefined;
};

raw Property Importance:

  • Required for lexer to advance position
  • Must match exactly what was consumed
  • Used for error reporting
  • Enables token reconstruction

Use Lexer for Nested Content

tokenizer.blockquote = function(src) {
  const match = src.match(/^> (.+)/);
  if (match) {
    return {
      type: 'blockquote',
      raw: match[0],
      text: match[1],
      // Use lexer to parse nested markdown
      tokens: this.lexer.blockTokens(match[1])
    };
  }
  return undefined;
};

Nested Content Handling:

  • Block content: this.lexer.blockTokens()
  • Inline content: this.lexer.inlineTokens() or this.lexer.inline()
  • Preserves markdown parsing in nested content
  • Maintains consistent behavior

Avoid Greedy Matching

// Bad: greedy, matches too much
const match = src.match(/^(.+)/);

// Good: specific, matches what's needed
const match = src.match(/^[^\n]+/); // Match to end of line

// Better: precise pattern
const match = src.match(/^@\[(\w+)\]\(([^)]+)\)/); // Specific syntax

Matching Guidelines:

  • Use specific patterns
  • Avoid .+ or .* without constraints
  • Use anchors (^, $)
  • Test with edge cases

Handle Edge Cases

tokenizer.heading = function(src) {
  // Match heading with optional whitespace
  const match = src.match(/^(#{1,6})(?:\s+(.+?))?(?:\n|$)/);
  if (match) {
    const depth = match[1].length;
    const text = match[2] || ''; // Handle empty headings
    
    return {
      type: 'heading',
      raw: match[0],
      depth,
      text,
      tokens: text ? this.lexer.inline(text) : []
    };
  }
  return undefined;
};

Common Edge Cases:

  • Empty content
  • Missing optional parts
  • Whitespace variations
  • End of input
  • Escaped characters

Tokenizer Return Values

  • Return a token object if the source matches
  • Return undefined if no match
  • Never return null or false from tokenizer methods
    • Use false only in renderer/extension overrides to fall back
    • Tokenizers should return undefined for no match
// Correct
tokenizer.custom = function(src) {
  const match = src.match(/^pattern/);
  if (match) {
    return { type: 'custom', raw: match[0], /* ... */ };
  }
  return undefined; // Correct
};

// Incorrect
tokenizer.custom = function(src) {
  const match = src.match(/^pattern/);
  if (match) {
    return { type: 'custom', raw: match[0], /* ... */ };
  }
  return false; // Wrong! Use undefined
};

Performance Considerations

  • Tokenizers are called frequently during parsing
  • Use efficient regular expressions
  • Avoid expensive operations in tokenizer methods
  • Consider caching compiled regexes
import { Tokenizer } from "marked";

// Cache regex outside method (compiled once)
const customRegex = /^@\[(\w+)\]\(([^)]+)\)/;

const tokenizer = new Tokenizer();

tokenizer.custom = function(src) {
  const match = customRegex.exec(src); // Reuse compiled regex
  if (match) {
    return {
      type: 'custom',
      raw: match[0],
      name: match[1],
      value: match[2]
    };
  }
  return undefined;
};

Performance Tips:

  • Compile regex once, reuse many times
  • Use start functions in extensions (optimization hint)
  • Avoid capturing groups if not needed
  • Test performance with large documents

Advanced Examples

Custom Container Syntax

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

tokenizer.container = function(src) {
  // Match :::type ... ::: syntax
  const match = src.match(/^:::(\w+)\n([\s\S]*?)\n:::/);
  if (match) {
    return {
      type: 'container',
      raw: match[0],
      containerType: match[1],
      text: match[2],
      tokens: this.lexer.blockTokens(match[2])
    };
  }
  return undefined;
};

// Register as extension
marked.use({
  extensions: [{
    name: 'container',
    level: 'block',
    tokenizer: tokenizer.container,
    renderer(token) {
      const body = this.parser.parse(token.tokens);
      return `<div class="container-${token.containerType}">${body}</div>\n`;
    }
  }]
});

Definition List Support

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

tokenizer.deflist = function(src) {
  // Match term followed by : definition
  const match = src.match(/^(.+)\n:\s+(.+)/);
  if (match) {
    return {
      type: 'deflist',
      raw: match[0],
      term: match[1],
      definition: match[2],
      termTokens: this.lexer.inlineTokens(match[1]),
      defTokens: this.lexer.inlineTokens(match[2])
    };
  }
  return undefined;
};

marked.use({
  extensions: [{
    name: 'deflist',
    level: 'block',
    tokenizer: tokenizer.deflist,
    renderer(token) {
      const term = this.parser.parseInline(token.termTokens);
      const def = this.parser.parseInline(token.defTokens);
      return `<dl><dt>${term}</dt><dd>${def}</dd></dl>\n`;
    }
  }]
});

Footnote Tokenizer

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Footnote reference: [^1]
tokenizer.footnoteRef = function(src) {
  const match = src.match(/^\[\^(\w+)\]/);
  if (match) {
    return {
      type: 'footnoteRef',
      raw: match[0],
      id: match[1]
    };
  }
  return undefined;
};

// Footnote definition: [^1]: Text
tokenizer.footnoteDef = function(src) {
  const match = src.match(/^\[\^(\w+)\]:\s+(.+)/);
  if (match) {
    return {
      type: 'footnoteDef',
      raw: match[0],
      id: match[1],
      text: match[2]
    };
  }
  return undefined;
};

marked.use({
  extensions: [
    {
      name: 'footnoteRef',
      level: 'inline',
      start: (src) => src.indexOf('[^'),
      tokenizer: tokenizer.footnoteRef,
      renderer(token) {
        return `<sup><a href="#fn-${token.id}" id="fnref-${token.id}">${token.id}</a></sup>`;
      }
    },
    {
      name: 'footnoteDef',
      level: 'block',
      tokenizer: tokenizer.footnoteDef,
      renderer() {
        return ''; // Rendered separately in postprocess hook
      }
    }
  ]
});

Attribute Support

import { marked, Tokenizer } from "marked";

const tokenizer = new Tokenizer();

// Override heading to support {.class #id} attributes
const originalHeading = tokenizer.heading.bind(tokenizer);

tokenizer.heading = function(src) {
  const match = src.match(/^(#{1,6})\s+(.+?)(?:\s+\{([^}]+)\})?\s*$/);
  if (match) {
    const depth = match[1].length;
    const text = match[2];
    const attrs = match[3];
    
    // Parse attributes
    const classes = attrs ? [...attrs.matchAll(/\.([^\s#]+)/g)].map(m => m[1]) : [];
    const id = attrs ? attrs.match(/#([^\s.]+)/)?.[1] : null;
    
    return {
      type: 'heading',
      raw: match[0],
      depth,
      text,
      tokens: this.lexer.inline(text),
      classes,
      id
    };
  }
  
  // Fall back to original
  return originalHeading(src);
};

marked.use({
  tokenizer,
  renderer: {
    heading(token) {
      const text = this.parser.parseInline(token.tokens);
      const id = token.id ? ` id="${token.id}"` : '';
      const classes = token.classes?.length ? ` class="${token.classes.join(' ')}"` : '';
      return `<h${token.depth}${id}${classes}>${text}</h${token.depth}>\n`;
    }
  }
});

// Usage: # Heading {.my-class #my-id}