or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
npmpkg:npm/marked@17.0.x

docs

index.md
tile.json

tessl/npm-marked

tessl install tessl/npm-marked@17.0.0

A markdown parser built for speed

lexer.mddocs/reference/

Lexer

The Lexer class tokenizes markdown text into a structured token tree. It handles block-level elements (paragraphs, headings, lists, etc.) and queues inline elements for processing.

Class: Lexer

class Lexer {
  constructor(options?: MarkedOptions);

  /**
   * Tokenize markdown string into tokens
   * Static method - creates new Lexer internally
   * @param src - Markdown source. Empty strings return empty token list.
   * @param options - Lexer options
   * @returns TokensList with all tokens and link references
   */
  static lex(src: string, options?: MarkedOptions): TokensList;

  /**
   * Tokenize inline markdown only
   * Static method - creates new Lexer internally
   * Does not process block-level elements
   * @param src - Inline markdown source
   * @param options - Lexer options
   * @returns Array of inline tokens (no TokensList wrapper)
   */
  static lexInline(src: string, options?: MarkedOptions): Token[];

  /**
   * Regular expression rules for tokenization
   * Rules vary based on options (GFM, pedantic, etc.)
   * Contains block and inline rule sets
   */
  static readonly rules: {
    block: BlockRules;
    inline: InlineRules;
  };

  /**
   * Tokenize markdown source (instance method)
   * Processes both block and inline tokens
   * @param src - Markdown string
   * @returns TokensList with tokens and link references
   */
  lex(src: string): TokensList;

  /**
   * Tokenize block-level markdown
   * Processes structural elements (headings, lists, paragraphs, etc.)
   * @param src - Markdown source
   * @param tokens - Token array to append to (optional, creates new if not provided)
   * @param lastParagraphClipped - Internal state flag for list parsing
   * @returns Array of tokens
   */
  blockTokens(src: string, tokens?: Token[], lastParagraphClipped?: boolean): Token[];

  /**
   * Queue inline tokens for processing
   * Queues rather than immediately processing for efficiency
   * @param src - Inline markdown source
   * @param tokens - Token array to populate
   * @returns Token array reference
   */
  inline(src: string, tokens?: Token[]): Token[];

  /**
   * Tokenize inline markdown
   * Processes inline elements (emphasis, links, code, etc.)
   * @param src - Inline markdown source
   * @param tokens - Token array to append to (optional)
   * @returns Array of inline tokens
   */
  inlineTokens(src: string, tokens?: Token[]): Token[];

  /**
   * Current token list being built
   * Includes link references in TokensList.links
   */
  tokens: TokensList;

  /**
   * Lexer options
   * Affects tokenization rules and behavior
   */
  options: MarkedOptions;

  /**
   * Current lexer state (managed automatically)
   * Used to handle context-dependent parsing
   */
  state: {
    /**
     * Whether currently inside a link (prevents nested links)
     * Links cannot contain other links per markdown spec
     */
    inLink: boolean;
    /**
     * Whether currently inside a raw HTML block
     * Affects how certain markdown is interpreted
     */
    inRawBlock: boolean;
    /**
     * Whether at top level (not in a nested block)
     * Affects what block-level elements are allowed
     */
    top: boolean;
  };

  /**
   * Queue of inline content to process
   * Used for deferred inline tokenization
   */
  inlineQueue: Array<{ src: string; tokens: Token[] }>;
}

Usage

Basic Lexing

import { Lexer } from "marked";

const markdown = `
# Hello World

This is a **paragraph** with inline elements.

- List item 1
- List item 2
`;

const tokens = Lexer.lex(markdown);

console.log(tokens);
// [
//   { type: 'heading', depth: 1, text: 'Hello World', tokens: [...], ... },
//   { type: 'space', raw: '\n', ... },
//   { type: 'paragraph', text: 'This is a **paragraph**...', tokens: [...], ... },
//   { type: 'list', items: [...], ordered: false, ... }
// ]

console.log(tokens.links);
// {} - Map of link reference definitions

Token List Properties:

  • tokens array contains all block-level tokens
  • tokens.links object contains link reference definitions
  • Empty input returns empty token list with empty links object

Inline Lexing

import { Lexer } from "marked";

const inline = Lexer.lexInline('This is **bold** and *italic*');

console.log(inline);
// [
//   { type: 'text', text: 'This is ', raw: 'This is ' },
//   { type: 'strong', text: 'bold', tokens: [...], raw: '**bold**' },
//   { type: 'text', text: ' and ', raw: ' and ' },
//   { type: 'em', text: 'italic', tokens: [...], raw: '*italic*' }
// ]

Inline vs Block:

  • lex(): Full processing (block + inline)
  • lexInline(): Only inline elements, no paragraphs or block structures

Instance Usage

import { Lexer } from "marked";

const lexer = new Lexer({
  gfm: true,
  breaks: true
});

const tokens = lexer.lex('# Markdown');

// Access state
console.log(lexer.state.top); // true - at document top level
console.log(lexer.state.inLink); // false - not inside link
console.log(lexer.tokens); // TokensList with tokens and links

Instance vs Static:

  • Static methods create new Lexer internally
  • Instance methods reuse existing Lexer (more efficient for multiple calls)
  • Instance maintains state across calls (usually reset between documents)

Custom Lexer with Extensions

import { marked, Lexer } from "marked";

// Extensions via marked.use() affect lexer automatically
marked.use({
  extensions: [{
    name: 'customBlock',
    level: 'block',
    tokenizer(src) {
      const match = src.match(/^@\[(\w+)\]\((.*?)\)/);
      if (match) {
        return {
          type: 'customBlock',
          raw: match[0],
          name: match[1],
          value: match[2]
        };
      }
    }
  }]
});

// Custom tokenizer is automatically used by lexer
const tokens = Lexer.lex('@[youtube](abc123)');
console.log(tokens[0].type); // 'customBlock'

Block Tokenization Process

The lexer processes block-level markdown in this specific order:

  1. Extension block tokenizers (if any)
  2. Space/newlines - Whitespace between blocks
  3. Indented code blocks - 4+ spaces indentation
  4. Fenced code blocks - ``` or ~~~
  5. Headings (ATX style) - # Heading
  6. Horizontal rules - ---, ***, ___
  7. Blockquotes - > quote
  8. Lists - Ordered and unordered
  9. HTML blocks - Block-level HTML
  10. Link definitions - [label]: url "title"
  11. Tables (GFM) - Pipe-delimited tables
  12. Headings (Setext style) - Underlined headings
  13. Paragraphs - Regular text blocks
  14. Text - Fallback for unmatched content

Order Matters: Earlier patterns take precedence. Custom extensions are checked first.

import { Lexer } from "marked";

const markdown = `
# ATX Heading
---
> Blockquote
- List item
`;

const lexer = new Lexer();
const tokens = lexer.blockTokens(markdown, []);

// Tokens will be in document order:
// 1. heading (ATX processed before HR)
// 2. hr
// 3. blockquote
// 4. list

Inline Tokenization Process

The lexer processes inline markdown in this order:

  1. Extension inline tokenizers (if any)
  2. Escape sequences - \*, \_, etc.
  3. HTML tags - <tag>, </tag>
  4. Links and images - [text](url), ![alt](url)
  5. Reference links - [text][ref]
  6. Emphasis and strong - *em*, **strong**
  7. Inline code - `code`
  8. Line breaks - Two spaces + newline
  9. Strikethrough (GFM) - ~~deleted~~
  10. Autolinks - <url>, <email>
  11. URL autolinks (GFM) - Raw URLs
  12. Text - Fallback for plain text
import { Lexer } from "marked";

const inline = '**bold** and *italic* and `code`';
const tokens = Lexer.lexInline(inline);

// Tokens processed in order:
// 1. strong (**bold**)
// 2. text ( and )
// 3. em (*italic*)
// 4. text ( and )
// 5. codespan (`code`)

Greedy Matching: Lexer uses greedy matching - longest match wins.

State Management

The lexer maintains state to handle context-dependent parsing:

import { Lexer } from "marked";

const lexer = new Lexer();

// Initial state
console.log(lexer.state);
// {
//   inLink: false,    // Prevents nested links
//   inRawBlock: false, // Affects HTML parsing
//   top: true          // Whether at document top level
// }

lexer.lex('# Heading\n\n[Link](url)');

// State is managed automatically during tokenization
// After processing, state is typically reset

State Purposes:

  • inLink: Prevents nested links (markdown spec violation)
  • inRawBlock: Affects HTML parsing in certain contexts
  • top: Some block elements only allowed at top level

State Management Example:

import { Lexer } from "marked";

const lexer = new Lexer();

// Process markdown with nested link attempt
const markdown = '[Outer [nested](url)](url)';
const tokens = lexer.lex(markdown);

// Nested link will be treated as text due to inLink state
// Output: [Outer [nested](url)](url) as single link with text "Outer [nested](url)"

Link References

Link reference definitions are stored separately from main tokens:

import { Lexer } from "marked";

const markdown = `
[Google]: https://google.com "Google Search"
[GitHub]: https://github.com

This is a [link][Google] and another [link][GitHub].
`;

const tokens = Lexer.lex(markdown);

console.log(tokens.links);
// {
//   'google': { href: 'https://google.com', title: 'Google Search' },
//   'github': { href: 'https://github.com', title: '' }
// }

// Link definitions don't appear in main token list
// They're used when resolving reference-style links

Link Reference Notes:

  • Keys are normalized to lowercase
  • Stored in TokensList.links object
  • Not included in main token array
  • Used by parser to resolve [text][ref] syntax

Link Reference Resolution:

import { Lexer } from "marked";

const markdown = `
[ref]: https://example.com "Example"

This is a [reference link][ref].
This is a [shortcut reference link].

[shortcut reference link]: https://example.org
`;

const tokens = Lexer.lex(markdown);

// tokens.links contains both definitions:
// 'ref' and 'shortcut reference link'

Inline Queue

The lexer queues inline content for deferred processing:

import { Lexer } from "marked";

const lexer = new Lexer();

// blockTokens() queues inline content
lexer.blockTokens('# Heading\n\nParagraph', lexer.tokens);

console.log(lexer.inlineQueue);
// [
//   { src: 'Heading', tokens: [...] },
//   { src: 'Paragraph', tokens: [...] }
// ]

// Process queued inline tokens
for (const item of lexer.inlineQueue) {
  lexer.inlineTokens(item.src, item.tokens);
}

Why Queue?

  • Performance: Process all block tokens first
  • Efficiency: Batch inline processing
  • State: Maintain proper context

Automatic Processing: The lex() method automatically processes the inline queue. Only relevant when using blockTokens() directly.

import { Lexer } from "marked";

// Using lex() - inline queue processed automatically
const tokens1 = Lexer.lex('# Heading');
// Inline tokens already processed

// Using blockTokens() - manual inline processing needed
const lexer = new Lexer();
const tokens2 = lexer.blockTokens('# Heading', []);
// Inline queue not yet processed
for (const item of lexer.inlineQueue) {
  lexer.inlineTokens(item.src, item.tokens);
}

Accessing Lexer Rules

import { Lexer } from "marked";

// Access tokenization rules
const rules = Lexer.rules;

console.log(rules.block); // Block-level regex rules
console.log(rules.inline); // Inline-level regex rules

// Example: heading rule
console.log(rules.block.heading);
// Regex for matching ATX headings: /^(#{1,6})(?=\s|$)(.*)(?:\n+|$)/

// Rules vary based on options
const lexerGfm = new Lexer({ gfm: true });
const lexerPedantic = new Lexer({ pedantic: true });
// Each uses different rule set

Rule Sets:

  • Different rule sets for GFM vs CommonMark
  • Pedantic mode uses original markdown.pl rules
  • Rules are optimized regex patterns
  • Not typically modified directly (use extensions instead)

Advanced Example

import { Lexer, Parser } from "marked";

// Custom two-pass processing
const lexer = new Lexer({ gfm: true });
const tokens = lexer.lex(markdown);

// First pass: Modify tokens before parsing
tokens.forEach(token => {
  if (token.type === 'heading') {
    token.depth += 1; // Demote all headings
  }
  if (token.type === 'link') {
    // Add nofollow to external links
    if (token.href.startsWith('http')) {
      token.rel = 'nofollow';
    }
  }
});

// Second pass: Parse modified tokens
const html = Parser.parse(tokens);

Advanced Use Cases:

  • Token inspection before rendering
  • Multi-pass processing
  • Token transformation
  • Custom output formats
  • Token statistics gathering

Error Handling

The lexer handles errors based on the silent option:

import { Lexer } from "marked";

// Throw on error (default behavior)
const lexer1 = new Lexer({ silent: false });
try {
  lexer1.lex(malformedMarkdown);
} catch (err) {
  console.error('Lexer error:', err.message);
}

// Silent mode (logs error, continues with partial tokens)
const lexer2 = new Lexer({ silent: true });
const tokens = lexer2.lex(malformedMarkdown);
// Errors logged to console, partial tokens returned

Error Types:

  • Malformed tokens: Invalid syntax
  • Infinite loops: Regex matches zero-length (prevented internally)
  • Stack overflow: Deeply nested structures (rare)

Handling Errors:

import { Lexer } from "marked";

function safeLex(markdown, options = {}) {
  try {
    return Lexer.lex(markdown, { ...options, silent: false });
  } catch (err) {
    console.error('Lexing failed:', err);
    // Return empty token list with error token
    return Object.assign([{
      type: 'paragraph',
      raw: markdown,
      text: 'Error: Failed to parse markdown',
      tokens: []
    }], { links: {} });
  }
}

Performance Considerations

  • The lexer uses regex-based tokenization for speed
  • It processes markdown in a single pass for block elements
  • Inline processing is deferred and batched for efficiency
  • No caching or blocking operations
  • Suitable for real-time parsing
import { Lexer } from "marked";

// Performance measurement
console.time('lex');
const tokens = Lexer.lex(largeMarkdownDocument);
console.timeEnd('lex');
// Typical: milliseconds for documents < 1MB

// For very large documents
const hugeMarkdown = '.'.repeat(10000000); // 10MB
console.time('huge-lex');
const hugeTokens = Lexer.lex(hugeMarkdown);
console.timeEnd('huge-lex');
// Still typically < 1 second

Performance Tips:

  • Reuse Lexer instance for multiple documents
  • Use static methods for one-off parsing
  • Profile with actual document sizes
  • Consider streaming for very large documents (requires custom implementation)

Memory Considerations:

import { Lexer } from "marked";

// Tokens are kept in memory
const markdown = 'x'.repeat(1000000); // 1MB of text
const tokens = Lexer.lex(markdown);

// Memory usage ≈ input size + token overhead
// For 1MB input, expect ~2-3MB total memory usage

Debugging Lexer Output

import { Lexer } from "marked";

const markdown = `
# Heading
**Bold** text
`;

const tokens = Lexer.lex(markdown);

// Pretty print tokens
console.log(JSON.stringify(tokens, null, 2));

// Inspect specific token
console.log('First token:', tokens[0]);
console.log('Type:', tokens[0].type);
console.log('Raw:', tokens[0].raw);
console.log('Depth:', tokens[0].depth);

// Inspect nested tokens
if (tokens[0].tokens) {
  console.log('Nested tokens:', tokens[0].tokens);
}

Custom Tokenization

import { marked, Lexer } from "marked";

// Add custom block-level syntax
marked.use({
  extensions: [{
    name: 'note',
    level: 'block',
    start(src) {
      return src.match(/^:::note/)?.index;
    },
    tokenizer(src) {
      const match = src.match(/^:::note\n([\s\S]*?)\n:::/);
      if (match) {
        return {
          type: 'note',
          raw: match[0],
          text: match[1],
          tokens: this.lexer.blockTokens(match[1])
        };
      }
    }
  }]
});

// Now lexer recognizes :::note blocks
const tokens = Lexer.lex(':::note\nImportant\n:::');
console.log(tokens[0].type); // 'note'

Edge Cases

import { Lexer } from "marked";

// Empty input
const empty = Lexer.lex('');
console.log(empty); // []
console.log(empty.links); // {}

// Only whitespace
const whitespace = Lexer.lex('   \n  \n  ');
console.log(whitespace); // [{ type: 'space', raw: '   \n  \n  ' }]

// Only link definitions
const onlyRefs = Lexer.lex('[ref]: url');
console.log(onlyRefs); // []
console.log(onlyRefs.links); // { ref: { href: 'url', title: '' } }

// Deeply nested structures
const nested = '> '.repeat(100) + 'text';
const nestedTokens = Lexer.lex(nested); // Handles deep nesting

// Very long lines
const longLine = 'a'.repeat(100000);
const longTokens = Lexer.lex(longLine); // Handles long lines