tessl install tessl/npm-marked@17.0.0A markdown parser built for speed
The Lexer class tokenizes markdown text into a structured token tree. It handles block-level elements (paragraphs, headings, lists, etc.) and queues inline elements for processing.
class Lexer {
constructor(options?: MarkedOptions);
/**
* Tokenize markdown string into tokens
* Static method - creates new Lexer internally
* @param src - Markdown source. Empty strings return empty token list.
* @param options - Lexer options
* @returns TokensList with all tokens and link references
*/
static lex(src: string, options?: MarkedOptions): TokensList;
/**
* Tokenize inline markdown only
* Static method - creates new Lexer internally
* Does not process block-level elements
* @param src - Inline markdown source
* @param options - Lexer options
* @returns Array of inline tokens (no TokensList wrapper)
*/
static lexInline(src: string, options?: MarkedOptions): Token[];
/**
* Regular expression rules for tokenization
* Rules vary based on options (GFM, pedantic, etc.)
* Contains block and inline rule sets
*/
static readonly rules: {
block: BlockRules;
inline: InlineRules;
};
/**
* Tokenize markdown source (instance method)
* Processes both block and inline tokens
* @param src - Markdown string
* @returns TokensList with tokens and link references
*/
lex(src: string): TokensList;
/**
* Tokenize block-level markdown
* Processes structural elements (headings, lists, paragraphs, etc.)
* @param src - Markdown source
* @param tokens - Token array to append to (optional, creates new if not provided)
* @param lastParagraphClipped - Internal state flag for list parsing
* @returns Array of tokens
*/
blockTokens(src: string, tokens?: Token[], lastParagraphClipped?: boolean): Token[];
/**
* Queue inline tokens for processing
* Queues rather than immediately processing for efficiency
* @param src - Inline markdown source
* @param tokens - Token array to populate
* @returns Token array reference
*/
inline(src: string, tokens?: Token[]): Token[];
/**
* Tokenize inline markdown
* Processes inline elements (emphasis, links, code, etc.)
* @param src - Inline markdown source
* @param tokens - Token array to append to (optional)
* @returns Array of inline tokens
*/
inlineTokens(src: string, tokens?: Token[]): Token[];
/**
* Current token list being built
* Includes link references in TokensList.links
*/
tokens: TokensList;
/**
* Lexer options
* Affects tokenization rules and behavior
*/
options: MarkedOptions;
/**
* Current lexer state (managed automatically)
* Used to handle context-dependent parsing
*/
state: {
/**
* Whether currently inside a link (prevents nested links)
* Links cannot contain other links per markdown spec
*/
inLink: boolean;
/**
* Whether currently inside a raw HTML block
* Affects how certain markdown is interpreted
*/
inRawBlock: boolean;
/**
* Whether at top level (not in a nested block)
* Affects what block-level elements are allowed
*/
top: boolean;
};
/**
* Queue of inline content to process
* Used for deferred inline tokenization
*/
inlineQueue: Array<{ src: string; tokens: Token[] }>;
}import { Lexer } from "marked";
const markdown = `
# Hello World
This is a **paragraph** with inline elements.
- List item 1
- List item 2
`;
const tokens = Lexer.lex(markdown);
console.log(tokens);
// [
// { type: 'heading', depth: 1, text: 'Hello World', tokens: [...], ... },
// { type: 'space', raw: '\n', ... },
// { type: 'paragraph', text: 'This is a **paragraph**...', tokens: [...], ... },
// { type: 'list', items: [...], ordered: false, ... }
// ]
console.log(tokens.links);
// {} - Map of link reference definitionsToken List Properties:
tokens array contains all block-level tokenstokens.links object contains link reference definitionsimport { Lexer } from "marked";
const inline = Lexer.lexInline('This is **bold** and *italic*');
console.log(inline);
// [
// { type: 'text', text: 'This is ', raw: 'This is ' },
// { type: 'strong', text: 'bold', tokens: [...], raw: '**bold**' },
// { type: 'text', text: ' and ', raw: ' and ' },
// { type: 'em', text: 'italic', tokens: [...], raw: '*italic*' }
// ]Inline vs Block:
lex(): Full processing (block + inline)lexInline(): Only inline elements, no paragraphs or block structuresimport { Lexer } from "marked";
const lexer = new Lexer({
gfm: true,
breaks: true
});
const tokens = lexer.lex('# Markdown');
// Access state
console.log(lexer.state.top); // true - at document top level
console.log(lexer.state.inLink); // false - not inside link
console.log(lexer.tokens); // TokensList with tokens and linksInstance vs Static:
import { marked, Lexer } from "marked";
// Extensions via marked.use() affect lexer automatically
marked.use({
extensions: [{
name: 'customBlock',
level: 'block',
tokenizer(src) {
const match = src.match(/^@\[(\w+)\]\((.*?)\)/);
if (match) {
return {
type: 'customBlock',
raw: match[0],
name: match[1],
value: match[2]
};
}
}
}]
});
// Custom tokenizer is automatically used by lexer
const tokens = Lexer.lex('@[youtube](abc123)');
console.log(tokens[0].type); // 'customBlock'The lexer processes block-level markdown in this specific order:
``` or ~~~# Heading---, ***, ___> quote[label]: url "title"Order Matters: Earlier patterns take precedence. Custom extensions are checked first.
import { Lexer } from "marked";
const markdown = `
# ATX Heading
---
> Blockquote
- List item
`;
const lexer = new Lexer();
const tokens = lexer.blockTokens(markdown, []);
// Tokens will be in document order:
// 1. heading (ATX processed before HR)
// 2. hr
// 3. blockquote
// 4. listThe lexer processes inline markdown in this order:
\*, \_, etc.<tag>, </tag>[text](url), [text][ref]*em*, **strong**`code`~~deleted~~<url>, <email>import { Lexer } from "marked";
const inline = '**bold** and *italic* and `code`';
const tokens = Lexer.lexInline(inline);
// Tokens processed in order:
// 1. strong (**bold**)
// 2. text ( and )
// 3. em (*italic*)
// 4. text ( and )
// 5. codespan (`code`)Greedy Matching: Lexer uses greedy matching - longest match wins.
The lexer maintains state to handle context-dependent parsing:
import { Lexer } from "marked";
const lexer = new Lexer();
// Initial state
console.log(lexer.state);
// {
// inLink: false, // Prevents nested links
// inRawBlock: false, // Affects HTML parsing
// top: true // Whether at document top level
// }
lexer.lex('# Heading\n\n[Link](url)');
// State is managed automatically during tokenization
// After processing, state is typically resetState Purposes:
inLink: Prevents nested links (markdown spec violation)inRawBlock: Affects HTML parsing in certain contextstop: Some block elements only allowed at top levelState Management Example:
import { Lexer } from "marked";
const lexer = new Lexer();
// Process markdown with nested link attempt
const markdown = '[Outer [nested](url)](url)';
const tokens = lexer.lex(markdown);
// Nested link will be treated as text due to inLink state
// Output: [Outer [nested](url)](url) as single link with text "Outer [nested](url)"Link reference definitions are stored separately from main tokens:
import { Lexer } from "marked";
const markdown = `
[Google]: https://google.com "Google Search"
[GitHub]: https://github.com
This is a [link][Google] and another [link][GitHub].
`;
const tokens = Lexer.lex(markdown);
console.log(tokens.links);
// {
// 'google': { href: 'https://google.com', title: 'Google Search' },
// 'github': { href: 'https://github.com', title: '' }
// }
// Link definitions don't appear in main token list
// They're used when resolving reference-style linksLink Reference Notes:
TokensList.links object[text][ref] syntaxLink Reference Resolution:
import { Lexer } from "marked";
const markdown = `
[ref]: https://example.com "Example"
This is a [reference link][ref].
This is a [shortcut reference link].
[shortcut reference link]: https://example.org
`;
const tokens = Lexer.lex(markdown);
// tokens.links contains both definitions:
// 'ref' and 'shortcut reference link'The lexer queues inline content for deferred processing:
import { Lexer } from "marked";
const lexer = new Lexer();
// blockTokens() queues inline content
lexer.blockTokens('# Heading\n\nParagraph', lexer.tokens);
console.log(lexer.inlineQueue);
// [
// { src: 'Heading', tokens: [...] },
// { src: 'Paragraph', tokens: [...] }
// ]
// Process queued inline tokens
for (const item of lexer.inlineQueue) {
lexer.inlineTokens(item.src, item.tokens);
}Why Queue?
Automatic Processing: The lex() method automatically processes the inline queue. Only relevant when using blockTokens() directly.
import { Lexer } from "marked";
// Using lex() - inline queue processed automatically
const tokens1 = Lexer.lex('# Heading');
// Inline tokens already processed
// Using blockTokens() - manual inline processing needed
const lexer = new Lexer();
const tokens2 = lexer.blockTokens('# Heading', []);
// Inline queue not yet processed
for (const item of lexer.inlineQueue) {
lexer.inlineTokens(item.src, item.tokens);
}import { Lexer } from "marked";
// Access tokenization rules
const rules = Lexer.rules;
console.log(rules.block); // Block-level regex rules
console.log(rules.inline); // Inline-level regex rules
// Example: heading rule
console.log(rules.block.heading);
// Regex for matching ATX headings: /^(#{1,6})(?=\s|$)(.*)(?:\n+|$)/
// Rules vary based on options
const lexerGfm = new Lexer({ gfm: true });
const lexerPedantic = new Lexer({ pedantic: true });
// Each uses different rule setRule Sets:
import { Lexer, Parser } from "marked";
// Custom two-pass processing
const lexer = new Lexer({ gfm: true });
const tokens = lexer.lex(markdown);
// First pass: Modify tokens before parsing
tokens.forEach(token => {
if (token.type === 'heading') {
token.depth += 1; // Demote all headings
}
if (token.type === 'link') {
// Add nofollow to external links
if (token.href.startsWith('http')) {
token.rel = 'nofollow';
}
}
});
// Second pass: Parse modified tokens
const html = Parser.parse(tokens);Advanced Use Cases:
The lexer handles errors based on the silent option:
import { Lexer } from "marked";
// Throw on error (default behavior)
const lexer1 = new Lexer({ silent: false });
try {
lexer1.lex(malformedMarkdown);
} catch (err) {
console.error('Lexer error:', err.message);
}
// Silent mode (logs error, continues with partial tokens)
const lexer2 = new Lexer({ silent: true });
const tokens = lexer2.lex(malformedMarkdown);
// Errors logged to console, partial tokens returnedError Types:
Handling Errors:
import { Lexer } from "marked";
function safeLex(markdown, options = {}) {
try {
return Lexer.lex(markdown, { ...options, silent: false });
} catch (err) {
console.error('Lexing failed:', err);
// Return empty token list with error token
return Object.assign([{
type: 'paragraph',
raw: markdown,
text: 'Error: Failed to parse markdown',
tokens: []
}], { links: {} });
}
}import { Lexer } from "marked";
// Performance measurement
console.time('lex');
const tokens = Lexer.lex(largeMarkdownDocument);
console.timeEnd('lex');
// Typical: milliseconds for documents < 1MB
// For very large documents
const hugeMarkdown = '.'.repeat(10000000); // 10MB
console.time('huge-lex');
const hugeTokens = Lexer.lex(hugeMarkdown);
console.timeEnd('huge-lex');
// Still typically < 1 secondPerformance Tips:
Memory Considerations:
import { Lexer } from "marked";
// Tokens are kept in memory
const markdown = 'x'.repeat(1000000); // 1MB of text
const tokens = Lexer.lex(markdown);
// Memory usage ≈ input size + token overhead
// For 1MB input, expect ~2-3MB total memory usageimport { Lexer } from "marked";
const markdown = `
# Heading
**Bold** text
`;
const tokens = Lexer.lex(markdown);
// Pretty print tokens
console.log(JSON.stringify(tokens, null, 2));
// Inspect specific token
console.log('First token:', tokens[0]);
console.log('Type:', tokens[0].type);
console.log('Raw:', tokens[0].raw);
console.log('Depth:', tokens[0].depth);
// Inspect nested tokens
if (tokens[0].tokens) {
console.log('Nested tokens:', tokens[0].tokens);
}import { marked, Lexer } from "marked";
// Add custom block-level syntax
marked.use({
extensions: [{
name: 'note',
level: 'block',
start(src) {
return src.match(/^:::note/)?.index;
},
tokenizer(src) {
const match = src.match(/^:::note\n([\s\S]*?)\n:::/);
if (match) {
return {
type: 'note',
raw: match[0],
text: match[1],
tokens: this.lexer.blockTokens(match[1])
};
}
}
}]
});
// Now lexer recognizes :::note blocks
const tokens = Lexer.lex(':::note\nImportant\n:::');
console.log(tokens[0].type); // 'note'import { Lexer } from "marked";
// Empty input
const empty = Lexer.lex('');
console.log(empty); // []
console.log(empty.links); // {}
// Only whitespace
const whitespace = Lexer.lex(' \n \n ');
console.log(whitespace); // [{ type: 'space', raw: ' \n \n ' }]
// Only link definitions
const onlyRefs = Lexer.lex('[ref]: url');
console.log(onlyRefs); // []
console.log(onlyRefs.links); // { ref: { href: 'url', title: '' } }
// Deeply nested structures
const nested = '> '.repeat(100) + 'text';
const nestedTokens = Lexer.lex(nested); // Handles deep nesting
// Very long lines
const longLine = 'a'.repeat(100000);
const longTokens = Lexer.lex(longLine); // Handles long lines