A sane and fast Markdown parser with useful plugins and renderers
—
Low-level parsing components that handle the conversion of Markdown text into structured tokens. The parsing system is split into block-level elements (paragraphs, headings, lists) and inline elements (bold, italic, links), with state management for tracking parsing progress and context.
Handles block-level Markdown elements like headings, paragraphs, lists, code blocks, and blockquotes.
class BlockParser(Parser[BlockState]):
"""
Parser for block-level Markdown elements.
Handles elements that form document structure: headings, paragraphs,
lists, code blocks, blockquotes, tables, etc.
"""
def __init__(self):
"""Initialize block parser with default rules."""
def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None:
"""
Parse state source and populate with block tokens.
Parameters:
- state: BlockState to parse and populate with tokens
- rules: Optional list of rules to use for parsing
"""Processes inline Markdown elements within block content like emphasis, links, code spans, and images.
class InlineParser(Parser[InlineState]):
"""
Parser for inline-level Markdown elements.
Handles elements within block content: bold, italic, links,
images, code spans, line breaks, etc.
"""
def __init__(self, hard_wrap: bool = False):
"""
Initialize inline parser.
Parameters:
- hard_wrap: Whether to convert line breaks to <br> tags
"""
def __call__(self, text: str, env: MutableMapping[str, Any]) -> List[Dict[str, Any]]:
"""
Process text and return inline tokens.
Parameters:
- text: Text to process
- env: Environment mapping for parsing context
Returns:
List of inline tokens
"""State management for block-level parsing including cursor position, token accumulation, and parsing environment.
class BlockState:
"""
State management for block-level parsing.
Tracks parsing progress, accumulated tokens, and contextual information
during the block parsing process.
Attributes:
- src: str - Source text being parsed
- tokens: List[Dict[str, Any]] - Accumulated parsed tokens
- cursor: int - Current position in source text
- cursor_max: int - Maximum position (length of source)
- list_tight: bool - Whether current list is tight formatting
- parent: Any - Parent parsing context
- env: MutableMapping[str, Any] - Environment variables and data
"""
def __init__(self, parent: Optional[Any] = None):
"""
Initialize block parsing state.
Parameters:
- parent: Parent state context
"""
def child_state(self, src: str) -> Self:
"""
Create child state for nested parsing.
Parameters:
- src: Source text for child state
Returns:
New BlockState instance with this state as parent
"""
def process(self, text: str) -> Self:
"""
Process text and return populated state.
Parameters:
- text: Text to process
Returns:
Self with populated tokens and updated cursor
"""State management for inline-level parsing within block elements.
class InlineState:
"""
State management for inline-level parsing.
Tracks parsing of inline elements within block content including
position tracking and environment data.
Attributes:
- src: str - Source text being parsed
- tokens: List[Dict[str, Any]] - Accumulated inline tokens
- pos: int - Current position in source text
- env: MutableMapping[str, Any] - Environment variables and data
"""
def __init__(self):
"""Initialize inline parsing state."""
def append_token(self, token: Dict[str, Any]) -> None:
"""
Add token to the token list.
Parameters:
- token: Token to add
"""Abstract base class providing common parsing functionality.
ST = TypeVar('ST', bound=Union[BlockState, InlineState])
class Parser(Generic[ST]):
"""
Base parser class with common parsing functionality.
Provides rule registration, method dispatch, and parsing utilities
for both block and inline parsers.
"""
def register(
self,
name: str,
pattern: Union[str, None],
func: Callable,
before: Optional[str] = None
) -> None:
"""
Register a new parsing rule.
Parameters:
- name: Rule name
- pattern: Regex pattern string or None
- func: Function to handle matches
- before: Insert rule before this existing rule
"""Adding a custom block-level element:
from mistune import create_markdown, BlockParser
import re
def custom_block_plugin(md):
"""Add support for custom block syntax: :::type content :::"""
def parse_custom_block(block, m, state):
block_type = m.group(1)
content = m.group(2).strip()
# Parse content as nested blocks
child = state.child_state(content)
block.parse(content, child)
return {
'type': 'custom_block',
'attrs': {'block_type': block_type},
'children': child.tokens
}
# Register rule with block parser
md.block.register(
'custom_block',
r'^:::(\w+)\n(.*?)\n:::$',
parse_custom_block
)
# Add renderer method
def render_custom_block(text, block_type):
return f'<div class="custom-{block_type}">{text}</div>\n'
md.renderer.register('custom_block', render_custom_block)
# Use custom plugin
md = create_markdown()
md.use(custom_block_plugin)
result = md("""
:::warning
This is a **warning** block.
:::
""")Adding a custom inline element:
from mistune import create_markdown
import re
def emoji_plugin(md):
"""Add support for emoji syntax: :emoji_name:"""
def parse_emoji(inline, m, state):
emoji_name = m.group(1)
return 'emoji', emoji_name
# Register with inline parser
md.inline.register('emoji', r':(\w+):', parse_emoji)
# Add renderer method
def render_emoji(emoji_name):
emoji_map = {
'smile': '😊',
'heart': '❤️',
'thumbsup': '👍'
}
return emoji_map.get(emoji_name, f':{emoji_name}:')
md.renderer.register('emoji', render_emoji)
# Use emoji plugin
md = create_markdown()
md.use(emoji_plugin)
result = md('Hello :smile: world :heart:!')
# Output: Hello 😊 world ❤️!Accessing parsing state for analysis:
from mistune import create_markdown
md = create_markdown()
# Parse with state access
text = """
# Heading 1
This is a paragraph with **bold** text.
## Heading 2
- List item 1
- List item 2
"""
output, state = md.parse(text)
# Analyze tokens
def analyze_tokens(tokens, level=0):
indent = " " * level
for token in tokens:
print(f"{indent}Token: {token['type']}")
if 'attrs' in token:
print(f"{indent} Attrs: {token['attrs']}")
if 'children' in token:
analyze_tokens(token['children'], level + 1)
analyze_tokens(state.tokens)
# Access environment data
print(f"Environment: {state.env}")Customizing parser behavior:
from mistune import BlockParser, InlineParser, Markdown, HTMLRenderer
# Create custom parsers
block = BlockParser()
inline = InlineParser(hard_wrap=True) # Convert line breaks to <br>
# Remove specific rules by modifying rules list
block.rules.remove('block_quote') # Disable blockquotes
inline.rules.remove('emphasis') # Disable italic text
# Create parser with custom components
renderer = HTMLRenderer(escape=False)
md = Markdown(renderer=renderer, block=block, inline=inline)
result = md('This is *not italic*\nThis is a line break.')Understanding the token format for custom processing:
# Block token structure
block_token = {
'type': 'heading', # Token type
'attrs': {'level': 1}, # Element attributes
'children': [ # Child tokens (for container elements)
{
'type': 'text',
'raw': 'Heading Text'
}
]
}
# Inline token structure
inline_token = {
'type': 'strong', # Token type
'children': [ # Child tokens
{
'type': 'text',
'raw': 'Bold Text'
}
]
}
# Leaf token structure
text_token = {
'type': 'text', # Token type
'raw': 'Plain text content' # Raw text content
}This parsing architecture provides the flexibility to extend mistune with custom syntax while maintaining high performance through optimized parsing algorithms and clear separation between block and inline processing stages.
Install with Tessl CLI
npx tessl i tessl/pypi-mistune