Python port of markdown-it providing CommonMark-compliant markdown parsing with configurable syntax and pluggable architecture
—
Tree representation utilities for converting linear token streams into hierarchical structures for advanced document analysis and manipulation. This module is unique to the Python implementation and not part of the original JavaScript markdown-it.
Hierarchical representation of markdown document structure.
class SyntaxTreeNode:
"""
A Markdown syntax tree node representing either:
- Root of the document
- Single unnested token
- Token pair (open/close) with nested content
"""
def __init__(self, tokens: list[Token] = (), *, create_root: bool = True):
"""
Initialize syntax tree from token stream.
Parameters:
- tokens: token stream to convert to tree
- create_root: whether to create a root node for the document
"""
# Properties
token: Token | None # Associated token (for leaf nodes)
nester_tokens: tuple[Token, Token] | None # Opening/closing token pair (for containers)
parent: SyntaxTreeNode | None # Parent node
children: list[SyntaxTreeNode] # Child nodesBuild tree structures from token streams:
# Class methods for tree creation
@classmethod
def from_tokens(cls, tokens: list[Token]) -> SyntaxTreeNode:
"""
Create syntax tree from token list.
Parameters:
- tokens: list of tokens to convert
Returns:
- SyntaxTreeNode: root node of constructed tree
"""Usage Example:
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
md = MarkdownIt()
tokens = md.parse("""
# Heading
Paragraph with **bold** text.
- Item 1
- Item 2
""")
# Create syntax tree
tree = SyntaxTreeNode(tokens)
# Access tree structure
print(f"Root has {len(tree.children)} children")
for child in tree.children:
print(f"Child type: {child.token.type if child.token else 'container'}")Navigate and inspect tree structure:
def walk(self, filter: callable = None) -> Generator[SyntaxTreeNode, None, None]:
"""
Walk the tree depth-first, yielding nodes.
Parameters:
- filter: optional function to filter nodes
Yields:
- SyntaxTreeNode: tree nodes in depth-first order
"""
@property
def is_root(self) -> bool:
"""True if this is the root node."""
@property
def is_leaf(self) -> bool:
"""True if this node has no children."""
@property
def is_container(self) -> bool:
"""True if this node represents a token pair container."""Usage Example:
from markdown_it.tree import SyntaxTreeNode
# Tree traversal
for node in tree.walk():
if node.token and node.token.type == "heading_open":
level = int(node.token.tag[1]) # h1->1, h2->2, etc.
print(f"Found heading level {level}")
# Filter specific node types
def is_paragraph(node):
return node.token and node.token.type == "paragraph_open"
for para_node in tree.walk(filter=is_paragraph):
print("Found paragraph")
# Check node types
for node in tree.children:
if node.is_container:
print(f"Container with {len(node.children)} children")
elif node.is_leaf:
print(f"Leaf node: {node.token.type}")Modify tree structure and content:
def remove_child(self, child: SyntaxTreeNode) -> None:
"""
Remove child node from this node.
Parameters:
- child: child node to remove
"""
def add_child(self, child: SyntaxTreeNode) -> None:
"""
Add child node to this node.
Parameters:
- child: child node to add
"""
def replace_child(self, old_child: SyntaxTreeNode, new_child: SyntaxTreeNode) -> None:
"""
Replace existing child with new child.
Parameters:
- old_child: child to replace
- new_child: replacement child
"""Usage Example:
from markdown_it.tree import SyntaxTreeNode
from markdown_it.token import Token
# Create new nodes
new_token = Token("div_open", "div", 1)
new_node = SyntaxTreeNode()
new_node.token = new_token
# Add to tree
tree.add_child(new_node)
# Remove nodes
for node in list(tree.children): # Copy list since we're modifying
if node.token and node.token.type == "hr":
tree.remove_child(node)Convert between tree and token representations:
def to_tokens(self) -> list[Token]:
"""
Convert tree back to linear token stream.
Returns:
- list[Token]: linearized token representation
"""
def to_pretty(self, *, indent: int = 2, show_text: bool = False) -> str:
"""
Generate pretty-printed tree representation.
Parameters:
- indent: indentation spaces per level
- show_text: whether to show text content
Returns:
- str: formatted tree structure
"""Usage Example:
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
md = MarkdownIt()
tokens = md.parse("# Title\n\nParagraph text.")
# Token stream -> Tree -> Token stream
tree = SyntaxTreeNode(tokens)
reconstructed_tokens = tree.to_tokens()
# Verify round-trip consistency
original_html = md.renderer.render(tokens, md.options, {})
reconstructed_html = md.renderer.render(reconstructed_tokens, md.options, {})
assert original_html == reconstructed_html
# Pretty print tree structure
print(tree.to_pretty(show_text=True))Extract specific content from tree structure:
def extract_headings(tree):
"""Extract all headings with their levels and text."""
headings = []
for node in tree.walk():
if (node.is_container and
node.nester_tokens and
node.nester_tokens[0].type == "heading_open"):
level = int(node.nester_tokens[0].tag[1])
# Find text content in children
text = ""
for child in node.children:
if child.token and child.token.type == "inline":
text = child.token.content
break
headings.append({
'level': level,
'text': text,
'node': node
})
return headings
def extract_links(tree):
"""Extract all links with URLs and text."""
links = []
for node in tree.walk():
if (node.is_container and
node.nester_tokens and
node.nester_tokens[0].type == "link_open"):
href = node.nester_tokens[0].attrGet("href")
# Extract link text
text = ""
for child in node.children:
if child.token and child.token.type == "text":
text = child.token.content
break
links.append({
'url': href,
'text': text,
'node': node
})
return linksTransform tree structure for custom processing:
def wrap_paragraphs_in_divs(tree):
"""Wrap all paragraphs in div containers."""
from markdown_it.token import Token
for node in list(tree.children): # Copy since we're modifying
if (node.is_container and
node.nester_tokens and
node.nester_tokens[0].type == "paragraph_open"):
# Create wrapper div
div_open = Token("div_open", "div", 1)
div_open.attrSet("class", "paragraph-wrapper")
div_close = Token("div_close", "div", -1)
# Create new container node
wrapper_node = SyntaxTreeNode()
wrapper_node.parent = tree
wrapper_node.nester_tokens = (div_open, div_close)
wrapper_node.children = [node]
# Update parent relationships
node.parent = wrapper_node
# Replace in tree
tree.replace_child(node, wrapper_node)
def add_table_of_contents(tree):
"""Add table of contents based on headings."""
headings = extract_headings(tree)
if not headings:
return
# Create TOC tokens
toc_tokens = [
Token("div_open", "div", 1, attrs={"class": "table-of-contents"}),
Token("heading_open", "h2", 1),
Token("inline", "", 0, content="Table of Contents"),
Token("heading_close", "h2", -1),
Token("bullet_list_open", "ul", 1)
]
for heading in headings:
toc_tokens.extend([
Token("list_item_open", "li", 1),
Token("paragraph_open", "p", 1),
Token("link_open", "a", 1, attrs={"href": f"#{heading['text'].lower().replace(' ', '-')}"}),
Token("inline", "", 0, content=heading['text']),
Token("link_close", "a", -1),
Token("paragraph_close", "p", -1),
Token("list_item_close", "li", -1)
])
toc_tokens.extend([
Token("bullet_list_close", "ul", -1),
Token("div_close", "div", -1)
])
# Create TOC tree node
toc_tree = SyntaxTreeNode(toc_tokens, create_root=False)
# Insert at beginning
tree.children.insert(0, toc_tree)
toc_tree.parent = treeAnalyze document structure using tree representation:
def analyze_document_structure(tree):
"""Analyze document structure and return statistics."""
stats = {
'total_nodes': 0,
'headings': [],
'paragraphs': 0,
'lists': 0,
'code_blocks': 0,
'links': 0,
'images': 0,
'max_nesting_level': 0
}
def analyze_node(node, level=0):
stats['total_nodes'] += 1
stats['max_nesting_level'] = max(stats['max_nesting_level'], level)
if node.token:
token_type = node.token.type
if token_type == "heading_open":
stats['headings'].append(int(node.token.tag[1]))
elif token_type == "paragraph_open":
stats['paragraphs'] += 1
elif token_type in ["bullet_list_open", "ordered_list_open"]:
stats['lists'] += 1
elif token_type in ["code_block", "fence"]:
stats['code_blocks'] += 1
elif token_type == "link_open":
stats['links'] += 1
elif token_type == "image":
stats['images'] += 1
for child in node.children:
analyze_node(child, level + 1)
for child in tree.children:
analyze_node(child)
return stats
# Usage
stats = analyze_document_structure(tree)
print(f"Document has {stats['paragraphs']} paragraphs")
print(f"Heading levels: {set(stats['headings'])}")
print(f"Maximum nesting: {stats['max_nesting_level']}")Install with Tessl CLI
npx tessl i tessl/pypi-markdown-it-py