tessl/pypi-markdown-it-py

Python port of markdown-it providing CommonMark-compliant markdown parsing with configurable syntax and pluggable architecture

—

Pending

Overview

Eval results

Files

Syntax Tree Processing

Name: tessl/pypi-markdown-it-py
Author: tessl

Tree representation utilities for converting linear token streams into hierarchical structures for advanced document analysis and manipulation. This module is unique to the Python implementation and not part of the original JavaScript markdown-it.

Capabilities

SyntaxTreeNode Class

Hierarchical representation of markdown document structure.

class SyntaxTreeNode:
    """
    A Markdown syntax tree node representing either:
    - Root of the document
    - Single unnested token
    - Token pair (open/close) with nested content
    """
    
    def __init__(self, tokens: list[Token] = (), *, create_root: bool = True):
        """
        Initialize syntax tree from token stream.
        
        Parameters:
        - tokens: token stream to convert to tree
        - create_root: whether to create a root node for the document
        """
    
    # Properties
    token: Token | None                    # Associated token (for leaf nodes)
    nester_tokens: tuple[Token, Token] | None  # Opening/closing token pair (for containers)
    parent: SyntaxTreeNode | None          # Parent node
    children: list[SyntaxTreeNode]         # Child nodes

Tree Construction

Build tree structures from token streams:

# Class methods for tree creation
@classmethod
def from_tokens(cls, tokens: list[Token]) -> SyntaxTreeNode:
    """
    Create syntax tree from token list.
    
    Parameters:
    - tokens: list of tokens to convert
    
    Returns:
    - SyntaxTreeNode: root node of constructed tree
    """

Usage Example:

from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode

md = MarkdownIt()
tokens = md.parse("""
# Heading

Paragraph with **bold** text.

- Item 1
- Item 2
""")

# Create syntax tree
tree = SyntaxTreeNode(tokens)

# Access tree structure
print(f"Root has {len(tree.children)} children")
for child in tree.children:
    print(f"Child type: {child.token.type if child.token else 'container'}")

Tree Traversal

Navigate and inspect tree structure:

def walk(self, filter: callable = None) -> Generator[SyntaxTreeNode, None, None]:
    """
    Walk the tree depth-first, yielding nodes.
    
    Parameters:
    - filter: optional function to filter nodes
    
    Yields:
    - SyntaxTreeNode: tree nodes in depth-first order
    """

@property
def is_root(self) -> bool:
    """True if this is the root node."""

@property  
def is_leaf(self) -> bool:
    """True if this node has no children."""

@property
def is_container(self) -> bool:
    """True if this node represents a token pair container."""

Usage Example:

from markdown_it.tree import SyntaxTreeNode

# Tree traversal
for node in tree.walk():
    if node.token and node.token.type == "heading_open":
        level = int(node.token.tag[1])  # h1->1, h2->2, etc.
        print(f"Found heading level {level}")

# Filter specific node types
def is_paragraph(node):
    return node.token and node.token.type == "paragraph_open"

for para_node in tree.walk(filter=is_paragraph):
    print("Found paragraph")
    
# Check node types
for node in tree.children:
    if node.is_container:
        print(f"Container with {len(node.children)} children")
    elif node.is_leaf:
        print(f"Leaf node: {node.token.type}")

Tree Manipulation

Modify tree structure and content:

def remove_child(self, child: SyntaxTreeNode) -> None:
    """
    Remove child node from this node.
    
    Parameters:
    - child: child node to remove
    """

def add_child(self, child: SyntaxTreeNode) -> None:
    """
    Add child node to this node.
    
    Parameters:
    - child: child node to add
    """

def replace_child(self, old_child: SyntaxTreeNode, new_child: SyntaxTreeNode) -> None:
    """
    Replace existing child with new child.
    
    Parameters:
    - old_child: child to replace
    - new_child: replacement child
    """

Usage Example:

from markdown_it.tree import SyntaxTreeNode
from markdown_it.token import Token

# Create new nodes
new_token = Token("div_open", "div", 1)
new_node = SyntaxTreeNode()
new_node.token = new_token

# Add to tree
tree.add_child(new_node)

# Remove nodes
for node in list(tree.children):  # Copy list since we're modifying
    if node.token and node.token.type == "hr":
        tree.remove_child(node)

Tree Conversion

Convert between tree and token representations:

def to_tokens(self) -> list[Token]:
    """
    Convert tree back to linear token stream.
    
    Returns:
    - list[Token]: linearized token representation
    """

def to_pretty(self, *, indent: int = 2, show_text: bool = False) -> str:
    """
    Generate pretty-printed tree representation.
    
    Parameters:
    - indent: indentation spaces per level
    - show_text: whether to show text content
    
    Returns:
    - str: formatted tree structure
    """

Usage Example:

from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode

md = MarkdownIt()
tokens = md.parse("# Title\n\nParagraph text.")

# Token stream -> Tree -> Token stream
tree = SyntaxTreeNode(tokens)
reconstructed_tokens = tree.to_tokens()

# Verify round-trip consistency
original_html = md.renderer.render(tokens, md.options, {})
reconstructed_html = md.renderer.render(reconstructed_tokens, md.options, {})
assert original_html == reconstructed_html

# Pretty print tree structure
print(tree.to_pretty(show_text=True))

Advanced Tree Operations

Content Extraction

Extract specific content from tree structure:

def extract_headings(tree):
    """Extract all headings with their levels and text."""
    headings = []
    
    for node in tree.walk():
        if (node.is_container and 
            node.nester_tokens and 
            node.nester_tokens[0].type == "heading_open"):
            
            level = int(node.nester_tokens[0].tag[1])
            
            # Find text content in children
            text = ""
            for child in node.children:
                if child.token and child.token.type == "inline":
                    text = child.token.content
                    break
            
            headings.append({
                'level': level,
                'text': text,
                'node': node
            })
    
    return headings

def extract_links(tree):
    """Extract all links with URLs and text."""
    links = []
    
    for node in tree.walk():
        if (node.is_container and 
            node.nester_tokens and
            node.nester_tokens[0].type == "link_open"):
            
            href = node.nester_tokens[0].attrGet("href")
            
            # Extract link text
            text = ""
            for child in node.children:
                if child.token and child.token.type == "text":
                    text = child.token.content
                    break
            
            links.append({
                'url': href,
                'text': text,
                'node': node
            })
    
    return links

Tree Transformation

Transform tree structure for custom processing:

def wrap_paragraphs_in_divs(tree):
    """Wrap all paragraphs in div containers."""
    from markdown_it.token import Token
    
    for node in list(tree.children):  # Copy since we're modifying
        if (node.is_container and 
            node.nester_tokens and
            node.nester_tokens[0].type == "paragraph_open"):
            
            # Create wrapper div
            div_open = Token("div_open", "div", 1)
            div_open.attrSet("class", "paragraph-wrapper")
            div_close = Token("div_close", "div", -1)
            
            # Create new container node  
            wrapper_node = SyntaxTreeNode()
            wrapper_node.parent = tree
            wrapper_node.nester_tokens = (div_open, div_close)
            wrapper_node.children = [node]
            
            # Update parent relationships
            node.parent = wrapper_node
            
            # Replace in tree
            tree.replace_child(node, wrapper_node)

def add_table_of_contents(tree):
    """Add table of contents based on headings."""
    headings = extract_headings(tree)
    
    if not headings:
        return
    
    # Create TOC tokens
    toc_tokens = [
        Token("div_open", "div", 1, attrs={"class": "table-of-contents"}),
        Token("heading_open", "h2", 1),
        Token("inline", "", 0, content="Table of Contents"),
        Token("heading_close", "h2", -1),
        Token("bullet_list_open", "ul", 1)
    ]
    
    for heading in headings:
        toc_tokens.extend([
            Token("list_item_open", "li", 1),
            Token("paragraph_open", "p", 1),
            Token("link_open", "a", 1, attrs={"href": f"#{heading['text'].lower().replace(' ', '-')}"}),
            Token("inline", "", 0, content=heading['text']),
            Token("link_close", "a", -1),
            Token("paragraph_close", "p", -1),
            Token("list_item_close", "li", -1)
        ])
    
    toc_tokens.extend([
        Token("bullet_list_close", "ul", -1),
        Token("div_close", "div", -1)
    ])
    
    # Create TOC tree node
    toc_tree = SyntaxTreeNode(toc_tokens, create_root=False)
    
    # Insert at beginning
    tree.children.insert(0, toc_tree)
    toc_tree.parent = tree

Tree Analysis

Analyze document structure using tree representation:

def analyze_document_structure(tree):
    """Analyze document structure and return statistics."""
    stats = {
        'total_nodes': 0,
        'headings': [],
        'paragraphs': 0,
        'lists': 0,
        'code_blocks': 0,
        'links': 0,
        'images': 0,
        'max_nesting_level': 0
    }
    
    def analyze_node(node, level=0):
        stats['total_nodes'] += 1
        stats['max_nesting_level'] = max(stats['max_nesting_level'], level)
        
        if node.token:
            token_type = node.token.type
            if token_type == "heading_open":
                stats['headings'].append(int(node.token.tag[1]))
            elif token_type == "paragraph_open":
                stats['paragraphs'] += 1
            elif token_type in ["bullet_list_open", "ordered_list_open"]:
                stats['lists'] += 1
            elif token_type in ["code_block", "fence"]:
                stats['code_blocks'] += 1
            elif token_type == "link_open":
                stats['links'] += 1
            elif token_type == "image":
                stats['images'] += 1
        
        for child in node.children:
            analyze_node(child, level + 1)
    
    for child in tree.children:
        analyze_node(child)
    
    return stats

# Usage
stats = analyze_document_structure(tree)
print(f"Document has {stats['paragraphs']} paragraphs")
print(f"Heading levels: {set(stats['headings'])}")
print(f"Maximum nesting: {stats['max_nesting_level']}")

Install with Tessl CLI