tessl/pypi-markdown-it-py

Python port of markdown-it providing CommonMark-compliant markdown parsing with configurable syntax and pluggable architecture

—

Pending

Overview

Eval results

Files

Token System

Name: tessl/pypi-markdown-it-py
Author: tessl

Structured representation of parsed markdown elements with metadata, attributes, and hierarchical relationships for advanced processing and custom rendering.

Capabilities

Token Class

Core data structure representing parsed markdown elements.

class Token:
    """Represents a parsed markdown element with metadata and attributes."""
    
    # Core properties
    type: str                           # Token type (e.g., "paragraph_open")
    tag: str                           # HTML tag name (e.g., "p")
    nesting: int                       # Level change: 1 (opening), 0 (self-closing), -1 (closing)
    attrs: dict[str, str | int | float] # HTML attributes
    map: list[int] | None              # Source map [line_begin, line_end]
    level: int                         # Nesting level
    children: list[Token] | None       # Child tokens (for inline and img tokens)
    content: str                       # Inner content
    markup: str                        # Markup characters ('*', '_', fence string, etc.)
    info: str                          # Additional info (fence language, autolink flag, etc.)
    meta: dict[Any, Any]               # Plugin storage
    block: bool                        # True for block-level tokens
    hidden: bool                       # Skip when rendering (tight lists)

Token Creation

Create tokens programmatically or from dictionaries.

def __init__(
    self,
    type: str,
    tag: str,
    nesting: int,
    attrs: dict = None,
    map: list[int] = None,
    level: int = 0,
    children: list[Token] = None,
    content: str = "",
    markup: str = "",
    info: str = "",
    meta: dict = None,
    block: bool = False,
    hidden: bool = False
):
    """Initialize a new token."""

@classmethod
def from_dict(cls, dct: dict[str, Any]) -> Token:
    """
    Create token from dictionary representation.
    
    Parameters:
    - dct: dictionary with token data
    
    Returns:
    - Token: new token instance
    """

Usage Example:

from markdown_it.token import Token

# Create token manually
token = Token(
    type="paragraph_open",
    tag="p", 
    nesting=1,
    attrs={"class": "custom"},
    level=0,
    block=True
)

# Create from dictionary
token_dict = {
    "type": "strong_open",
    "tag": "strong",
    "nesting": 1,
    "markup": "**"
}
token = Token.from_dict(token_dict)

Attribute Management

Methods for managing HTML attributes on tokens.

def attrItems(self) -> list[tuple[str, str | int | float]]:
    """
    Get (key, value) list of attributes.
    
    Returns:
    - list: attribute key-value pairs
    """

def attrPush(self, attrData: tuple[str, str | int | float]) -> None:
    """
    Add [name, value] attribute to list.
    
    Parameters:
    - attrData: (name, value) tuple to add
    """

def attrSet(self, name: str, value: str | int | float) -> None:
    """
    Set attribute value, overriding if exists.
    
    Parameters:
    - name: attribute name
    - value: attribute value
    """

def attrGet(self, name: str) -> str | int | float | None:
    """
    Get attribute value.
    
    Parameters:
    - name: attribute name
    
    Returns:
    - str | int | float | None: attribute value or None if not found
    """

def attrJoin(self, name: str, value: str) -> None:
    """
    Join value to existing attribute via space, or create new.
    
    Parameters:
    - name: attribute name
    - value: value to join
    """

Usage Example:

from markdown_it.token import Token

token = Token("div_open", "div", 1)

# Set attributes
token.attrSet("class", "container")
token.attrSet("id", "main")

# Join to existing attribute (useful for CSS classes)
token.attrJoin("class", "highlight")

# Get attribute value
class_value = token.attrGet("class")  # "container highlight"

# List all attributes
attrs = token.attrItems()  # [("class", "container highlight"), ("id", "main")]

Token Manipulation

Methods for copying and converting tokens.

def copy(self, **changes: Any) -> Token:
    """
    Create shallow copy with optional changes.
    
    Parameters:
    - changes: keyword arguments for properties to change
    
    Returns:
    - Token: new token instance with changes applied
    """

def as_dict(
    self,
    *,
    children: bool = True,
    as_upstream: bool = True,
    meta_serializer: callable = None,
    filter: callable = None,
    dict_factory: callable = dict,
) -> dict[str, Any]:
    """
    Convert token to dictionary representation.
    
    Parameters:
    - children: also convert children to dicts
    - as_upstream: ensure compatibility with markdown-it format
    - meta_serializer: hook for serializing Token.meta
    - filter: callable to filter attributes
    - dict_factory: function to create dictionaries
    
    Returns:
    - dict: token as dictionary
    """

Usage Example:

from markdown_it.token import Token

# Original token
token = Token("paragraph_open", "p", 1, level=0)

# Create modified copy
modified = token.copy(
    attrs={"class": "highlight"},
    level=1
)

# Convert to dictionary
token_dict = token.as_dict()
print(token_dict)

# Convert with filtering
def filter_func(key, value):
    return key in ['type', 'tag', 'attrs']

filtered_dict = token.as_dict(filter=filter_func)

Common Token Types

Standard token types produced by markdown-it-py:

Block Tokens

# Structural block elements
"paragraph_open" / "paragraph_close"    # <p> tags
"heading_open" / "heading_close"        # <h1>-<h6> tags  
"blockquote_open" / "blockquote_close"  # <blockquote> tags
"list_item_open" / "list_item_close"    # <li> tags
"bullet_list_open" / "bullet_list_close" # <ul> tags
"ordered_list_open" / "ordered_list_close" # <ol> tags

# Content blocks
"code_block"                            # <pre><code> blocks
"fence"                                 # Fenced code blocks
"hr"                                    # <hr> horizontal rules
"html_block"                            # Raw HTML blocks
"table_open" / "table_close"            # <table> tags
"tr_open" / "tr_close"                  # <tr> tags
"td_open" / "td_close"                  # <td> tags
"th_open" / "th_close"                  # <th> tags

Inline Tokens

# Text formatting
"inline"                                # Container for inline content
"text"                                  # Plain text
"code_inline"                           # `code` spans
"em_open" / "em_close"                  # <em> emphasis
"strong_open" / "strong_close"          # <strong> tags
"s_open" / "s_close"                    # <s> strikethrough

# Links and media  
"link_open" / "link_close"              # <a> links
"image"                                 # <img> images
"autolink_open" / "autolink_close"      # Auto-detected links

# Special
"softbreak"                             # Soft line breaks
"hardbreak"                             # Hard line breaks <br>
"html_inline"                           # Inline HTML
"entity"                                # HTML entities

Token Inspection

from markdown_it import MarkdownIt

md = MarkdownIt()
tokens = md.parse("""
# Heading

Paragraph with **bold** and *italic* text.

- List item 1
- List item 2
""")

# Inspect token structure
for i, token in enumerate(tokens):
    print(f"{i}: {token.type} | {token.tag} | level={token.level}")
    if token.children:
        for j, child in enumerate(token.children):
            print(f"  {j}: {child.type} | content='{child.content}'")

Advanced Token Processing

Modifying Token Stream

from markdown_it import MarkdownIt

def add_custom_class(tokens):
    """Add custom CSS class to all paragraph tokens."""
    for token in tokens:
        if token.type == "paragraph_open":
            token.attrSet("class", "custom-paragraph")
    return tokens

md = MarkdownIt()
tokens = md.parse("# Title\n\nParagraph text.")
modified_tokens = add_custom_class(tokens)
html = md.renderer.render(modified_tokens, md.options, {})

Token Filtering

def filter_html_tokens(tokens):
    """Remove HTML tokens for security."""
    return [token for token in tokens 
            if not token.type.startswith('html_')]

def extract_headings(tokens):
    """Extract heading text from token stream."""
    headings = []
    for token in tokens:
        if token.type == "heading_open":
            # Next token should be inline with heading content
            next_token = tokens[tokens.index(token) + 1]
            if next_token.type == "inline":
                headings.append({
                    'level': int(token.tag[1]),  # h1->1, h2->2, etc.
                    'text': next_token.content
                })
    return headings

Custom Token Creation

def create_custom_block(content, css_class=None):
    """Create custom block with wrapper div."""
    tokens = []
    
    # Opening div
    div_open = Token("div_open", "div", 1)
    if css_class:
        div_open.attrSet("class", css_class)
    tokens.append(div_open)
    
    # Content paragraph
    p_open = Token("paragraph_open", "p", 1, level=1)
    inline = Token("inline", "", 0, content=content, level=1)
    p_close = Token("paragraph_close", "p", -1, level=1)
    
    tokens.extend([p_open, inline, p_close])
    
    # Closing div
    div_close = Token("div_close", "div", -1)
    tokens.append(div_close)
    
    return tokens

Install with Tessl CLI