tessl/pypi-markdown-it-py

Python port of markdown-it providing CommonMark-compliant markdown parsing with configurable syntax and pluggable architecture

—

Pending

Overview

Eval results

Files

Link Processing and Security

Name: tessl/pypi-markdown-it-py
Author: tessl

URL validation, normalization, and link processing utilities with built-in security features to prevent XSS attacks and ensure safe link handling in markdown documents.

Capabilities

URL Validation

Security-focused URL validation to prevent malicious links.

def validateLink(self, url: str) -> bool:
    """
    Validate if URL link is allowed in output.
    
    This validator can prohibit more than really needed to prevent XSS.
    It's a tradeoff to keep code simple and to be secure by default.
    
    Parameters:
    - url: URL to validate (should be normalized and entities decoded)
    
    Returns:
    - bool: True if URL is considered safe
    """

Usage Example:

from markdown_it import MarkdownIt

md = MarkdownIt()

# Test URL validation
safe_urls = [
    "https://example.com",
    "http://example.com/path",
    "mailto:user@example.com",
    "/relative/path",
    "#anchor"
]

unsafe_urls = [
    "javascript:alert('xss')",
    "data:text/html,<script>alert('xss')</script>",
    "vbscript:msgbox('xss')"
]

for url in safe_urls:
    print(f"{url}: {md.validateLink(url)}")  # Should be True

for url in unsafe_urls:
    print(f"{url}: {md.validateLink(url)}")  # Should be False

URL Normalization

Normalize URLs for consistency and security.

def normalizeLink(self, url: str) -> str:
    """
    Normalize destination URLs in links.
    
    Used for link destinations like:
    [label]: destination   'title'
             ^^^^^^^^^^^
    
    Parameters:
    - url: raw URL to normalize
    
    Returns:
    - str: normalized URL
    """

def normalizeLinkText(self, link: str) -> str:
    """
    Normalize autolink content.
    
    Used for autolink content like:
    <destination>
     ~~~~~~~~~~~
    
    Parameters:
    - link: raw link text to normalize
    
    Returns:
    - str: normalized link text
    """

Usage Example:

from markdown_it import MarkdownIt

md = MarkdownIt()

# URL normalization
raw_url = "HTTP://EXAMPLE.COM/Path With Spaces"
normalized = md.normalizeLink(raw_url)
print(normalized)  # "http://example.com/Path%20With%20Spaces"

# Link text normalization
raw_link = "www.example.com/path"
normalized_text = md.normalizeLinkText(raw_link)
print(normalized_text)  # Normalized for display

Link Helper Functions

Low-level utilities for parsing link components.

from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle

def parseLinkDestination(str: str, pos: int, max: int) -> dict:
    """
    Parse link destination from input string.
    
    Parameters:
    - str: input string
    - pos: starting position
    - max: maximum position
    
    Returns:
    - dict: {ok: bool, pos: int, str: str} - parse result
    """

def parseLinkLabel(str: str, pos: int, max: int) -> dict:
    """
    Parse link label from input string.
    
    Parameters:
    - str: input string  
    - pos: starting position
    - max: maximum position
    
    Returns:
    - dict: {ok: bool, pos: int, str: str} - parse result
    """

def parseLinkTitle(str: str, pos: int, max: int) -> dict:
    """
    Parse link title from input string.
    
    Parameters:
    - str: input string
    - pos: starting position  
    - max: maximum position
    
    Returns:
    - dict: {ok: bool, pos: int, str: str, marker: str} - parse result
    """

Usage Example:

from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle

# Parse link destination
text = '<https://example.com> "Title"'
result = parseLinkDestination(text, 1, len(text) - 1)
print(result)  # {ok: True, pos: 19, str: 'https://example.com'}

# Parse link label
text = '[Link Text]'
result = parseLinkLabel(text, 0, len(text))
print(result)  # {ok: True, pos: 11, str: 'Link Text'}

# Parse link title
text = '"Title Here"'
result = parseLinkTitle(text, 0, len(text))
print(result)  # {ok: True, pos: 12, str: 'Title Here', marker: '"'}

Security Features

XSS Prevention

Built-in protection against cross-site scripting attacks:

def custom_link_validator(url):
    """Custom link validation with additional security checks."""
    from markdown_it.common.normalize_url import validateLink
    
    # Use built-in validation first
    if not validateLink(url):
        return False
    
    # Additional custom checks
    lower_url = url.lower()
    
    # Block additional dangerous protocols
    dangerous_protocols = ['file:', 'ftp:', 'news:', 'gopher:']
    if any(lower_url.startswith(proto) for proto in dangerous_protocols):
        return False
    
    # Block URLs with suspicious patterns
    suspicious_patterns = ['<script', 'javascript:', 'vbscript:', 'data:']
    if any(pattern in lower_url for pattern in suspicious_patterns):
        return False
    
    return True

# Override validation in renderer
def secure_link_open(tokens, idx, options, env):
    """Secure link rendering with validation."""
    token = tokens[idx]
    href = token.attrGet("href")
    
    if href and not custom_link_validator(href):
        # Replace with safe placeholder
        token.attrSet("href", "#invalid-link")
        token.attrSet("class", "invalid-link")
        token.attrSet("title", "Invalid or potentially unsafe link")
    
    return default_link_open(tokens, idx, options, env)

Content Security

Sanitize and validate link content:

def sanitize_link_content(tokens):
    """Sanitize link tokens for security."""
    for token in tokens:
        if token.type == "link_open":
            href = token.attrGet("href")
            if href:
                # Normalize URL
                from markdown_it.common.normalize_url import normalizeLink
                normalized_href = normalizeLink(href)
                
                # Validate normalized URL
                from markdown_it.common.normalize_url import validateLink
                if validateLink(normalized_href):
                    token.attrSet("href", normalized_href)
                    # Add security attributes
                    if normalized_href.startswith(('http://', 'https://')):
                        token.attrSet("rel", "noopener noreferrer")
                        token.attrSet("target", "_blank")
                else:
                    # Remove unsafe link
                    token.type = "text"
                    token.tag = ""
                    token.content = href
        
        elif token.type == "image":
            src = token.attrGet("src")
            if src:
                # Validate image URLs
                from markdown_it.common.normalize_url import normalizeLink, validateLink
                normalized_src = normalizeLink(src)
                if validateLink(normalized_src):
                    token.attrSet("src", normalized_src)
                else:
                    # Remove unsafe image
                    token.attrSet("src", "")
                    token.attrSet("alt", f"[Invalid image: {src}]")
    
    return tokens

Link Processing Utilities

Reference Link Handling

Process reference-style links and their definitions:

def extract_reference_links(env):
    """Extract reference link definitions from environment."""
    references = env.get('references', {})
    
    links = []
    for label, ref_data in references.items():
        links.append({
            'label': label,
            'href': ref_data.get('href', ''),
            'title': ref_data.get('title', '')
        })
    
    return links

def add_reference_link(env, label, href, title=""):
    """Add reference link definition to environment."""
    if 'references' not in env:
        env['references'] = {}
    
    env['references'][label.lower()] = {
        'href': href,
        'title': title
    }

# Usage
md = MarkdownIt()
env = {}

# Parse markdown with reference links
text = """
[Link 1][ref1]
[Link 2][ref2]

[ref1]: https://example.com "Example"
[ref2]: https://another.com
"""

tokens = md.parse(text, env)
references = extract_reference_links(env)

for ref in references:
    print(f"Reference '{ref['label']}': {ref['href']}")

Autolink Processing

Handle automatic link detection and processing:

def extract_autolinks(tokens):
    """Extract automatically detected links from tokens."""
    autolinks = []
    
    for token in tokens:
        if token.type == "link_open" and token.info == "auto":
            # This is an autolink
            href = token.attrGet("href")
            autolinks.append(href)
        elif token.children:
            # Recursively check children
            autolinks.extend(extract_autolinks(token.children))
    
    return autolinks

def disable_autolinks_for_domains(md, blocked_domains):
    """Disable autolink processing for specific domains."""
    original_linkify = md.core.ruler.getRules("")[3]  # linkify rule
    
    def filtered_linkify(state):
        # Run original linkify
        original_linkify(state)
        
        # Filter out blocked domains
        for token in state.tokens:
            if (token.type == "inline" and token.children):
                for child in token.children:
                    if (child.type == "link_open" and 
                        child.info == "auto"):
                        href = child.attrGet("href")
                        if any(domain in href for domain in blocked_domains):
                            # Convert back to text
                            child.type = "text"
                            child.content = href
    
    # Replace linkify rule
    md.core.ruler.at("linkify", filtered_linkify)

Link Analysis

Analyze and report on links in documents:

def analyze_links(tokens):
    """Analyze all links in token stream."""
    analysis = {
        'total_links': 0,
        'external_links': 0,
        'internal_links': 0,
        'reference_links': 0,
        'autolinks': 0,
        'images': 0,
        'broken_links': [],
        'domains': set()
    }
    
    def analyze_token_links(token_list):
        for token in token_list:
            if token.type == "link_open":
                analysis['total_links'] += 1
                href = token.attrGet("href")
                
                if token.info == "auto":
                    analysis['autolinks'] += 1
                
                if href:
                    if href.startswith(('http://', 'https://')):
                        analysis['external_links'] += 1
                        # Extract domain
                        from urllib.parse import urlparse
                        domain = urlparse(href).netloc
                        analysis['domains'].add(domain)
                    elif href.startswith('#'):
                        analysis['internal_links'] += 1
                    elif not href:
                        analysis['broken_links'].append(token)
                        
            elif token.type == "image":
                analysis['images'] += 1
                src = token.attrGet("src")
                if src and src.startswith(('http://', 'https://')):
                    from urllib.parse import urlparse
                    domain = urlparse(src).netloc
                    analysis['domains'].add(domain)
            
            elif token.children:
                analyze_token_links(token.children)
    
    analyze_token_links(tokens)
    analysis['domains'] = list(analysis['domains'])
    
    return analysis

# Usage
md = MarkdownIt('gfm-like')
tokens = md.parse("""
# Document

[External link](https://example.com)
[Internal link](#section)
https://auto.link.com
![Image](https://images.example.com/pic.jpg)
""")

link_analysis = analyze_links(tokens)
print(f"Found {link_analysis['total_links']} links")
print(f"External domains: {link_analysis['domains']}")

Install with Tessl CLI