Python port of markdown-it providing CommonMark-compliant markdown parsing with configurable syntax and pluggable architecture
—
URL validation, normalization, and link processing utilities with built-in security features to prevent XSS attacks and ensure safe link handling in markdown documents.
Security-focused URL validation to prevent malicious links.
def validateLink(self, url: str) -> bool:
"""
Validate if URL link is allowed in output.
This validator can prohibit more than really needed to prevent XSS.
It's a tradeoff to keep code simple and to be secure by default.
Parameters:
- url: URL to validate (should be normalized and entities decoded)
Returns:
- bool: True if URL is considered safe
"""Usage Example:
from markdown_it import MarkdownIt
md = MarkdownIt()
# Test URL validation
safe_urls = [
"https://example.com",
"http://example.com/path",
"mailto:user@example.com",
"/relative/path",
"#anchor"
]
unsafe_urls = [
"javascript:alert('xss')",
"data:text/html,<script>alert('xss')</script>",
"vbscript:msgbox('xss')"
]
for url in safe_urls:
print(f"{url}: {md.validateLink(url)}") # Should be True
for url in unsafe_urls:
print(f"{url}: {md.validateLink(url)}") # Should be FalseNormalize URLs for consistency and security.
def normalizeLink(self, url: str) -> str:
"""
Normalize destination URLs in links.
Used for link destinations like:
[label]: destination 'title'
^^^^^^^^^^^
Parameters:
- url: raw URL to normalize
Returns:
- str: normalized URL
"""
def normalizeLinkText(self, link: str) -> str:
"""
Normalize autolink content.
Used for autolink content like:
<destination>
~~~~~~~~~~~
Parameters:
- link: raw link text to normalize
Returns:
- str: normalized link text
"""Usage Example:
from markdown_it import MarkdownIt
md = MarkdownIt()
# URL normalization
raw_url = "HTTP://EXAMPLE.COM/Path With Spaces"
normalized = md.normalizeLink(raw_url)
print(normalized) # "http://example.com/Path%20With%20Spaces"
# Link text normalization
raw_link = "www.example.com/path"
normalized_text = md.normalizeLinkText(raw_link)
print(normalized_text) # Normalized for displayLow-level utilities for parsing link components.
from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle
def parseLinkDestination(str: str, pos: int, max: int) -> dict:
"""
Parse link destination from input string.
Parameters:
- str: input string
- pos: starting position
- max: maximum position
Returns:
- dict: {ok: bool, pos: int, str: str} - parse result
"""
def parseLinkLabel(str: str, pos: int, max: int) -> dict:
"""
Parse link label from input string.
Parameters:
- str: input string
- pos: starting position
- max: maximum position
Returns:
- dict: {ok: bool, pos: int, str: str} - parse result
"""
def parseLinkTitle(str: str, pos: int, max: int) -> dict:
"""
Parse link title from input string.
Parameters:
- str: input string
- pos: starting position
- max: maximum position
Returns:
- dict: {ok: bool, pos: int, str: str, marker: str} - parse result
"""Usage Example:
from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle
# Parse link destination
text = '<https://example.com> "Title"'
result = parseLinkDestination(text, 1, len(text) - 1)
print(result) # {ok: True, pos: 19, str: 'https://example.com'}
# Parse link label
text = '[Link Text]'
result = parseLinkLabel(text, 0, len(text))
print(result) # {ok: True, pos: 11, str: 'Link Text'}
# Parse link title
text = '"Title Here"'
result = parseLinkTitle(text, 0, len(text))
print(result) # {ok: True, pos: 12, str: 'Title Here', marker: '"'}Built-in protection against cross-site scripting attacks:
def custom_link_validator(url):
"""Custom link validation with additional security checks."""
from markdown_it.common.normalize_url import validateLink
# Use built-in validation first
if not validateLink(url):
return False
# Additional custom checks
lower_url = url.lower()
# Block additional dangerous protocols
dangerous_protocols = ['file:', 'ftp:', 'news:', 'gopher:']
if any(lower_url.startswith(proto) for proto in dangerous_protocols):
return False
# Block URLs with suspicious patterns
suspicious_patterns = ['<script', 'javascript:', 'vbscript:', 'data:']
if any(pattern in lower_url for pattern in suspicious_patterns):
return False
return True
# Override validation in renderer
def secure_link_open(tokens, idx, options, env):
"""Secure link rendering with validation."""
token = tokens[idx]
href = token.attrGet("href")
if href and not custom_link_validator(href):
# Replace with safe placeholder
token.attrSet("href", "#invalid-link")
token.attrSet("class", "invalid-link")
token.attrSet("title", "Invalid or potentially unsafe link")
return default_link_open(tokens, idx, options, env)Sanitize and validate link content:
def sanitize_link_content(tokens):
"""Sanitize link tokens for security."""
for token in tokens:
if token.type == "link_open":
href = token.attrGet("href")
if href:
# Normalize URL
from markdown_it.common.normalize_url import normalizeLink
normalized_href = normalizeLink(href)
# Validate normalized URL
from markdown_it.common.normalize_url import validateLink
if validateLink(normalized_href):
token.attrSet("href", normalized_href)
# Add security attributes
if normalized_href.startswith(('http://', 'https://')):
token.attrSet("rel", "noopener noreferrer")
token.attrSet("target", "_blank")
else:
# Remove unsafe link
token.type = "text"
token.tag = ""
token.content = href
elif token.type == "image":
src = token.attrGet("src")
if src:
# Validate image URLs
from markdown_it.common.normalize_url import normalizeLink, validateLink
normalized_src = normalizeLink(src)
if validateLink(normalized_src):
token.attrSet("src", normalized_src)
else:
# Remove unsafe image
token.attrSet("src", "")
token.attrSet("alt", f"[Invalid image: {src}]")
return tokensProcess reference-style links and their definitions:
def extract_reference_links(env):
"""Extract reference link definitions from environment."""
references = env.get('references', {})
links = []
for label, ref_data in references.items():
links.append({
'label': label,
'href': ref_data.get('href', ''),
'title': ref_data.get('title', '')
})
return links
def add_reference_link(env, label, href, title=""):
"""Add reference link definition to environment."""
if 'references' not in env:
env['references'] = {}
env['references'][label.lower()] = {
'href': href,
'title': title
}
# Usage
md = MarkdownIt()
env = {}
# Parse markdown with reference links
text = """
[Link 1][ref1]
[Link 2][ref2]
[ref1]: https://example.com "Example"
[ref2]: https://another.com
"""
tokens = md.parse(text, env)
references = extract_reference_links(env)
for ref in references:
print(f"Reference '{ref['label']}': {ref['href']}")Handle automatic link detection and processing:
def extract_autolinks(tokens):
"""Extract automatically detected links from tokens."""
autolinks = []
for token in tokens:
if token.type == "link_open" and token.info == "auto":
# This is an autolink
href = token.attrGet("href")
autolinks.append(href)
elif token.children:
# Recursively check children
autolinks.extend(extract_autolinks(token.children))
return autolinks
def disable_autolinks_for_domains(md, blocked_domains):
"""Disable autolink processing for specific domains."""
original_linkify = md.core.ruler.getRules("")[3] # linkify rule
def filtered_linkify(state):
# Run original linkify
original_linkify(state)
# Filter out blocked domains
for token in state.tokens:
if (token.type == "inline" and token.children):
for child in token.children:
if (child.type == "link_open" and
child.info == "auto"):
href = child.attrGet("href")
if any(domain in href for domain in blocked_domains):
# Convert back to text
child.type = "text"
child.content = href
# Replace linkify rule
md.core.ruler.at("linkify", filtered_linkify)Analyze and report on links in documents:
def analyze_links(tokens):
"""Analyze all links in token stream."""
analysis = {
'total_links': 0,
'external_links': 0,
'internal_links': 0,
'reference_links': 0,
'autolinks': 0,
'images': 0,
'broken_links': [],
'domains': set()
}
def analyze_token_links(token_list):
for token in token_list:
if token.type == "link_open":
analysis['total_links'] += 1
href = token.attrGet("href")
if token.info == "auto":
analysis['autolinks'] += 1
if href:
if href.startswith(('http://', 'https://')):
analysis['external_links'] += 1
# Extract domain
from urllib.parse import urlparse
domain = urlparse(href).netloc
analysis['domains'].add(domain)
elif href.startswith('#'):
analysis['internal_links'] += 1
elif not href:
analysis['broken_links'].append(token)
elif token.type == "image":
analysis['images'] += 1
src = token.attrGet("src")
if src and src.startswith(('http://', 'https://')):
from urllib.parse import urlparse
domain = urlparse(src).netloc
analysis['domains'].add(domain)
elif token.children:
analyze_token_links(token.children)
analyze_token_links(tokens)
analysis['domains'] = list(analysis['domains'])
return analysis
# Usage
md = MarkdownIt('gfm-like')
tokens = md.parse("""
# Document
[External link](https://example.com)
[Internal link](#section)
https://auto.link.com

""")
link_analysis = analyze_links(tokens)
print(f"Found {link_analysis['total_links']} links")
print(f"External domains: {link_analysis['domains']}")Install with Tessl CLI
npx tessl i tessl/pypi-markdown-it-py