High-performance HTML sanitization library with Python bindings to Rust ammonia crate
npx @tessl/cli install tessl/pypi-nh3@0.3.0High-performance HTML sanitization library providing Python bindings to the Rust ammonia crate. NH3 delivers fast, secure HTML cleaning with comprehensive configuration options, approximately 20x faster than alternatives like bleach while maintaining security and flexibility.
pip install nh3import nh3All functionality is available at the module level. For type hints:
from typing import Callable, Dict, Optional, Setimport nh3
# Basic HTML sanitization
html = '<script>alert("xss")</script><p>Safe <b>content</b></p>'
clean_html = nh3.clean(html)
print(clean_html) # Output: '<p>Safe <b>content</b></p>'
# Text escaping
text = 'User input with <dangerous> characters & symbols'
escaped = nh3.clean_text(text)
print(escaped) # Output: 'User input with <dangerous> characters & symbols'
# Check if string contains HTML
has_html = nh3.is_html('<p>HTML content</p>') # True
has_no_html = nh3.is_html('Plain text') # False
# Using a reusable cleaner with custom configuration
cleaner = nh3.Cleaner(
tags={'p', 'b', 'i', 'strong', 'em'},
attributes={'*': {'class', 'id'}},
strip_comments=True
)
result = cleaner.clean('<p class="text">Safe <script>evil()</script> content</p>')
print(result) # Output: '<p class="text">Safe content</p>'Primary function for cleaning HTML content with extensive configuration options for allowed tags, attributes, URL schemes, and content filtering.
def clean(
html: str,
tags: Optional[Set[str]] = None,
clean_content_tags: Optional[Set[str]] = None,
attributes: Optional[Dict[str, Set[str]]] = None,
attribute_filter: Optional[Callable[[str, str, str], Optional[str]]] = None,
strip_comments: bool = True,
link_rel: Optional[str] = "noopener noreferrer",
generic_attribute_prefixes: Optional[Set[str]] = None,
tag_attribute_values: Optional[Dict[str, Dict[str, Set[str]]]] = None,
set_tag_attribute_values: Optional[Dict[str, Dict[str, str]]] = None,
url_schemes: Optional[Set[str]] = None,
allowed_classes: Optional[Dict[str, Set[str]]] = None,
filter_style_properties: Optional[Set[str]] = None
) -> str:
"""
Sanitize an HTML fragment according to the given options.
Parameters:
- html: Input HTML fragment to sanitize
- tags: Set of allowed HTML tags (defaults to ALLOWED_TAGS)
- clean_content_tags: Tags whose contents are completely removed
- attributes: Allowed attributes per tag ('*' key for any tag)
- attribute_filter: Callback for custom attribute processing
- strip_comments: Whether to remove HTML comments
- link_rel: Rel attribute value added to links
- generic_attribute_prefixes: Attribute prefixes allowed on any tag
- tag_attribute_values: Allowed attribute values per tag
- set_tag_attribute_values: Required attribute values per tag
- url_schemes: Permitted URL schemes for href/src attributes
- allowed_classes: Allowed CSS classes per tag
- filter_style_properties: Allowed CSS properties in style attributes
Returns:
Sanitized HTML fragment as string
"""Usage Examples:
# Allow only specific tags
nh3.clean('<div><p>Text</p><script>evil()</script></div>', tags={'p'})
# Result: '<p>Text</p>'
# Remove script/style content completely
nh3.clean('<style>body{}</style><p>Text</p>', clean_content_tags={'style'})
# Result: '<p>Text</p>'
# Custom attribute filtering
def filter_classes(tag, attr, value):
if tag == 'div' and attr == 'class':
allowed = {'container', 'wrapper'}
classes = set(value.split())
filtered = classes.intersection(allowed)
return ' '.join(filtered) if filtered else None
return value
nh3.clean('<div class="container evil">text</div>',
attributes={'div': {'class'}},
attribute_filter=filter_classes)
# Result: '<div class="container">text</div>'
# Allow data attributes with prefixes
nh3.clean('<div data-id="123" onclick="evil()">text</div>',
generic_attribute_prefixes={'data-'})
# Result: '<div data-id="123">text</div>'
# Control URL schemes
nh3.clean('<a href="javascript:alert()">link</a>', url_schemes={'https', 'http'})
# Result: '<a>link</a>'
# Filter CSS properties
nh3.clean('<p style="color:red;display:none">text</p>',
attributes={'p': {'style'}},
filter_style_properties={'color'})
# Result: '<p style="color:red">text</p>'Converts arbitrary strings to HTML-safe text by escaping special characters, equivalent to html.escape() but with more aggressive escaping for maximum security.
def clean_text(html: str) -> str:
"""
Turn an arbitrary string into unformatted HTML by escaping special characters.
Parameters:
- html: Input string to escape
Returns:
HTML-escaped string safe for display in HTML context
"""Usage Examples:
# Basic text escaping
nh3.clean_text('Price: $5 & up')
# Result: 'Price: $5 & up'
# JavaScript injection prevention
nh3.clean_text('"); alert("xss");//')
# Result: '"); alert("xss");//'
# HTML tag neutralization
nh3.clean_text('<script>alert("hello")</script>')
# Result: '<script>alert("hello")</script>'Determines whether a string contains HTML syntax through full parsing, useful for conditional processing of user input.
def is_html(html: str) -> bool:
"""
Determine if a given string contains HTML syntax.
Parameters:
- html: Input string to analyze
Returns:
True if string contains HTML syntax (including invalid HTML), False otherwise
"""Usage Examples:
# Valid HTML detection
nh3.is_html('<p>Hello world</p>') # True
nh3.is_html('<br>') # True
# Invalid HTML still detected
nh3.is_html('<invalid-tag>') # True
nh3.is_html('Vec::<u8>::new()') # True (angle brackets detected)
# Plain text
nh3.is_html('Hello world') # False
nh3.is_html('Price: $5 & up') # FalseClass-based interface for creating configured sanitizers that can be reused multiple times, providing better performance for repeated sanitization with the same settings.
class Cleaner:
def __init__(
self,
tags: Optional[Set[str]] = None,
clean_content_tags: Optional[Set[str]] = None,
attributes: Optional[Dict[str, Set[str]]] = None,
attribute_filter: Optional[Callable[[str, str, str], Optional[str]]] = None,
strip_comments: bool = True,
link_rel: Optional[str] = "noopener noreferrer",
generic_attribute_prefixes: Optional[Set[str]] = None,
tag_attribute_values: Optional[Dict[str, Dict[str, Set[str]]]] = None,
set_tag_attribute_values: Optional[Dict[str, Dict[str, str]]] = None,
url_schemes: Optional[Set[str]] = None,
allowed_classes: Optional[Dict[str, Set[str]]] = None,
filter_style_properties: Optional[Set[str]] = None
) -> None:
"""
Create a reusable sanitizer with the given configuration.
Parameters: Same as clean() function parameters
"""
def clean(self, html: str) -> str:
"""
Sanitize HTML using the configured options.
Parameters:
- html: Input HTML fragment to sanitize
Returns:
Sanitized HTML fragment as string
"""Usage Examples:
# Create a cleaner for blog content
blog_cleaner = nh3.Cleaner(
tags={'p', 'br', 'strong', 'em', 'a', 'ul', 'ol', 'li'},
attributes={
'a': {'href', 'title'},
'*': {'class'}
},
allowed_classes={
'p': {'highlight', 'quote'},
'a': {'external-link'}
},
url_schemes={'http', 'https', 'mailto'}
)
# Reuse the cleaner for multiple inputs
user_content1 = blog_cleaner.clean('<p class="highlight">Safe content</p>')
user_content2 = blog_cleaner.clean('<script>evil()</script><p>More content</p>')
# Create a strict cleaner for user comments
comment_cleaner = nh3.Cleaner(
tags={'p', 'br'},
attributes={},
strip_comments=True,
link_rel=None
)
safe_comment = comment_cleaner.clean('<p>User comment with <a>no links</a></p>')
# Result: '<p>User comment with no links</p>'Pre-configured sets of allowed tags, attributes, and URL schemes based on secure defaults from the ammonia library.
ALLOWED_TAGS: Set[str]
# Default set of allowed HTML tags including: a, abbr, acronym, area, article, aside,
# b, bdi, bdo, blockquote, br, button, caption, center, cite, code, col, colgroup,
# data, datalist, dd, del, details, dfn, div, dl, dt, em, fieldset, figcaption,
# figure, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, img, input,
# ins, kbd, keygen, label, legend, li, main, map, mark, meter, nav, ol, optgroup,
# option, output, p, pre, progress, q, rp, rt, ruby, s, samp, section, select,
# small, span, strong, sub, summary, sup, table, tbody, td, textarea, tfoot, th,
# thead, time, tr, u, ul, var, wbr
ALLOWED_ATTRIBUTES: Dict[str, Set[str]]
# Default mapping of allowed attributes per tag, includes common safe attributes
# like href for links, src for images, type for inputs, etc.
ALLOWED_URL_SCHEMES: Set[str]
# Default set of allowed URL schemes: http, https, mailtoUsage Examples:
# Inspect default allowed tags
print('p' in nh3.ALLOWED_TAGS) # True
print('script' in nh3.ALLOWED_TAGS) # False
# Extend default attributes
from copy import deepcopy
custom_attributes = deepcopy(nh3.ALLOWED_ATTRIBUTES)
custom_attributes['div'].add('data-id')
custom_attributes['*'] = {'class', 'id'}
# Use extended configuration
result = nh3.clean('<div class="box" data-id="123">content</div>',
attributes=custom_attributes)
# Remove tags using set operations
restricted_tags = nh3.ALLOWED_TAGS - {'b', 'i'}
nh3.clean('<b><i>text</i></b><p>paragraph</p>', tags=restricted_tags)
# Result: 'text<p>paragraph</p>'
# Remove URL schemes using set operations
safe_schemes = nh3.ALLOWED_URL_SCHEMES - {'tel'}
nh3.clean('<a href="tel:+1">Call</a> or <a href="mailto:me">email</a>',
url_schemes=safe_schemes)
# Result: '<a rel="noopener noreferrer">Call</a> or <a href="mailto:me" rel="noopener noreferrer">email</a>'
# Check default URL schemes
print('https' in nh3.ALLOWED_URL_SCHEMES) # True
print('javascript' in nh3.ALLOWED_URL_SCHEMES) # FalseThe attribute_filter parameter accepts a callable that receives three string parameters (tag, attribute, value) and can return a modified value or None to remove the attribute entirely.
def smart_class_filter(tag, attr, value):
"""Example: Only allow specific CSS classes"""
if attr == 'class':
allowed_classes = {
'p': {'intro', 'highlight', 'quote'},
'div': {'container', 'wrapper', 'sidebar'},
'a': {'external', 'internal'}
}
if tag in allowed_classes:
classes = set(value.split())
filtered = classes.intersection(allowed_classes[tag])
return ' '.join(sorted(filtered)) if filtered else None
return value
# Apply the filter
result = nh3.clean(
'<div class="container evil"><p class="intro spam">Text</p></div>',
attributes={'div': {'class'}, 'p': {'class'}},
attribute_filter=smart_class_filter
)
# Result: '<div class="container"><p class="intro">Text</p></div>'Control which specific values are allowed for attributes on specific tags.
# Only allow specific form input types
result = nh3.clean(
'<input type="text"><input type="password"><input type="file">',
tags={'input'},
tag_attribute_values={
'input': {
'type': {'text', 'email', 'password', 'number'}
}
}
)
# Result: '<input type="text"><input type="password"><input>'Automatically add or override attribute values on specific tags.
# Always add target="_blank" to external links
result = nh3.clean(
'<a href="https://example.com">Link</a>',
tags={'a'},
attributes={'a': {'href', 'target'}},
set_tag_attribute_values={
'a': {'target': '_blank'}
},
link_rel='noopener noreferrer'
)
# Result: '<a href="https://example.com" target="_blank" rel="noopener noreferrer">Link</a>'NH3 follows Python conventions for error handling:
TypeError if the provided callback is not callable. Exceptions raised within the callback are handled as unraisable exceptions and logged, allowing processing to continue__version__: str
# Package version string (e.g., "0.3.0")