An easy safelist-based HTML-sanitizing tool for untrusted content
npx @tessl/cli install tessl/pypi-bleach@6.2.0An easy safelist-based HTML-sanitizing tool that escapes or strips markup and attributes from untrusted HTML content. Bleach uses an allowlist approach to remove malicious content while preserving safe, intended HTML elements. It can also safely linkify text, applying more comprehensive filters than Django's urlize filter.
pip install bleachpip install bleach[css] (for CSS sanitization with tinycss2)import bleachFor main functions:
from bleach import clean, linkifyFor classes:
from bleach.sanitizer import Cleaner, BleachSanitizerFilter, attribute_filter_factory
from bleach.linkifier import Linker, LinkifyFilter
from bleach.css_sanitizer import CSSSanitizerFor callbacks:
from bleach.callbacks import nofollow, target_blankFor constants and utilities:
from bleach.sanitizer import ALLOWED_TAGS, ALLOWED_ATTRIBUTES, ALLOWED_PROTOCOLS
from bleach.sanitizer import INVISIBLE_CHARACTERS, INVISIBLE_CHARACTERS_RE, INVISIBLE_REPLACEMENT_CHAR
from bleach.linkifier import DEFAULT_CALLBACKS, build_url_re, build_email_re, TLDS, URL_RE, EMAIL_RE, PROTO_RE
from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES, ALLOWED_SVG_PROPERTIES
from bleach import html5lib_shim # For HTML_TAGS constant
from bleach import __version__, __releasedate__import bleach
# Basic HTML sanitization - removes unsafe tags and attributes
unsafe_html = '<script>alert("XSS")</script><p onclick="evil()">Hello <b>world</b></p>'
safe_html = bleach.clean(unsafe_html)
# Result: '<script>alert("XSS")</script><p>Hello <b>world</b></p>'
# Linkification - converts URLs to clickable links
text_with_urls = 'Visit https://example.com for more info!'
linked_text = bleach.linkify(text_with_urls)
# Result: 'Visit <a href="https://example.com" rel="nofollow">https://example.com</a> for more info!'
# Combined cleaning and linkifying
unsafe_text = 'Check out http://evil.com<script>alert("bad")</script>'
safe_linked = bleach.linkify(bleach.clean(unsafe_text))Cleans HTML fragments by removing or escaping malicious content using an allowlist-based approach.
def clean(
text: str,
tags: frozenset = ALLOWED_TAGS,
attributes: dict = ALLOWED_ATTRIBUTES,
protocols: frozenset = ALLOWED_PROTOCOLS,
strip: bool = False,
strip_comments: bool = True,
css_sanitizer: CSSSanitizer = None
) -> str:
"""
Clean an HTML fragment of malicious content and return it.
Parameters:
- text: the HTML text to clean
- tags: set of allowed tags; defaults to ALLOWED_TAGS
- attributes: allowed attributes; can be callable, list or dict; defaults to ALLOWED_ATTRIBUTES
- protocols: allowed list of protocols for links; defaults to ALLOWED_PROTOCOLS
- strip: whether to strip disallowed elements instead of escaping
- strip_comments: whether to strip HTML comments
- css_sanitizer: instance with sanitize_css method for style attributes
Returns:
Cleaned text as unicode string
"""Converts URL-like strings in HTML fragments to clickable links while preserving existing links and structure.
def linkify(
text: str,
callbacks: list = DEFAULT_CALLBACKS,
skip_tags: set = None,
parse_email: bool = False
) -> str:
"""
Convert URL-like strings in an HTML fragment to links.
Parameters:
- text: the text to linkify
- callbacks: list of callbacks to run when adjusting tag attributes
- skip_tags: set of tags to skip linkifying contents of
- parse_email: whether to linkify email addresses
Returns:
Linkified text as unicode string
"""Configurable HTML cleaner for repeated use with consistent settings.
class Cleaner:
"""
Cleaner for cleaning HTML fragments of malicious content.
Not thread-safe - create separate instances per thread.
"""
def __init__(
self,
tags: frozenset = ALLOWED_TAGS,
attributes: dict = ALLOWED_ATTRIBUTES,
protocols: frozenset = ALLOWED_PROTOCOLS,
strip: bool = False,
strip_comments: bool = True,
filters: list = None,
css_sanitizer: CSSSanitizer = None
):
"""
Initialize a Cleaner instance.
Parameters:
- tags: set of allowed tags
- attributes: allowed attributes configuration
- protocols: allowed protocols for links
- strip: whether to strip disallowed elements
- strip_comments: whether to strip HTML comments
- filters: list of additional html5lib filters
- css_sanitizer: CSS sanitizer instance
"""
def clean(self, text: str) -> str:
"""
Clean the specified HTML text.
Parameters:
- text: HTML text to clean
Returns:
Cleaned HTML text
"""Configurable URL linkifier for repeated use with consistent settings.
class Linker:
"""
Convert URL-like strings in HTML fragments to links with configuration.
"""
def __init__(
self,
callbacks: list = DEFAULT_CALLBACKS,
skip_tags: set = None,
parse_email: bool = False,
url_re = URL_RE,
email_re = EMAIL_RE,
recognized_tags = html5lib_shim.HTML_TAGS
):
"""
Create a Linker instance.
Parameters:
- callbacks: list of callbacks for adjusting tag attributes
- skip_tags: set of tags to skip linkifying contents of
- parse_email: whether to linkify email addresses
- url_re: custom URL matching regex
- email_re: custom email matching regex
- recognized_tags: set of recognized HTML tags
"""
def linkify(self, text: str) -> str:
"""
Linkify the specified text.
Parameters:
- text: text to linkify
Returns:
Linkified text
Raises:
TypeError: if text is not a string type
"""HTML filter for linkifying during html5lib parsing, commonly used with Cleaner filters.
class LinkifyFilter(html5lib_shim.Filter):
"""
HTML filter that linkifies text during html5lib parsing.
Can be used with Cleaner filters for combined cleaning and linkification.
"""
def __init__(
self,
source,
callbacks: list = DEFAULT_CALLBACKS,
skip_tags: set = None,
parse_email: bool = False,
url_re = URL_RE,
email_re = EMAIL_RE
):
"""
Create a LinkifyFilter instance.
Parameters:
- source: html5lib TreeWalker stream
- callbacks: list of callbacks for adjusting tag attributes
- skip_tags: set of tags to skip linkifying contents of
- parse_email: whether to linkify email addresses
- url_re: custom URL matching regex
- email_re: custom email matching regex
"""HTML filter for sanitizing content during html5lib parsing, commonly used with other filters.
class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
"""
HTML filter that sanitizes HTML during html5lib parsing.
Can be used with other html5lib filters for custom processing.
"""
def __init__(
self,
source,
allowed_tags: frozenset = ALLOWED_TAGS,
attributes = ALLOWED_ATTRIBUTES,
allowed_protocols: frozenset = ALLOWED_PROTOCOLS,
attr_val_is_uri = html5lib_shim.attr_val_is_uri,
svg_attr_val_allows_ref = html5lib_shim.svg_attr_val_allows_ref,
svg_allow_local_href = html5lib_shim.svg_allow_local_href,
strip_disallowed_tags: bool = False,
strip_html_comments: bool = True,
css_sanitizer: CSSSanitizer = None
):
"""
Create a BleachSanitizerFilter instance.
Parameters:
- source: html5lib TreeWalker stream
- allowed_tags: set of allowed tags
- attributes: allowed attributes configuration
- allowed_protocols: allowed protocols for links
- attr_val_is_uri: set of attributes that have URI values
- svg_attr_val_allows_ref: set of SVG attributes that can have references
- svg_allow_local_href: set of SVG elements that can have local hrefs
- strip_disallowed_tags: whether to strip disallowed tags
- strip_html_comments: whether to strip HTML comments
- css_sanitizer: CSS sanitizer instance
"""Sanitizes CSS declarations in style attributes and style elements.
class CSSSanitizer:
"""
CSS sanitizer for cleaning style attributes and style text.
"""
def __init__(
self,
allowed_css_properties: frozenset = ALLOWED_CSS_PROPERTIES,
allowed_svg_properties: frozenset = ALLOWED_SVG_PROPERTIES
):
"""
Initialize CSS sanitizer.
Parameters:
- allowed_css_properties: set of allowed CSS properties
- allowed_svg_properties: set of allowed SVG properties
"""
def sanitize_css(self, style: str) -> str:
"""
Sanitize CSS declarations.
Parameters:
- style: CSS declarations string
Returns:
Sanitized CSS string
"""Callback functions for customizing link attributes during linkification.
def nofollow(attrs: dict, new: bool = False) -> dict:
"""
Add rel="nofollow" to links (except mailto links).
Parameters:
- attrs: link attributes dictionary
- new: whether this is a new link
Returns:
Modified attributes dictionary
"""
def target_blank(attrs: dict, new: bool = False) -> dict:
"""
Add target="_blank" to links (except mailto links).
Parameters:
- attrs: link attributes dictionary
- new: whether this is a new link
Returns:
Modified attributes dictionary
"""Utility function for creating attribute filter functions from various attribute configurations.
def attribute_filter_factory(attributes) -> callable:
"""
Generate attribute filter function for the given attributes configuration.
The attributes value can be a callable, dict, or list. This returns a filter
function appropriate to the attributes value.
Parameters:
- attributes: attribute configuration (callable, dict, or list)
Returns:
Filter function that takes (tag, attr, value) and returns bool
Raises:
ValueError: if attributes is not a callable, list, or dict
"""Functions for creating custom URL and email matching patterns.
def build_url_re(
tlds: list = TLDS,
protocols = html5lib_shim.allowed_protocols
) -> re.Pattern:
"""
Build URL regex with custom TLDs and protocols.
Parameters:
- tlds: list of top-level domains
- protocols: set of allowed protocols
Returns:
Compiled regex pattern for URL matching
"""
def build_email_re(tlds: list = TLDS) -> re.Pattern:
"""
Build email regex with custom TLDs.
Parameters:
- tlds: list of top-level domains
Returns:
Compiled regex pattern for email matching
"""# Default allowed HTML tags
ALLOWED_TAGS: frozenset = frozenset((
"a", "abbr", "acronym", "b", "blockquote", "code",
"em", "i", "li", "ol", "strong", "ul"
))
# Default allowed attributes by tag
ALLOWED_ATTRIBUTES: dict = {
"a": ["href", "title"],
"abbr": ["title"],
"acronym": ["title"]
}
# Default allowed protocols for links
ALLOWED_PROTOCOLS: frozenset = frozenset(("http", "https", "mailto"))
# Invisible character handling (requires: from itertools import chain)
INVISIBLE_CHARACTERS: str = "".join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
INVISIBLE_CHARACTERS_RE: re.Pattern = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
INVISIBLE_REPLACEMENT_CHAR: str = "?"# Default linkification callbacks
DEFAULT_CALLBACKS: list = [nofollow]
# Top-level domains for URL detection
TLDS: list = [
"ac", "ad", "ae", "aero", "af", "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "arpa", "as", "asia", "at", "au", "aw", "ax", "az",
"ba", "bb", "bd", "be", "bf", "bg", "bh", "bi", "biz", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bv", "bw", "by", "bz",
"ca", "cat", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "com", "coop", "cr", "cu", "cv", "cx", "cy", "cz",
"de", "dj", "dk", "dm", "do", "dz", "ec", "edu", "ee", "eg", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr",
"ga", "gb", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gov", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy",
"hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "info", "int", "io", "iq", "ir", "is", "it",
"je", "jm", "jo", "jobs", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz",
"la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mil", "mk", "ml", "mm", "mn", "mo", "mobi", "mp", "mq", "mr", "ms", "mt", "mu", "museum", "mv", "mw", "mx", "my", "mz",
"na", "name", "nc", "ne", "net", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "org",
"pa", "pe", "pf", "pg", "ph", "pk", "pl", "pm", "pn", "post", "pr", "pro", "ps", "pt", "pw", "py",
"qa", "re", "ro", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "ss", "st", "su", "sv", "sx", "sy", "sz",
"tc", "td", "tel", "tf", "tg", "th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr", "travel", "tt", "tv", "tw", "tz",
"ua", "ug", "uk", "us", "uy", "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "xn", "xxx", "ye", "yt", "yu", "za", "zm", "zw"
]
# Default URL matching regex
URL_RE: re.Pattern = build_url_re()
# Default email matching regex
EMAIL_RE: re.Pattern = build_email_re()
# Protocol matching regex for URL detection
PROTO_RE: re.Pattern = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)# Allowed CSS properties
ALLOWED_CSS_PROPERTIES: frozenset = frozenset((
"azimuth", "background-color", "border-bottom-color", "border-collapse",
"border-color", "border-left-color", "border-right-color", "border-top-color",
"clear", "color", "cursor", "direction", "display", "elevation", "float",
"font", "font-family", "font-size", "font-style", "font-variant", "font-weight",
"height", "letter-spacing", "line-height", "overflow", "pause", "pause-after",
"pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header",
"speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align",
"text-decoration", "text-indent", "unicode-bidi", "vertical-align",
"voice-family", "volume", "white-space", "width"
))
# Allowed SVG properties
ALLOWED_SVG_PROPERTIES: frozenset = frozenset((
"fill", "fill-opacity", "fill-rule", "stroke", "stroke-width",
"stroke-linecap", "stroke-linejoin", "stroke-opacity"
))# Package version string
__version__: str = "6.2.0"
# Release date in YYYYMMDD format
__releasedate__: str = "20241029"class NoCssSanitizerWarning(UserWarning):
"""
Warning raised when CSS sanitization is needed but no CSS sanitizer is configured.
"""import bleach
from bleach.sanitizer import Cleaner
# Custom allowed tags and attributes
custom_tags = ['p', 'strong', 'em', 'a', 'img']
custom_attributes = {
'a': ['href', 'title'],
'img': ['src', 'alt', 'width', 'height']
}
# Create reusable cleaner
cleaner = Cleaner(
tags=custom_tags,
attributes=custom_attributes,
strip=True # Remove disallowed tags entirely
)
# Clean multiple texts with same rules
safe_text1 = cleaner.clean(untrusted_html1)
safe_text2 = cleaner.clean(untrusted_html2)import bleach
from bleach.css_sanitizer import CSSSanitizer
# Create CSS sanitizer
css_sanitizer = CSSSanitizer(
allowed_css_properties=bleach.css_sanitizer.ALLOWED_CSS_PROPERTIES
)
# Clean HTML with CSS sanitization
html_with_styles = '<p style="color: red; background: javascript:alert();">Text</p>'
safe_html = bleach.clean(
html_with_styles,
tags=['p'],
attributes={'p': ['style']},
css_sanitizer=css_sanitizer
)
# Result: '<p style="color: red;">Text</p>'import bleach
from bleach.linkifier import Linker
from bleach.callbacks import target_blank, nofollow
# Custom linkifier with multiple callbacks
linker = Linker(
callbacks=[nofollow, target_blank],
skip_tags={'pre', 'code'}, # Don't linkify in code blocks
parse_email=True
)
text = 'Email me at user@example.com or visit https://example.org'
linked = linker.linkify(text)
# Result includes both rel="nofollow" and target="_blank"import bleach
from bleach.sanitizer import Cleaner
from bleach.linkifier import Linker, LinkifyFilter
# Clean and linkify in single pass using LinkifyFilter
cleaner = Cleaner(
tags=['p', 'a', 'strong'],
attributes={'a': ['href', 'rel', 'target']},
filters=[LinkifyFilter()] # Linkify during cleaning
)
unsafe_text = '<script>alert("xss")</script><p>Visit https://example.com</p>'
result = cleaner.clean(unsafe_text)