or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

index.md
tile.json

tessl/pypi-bleach

An easy safelist-based HTML-sanitizing tool for untrusted content

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/bleach@6.2.x

To install, run

npx @tessl/cli install tessl/pypi-bleach@6.2.0

index.mddocs/

Bleach

An easy safelist-based HTML-sanitizing tool that escapes or strips markup and attributes from untrusted HTML content. Bleach uses an allowlist approach to remove malicious content while preserving safe, intended HTML elements. It can also safely linkify text, applying more comprehensive filters than Django's urlize filter.

Package Information

  • Package Name: bleach
  • Language: Python
  • Installation: pip install bleach
  • Optional Dependencies: pip install bleach[css] (for CSS sanitization with tinycss2)

Core Imports

import bleach

For main functions:

from bleach import clean, linkify

For classes:

from bleach.sanitizer import Cleaner, BleachSanitizerFilter, attribute_filter_factory
from bleach.linkifier import Linker, LinkifyFilter
from bleach.css_sanitizer import CSSSanitizer

For callbacks:

from bleach.callbacks import nofollow, target_blank

For constants and utilities:

from bleach.sanitizer import ALLOWED_TAGS, ALLOWED_ATTRIBUTES, ALLOWED_PROTOCOLS
from bleach.sanitizer import INVISIBLE_CHARACTERS, INVISIBLE_CHARACTERS_RE, INVISIBLE_REPLACEMENT_CHAR
from bleach.linkifier import DEFAULT_CALLBACKS, build_url_re, build_email_re, TLDS, URL_RE, EMAIL_RE, PROTO_RE
from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES, ALLOWED_SVG_PROPERTIES
from bleach import html5lib_shim  # For HTML_TAGS constant
from bleach import __version__, __releasedate__

Basic Usage

import bleach

# Basic HTML sanitization - removes unsafe tags and attributes
unsafe_html = '<script>alert("XSS")</script><p onclick="evil()">Hello <b>world</b></p>'
safe_html = bleach.clean(unsafe_html)
# Result: '&lt;script&gt;alert("XSS")&lt;/script&gt;<p>Hello <b>world</b></p>'

# Linkification - converts URLs to clickable links
text_with_urls = 'Visit https://example.com for more info!'
linked_text = bleach.linkify(text_with_urls)
# Result: 'Visit <a href="https://example.com" rel="nofollow">https://example.com</a> for more info!'

# Combined cleaning and linkifying
unsafe_text = 'Check out http://evil.com<script>alert("bad")</script>'
safe_linked = bleach.linkify(bleach.clean(unsafe_text))

Capabilities

HTML Sanitization

Cleans HTML fragments by removing or escaping malicious content using an allowlist-based approach.

def clean(
    text: str,
    tags: frozenset = ALLOWED_TAGS,
    attributes: dict = ALLOWED_ATTRIBUTES,
    protocols: frozenset = ALLOWED_PROTOCOLS,
    strip: bool = False,
    strip_comments: bool = True,
    css_sanitizer: CSSSanitizer = None
) -> str:
    """
    Clean an HTML fragment of malicious content and return it.
    
    Parameters:
    - text: the HTML text to clean
    - tags: set of allowed tags; defaults to ALLOWED_TAGS
    - attributes: allowed attributes; can be callable, list or dict; defaults to ALLOWED_ATTRIBUTES
    - protocols: allowed list of protocols for links; defaults to ALLOWED_PROTOCOLS
    - strip: whether to strip disallowed elements instead of escaping
    - strip_comments: whether to strip HTML comments
    - css_sanitizer: instance with sanitize_css method for style attributes
    
    Returns:
    Cleaned text as unicode string
    """

URL Linkification

Converts URL-like strings in HTML fragments to clickable links while preserving existing links and structure.

def linkify(
    text: str,
    callbacks: list = DEFAULT_CALLBACKS,
    skip_tags: set = None,
    parse_email: bool = False
) -> str:
    """
    Convert URL-like strings in an HTML fragment to links.
    
    Parameters:
    - text: the text to linkify
    - callbacks: list of callbacks to run when adjusting tag attributes
    - skip_tags: set of tags to skip linkifying contents of
    - parse_email: whether to linkify email addresses
    
    Returns:
    Linkified text as unicode string
    """

Advanced HTML Cleaning

Configurable HTML cleaner for repeated use with consistent settings.

class Cleaner:
    """
    Cleaner for cleaning HTML fragments of malicious content.
    Not thread-safe - create separate instances per thread.
    """
    
    def __init__(
        self,
        tags: frozenset = ALLOWED_TAGS,
        attributes: dict = ALLOWED_ATTRIBUTES,
        protocols: frozenset = ALLOWED_PROTOCOLS,
        strip: bool = False,
        strip_comments: bool = True,
        filters: list = None,
        css_sanitizer: CSSSanitizer = None
    ):
        """
        Initialize a Cleaner instance.
        
        Parameters:
        - tags: set of allowed tags
        - attributes: allowed attributes configuration
        - protocols: allowed protocols for links  
        - strip: whether to strip disallowed elements
        - strip_comments: whether to strip HTML comments
        - filters: list of additional html5lib filters
        - css_sanitizer: CSS sanitizer instance
        """
    
    def clean(self, text: str) -> str:
        """
        Clean the specified HTML text.
        
        Parameters:
        - text: HTML text to clean
        
        Returns:
        Cleaned HTML text
        """

Advanced URL Linkification

Configurable URL linkifier for repeated use with consistent settings.

class Linker:
    """
    Convert URL-like strings in HTML fragments to links with configuration.
    """
    
    def __init__(
        self,
        callbacks: list = DEFAULT_CALLBACKS,
        skip_tags: set = None,
        parse_email: bool = False,
        url_re = URL_RE,
        email_re = EMAIL_RE,
        recognized_tags = html5lib_shim.HTML_TAGS
    ):
        """
        Create a Linker instance.
        
        Parameters:
        - callbacks: list of callbacks for adjusting tag attributes
        - skip_tags: set of tags to skip linkifying contents of
        - parse_email: whether to linkify email addresses
        - url_re: custom URL matching regex
        - email_re: custom email matching regex
        - recognized_tags: set of recognized HTML tags
        """
    
    def linkify(self, text: str) -> str:
        """
        Linkify the specified text.
        
        Parameters:
        - text: text to linkify
        
        Returns:
        Linkified text
        
        Raises:
        TypeError: if text is not a string type
        """

Advanced Linkification Filter

HTML filter for linkifying during html5lib parsing, commonly used with Cleaner filters.

class LinkifyFilter(html5lib_shim.Filter):
    """
    HTML filter that linkifies text during html5lib parsing.
    Can be used with Cleaner filters for combined cleaning and linkification.
    """
    
    def __init__(
        self,
        source,
        callbacks: list = DEFAULT_CALLBACKS,
        skip_tags: set = None,
        parse_email: bool = False,
        url_re = URL_RE,
        email_re = EMAIL_RE
    ):
        """
        Create a LinkifyFilter instance.
        
        Parameters:
        - source: html5lib TreeWalker stream  
        - callbacks: list of callbacks for adjusting tag attributes
        - skip_tags: set of tags to skip linkifying contents of
        - parse_email: whether to linkify email addresses
        - url_re: custom URL matching regex
        - email_re: custom email matching regex
        """

HTML Sanitization Filter

HTML filter for sanitizing content during html5lib parsing, commonly used with other filters.

class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
    """
    HTML filter that sanitizes HTML during html5lib parsing.
    Can be used with other html5lib filters for custom processing.
    """
    
    def __init__(
        self,
        source,
        allowed_tags: frozenset = ALLOWED_TAGS,
        attributes = ALLOWED_ATTRIBUTES,
        allowed_protocols: frozenset = ALLOWED_PROTOCOLS,
        attr_val_is_uri = html5lib_shim.attr_val_is_uri,
        svg_attr_val_allows_ref = html5lib_shim.svg_attr_val_allows_ref,
        svg_allow_local_href = html5lib_shim.svg_allow_local_href,
        strip_disallowed_tags: bool = False,
        strip_html_comments: bool = True,
        css_sanitizer: CSSSanitizer = None
    ):
        """
        Create a BleachSanitizerFilter instance.
        
        Parameters:
        - source: html5lib TreeWalker stream
        - allowed_tags: set of allowed tags
        - attributes: allowed attributes configuration
        - allowed_protocols: allowed protocols for links
        - attr_val_is_uri: set of attributes that have URI values
        - svg_attr_val_allows_ref: set of SVG attributes that can have references
        - svg_allow_local_href: set of SVG elements that can have local hrefs
        - strip_disallowed_tags: whether to strip disallowed tags
        - strip_html_comments: whether to strip HTML comments
        - css_sanitizer: CSS sanitizer instance
        """

CSS Sanitization

Sanitizes CSS declarations in style attributes and style elements.

class CSSSanitizer:
    """
    CSS sanitizer for cleaning style attributes and style text.
    """
    
    def __init__(
        self,
        allowed_css_properties: frozenset = ALLOWED_CSS_PROPERTIES,
        allowed_svg_properties: frozenset = ALLOWED_SVG_PROPERTIES
    ):
        """
        Initialize CSS sanitizer.
        
        Parameters:
        - allowed_css_properties: set of allowed CSS properties
        - allowed_svg_properties: set of allowed SVG properties
        """
    
    def sanitize_css(self, style: str) -> str:
        """
        Sanitize CSS declarations.
        
        Parameters:
        - style: CSS declarations string
        
        Returns:
        Sanitized CSS string
        """

Linkification Callbacks

Callback functions for customizing link attributes during linkification.

def nofollow(attrs: dict, new: bool = False) -> dict:
    """
    Add rel="nofollow" to links (except mailto links).
    
    Parameters:
    - attrs: link attributes dictionary
    - new: whether this is a new link
    
    Returns:
    Modified attributes dictionary
    """

def target_blank(attrs: dict, new: bool = False) -> dict:
    """
    Add target="_blank" to links (except mailto links).
    
    Parameters:
    - attrs: link attributes dictionary  
    - new: whether this is a new link
    
    Returns:
    Modified attributes dictionary
    """

Attribute Filter Factory

Utility function for creating attribute filter functions from various attribute configurations.

def attribute_filter_factory(attributes) -> callable:
    """
    Generate attribute filter function for the given attributes configuration.
    
    The attributes value can be a callable, dict, or list. This returns a filter
    function appropriate to the attributes value.
    
    Parameters:
    - attributes: attribute configuration (callable, dict, or list)
    
    Returns:
    Filter function that takes (tag, attr, value) and returns bool
    
    Raises:
    ValueError: if attributes is not a callable, list, or dict
    """

URL and Email Pattern Building

Functions for creating custom URL and email matching patterns.

def build_url_re(
    tlds: list = TLDS,
    protocols = html5lib_shim.allowed_protocols
) -> re.Pattern:
    """
    Build URL regex with custom TLDs and protocols.
    
    Parameters:
    - tlds: list of top-level domains
    - protocols: set of allowed protocols
    
    Returns:
    Compiled regex pattern for URL matching
    """

def build_email_re(tlds: list = TLDS) -> re.Pattern:
    """
    Build email regex with custom TLDs.
    
    Parameters:
    - tlds: list of top-level domains
    
    Returns:
    Compiled regex pattern for email matching
    """

Constants

Default Sanitization Settings

# Default allowed HTML tags
ALLOWED_TAGS: frozenset = frozenset((
    "a", "abbr", "acronym", "b", "blockquote", "code", 
    "em", "i", "li", "ol", "strong", "ul"
))

# Default allowed attributes by tag
ALLOWED_ATTRIBUTES: dict = {
    "a": ["href", "title"],
    "abbr": ["title"],
    "acronym": ["title"]
}

# Default allowed protocols for links
ALLOWED_PROTOCOLS: frozenset = frozenset(("http", "https", "mailto"))

# Invisible character handling (requires: from itertools import chain)
INVISIBLE_CHARACTERS: str = "".join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
INVISIBLE_CHARACTERS_RE: re.Pattern = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
INVISIBLE_REPLACEMENT_CHAR: str = "?"

Default Linkification Settings

# Default linkification callbacks
DEFAULT_CALLBACKS: list = [nofollow]

# Top-level domains for URL detection
TLDS: list = [
    "ac", "ad", "ae", "aero", "af", "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "arpa", "as", "asia", "at", "au", "aw", "ax", "az",
    "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi", "biz", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bv", "bw", "by", "bz",
    "ca", "cat", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "com", "coop", "cr", "cu", "cv", "cx", "cy", "cz",
    "de", "dj", "dk", "dm", "do", "dz", "ec", "edu", "ee", "eg", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr",
    "ga", "gb", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gov", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy",
    "hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "info", "int", "io", "iq", "ir", "is", "it",
    "je", "jm", "jo", "jobs", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz",
    "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mil", "mk", "ml", "mm", "mn", "mo", "mobi", "mp", "mq", "mr", "ms", "mt", "mu", "museum", "mv", "mw", "mx", "my", "mz",
    "na", "name", "nc", "ne", "net", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "org",
    "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pm", "pn", "post", "pr", "pro", "ps", "pt", "pw", "py",
    "qa", "re", "ro", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "ss", "st", "su", "sv", "sx", "sy", "sz",
    "tc", "td", "tel", "tf", "tg", "th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr", "travel", "tt", "tv", "tw", "tz",
    "ua", "ug", "uk", "us", "uy", "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "xn", "xxx", "ye", "yt", "yu", "za", "zm", "zw"
]

# Default URL matching regex
URL_RE: re.Pattern = build_url_re()

# Default email matching regex  
EMAIL_RE: re.Pattern = build_email_re()

# Protocol matching regex for URL detection
PROTO_RE: re.Pattern = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)

CSS Sanitization Settings

# Allowed CSS properties
ALLOWED_CSS_PROPERTIES: frozenset = frozenset((
    "azimuth", "background-color", "border-bottom-color", "border-collapse",
    "border-color", "border-left-color", "border-right-color", "border-top-color",
    "clear", "color", "cursor", "direction", "display", "elevation", "float",
    "font", "font-family", "font-size", "font-style", "font-variant", "font-weight",
    "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after",
    "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header",
    "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align",
    "text-decoration", "text-indent", "unicode-bidi", "vertical-align",
    "voice-family", "volume", "white-space", "width"
))

# Allowed SVG properties  
ALLOWED_SVG_PROPERTIES: frozenset = frozenset((
    "fill", "fill-opacity", "fill-rule", "stroke", "stroke-width",
    "stroke-linecap", "stroke-linejoin", "stroke-opacity"
))

Package Version Information

# Package version string
__version__: str = "6.2.0"

# Release date in YYYYMMDD format
__releasedate__: str = "20241029"

Warning Classes

class NoCssSanitizerWarning(UserWarning):
    """
    Warning raised when CSS sanitization is needed but no CSS sanitizer is configured.
    """

Usage Examples

Custom Sanitization Rules

import bleach
from bleach.sanitizer import Cleaner

# Custom allowed tags and attributes
custom_tags = ['p', 'strong', 'em', 'a', 'img']
custom_attributes = {
    'a': ['href', 'title'],
    'img': ['src', 'alt', 'width', 'height']
}

# Create reusable cleaner
cleaner = Cleaner(
    tags=custom_tags,
    attributes=custom_attributes,
    strip=True  # Remove disallowed tags entirely
)

# Clean multiple texts with same rules
safe_text1 = cleaner.clean(untrusted_html1)
safe_text2 = cleaner.clean(untrusted_html2)

CSS Sanitization

import bleach
from bleach.css_sanitizer import CSSSanitizer

# Create CSS sanitizer
css_sanitizer = CSSSanitizer(
    allowed_css_properties=bleach.css_sanitizer.ALLOWED_CSS_PROPERTIES
)

# Clean HTML with CSS sanitization
html_with_styles = '<p style="color: red; background: javascript:alert();">Text</p>'
safe_html = bleach.clean(
    html_with_styles,
    tags=['p'],
    attributes={'p': ['style']}, 
    css_sanitizer=css_sanitizer
)
# Result: '<p style="color: red;">Text</p>'

Custom Linkification

import bleach
from bleach.linkifier import Linker
from bleach.callbacks import target_blank, nofollow

# Custom linkifier with multiple callbacks
linker = Linker(
    callbacks=[nofollow, target_blank],
    skip_tags={'pre', 'code'},  # Don't linkify in code blocks
    parse_email=True
)

text = 'Email me at user@example.com or visit https://example.org'
linked = linker.linkify(text)
# Result includes both rel="nofollow" and target="_blank"

Combined Operations

import bleach
from bleach.sanitizer import Cleaner
from bleach.linkifier import Linker, LinkifyFilter

# Clean and linkify in single pass using LinkifyFilter
cleaner = Cleaner(
    tags=['p', 'a', 'strong'],
    attributes={'a': ['href', 'rel', 'target']},
    filters=[LinkifyFilter()]  # Linkify during cleaning
)

unsafe_text = '<script>alert("xss")</script><p>Visit https://example.com</p>'
result = cleaner.clean(unsafe_text)