Library of web-related functions for HTML manipulation, HTTP processing, URL handling, and encoding detection
84
Comprehensive HTML manipulation functions for cleaning, parsing, and extracting information from HTML content. All functions handle both string and bytes input with robust encoding support.
Convert HTML entities to their corresponding Unicode characters, with options to preserve specific entities and handle malformed content.
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
"""
Remove entities from the given text by converting them to Unicode characters.
Args:
text (str|bytes): Text containing HTML entities
keep (Iterable[str]): Entity names to preserve (default: empty)
remove_illegal (bool): Remove non-convertible entities (default: True)
encoding (str): Text encoding for bytes input (default: 'utf-8')
Returns:
str: Text with entities converted to Unicode characters
"""
def has_entities(text, encoding=None):
"""
Check if text contains HTML entities.
Args:
text (str|bytes): Text to check for entities
encoding (str|None): Text encoding for bytes input
Returns:
bool: True if entities are found
"""Usage Examples:
from w3lib.html import replace_entities, has_entities
# Basic entity conversion
html = 'Price: £100 & free shipping!'
result = replace_entities(html)
# Returns: 'Price: £100 & free shipping!'
# Keep specific entities
html = 'Low < High & Medium £ six'
result = replace_entities(html, keep=['lt', 'amp'])
# Returns: 'Low < High & Medium £ six'
# Check for entities
if has_entities('Price: £100'):
print("Contains entities")Remove or replace HTML tags with extensive filtering options.
def replace_tags(text, token='', encoding=None):
"""
Replace all HTML tags with a token (removes tags by default).
Args:
text (str|bytes): HTML content
token (str): Replacement token (default: empty string removes tags)
encoding (str|None): Text encoding for bytes input
Returns:
str: Text with tags replaced
"""
def remove_tags(text, which_ones=(), keep=(), encoding=None):
"""
Remove specific HTML tags or keep only specified tags.
Args:
text (str|bytes): HTML content
which_ones (Iterable[str]): Tags to remove (mutually exclusive with keep)
keep (Iterable[str]): Tags to preserve (mutually exclusive with which_ones)
encoding (str|None): Text encoding for bytes input
Returns:
str: Text with specified tags removed
Raises:
ValueError: If both which_ones and keep are specified
"""
def remove_tags_with_content(text, which_ones=(), encoding=None):
"""
Remove tags and their content.
Args:
text (str|bytes): HTML content
which_ones (Iterable[str]): Tags to remove including content
encoding (str|None): Text encoding for bytes input
Returns:
str: Text with tags and their content removed
"""Usage Examples:
from w3lib.html import replace_tags, remove_tags, remove_tags_with_content
doc = '<div><p><b>Bold text</b> and <a href="url">link</a></p></div>'
# Replace all tags with spaces
replace_tags(doc, ' ') # ' Bold text and link '
# Remove all tags
remove_tags(doc) # 'Bold text and link'
# Keep only specific tags
remove_tags(doc, keep=['div']) # '<div>Bold text and link</div>'
# Remove specific tags
remove_tags(doc, which_ones=['a', 'b']) # '<div><p>Bold text and link</p></div>'
# Remove tags with content
remove_tags_with_content(doc, which_ones=['b']) # '<div><p> and <a href="url">link</a></p></div>'Strip HTML comments from content.
def remove_comments(text, encoding=None):
"""
Remove HTML comments from text.
Args:
text (str|bytes): HTML content
encoding (str|None): Text encoding for bytes input
Returns:
str: Text without HTML comments
"""Usage Example:
from w3lib.html import remove_comments
html = 'Text <!-- this is a comment --> more text'
result = remove_comments(html) # 'Text more text'Remove or replace escape characters in text content.
def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', encoding=None):
"""
Remove or replace escape characters.
Args:
text (str|bytes): Text containing escape characters
which_ones (Iterable[str]): Escape characters to replace (default: newline, tab, carriage return)
replace_by (str|bytes): Replacement string (default: empty string)
encoding (str|None): Text encoding for bytes input
Returns:
str: Text with escape characters replaced
"""Process markup while preserving CDATA sections and handling complex entity scenarios.
def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
"""
Remove entities from markup while preserving CDATA sections.
Args:
text (str|bytes): HTML markup
keep (Iterable[str]): Entity names to preserve
remove_illegal (bool): Remove non-convertible entities
encoding (str|None): Text encoding for bytes input
Returns:
str: Processed markup with entities converted except in CDATA sections
"""Extract base URL from HTML <base> tags for resolving relative URLs.
def get_base_url(text, baseurl='', encoding='utf-8'):
"""
Extract base URL from HTML <base> tag, relative to given base URL.
Args:
text (str|bytes): HTML content
baseurl (str|bytes): Fallback base URL (default: empty string)
encoding (str): Text encoding for bytes input (default: 'utf-8')
Returns:
str: Base URL found in <base> tag or fallback baseurl
"""Usage Example:
from w3lib.html import get_base_url
html = '<html><head><base href="/app/"></head><body>...</body></html>'
base = get_base_url(html, 'https://example.com')
# Returns: 'https://example.com/app/'Extract meta refresh redirect information from HTML.
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
"""
Extract meta refresh redirect information from HTML.
Args:
text (str|bytes): HTML content
baseurl (str): Base URL for resolving relative redirects (default: empty)
encoding (str): Text encoding for bytes input (default: 'utf-8')
ignore_tags (Iterable[str]): Tags to ignore during parsing (default: script, noscript)
Returns:
tuple: (interval, url) where interval is delay in seconds, url is redirect target
Returns (None, None) if no meta refresh found
"""Usage Example:
from w3lib.html import get_meta_refresh
html = '<meta http-equiv="refresh" content="5;url=https://example.com/new">'
interval, url = get_meta_refresh(html, 'https://example.com')
# Returns: (5.0, 'https://example.com/new')Strip HTML5-defined whitespace characters from strings.
def strip_html5_whitespace(text):
"""
Strip HTML5-defined whitespace characters from text.
Args:
text (str): Text to strip
Returns:
str: Text with HTML5 whitespace characters removed from ends
"""The HTML5 whitespace characters are defined as: space, tab, newline, carriage return, and form feed.
Usage Example:
from w3lib.html import strip_html5_whitespace
text = ' \t\n\r\x0c hello world \t\n\r\x0c '
result = strip_html5_whitespace(text) # 'hello world'HTML5_WHITESPACE = " \t\n\r\x0c" # HTML5-defined whitespace characters\ufffd)remove_illegal parameterInstall with Tessl CLI
npx tessl i tessl/pypi-w3libevals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10