Library of web-related functions for HTML manipulation, HTTP processing, URL handling, and encoding detection
84
Core utility functions for converting between string and bytes representations with robust encoding support and error handling. These foundational functions are used throughout the w3lib library and provide consistent text processing behavior.
Convert various text inputs to Unicode strings with flexible encoding and error handling.
def to_unicode(text, encoding=None, errors='strict'):
"""
Convert text to Unicode string representation.
Args:
text (str|bytes): Text to convert to Unicode
encoding (str|None): Character encoding for bytes input (default: 'utf-8')
errors (str): Error handling mode - 'strict', 'ignore', 'replace', etc. (default: 'strict')
Returns:
str: Unicode string representation
Raises:
TypeError: If text is neither str nor bytes
UnicodeDecodeError: If decoding fails and errors='strict'
"""Usage Examples:
from w3lib.util import to_unicode
# Convert bytes to Unicode
data = b'Hello, world!'
result = to_unicode(data) # 'Hello, world!'
# Handle different encodings
latin1_data = b'Caf\xe9' # 'é' in Latin-1
result = to_unicode(latin1_data, encoding='latin-1') # 'Café'
# String input returns unchanged
text = 'Already Unicode'
result = to_unicode(text) # 'Already Unicode'
# Error handling modes
invalid_utf8 = b'Invalid \xff UTF-8'
result = to_unicode(invalid_utf8, errors='replace') # 'Invalid � UTF-8'
result = to_unicode(invalid_utf8, errors='ignore') # 'Invalid UTF-8'
# Custom encoding with fallback
try:
result = to_unicode(data, encoding='ascii')
except UnicodeDecodeError:
result = to_unicode(data, encoding='utf-8', errors='replace')Convert various text inputs to bytes representation with encoding control and error handling.
def to_bytes(text, encoding=None, errors='strict'):
"""
Convert text to bytes representation.
Args:
text (str|bytes): Text to convert to bytes
encoding (str|None): Character encoding for string input (default: 'utf-8')
errors (str): Error handling mode - 'strict', 'ignore', 'replace', etc. (default: 'strict')
Returns:
bytes: Bytes representation
Raises:
TypeError: If text is neither str nor bytes
UnicodeEncodeError: If encoding fails and errors='strict'
"""Usage Examples:
from w3lib.util import to_bytes
# Convert Unicode to bytes
text = 'Hello, world!'
result = to_bytes(text) # b'Hello, world!'
# Handle different encodings
unicode_text = 'Café'
result = to_bytes(unicode_text, encoding='latin-1') # b'Caf\xe9'
result = to_bytes(unicode_text, encoding='ascii', errors='ignore') # b'Caf'
# Bytes input returns unchanged
data = b'Already bytes'
result = to_bytes(data) # b'Already bytes'
# Error handling for non-encodable characters
emoji_text = 'Hello 🌍'
result = to_bytes(emoji_text, encoding='ascii', errors='replace') # b'Hello ?'
result = to_bytes(emoji_text, encoding='ascii', errors='ignore') # b'Hello '
# Ensure proper encoding for HTTP
http_header = 'Content-Type: text/html; charset=utf-8'
header_bytes = to_bytes(http_header, encoding='ascii') # For HTTP headersHandle mixed string/bytes input safely:
from w3lib.util import to_unicode, to_bytes
def process_text(text):
# Ensure Unicode for processing
unicode_text = to_unicode(text)
# Process the text
processed = unicode_text.upper().strip()
# Return in desired format
return processed
# Works with both string and bytes input
result1 = process_text('hello world') # 'HELLO WORLD'
result2 = process_text(b'hello world') # 'HELLO WORLD'Create encoding conversion pipelines:
from w3lib.util import to_unicode, to_bytes
def convert_encoding(data, from_encoding, to_encoding):
"""Convert data from one encoding to another."""
# First convert to Unicode
unicode_text = to_unicode(data, encoding=from_encoding, errors='replace')
# Then convert to target encoding
return to_bytes(unicode_text, encoding=to_encoding, errors='replace')
# Convert from Latin-1 to UTF-8
latin1_data = b'Caf\xe9'
utf8_data = convert_encoding(latin1_data, 'latin-1', 'utf-8')
# Returns: b'Caf\xc3\xa9'Process web content with unknown encoding:
from w3lib.util import to_unicode
def process_web_content(raw_content, declared_encoding=None):
"""Process web content with fallback encoding detection."""
encodings_to_try = [
declared_encoding,
'utf-8',
'latin-1',
'cp1252'
]
for encoding in encodings_to_try:
if encoding is None:
continue
try:
return to_unicode(raw_content, encoding=encoding)
except UnicodeDecodeError:
continue
# Last resort - replace invalid characters
return to_unicode(raw_content, encoding='utf-8', errors='replace')Both functions support Python's standard error handling modes:
'strict' (default): Raise exception on encoding/decoding errors'ignore': Skip invalid characters silently'replace': Replace invalid characters with replacement character (� for Unicode, ? for bytes)'xmlcharrefreplace': Replace with XML character references (encoding only)'backslashreplace': Replace with backslash escape sequencescodecs.register_error() to define custom behaviorfrom w3lib.util import to_unicode, to_bytes
# Type checking
def safe_to_unicode(text):
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return to_unicode(text)
else:
raise TypeError(f"Expected str or bytes, got {type(text)}")
# The functions will raise TypeError for invalid input types
try:
to_unicode(123) # TypeError: to_unicode must receive bytes or str, got int
except TypeError as e:
print(f"Type error: {e}")encoding=None'strict' mode by default ensures data integrityThese utility functions are used extensively throughout w3lib:
# Used in HTML processing
from w3lib.html import replace_entities
from w3lib.util import to_unicode
# Functions automatically handle mixed input
html_bytes = b'<Hello>'
html_str = 'World & Universe'
result1 = replace_entities(html_bytes) # Uses to_unicode internally
result2 = replace_entities(html_str) # Direct string processing
# Used in URL processing
from w3lib.url import safe_url_string
from w3lib.util import to_unicode
# URL functions accept both string and bytes
url_bytes = b'http://example.com/caf\xc3\xa9'
safe_url = safe_url_string(url_bytes) # Uses to_unicode internally'strict' mode has lowest overhead, 'replace' mode slightly higherThese utilities provide the foundation for robust text processing throughout w3lib, ensuring consistent behavior across all modules while maintaining high performance and reliability.
Install with Tessl CLI
npx tessl i tessl/pypi-w3libevals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10