Library of web-related functions for HTML manipulation, HTTP processing, URL handling, and encoding detection
84
Character encoding detection and conversion functions for web content, supporting HTTP Content-Type headers, HTML meta tags, XML declarations, byte order marks (BOMs), and smart fallback handling with encoding alias resolution.
Detect character encoding from multiple sources with intelligent fallback logic following browser behavior.
def html_to_unicode(content_type_header, html_body_str, default_encoding='utf8', auto_detect_fun=None):
"""
Convert raw HTML bytes to Unicode with smart encoding detection.
Detection priority order:
1. Byte Order Mark (BOM)
2. HTTP Content-Type header
3. HTML meta tags or XML declaration
4. Auto-detection function (if provided)
5. Default encoding
Args:
content_type_header (str|None): HTTP Content-Type header value
html_body_str (bytes): Raw HTML content as bytes
default_encoding (str): Fallback encoding (default: 'utf8')
auto_detect_fun (Callable|None): Optional encoding detection function
Returns:
tuple[str, str]: (detected_encoding, unicode_content)
"""Usage Examples:
from w3lib.encoding import html_to_unicode
# Detect from meta tag
html_bytes = b'<meta charset="utf-8"><body>Caf\xc3\xa9</body>'
encoding, content = html_to_unicode(None, html_bytes)
# Returns: ('utf-8', '<meta charset="utf-8"><body>Café</body>')
# Use Content-Type header
content_type = 'text/html; charset=iso-8859-1'
html_bytes = b'<body>Caf\xe9</body>' # ISO-8859-1 encoded
encoding, content = html_to_unicode(content_type, html_bytes)
# Returns: ('cp1252', '<body>Café</body>') # Upgraded to cp1252
# With auto-detection (using chardet)
import chardet
def auto_detect(data):
return chardet.detect(data).get('encoding')
encoding, content = html_to_unicode(None, html_bytes, auto_detect_fun=auto_detect)
# BOM detection takes priority
bom_html = b'\xff\xfe<\x00m\x00e\x00t\x00a\x00>\x00' # UTF-16LE with BOM
encoding, content = html_to_unicode(None, bom_html)
# Returns: ('utf-16-le', '<meta>')Extract character encoding from HTTP Content-Type headers.
def http_content_type_encoding(content_type):
"""
Extract encoding from HTTP Content-Type header.
Args:
content_type (str|None): Content-Type header value
Returns:
str|None: Detected encoding name or None if not found
"""Usage Examples:
from w3lib.encoding import http_content_type_encoding
# Standard Content-Type header
encoding = http_content_type_encoding('text/html; charset=UTF-8')
# Returns: 'utf-8'
# Case-insensitive parsing
encoding = http_content_type_encoding('Text/HTML; CHARSET=ISO-8859-1')
# Returns: 'cp1252' # Upgraded to Windows-1252
# No charset specified
encoding = http_content_type_encoding('text/html')
# Returns: None
# Invalid header
encoding = http_content_type_encoding(None)
# Returns: NoneExtract encoding declarations from HTML meta tags and XML declarations.
def html_body_declared_encoding(html_body_str):
"""
Extract encoding from HTML meta tags or XML declarations.
Supports multiple formats:
- <meta charset="utf-8">
- <meta http-equiv="content-type" content="text/html; charset=utf-8">
- <?xml encoding="utf-8"?>
Args:
html_body_str (str|bytes): HTML content (only first 4096 bytes checked)
Returns:
str|None: Declared encoding or None if not found
"""Usage Examples:
from w3lib.encoding import html_body_declared_encoding
# HTML5 meta charset
html = '<meta charset="utf-8">'
encoding = html_body_declared_encoding(html) # 'utf-8'
# HTTP-equiv format
html = '<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">'
encoding = html_body_declared_encoding(html) # 'cp1252'
# XML declaration
xml = '<?xml version="1.0" encoding="utf-8"?>'
encoding = html_body_declared_encoding(xml) # 'utf-8'
# No encoding found
html = '<meta name="description" content="No encoding here">'
encoding = html_body_declared_encoding(html) # NoneResolve encoding aliases to canonical names with web-specific translations.
def resolve_encoding(encoding_alias):
"""
Resolve encoding alias to canonical encoding name.
Applies web-specific encoding translations:
- ASCII -> Windows-1252 (common web default)
- Latin-1 -> Windows-1252 (web compatibility)
- GB2312/GBK -> GB18030 (superset encoding)
- Various Japanese encodings -> CP932
Args:
encoding_alias (str): Encoding name or alias
Returns:
str|None: Canonical encoding name or None if invalid
"""Usage Examples:
from w3lib.encoding import resolve_encoding
# Common aliases
resolve_encoding('latin1') # 'cp1252'
resolve_encoding('ascii') # 'cp1252'
resolve_encoding('gb2312') # 'gb18030'
resolve_encoding('shift_jis') # 'cp932'
# Already canonical
resolve_encoding('utf-8') # 'utf-8'
resolve_encoding('cp1252') # 'cp1252'
# Invalid encoding
resolve_encoding('invalid') # NoneDetect and parse byte order marks (BOMs) in binary data.
def read_bom(data):
"""
Detect byte order mark in data and return encoding and BOM bytes.
Supports:
- UTF-32 BE/LE
- UTF-16 BE/LE
- UTF-8
Args:
data (bytes): Binary data to check for BOM
Returns:
tuple: (encoding, bom_bytes) or (None, None) if no BOM found
"""Usage Examples:
from w3lib.encoding import read_bom
# UTF-8 BOM
data = b'\xef\xbb\xbfHello World'
encoding, bom = read_bom(data)
# Returns: ('utf-8', b'\xef\xbb\xbf')
# UTF-16 LE BOM
data = b'\xff\xfeH\x00e\x00l\x00l\x00o\x00'
encoding, bom = read_bom(data)
# Returns: ('utf-16-le', b'\xff\xfe')
# No BOM
data = b'Hello World'
encoding, bom = read_bom(data)
# Returns: (None, None)Convert bytes to Unicode with error handling.
def to_unicode(data_str, encoding):
"""
Convert bytes to Unicode string with replacement error handling.
Args:
data_str (bytes): Data to convert
encoding (str): Character encoding
Returns:
str: Unicode string with invalid bytes replaced by U+FFFD
"""Usage Example:
from w3lib.encoding import to_unicode
# Convert with invalid bytes
data = b'Hello \xff World' # Invalid UTF-8 byte
result = to_unicode(data, 'utf-8')
# Returns: 'Hello \ufffd World' # \ufffd is replacement characterDEFAULT_ENCODING_TRANSLATION = {
'ascii': 'cp1252', # Web compatibility
'big5': 'big5hkscs', # Extended Big5
'euc_kr': 'cp949', # Extended EUC-KR
'gb2312': 'gb18030', # Superset encoding
'gb_2312_80': 'gb18030', # Alternative name
'gbk': 'gb18030', # Superset encoding
'iso8859_11': 'cp874', # Thai encoding
'iso8859_9': 'cp1254', # Turkish encoding
'latin_1': 'cp1252', # Web compatibility
'macintosh': 'mac_roman', # Mac encoding
'shift_jis': 'cp932', # Japanese encoding
'tis_620': 'cp874', # Thai encoding
'win_1251': 'cp1251', # Windows Cyrillic
'windows_31j': 'cp932', # Japanese encoding
'win_31j': 'cp932', # Japanese encoding
'windows_874': 'cp874', # Thai encoding
'win_874': 'cp874', # Thai encoding
'x_sjis': 'cp932', # Japanese encoding
'zh_cn': 'gb18030', # Chinese encoding
}For advanced encoding detection, integrate with external libraries:
# Using chardet
import chardet
def chardet_detector(data):
result = chardet.detect(data)
return result.get('encoding') if result['confidence'] > 0.7 else None
# Using BeautifulSoup's UnicodeDammit
from bs4 import UnicodeDammit
def bs4_detector(data):
return UnicodeDammit(data).original_encoding
# Use with html_to_unicode
encoding, content = html_to_unicode(
content_type_header=None,
html_body_str=raw_html,
auto_detect_fun=chardet_detector
)None input gracefullyNone<body> tag for performanceInstall with Tessl CLI
npx tessl i tessl/pypi-w3libevals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10