Library of web-related functions for HTML manipulation, HTTP processing, URL handling, and encoding detection
npx @tessl/cli install tessl/pypi-w3lib@2.3.0A comprehensive Python library providing essential web-related utility functions for HTML manipulation, HTTP header processing, URL handling, and character encoding detection. Originally developed as a foundational component of the Scrapy web scraping framework, w3lib offers production-tested utilities for web crawlers, data extraction tools, and content processing pipelines.
pip install w3libimport w3libModule-specific imports:
from w3lib.html import replace_entities, remove_tags, get_base_url
from w3lib.http import basic_auth_header, headers_raw_to_dict
from w3lib.url import safe_url_string, url_query_parameter, canonicalize_url
from w3lib.encoding import html_to_unicode, resolve_encoding
from w3lib.util import to_unicode, to_bytesfrom w3lib.html import replace_entities, remove_tags, get_base_url
from w3lib.url import safe_url_string, url_query_parameter
from w3lib.http import basic_auth_header
from w3lib.encoding import html_to_unicode
# HTML processing - clean up HTML content
html = '<p>Price: £100 <b>only!</b></p>'
clean_text = replace_entities(html) # 'Price: £100 <b>only!</b>'
text_only = remove_tags(clean_text) # 'Price: £100 only!'
# URL handling - make URLs safe and extract parameters
unsafe_url = 'http://example.com/search?q=hello world&price=£100'
safe_url = safe_url_string(unsafe_url) # Properly encoded URL
query_param = url_query_parameter(safe_url, 'q') # 'hello world'
# HTTP utilities - create authentication headers
auth_header = basic_auth_header('user', 'password') # b'Basic dXNlcjpwYXNzd29yZA=='
# Encoding detection - convert HTML to Unicode
raw_html = b'<html><meta charset="utf-8"><body>Caf\xc3\xa9</body></html>'
encoding, unicode_html = html_to_unicode(None, raw_html) # ('utf-8', '<html>...')w3lib is organized into focused modules, each handling specific web processing tasks:
This modular design allows developers to import only the functionality they need while maintaining consistent interfaces and error handling across all components.
Comprehensive HTML manipulation including entity conversion, tag removal, comment stripping, base URL extraction, and meta refresh parsing. Handles both string and bytes input with robust encoding support.
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): ...
def remove_tags(text, which_ones=(), keep=(), encoding=None): ...
def remove_comments(text, encoding=None): ...
def get_base_url(text, baseurl='', encoding='utf-8'): ...
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): ...HTTP header processing utilities for converting between raw header formats and dictionaries, plus HTTP Basic Authentication header generation.
def headers_raw_to_dict(headers_raw): ...
def headers_dict_to_raw(headers_dict): ...
def basic_auth_header(username, password, encoding='ISO-8859-1'): ...Comprehensive URL processing including browser-compatible URL sanitization, query parameter manipulation, data URI parsing, and canonicalization with support for various URL standards.
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True): ...
def url_query_parameter(url, parameter, default=None, keep_blank_values=0): ...
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False): ...
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding=None): ...
def parse_data_uri(uri): ...Character encoding detection from HTTP Content-Type headers, HTML meta tags, XML declarations, and byte order marks, with smart fallback handling and encoding alias resolution.
def html_to_unicode(content_type_header, html_body_str, default_encoding='utf8', auto_detect_fun=None): ...
def http_content_type_encoding(content_type): ...
def html_body_declared_encoding(html_body_str): ...
def resolve_encoding(encoding_alias): ...Core utility functions for converting between string and bytes representations with robust encoding support and error handling.
def to_unicode(text, encoding=None, errors='strict'): ...
def to_bytes(text, encoding=None, errors='strict'): ...# Type aliases used across the library
StrOrBytes = Union[str, bytes]
# HTTP header types
HeadersDictInput = Mapping[bytes, Union[Any, Sequence[bytes]]]
HeadersDictOutput = MutableMapping[bytes, list[bytes]]
# Data URI parsing result
class ParseDataURIResult(NamedTuple):
media_type: str
media_type_parameters: dict[str, str]
data: bytesw3lib functions follow consistent error handling patterns:
TypeError\ufffd)ValueError for malformed inputNone) rather than raising exceptions