Library of web-related functions for HTML manipulation, HTTP processing, URL handling, and encoding detection
84
Comprehensive URL processing and manipulation functions supporting browser-compatible URL sanitization, query parameter handling, data URI parsing, and canonicalization according to multiple web standards.
Make URLs safe for browsers by applying proper encoding and normalization according to web standards.
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True):
"""
Make URL safe for browsers by applying proper encoding and normalization.
Args:
url (str|bytes): URL to make safe
encoding (str): URL encoding for query parameters (default: 'utf8')
path_encoding (str): Path component encoding (default: 'utf8')
quote_path (bool): Whether to quote path component (default: True)
Returns:
str: Browser-safe URL string
"""
def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
"""
Make URL safe for downloading by removing fragments and normalizing path.
Args:
url (str|bytes): URL to process
encoding (str): URL encoding (default: 'utf8')
path_encoding (str): Path encoding (default: 'utf8')
Returns:
str: Safe download URL without fragments
"""Usage Examples:
from w3lib.url import safe_url_string, safe_download_url
# Make URL browser-safe with proper encoding
unsafe_url = 'http://example.com/café/über?q=hello world'
safe_url = safe_url_string(unsafe_url)
# Returns: 'http://example.com/caf%C3%A9/%C3%BCber?q=hello%20world'
# Prepare URL for downloading (removes fragments, normalizes path)
download_url = safe_download_url('http://example.com/path/../file.pdf#section1')
# Returns: 'http://example.com/file.pdf'Check if a string represents a valid URL.
def is_url(text):
"""
Check if text is a URL (file, http, or https scheme).
Args:
text (str): Text to check
Returns:
bool: True if text is a URL with supported scheme
"""Extract and manipulate URL query parameters with extensive filtering options.
def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
"""
Extract parameter value from URL query string.
Args:
url (str|bytes): URL with query parameters
parameter (str): Parameter name to extract
default (str|None): Default value if parameter not found
keep_blank_values (bool|int): Whether to preserve blank values (default: False)
Returns:
str|None: Parameter value or default
"""
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
"""
Clean URL query parameters, keeping or removing specified ones.
Args:
url (str|bytes): URL to clean
parameterlist (str|bytes|Sequence): Parameters to keep/remove
sep (str): Parameter separator (default: '&')
kvsep (str): Key-value separator (default: '=')
remove (bool): Remove specified parameters instead of keeping (default: False)
unique (bool): Remove duplicate parameters (default: True)
keep_fragments (bool): Preserve URL fragments (default: False)
Returns:
str: Cleaned URL with filtered parameters
"""Usage Examples:
from w3lib.url import url_query_parameter, url_query_cleaner
url = 'http://example.com/search?q=python&category=web&sort=date&q=scrapy'
# Extract parameter value
query = url_query_parameter(url, 'q') # 'python' (first occurrence)
category = url_query_parameter(url, 'category') # 'web'
missing = url_query_parameter(url, 'missing', 'default') # 'default'
# Keep only specific parameters
clean_url = url_query_cleaner(url, ['q', 'sort'])
# Returns: 'http://example.com/search?q=python&sort=date'
# Remove specific parameters
without_sort = url_query_cleaner(url, ['sort'], remove=True)
# Returns: 'http://example.com/search?q=python&category=web'
# Keep duplicates
with_dups = url_query_cleaner(url, ['q'], unique=False)
# Returns: 'http://example.com/search?q=python&q=scrapy'Add or modify URL parameters while preserving existing ones.
def add_or_replace_parameter(url, name, new_value):
"""
Add or replace a single URL parameter.
Args:
url (str): URL to modify
name (str): Parameter name
new_value (str): Parameter value
Returns:
str: Modified URL with parameter added/replaced
"""
def add_or_replace_parameters(url, new_parameters):
"""
Add or replace multiple URL parameters.
Args:
url (str): URL to modify
new_parameters (dict[str, str]): Parameters to add/replace
Returns:
str: Modified URL with parameters added/replaced
"""Usage Examples:
from w3lib.url import add_or_replace_parameter, add_or_replace_parameters
base_url = 'http://example.com/search?q=python'
# Add single parameter
with_sort = add_or_replace_parameter(base_url, 'sort', 'date')
# Returns: 'http://example.com/search?q=python&sort=date'
# Replace existing parameter
new_query = add_or_replace_parameter(base_url, 'q', 'scrapy')
# Returns: 'http://example.com/search?q=scrapy'
# Add multiple parameters
params = {'sort': 'date', 'limit': '10', 'q': 'web-scraping'}
full_url = add_or_replace_parameters(base_url, params)
# Returns: 'http://example.com/search?q=web-scraping&sort=date&limit=10'Convert between file system paths and file:// URIs.
def path_to_file_uri(path):
"""
Convert local filesystem path to file:// URI.
Args:
path (str): Filesystem path
Returns:
str: File URI following RFC standards
"""
def file_uri_to_path(uri):
"""
Convert file:// URI to local filesystem path.
Args:
uri (str): File URI
Returns:
str: Local filesystem path
"""
def any_to_uri(uri_or_path):
"""
Convert path to file URI or return URI unchanged.
Args:
uri_or_path (str): URI or filesystem path
Returns:
str: URI (file:// for paths, unchanged for URIs)
"""Usage Examples:
from w3lib.url import path_to_file_uri, file_uri_to_path, any_to_uri
# Convert path to URI
file_uri = path_to_file_uri('/home/user/document.pdf')
# Returns: 'file:///home/user/document.pdf'
# Convert URI to path
local_path = file_uri_to_path('file:///home/user/document.pdf')
# Returns: '/home/user/document.pdf'
# Smart conversion
uri1 = any_to_uri('/home/user/file.txt') # 'file:///home/user/file.txt'
uri2 = any_to_uri('http://example.com') # 'http://example.com'Parse data: URIs into their components according to RFC 2397.
def parse_data_uri(uri):
"""
Parse data: URI into components.
Args:
uri (str|bytes): Data URI to parse
Returns:
ParseDataURIResult: Named tuple with media_type, media_type_parameters, data
Raises:
ValueError: If URI is malformed or not a data URI
"""Usage Example:
from w3lib.url import parse_data_uri
# Parse data URI
data_uri = 'data:text/plain;charset=utf-8;base64,SGVsbG8gV29ybGQ='
result = parse_data_uri(data_uri)
print(result.media_type) # 'text/plain'
print(result.media_type_parameters) # {'charset': 'utf-8'}
print(result.data) # b'Hello World'
# Parse simple data URI
simple_uri = 'data:,Hello%20World'
result = parse_data_uri(simple_uri)
print(result.media_type) # 'text/plain'
print(result.data) # b'Hello World'Normalize URLs for consistent comparison and caching.
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding=None):
"""
Canonicalize URL by sorting parameters and normalizing encoding.
Args:
url (str|bytes|ParseResult): URL to canonicalize
keep_blank_values (bool): Preserve blank parameter values (default: True)
keep_fragments (bool): Preserve URL fragments (default: False)
encoding (str|None): Character encoding (default: UTF-8)
Returns:
str: Canonicalized URL with sorted parameters and normalized encoding
"""Usage Example:
from w3lib.url import canonicalize_url
# Canonicalize URL with sorted parameters and normalized encoding
messy_url = 'http://example.com/search?c=3&b=5&b=2&a=50&blank='
canonical = canonicalize_url(messy_url)
# Returns: 'http://example.com/search?a=50&b=2&b=5&blank=&c=3'
# Remove fragments
with_fragment = 'http://example.com/page?q=test#section1'
no_fragment = canonicalize_url(with_fragment, keep_fragments=False)
# Returns: 'http://example.com/page?q=test'class ParseDataURIResult(NamedTuple):
"""Result of parsing a data: URI."""
media_type: str # MIME type (e.g., 'text/plain')
media_type_parameters: dict[str, str] # MIME parameters (e.g., {'charset': 'utf-8'})
data: bytes # Decoded data contentThese functions are used internally but may be useful for advanced use cases:
def parse_url(url, encoding=None):
"""
Parse URL into ParseResult components.
Args:
url (str|bytes|ParseResult): URL to parse
encoding (str|None): Character encoding
Returns:
ParseResult: Parsed URL components (scheme, netloc, path, params, query, fragment)
"""
def parse_qsl_to_bytes(qs, keep_blank_values=False):
"""
Parse query string returning bytes pairs.
Args:
qs (str): Query string to parse
keep_blank_values (bool): Preserve blank values (default: False)
Returns:
list[tuple[bytes, bytes]]: Query parameter pairs as bytes
"""ValueError for malformed inputValueError for invalid formatThe safe_url_string function ensures compatibility with:
This multi-standard approach ensures URLs work correctly across different browsers, servers, and URL processing libraries.
Install with Tessl CLI
npx tessl i tessl/pypi-w3libevals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10