Character encoding aliases for legacy web content implementing the WHATWG Encoding standard
Streaming interfaces for processing large amounts of data incrementally. Provides both "pull"-based (iterator) and "push"-based (incremental) processing patterns for efficient handling of large files or data streams.
Iterator-based decoder that consumes input on-demand and yields Unicode strings.
def iter_decode(input: Iterable[bytes], fallback_encoding: Encoding | str, errors: str = 'replace') -> tuple[Iterator[str], Encoding]:
"""
Pull-based decoder for iterables of byte strings.
Args:
input: Iterable of byte strings (consumed on-demand)
fallback_encoding: Encoding object or label string if no BOM detected
errors: Error handling strategy ('replace', 'strict', 'ignore', etc.)
Returns:
Tuple of (output_iterator, encoding_used)
The output iterator yields Unicode strings as input is consumed
Raises:
LookupError: If fallback_encoding label is unknown
"""Iterator-based encoder that consumes Unicode strings and yields bytes.
def iter_encode(input: Iterable[str], encoding: Encoding | str = UTF8, errors: str = 'strict') -> Iterator[bytes]:
"""
Pull-based encoder for iterables of Unicode strings.
Args:
input: Iterable of Unicode strings
encoding: Encoding object or label string (defaults to UTF-8)
errors: Error handling strategy ('strict', 'replace', 'ignore', etc.)
Returns:
Iterator yielding byte strings
Raises:
LookupError: If encoding label is unknown
"""Stateful decoder for incremental processing where data is fed in chunks.
class IncrementalDecoder:
"""
Push-based decoder for incremental processing.
Attributes:
encoding: The detected/used Encoding object, or None if not yet determined
"""
def __init__(self, fallback_encoding: Encoding | str, errors: str = 'replace') -> None:
"""
Initialize incremental decoder.
Args:
fallback_encoding: Encoding object or label string if no BOM detected
errors: Error handling strategy ('replace', 'strict', 'ignore', etc.)
Raises:
LookupError: If fallback_encoding label is unknown
"""
def decode(self, input: bytes, final: bool = False) -> str:
"""
Decode one chunk of input.
Args:
input: Byte string chunk to decode
final: True if this is the last chunk (flushes any buffered data)
Returns:
Decoded Unicode string for this chunk
"""Stateful encoder for incremental processing where data is fed in chunks.
class IncrementalEncoder:
"""Push-based encoder for incremental processing."""
def __init__(self, encoding: Encoding | str = UTF8, errors: str = 'strict') -> None:
"""
Initialize incremental encoder.
Args:
encoding: Encoding object or label string (defaults to UTF-8)
errors: Error handling strategy ('strict', 'replace', 'ignore', etc.)
Raises:
LookupError: If encoding label is unknown
"""
def encode(self, input: str, final: bool = False) -> bytes:
"""
Encode one chunk of input.
Args:
input: Unicode string chunk to encode
final: True if this is the last chunk (flushes any buffered data)
Returns:
Encoded byte string for this chunk
"""import webencodings
# Pull-based decoding with iterator
data_chunks = [b'\xef\xbb\xbf', b'Hello ', b'World']
output_iter, encoding = webencodings.iter_decode(data_chunks, 'utf-8')
print(f"Detected encoding: {encoding.name}") # 'utf-8'
for text_chunk in output_iter:
print(repr(text_chunk)) # 'Hello ', 'World'
# Pull-based encoding
text_chunks = ['Hello ', 'World', '!']
byte_iter = webencodings.iter_encode(text_chunks, 'utf-8')
for byte_chunk in byte_iter:
print(repr(byte_chunk)) # b'Hello ', b'World', b'!'
# Push-based incremental decoding
decoder = webencodings.IncrementalDecoder('utf-8')
# Feed data in chunks
result1 = decoder.decode(b'\xef\xbb\xbfHel')
print(repr(result1)) # 'Hel'
print(decoder.encoding.name) # 'utf-8'
result2 = decoder.decode(b'lo Wor')
print(repr(result2)) # 'lo Wor'
result3 = decoder.decode(b'ld', final=True)
print(repr(result3)) # 'ld'
# Push-based incremental encoding
encoder = webencodings.IncrementalEncoder('utf-8')
data1 = encoder.encode('Hello ')
print(repr(data1)) # b'Hello '
data2 = encoder.encode('World', final=True)
print(repr(data2)) # b'World'
# Handle BOM detection with streaming
decoder = webencodings.IncrementalDecoder('iso-8859-1')
# Feed just the BOM first
result1 = decoder.decode(b'\xff\xfe')
print(repr(result1)) # ''
print(decoder.encoding) # None (not enough data yet)
# Feed more data to complete BOM detection
result2 = decoder.decode(b'H\x00e\x00')
print(repr(result2)) # 'He'
print(decoder.encoding.name) # 'utf-16le'Install with Tessl CLI
npx tessl i tessl/pypi-webencodings