tessl/pypi-webencodings

Character encoding aliases for legacy web content implementing the WHATWG Encoding standard

Overview

Eval results

Files

Streaming Processing

Name: tessl/pypi-webencodings
Author: tessl

Streaming interfaces for processing large amounts of data incrementally. Provides both "pull"-based (iterator) and "push"-based (incremental) processing patterns for efficient handling of large files or data streams.

Capabilities

Pull-based Decoding

Iterator-based decoder that consumes input on-demand and yields Unicode strings.

def iter_decode(input: Iterable[bytes], fallback_encoding: Encoding | str, errors: str = 'replace') -> tuple[Iterator[str], Encoding]:
    """
    Pull-based decoder for iterables of byte strings.
    
    Args:
        input: Iterable of byte strings (consumed on-demand)
        fallback_encoding: Encoding object or label string if no BOM detected
        errors: Error handling strategy ('replace', 'strict', 'ignore', etc.)
        
    Returns:
        Tuple of (output_iterator, encoding_used)
        The output iterator yields Unicode strings as input is consumed
        
    Raises:
        LookupError: If fallback_encoding label is unknown
    """

Pull-based Encoding

Iterator-based encoder that consumes Unicode strings and yields bytes.

def iter_encode(input: Iterable[str], encoding: Encoding | str = UTF8, errors: str = 'strict') -> Iterator[bytes]:
    """
    Pull-based encoder for iterables of Unicode strings.
    
    Args:
        input: Iterable of Unicode strings
        encoding: Encoding object or label string (defaults to UTF-8)
        errors: Error handling strategy ('strict', 'replace', 'ignore', etc.)
        
    Returns:
        Iterator yielding byte strings
        
    Raises:
        LookupError: If encoding label is unknown
    """

Push-based Decoding

Stateful decoder for incremental processing where data is fed in chunks.

class IncrementalDecoder:
    """
    Push-based decoder for incremental processing.
    
    Attributes:
        encoding: The detected/used Encoding object, or None if not yet determined
    """
    
    def __init__(self, fallback_encoding: Encoding | str, errors: str = 'replace') -> None:
        """
        Initialize incremental decoder.
        
        Args:
            fallback_encoding: Encoding object or label string if no BOM detected
            errors: Error handling strategy ('replace', 'strict', 'ignore', etc.)
            
        Raises:
            LookupError: If fallback_encoding label is unknown
        """
    
    def decode(self, input: bytes, final: bool = False) -> str:
        """
        Decode one chunk of input.
        
        Args:
            input: Byte string chunk to decode
            final: True if this is the last chunk (flushes any buffered data)
            
        Returns:
            Decoded Unicode string for this chunk
        """

Push-based Encoding

Stateful encoder for incremental processing where data is fed in chunks.

class IncrementalEncoder:
    """Push-based encoder for incremental processing."""
    
    def __init__(self, encoding: Encoding | str = UTF8, errors: str = 'strict') -> None:
        """
        Initialize incremental encoder.
        
        Args:
            encoding: Encoding object or label string (defaults to UTF-8)
            errors: Error handling strategy ('strict', 'replace', 'ignore', etc.)
            
        Raises:
            LookupError: If encoding label is unknown
        """
    
    def encode(self, input: str, final: bool = False) -> bytes:
        """
        Encode one chunk of input.
        
        Args:
            input: Unicode string chunk to encode
            final: True if this is the last chunk (flushes any buffered data)
            
        Returns:
            Encoded byte string for this chunk
        """

Usage Examples

import webencodings

# Pull-based decoding with iterator
data_chunks = [b'\xef\xbb\xbf', b'Hello ', b'World']
output_iter, encoding = webencodings.iter_decode(data_chunks, 'utf-8')

print(f"Detected encoding: {encoding.name}")  # 'utf-8'
for text_chunk in output_iter:
    print(repr(text_chunk))  # 'Hello ', 'World'

# Pull-based encoding
text_chunks = ['Hello ', 'World', '!']
byte_iter = webencodings.iter_encode(text_chunks, 'utf-8')

for byte_chunk in byte_iter:
    print(repr(byte_chunk))  # b'Hello ', b'World', b'!'

# Push-based incremental decoding
decoder = webencodings.IncrementalDecoder('utf-8')

# Feed data in chunks
result1 = decoder.decode(b'\xef\xbb\xbfHel')
print(repr(result1))  # 'Hel'
print(decoder.encoding.name)  # 'utf-8'

result2 = decoder.decode(b'lo Wor')
print(repr(result2))  # 'lo Wor'

result3 = decoder.decode(b'ld', final=True)
print(repr(result3))  # 'ld'

# Push-based incremental encoding
encoder = webencodings.IncrementalEncoder('utf-8')

data1 = encoder.encode('Hello ')
print(repr(data1))  # b'Hello '

data2 = encoder.encode('World', final=True)
print(repr(data2))  # b'World'

# Handle BOM detection with streaming
decoder = webencodings.IncrementalDecoder('iso-8859-1')

# Feed just the BOM first
result1 = decoder.decode(b'\xff\xfe')
print(repr(result1))  # ''
print(decoder.encoding)  # None (not enough data yet)

# Feed more data to complete BOM detection
result2 = decoder.decode(b'H\x00e\x00')
print(repr(result2))  # 'He'
print(decoder.encoding.name)  # 'utf-16le'

Install with Tessl CLI