tessl/pypi-ftfy

Fixes mojibake and other problems with Unicode, after the fact

—

Pending

Overview

Eval results

Files

File and Byte Processing

Name: tessl/pypi-ftfy
Author: tessl

Functions for processing files and handling bytes of unknown encoding, including streaming file processing and encoding detection utilities.

Capabilities

File Stream Processing

Process text files with automatic encoding detection and line-by-line text fixing.

def fix_file(
    input_file: TextIO | BinaryIO,
    encoding: str | None = None,
    config: TextFixerConfig | None = None,
    **kwargs: Any
) -> Iterator[str]:
    """
    Fix text found in a file with streaming processing.
    
    Processes file line by line, applying text fixes to each line.
    Handles both text and binary file objects, with encoding detection
    for binary files when encoding is not specified.
    
    Args:
        input_file: File object opened in text or binary mode
        encoding: Encoding name for binary files, None for detection
        config: Configuration object, or None for defaults
        **kwargs: Individual config options
        
    Yields:
        Fixed lines of text as strings
        
    Examples:
        >>> with open('messy.txt', 'r') as f:
        ...     for line in fix_file(f):
        ...         print(line, end='')
                
        >>> with open('unknown.txt', 'rb') as f:
        ...     for line in fix_file(f, encoding='utf-8'):
        ...         print(line, end='')
    """

Byte Encoding Detection

Attempt to decode bytes of unknown encoding using heuristic detection.

def guess_bytes(bstring: bytes) -> tuple[str, str]:
    """
    Guess reasonable strategy for decoding bytes in unknown encoding.
    
    WARNING: This is not the recommended way to use ftfy. ftfy is not
    designed as an encoding detector. Use only when encoding is truly
    unknowable and you need a best-effort decode.
    
    Tries encodings in order: UTF-16 with BOM, UTF-8, utf-8-variants,
    MacRoman (if CR line breaks), sloppy-windows-1252.
    
    Args:
        bstring: Bytes to decode
        
    Returns:
        Tuple of (decoded_string, detected_encoding)
        
    Raises:
        UnicodeError: If input is already a string
        
    Examples:
        >>> text, encoding = guess_bytes(b'caf\\xc3\\xa9')
        >>> print(f"Text: {text}, Encoding: {encoding}")
        Text: café, Encoding: utf-8
        
        >>> text, encoding = guess_bytes(b'\\xff\\xfecafe')  # UTF-16 BOM
        >>> print(f"Encoding: {encoding}")
        Encoding: utf-16
    """

Usage Examples

Processing Text Files

from ftfy import fix_file, TextFixerConfig
import sys

# Process file with default settings
with open('input.txt', 'r', encoding='utf-8') as infile:
    with open('output.txt', 'w', encoding='utf-8') as outfile:
        for line in fix_file(infile):
            outfile.write(line)

# Process with custom configuration
config = TextFixerConfig(uncurl_quotes=False, fix_encoding=True)
with open('input.txt', 'r') as infile:
    for line in fix_file(infile, config=config):
        print(line, end='')

Processing Binary Files

from ftfy import fix_file

# Process binary file with known encoding
with open('data.txt', 'rb') as binfile:
    for line in fix_file(binfile, encoding='latin-1'):
        print(line, end='')

# Process binary file with encoding detection (risky)
with open('unknown.txt', 'rb') as binfile:
    for line in fix_file(binfile, encoding=None):  # Will use guess_bytes
        print(line, end='')

Standard Input/Output Processing

import sys
from ftfy import fix_file

# Process stdin to stdout
for line in fix_file(sys.stdin):
    sys.stdout.write(line)

# Process stdin as binary with encoding detection  
for line in fix_file(sys.stdin.buffer, encoding=None):
    sys.stdout.write(line)

Batch File Processing

import os
from ftfy import fix_file, TextFixerConfig

def process_directory(input_dir, output_dir, config=None):
    """Process all text files in a directory."""
    if config is None:
        config = TextFixerConfig()
        
    for filename in os.listdir(input_dir):
        if filename.endswith('.txt'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            
            with open(input_path, 'rb') as infile:
                with open(output_path, 'w', encoding='utf-8') as outfile:
                    for line in fix_file(infile, config=config):
                        outfile.write(line)

# Process with conservative settings
conservative = TextFixerConfig(
    fix_encoding=True,
    unescape_html=False,
    restore_byte_a0=False
)
process_directory('input/', 'output/', conservative)

Encoding Detection Examples

from ftfy import guess_bytes

# Detect UTF-8
utf8_bytes = "café".encode('utf-8')
text, encoding = guess_bytes(utf8_bytes)
print(f"Detected: {encoding}, Text: {text}")  # utf-8, café

# Detect UTF-16 with BOM
utf16_bytes = "hello".encode('utf-16')
text, encoding = guess_bytes(utf16_bytes)  
print(f"Detected: {encoding}")  # utf-16

# Detect MacRoman (by CR line breaks)
macroman_bytes = "line1\rline2".encode('macroman')
text, encoding = guess_bytes(macroman_bytes)
print(f"Detected: {encoding}")  # macroman

# Default to sloppy-windows-1252
mystery_bytes = bytes([0x80, 0x81, 0x82])  # C1 controls
text, encoding = guess_bytes(mystery_bytes)
print(f"Detected: {encoding}")  # sloppy-windows-1252

Error Handling

from ftfy import fix_file, guess_bytes

# Handle encoding detection errors
def safe_guess_bytes(data):
    """Safely guess byte encoding with fallback."""
    try:
        return guess_bytes(data)
    except UnicodeDecodeError:
        # Fallback to sloppy decoding
        return data.decode('sloppy-windows-1252', errors='replace'), 'sloppy-windows-1252'

# Handle file processing errors
def safe_fix_file(filepath, output_path):
    """Process file with error handling."""
    try:
        with open(filepath, 'rb') as infile:
            # Try UTF-8 first
            infile_text = open(filepath, 'r', encoding='utf-8')
            with open(output_path, 'w', encoding='utf-8') as outfile:
                for line in fix_file(infile_text):
                    outfile.write(line)
                    
    except UnicodeDecodeError:
        # Fall back to binary mode with detection
        with open(filepath, 'rb') as infile:
            with open(output_path, 'w', encoding='utf-8') as outfile:
                for line in fix_file(infile, encoding=None):
                    outfile.write(line)

Large File Processing

from ftfy import fix_file, TextFixerConfig

# Process large files with memory efficiency
def process_large_file(input_path, output_path, chunk_size=1024*1024):
    """Process large file in chunks to manage memory."""
    config = TextFixerConfig(max_decode_length=chunk_size)
    
    with open(input_path, 'rb') as infile:
        with open(output_path, 'w', encoding='utf-8') as outfile:
            for line in fix_file(infile, config=config):
                outfile.write(line)

# Disable explanations for performance on large files
fast_config = TextFixerConfig(explain=False, max_decode_length=500000)

with open('huge_file.txt', 'rb') as infile:
    for line in fix_file(infile, config=fast_config):
        # Process line...
        pass

Custom File Processing Pipeline

from ftfy import fix_file, TextFixerConfig
import gzip
import json

def process_jsonl_gz(input_path, output_path):
    """Process gzipped JSONL file with text fixing."""
    config = TextFixerConfig(unescape_html=False)  # Preserve JSON
    
    with gzip.open(input_path, 'rt', encoding='utf-8') as infile:
        with gzip.open(output_path, 'wt', encoding='utf-8') as outfile:
            for line in fix_file(infile, config=config):
                try:
                    # Parse and re-serialize to ensure valid JSON
                    data = json.loads(line.strip())
                    json.dump(data, outfile, ensure_ascii=False)
                    outfile.write('\n')
                except json.JSONDecodeError:
                    # Write problematic line as-is
                    outfile.write(line)

# Process log files with terminal escapes
def clean_log_file(input_path, output_path):
    """Clean log file by removing terminal escapes."""
    config = TextFixerConfig(
        remove_terminal_escapes=True,
        fix_encoding=True,
        unescape_html=False,  # Logs may contain < > 
        uncurl_quotes=False   # Preserve original quotes in logs
    )
    
    with open(input_path, 'rb') as infile:
        with open(output_path, 'w', encoding='utf-8') as outfile:
            for line in fix_file(infile, config=config):
                outfile.write(line)

Install with Tessl CLI