Fixes mojibake and other problems with Unicode, after the fact
—
Functions for processing files and handling bytes of unknown encoding, including streaming file processing and encoding detection utilities.
Process text files with automatic encoding detection and line-by-line text fixing.
def fix_file(
input_file: TextIO | BinaryIO,
encoding: str | None = None,
config: TextFixerConfig | None = None,
**kwargs: Any
) -> Iterator[str]:
"""
Fix text found in a file with streaming processing.
Processes file line by line, applying text fixes to each line.
Handles both text and binary file objects, with encoding detection
for binary files when encoding is not specified.
Args:
input_file: File object opened in text or binary mode
encoding: Encoding name for binary files, None for detection
config: Configuration object, or None for defaults
**kwargs: Individual config options
Yields:
Fixed lines of text as strings
Examples:
>>> with open('messy.txt', 'r') as f:
... for line in fix_file(f):
... print(line, end='')
>>> with open('unknown.txt', 'rb') as f:
... for line in fix_file(f, encoding='utf-8'):
... print(line, end='')
"""Attempt to decode bytes of unknown encoding using heuristic detection.
def guess_bytes(bstring: bytes) -> tuple[str, str]:
"""
Guess reasonable strategy for decoding bytes in unknown encoding.
WARNING: This is not the recommended way to use ftfy. ftfy is not
designed as an encoding detector. Use only when encoding is truly
unknowable and you need a best-effort decode.
Tries encodings in order: UTF-16 with BOM, UTF-8, utf-8-variants,
MacRoman (if CR line breaks), sloppy-windows-1252.
Args:
bstring: Bytes to decode
Returns:
Tuple of (decoded_string, detected_encoding)
Raises:
UnicodeError: If input is already a string
Examples:
>>> text, encoding = guess_bytes(b'caf\\xc3\\xa9')
>>> print(f"Text: {text}, Encoding: {encoding}")
Text: café, Encoding: utf-8
>>> text, encoding = guess_bytes(b'\\xff\\xfecafe') # UTF-16 BOM
>>> print(f"Encoding: {encoding}")
Encoding: utf-16
"""from ftfy import fix_file, TextFixerConfig
import sys
# Process file with default settings
with open('input.txt', 'r', encoding='utf-8') as infile:
with open('output.txt', 'w', encoding='utf-8') as outfile:
for line in fix_file(infile):
outfile.write(line)
# Process with custom configuration
config = TextFixerConfig(uncurl_quotes=False, fix_encoding=True)
with open('input.txt', 'r') as infile:
for line in fix_file(infile, config=config):
print(line, end='')from ftfy import fix_file
# Process binary file with known encoding
with open('data.txt', 'rb') as binfile:
for line in fix_file(binfile, encoding='latin-1'):
print(line, end='')
# Process binary file with encoding detection (risky)
with open('unknown.txt', 'rb') as binfile:
for line in fix_file(binfile, encoding=None): # Will use guess_bytes
print(line, end='')import sys
from ftfy import fix_file
# Process stdin to stdout
for line in fix_file(sys.stdin):
sys.stdout.write(line)
# Process stdin as binary with encoding detection
for line in fix_file(sys.stdin.buffer, encoding=None):
sys.stdout.write(line)import os
from ftfy import fix_file, TextFixerConfig
def process_directory(input_dir, output_dir, config=None):
"""Process all text files in a directory."""
if config is None:
config = TextFixerConfig()
for filename in os.listdir(input_dir):
if filename.endswith('.txt'):
input_path = os.path.join(input_dir, filename)
output_path = os.path.join(output_dir, filename)
with open(input_path, 'rb') as infile:
with open(output_path, 'w', encoding='utf-8') as outfile:
for line in fix_file(infile, config=config):
outfile.write(line)
# Process with conservative settings
conservative = TextFixerConfig(
fix_encoding=True,
unescape_html=False,
restore_byte_a0=False
)
process_directory('input/', 'output/', conservative)from ftfy import guess_bytes
# Detect UTF-8
utf8_bytes = "café".encode('utf-8')
text, encoding = guess_bytes(utf8_bytes)
print(f"Detected: {encoding}, Text: {text}") # utf-8, café
# Detect UTF-16 with BOM
utf16_bytes = "hello".encode('utf-16')
text, encoding = guess_bytes(utf16_bytes)
print(f"Detected: {encoding}") # utf-16
# Detect MacRoman (by CR line breaks)
macroman_bytes = "line1\rline2".encode('macroman')
text, encoding = guess_bytes(macroman_bytes)
print(f"Detected: {encoding}") # macroman
# Default to sloppy-windows-1252
mystery_bytes = bytes([0x80, 0x81, 0x82]) # C1 controls
text, encoding = guess_bytes(mystery_bytes)
print(f"Detected: {encoding}") # sloppy-windows-1252from ftfy import fix_file, guess_bytes
# Handle encoding detection errors
def safe_guess_bytes(data):
"""Safely guess byte encoding with fallback."""
try:
return guess_bytes(data)
except UnicodeDecodeError:
# Fallback to sloppy decoding
return data.decode('sloppy-windows-1252', errors='replace'), 'sloppy-windows-1252'
# Handle file processing errors
def safe_fix_file(filepath, output_path):
"""Process file with error handling."""
try:
with open(filepath, 'rb') as infile:
# Try UTF-8 first
infile_text = open(filepath, 'r', encoding='utf-8')
with open(output_path, 'w', encoding='utf-8') as outfile:
for line in fix_file(infile_text):
outfile.write(line)
except UnicodeDecodeError:
# Fall back to binary mode with detection
with open(filepath, 'rb') as infile:
with open(output_path, 'w', encoding='utf-8') as outfile:
for line in fix_file(infile, encoding=None):
outfile.write(line)from ftfy import fix_file, TextFixerConfig
# Process large files with memory efficiency
def process_large_file(input_path, output_path, chunk_size=1024*1024):
"""Process large file in chunks to manage memory."""
config = TextFixerConfig(max_decode_length=chunk_size)
with open(input_path, 'rb') as infile:
with open(output_path, 'w', encoding='utf-8') as outfile:
for line in fix_file(infile, config=config):
outfile.write(line)
# Disable explanations for performance on large files
fast_config = TextFixerConfig(explain=False, max_decode_length=500000)
with open('huge_file.txt', 'rb') as infile:
for line in fix_file(infile, config=fast_config):
# Process line...
passfrom ftfy import fix_file, TextFixerConfig
import gzip
import json
def process_jsonl_gz(input_path, output_path):
"""Process gzipped JSONL file with text fixing."""
config = TextFixerConfig(unescape_html=False) # Preserve JSON
with gzip.open(input_path, 'rt', encoding='utf-8') as infile:
with gzip.open(output_path, 'wt', encoding='utf-8') as outfile:
for line in fix_file(infile, config=config):
try:
# Parse and re-serialize to ensure valid JSON
data = json.loads(line.strip())
json.dump(data, outfile, ensure_ascii=False)
outfile.write('\n')
except json.JSONDecodeError:
# Write problematic line as-is
outfile.write(line)
# Process log files with terminal escapes
def clean_log_file(input_path, output_path):
"""Clean log file by removing terminal escapes."""
config = TextFixerConfig(
remove_terminal_escapes=True,
fix_encoding=True,
unescape_html=False, # Logs may contain < >
uncurl_quotes=False # Preserve original quotes in logs
)
with open(input_path, 'rb') as infile:
with open(output_path, 'w', encoding='utf-8') as outfile:
for line in fix_file(infile, config=config):
outfile.write(line)Install with Tessl CLI
npx tessl i tessl/pypi-ftfy