Fixes mojibake and other problems with Unicode, after the fact
—
Configuration classes and types for controlling ftfy behavior, including comprehensive options for each fix step and explanation data structures.
Comprehensive configuration class controlling all aspects of ftfy text processing through named tuple with sensible defaults.
class TextFixerConfig(NamedTuple):
"""
Configuration for all ftfy text processing options.
Implemented as NamedTuple with defaults, instantiate with keyword
arguments for values to change from defaults.
Attributes:
unescape_html: HTML entity handling ("auto"|True|False)
remove_terminal_escapes: Remove ANSI escape sequences (bool)
fix_encoding: Detect and fix mojibake (bool)
restore_byte_a0: Allow space as non-breaking space in mojibake (bool)
replace_lossy_sequences: Fix partial mojibake with � or ? (bool)
decode_inconsistent_utf8: Fix inconsistent UTF-8 sequences (bool)
fix_c1_controls: Replace C1 controls with Windows-1252 (bool)
fix_latin_ligatures: Replace ligatures with letters (bool)
fix_character_width: Normalize fullwidth/halfwidth chars (bool)
uncurl_quotes: Convert curly quotes to straight quotes (bool)
fix_line_breaks: Standardize line breaks to \\n (bool)
fix_surrogates: Fix UTF-16 surrogate sequences (bool)
remove_control_chars: Remove unnecessary control chars (bool)
normalization: Unicode normalization type (str|None)
max_decode_length: Maximum segment size for processing (int)
explain: Whether to compute explanations (bool)
"""
unescape_html: str | bool = "auto"
remove_terminal_escapes: bool = True
fix_encoding: bool = True
restore_byte_a0: bool = True
replace_lossy_sequences: bool = True
decode_inconsistent_utf8: bool = True
fix_c1_controls: bool = True
fix_latin_ligatures: bool = True
fix_character_width: bool = True
uncurl_quotes: bool = True
fix_line_breaks: bool = True
fix_surrogates: bool = True
remove_control_chars: bool = True
normalization: Literal["NFC", "NFD", "NFKC", "NFKD"] | None = "NFC"
max_decode_length: int = 1000000
explain: bool = TrueData structures for representing text transformation explanations and individual transformation steps.
class ExplainedText(NamedTuple):
"""
Return type for ftfy functions that provide explanations.
Contains both the fixed text result and optional explanation of
transformations applied. When explain=False, explanation is None.
Attributes:
text: The processed text result (str)
explanation: List of transformation steps or None (list[ExplanationStep]|None)
"""
text: str
explanation: list[ExplanationStep] | None
class ExplanationStep(NamedTuple):
"""
Single step in text transformation explanation.
Describes one transformation applied during text processing with
action type and parameter specifying the operation performed.
Attributes:
action: Type of transformation (str)
parameter: Encoding name or function name (str)
Actions:
"encode": Convert string to bytes with specified encoding
"decode": Convert bytes to string with specified encoding
"transcode": Convert bytes to bytes with named function
"apply": Convert string to string with named function
"normalize": Apply Unicode normalization
"""
action: str
parameter: strThe unescape_html option controls HTML entity processing:
"auto" (default): Decode entities unless literal < appears (indicating HTML)True: Always decode HTML entities like & → &False: Never decode HTML entitiesfrom ftfy import TextFixerConfig, fix_text
# Auto mode - detects HTML context
config = TextFixerConfig(unescape_html="auto")
fix_text("& text") # → "& text" (no < detected)
fix_text("<p>&</p>") # → "<p>&</p>" (< detected, preserve entities)
# Always decode entities
config = TextFixerConfig(unescape_html=True)
fix_text("<p>&</p>") # → "<p>&</p>"
# Never decode entities
config = TextFixerConfig(unescape_html=False)
fix_text("& text") # → "& text"Several options control encoding detection and mojibake fixing:
# Conservative encoding fixing - fewer false positives
conservative = TextFixerConfig(
restore_byte_a0=False, # Don't interpret spaces as non-breaking spaces
replace_lossy_sequences=False, # Don't fix partial mojibake
decode_inconsistent_utf8=False # Don't fix inconsistent UTF-8
)
# Aggressive encoding fixing - more corrections
aggressive = TextFixerConfig(
restore_byte_a0=True, # Allow space → non-breaking space
replace_lossy_sequences=True, # Fix sequences with � or ?
decode_inconsistent_utf8=True # Fix inconsistent UTF-8 patterns
)Control various character formatting fixes:
# Minimal character normalization
minimal = TextFixerConfig(
fix_latin_ligatures=False, # Keep ligatures like fi
fix_character_width=False, # Keep fullwidth characters
uncurl_quotes=False, # Keep curly quotes
fix_line_breaks=False # Keep original line endings
)
# Text cleaning for terminal display
terminal = TextFixerConfig(
remove_terminal_escapes=True, # Remove ANSI escapes
remove_control_chars=True, # Remove control characters
fix_character_width=True, # Normalize character widths
normalization="NFC" # Canonical Unicode form
)The normalization option controls Unicode canonical forms:
# NFC - Canonical decomposed + composed (default)
nfc_config = TextFixerConfig(normalization="NFC")
fix_text("café", nfc_config) # Combines é into single character
# NFD - Canonical decomposed
nfd_config = TextFixerConfig(normalization="NFD")
fix_text("café", nfd_config) # Separates é into e + ´
# NFKC - Compatibility normalization (changes meaning)
nfkc_config = TextFixerConfig(normalization="NFKC")
fix_text("10³", nfkc_config) # → "103" (loses superscript)
# No normalization
no_norm = TextFixerConfig(normalization=None)from ftfy import TextFixerConfig, fix_text
# Use defaults
config = TextFixerConfig()
# Change specific options
config = TextFixerConfig(uncurl_quotes=False, fix_encoding=True)
# Create variations
no_html = config._replace(unescape_html=False)
conservative = config._replace(restore_byte_a0=False, replace_lossy_sequences=False)from ftfy import fix_text
# Pass config options as kwargs (equivalent to config object)
result = fix_text(text, uncurl_quotes=False, normalization="NFD")
# Mix config object and kwargs (kwargs override config)
config = TextFixerConfig(uncurl_quotes=False)
result = fix_text(text, config, normalization="NFD") # NFD overrides configfrom ftfy import fix_and_explain
# Get detailed explanations
result = fix_and_explain("só")
print(f"Text: {result.text}")
print(f"Steps: {len(result.explanation)} transformations")
for step in result.explanation:
print(f" {step.action}: {step.parameter}")
# Disable explanations for performance
from ftfy import TextFixerConfig
config = TextFixerConfig(explain=False)
result = fix_and_explain(text, config)
print(result.explanation) # None# Process large texts in smaller segments
large_text_config = TextFixerConfig(max_decode_length=500000)
# Skip expensive operations for simple cleaning
fast_config = TextFixerConfig(
fix_encoding=False, # Skip mojibake detection
unescape_html=False, # Skip HTML processing
explain=False # Skip explanation generation
)
# Text-only cleaning (no encoding fixes)
text_only = TextFixerConfig(
fix_encoding=False,
unescape_html=False,
remove_terminal_escapes=True,
fix_character_width=True,
uncurl_quotes=True,
fix_line_breaks=True
)Install with Tessl CLI
npx tessl i tessl/pypi-ftfy