Fixes mojibake and other problems with Unicode, after the fact
—
Core functions for detecting and fixing text encoding problems, including the main fix_text function and variants that provide explanations of applied transformations.
Detects and fixes Unicode text problems including mojibake, HTML entities, character formatting issues, and other common text corruptions.
def fix_text(text: str, config: TextFixerConfig | None = None, **kwargs) -> str:
"""
Fix inconsistencies and glitches in Unicode text.
Applies multiple text fixes in sequence, processing text in segments
for performance. Handles mojibake, HTML entities, character width,
quotes, line breaks, and other common text problems.
Args:
text: Unicode string to fix
config: Configuration object, or None for defaults
**kwargs: Individual config options (e.g., uncurl_quotes=False)
Returns:
Fixed Unicode string
Examples:
>>> fix_text('âœ" No problems')
'✔ No problems'
>>> fix_text('LOUD NOISES')
'LOUD NOISES'
"""Fixes text and provides detailed explanation of transformations applied, useful for debugging and understanding the fixes.
def fix_and_explain(text: str, config: TextFixerConfig | None = None, **kwargs) -> ExplainedText:
"""
Fix text as single segment and return explanation of changes.
Processes text with consistent sequence of fixes and returns both
the fixed text and list of transformation steps applied.
Args:
text: Unicode string to fix
config: Configuration object, or None for defaults
**kwargs: Individual config options
Returns:
ExplainedText with fixed text and explanation steps
Examples:
>>> result = fix_and_explain("só")
>>> result.text
'só'
>>> result.explanation
[ExplanationStep(action='encode', parameter='latin-1'),
ExplanationStep(action='decode', parameter='utf-8')]
"""Applies only the encoding detection and correction steps, skipping character formatting and normalization fixes.
def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs) -> str:
"""
Apply only encoding-fixing steps of ftfy.
Detects mojibake and attempts to fix by decoding text in different
encoding standard, without applying character formatting fixes.
Args:
text: Unicode string to fix
config: Configuration object, or None for defaults
**kwargs: Individual config options
Returns:
Text with encoding problems fixed
Examples:
>>> fix_encoding("ó")
'ó'
>>> fix_encoding("ó") # HTML entities not fixed
'ó'
"""
def fix_encoding_and_explain(text: str, config: TextFixerConfig | None = None, **kwargs) -> ExplainedText:
"""
Apply encoding fixes and return explanation.
Detects and fixes mojibake with detailed explanation of encoding
transformations applied including subordinate fixes.
Args:
text: Unicode string to fix
config: Configuration object, or None for defaults
**kwargs: Individual config options
Returns:
ExplainedText with encoding fixes and explanation
Examples:
>>> result = fix_encoding_and_explain("voilà le travail")
>>> result.text
'voilà le travail'
>>> result.explanation
[ExplanationStep(action='encode', parameter='latin-1'),
ExplanationStep(action='transcode', parameter='restore_byte_a0'),
ExplanationStep(action='decode', parameter='utf-8')]
"""Fixes text as single segment with consistent transformation sequence, useful when segment boundaries matter.
def fix_text_segment(text: str, config: TextFixerConfig | None = None, **kwargs) -> str:
"""
Fix text as single segment with consistent sequence of steps.
Unlike fix_text which may process in multiple segments, this applies
a single consistent sequence of transformations to entire text.
Args:
text: Unicode string to fix
config: Configuration object, or None for defaults
**kwargs: Individual config options
Returns:
Fixed text processed as single segment
"""import ftfy
# Fix common mojibake
broken = "âœ" No problems"
fixed = ftfy.fix_text(broken)
print(fixed) # "✔ No problems"
# Fix multiple encoding layers
multilayer = "The Mona Lisa doesn’t have eyebrows."
fixed = ftfy.fix_text(multilayer)
print(fixed) # "The Mona Lisa doesn't have eyebrows."from ftfy import fix_text, TextFixerConfig
# Disable quote uncurling
config = TextFixerConfig(uncurl_quotes=False)
text_with_quotes = "It's "quoted" text"
result = fix_text(text_with_quotes, config)
# Use keyword arguments
result = fix_text(text_with_quotes, uncurl_quotes=False)
# Disable HTML entity decoding
result = fix_text("& symbols", unescape_html=False)from ftfy import fix_and_explain
# Understand what was fixed
text, explanation = fix_and_explain("áéÃóú")
print(f"Fixed: {text}")
print(f"Steps: {explanation}")
# Check if any fixes were applied
result = fix_and_explain("normal text")
if result.explanation:
print("Fixes applied:", result.explanation)
else:
print("No fixes needed")from ftfy import fix_encoding, fix_encoding_and_explain
# Fix only encoding problems
mojibake = "café" # appears as mojibake
fixed = fix_encoding(mojibake)
# Get encoding fix explanation
result = fix_encoding_and_explain(mojibake)
print(f"Encoding steps: {result.explanation}")Install with Tessl CLI
npx tessl i tessl/pypi-ftfy