Fixes mojibake and other problems with Unicode, after the fact
—
Debugging and utility functions for understanding Unicode text and applying transformation plans manually.
Debugging utility for analyzing Unicode text character by character.
def explain_unicode(text: str) -> None:
"""
Debug utility showing detailed Unicode information for each character.
Prints character-by-character breakdown showing Unicode codepoint,
glyph, category, and name for debugging mysterious Unicode text.
Output goes to stdout and is intended for interactive debugging.
Args:
text: Unicode string to analyze
Returns:
None (prints to stdout)
Examples:
>>> explain_unicode('café')
U+0063 c [Ll] LATIN SMALL LETTER C
U+0061 a [Ll] LATIN SMALL LETTER A
U+0066 f [Ll] LATIN SMALL LETTER F
U+00E9 é [Ll] LATIN SMALL LETTER E WITH ACUTE
>>> explain_unicode('😀🎉')
U+1F600 😀 [So] GRINNING FACE
U+1F389 🎉 [So] PARTY POPPER
"""Function for manually applying transformation plans generated by ftfy's explanation system.
# Dictionary mapping fixer names to functions for use with apply_plan
FIXERS: dict[str, Callable] = {
"unescape_html": fixes.unescape_html,
"remove_terminal_escapes": fixes.remove_terminal_escapes,
"restore_byte_a0": fixes.restore_byte_a0,
"replace_lossy_sequences": fixes.replace_lossy_sequences,
"decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
"fix_c1_controls": fixes.fix_c1_controls,
"fix_latin_ligatures": fixes.fix_latin_ligatures,
"fix_character_width": fixes.fix_character_width,
"uncurl_quotes": fixes.uncurl_quotes,
"fix_line_breaks": fixes.fix_line_breaks,
"fix_surrogates": fixes.fix_surrogates,
"remove_control_chars": fixes.remove_control_chars,
}
def apply_plan(text: str, plan: list[tuple[str, str]]) -> str:
"""
Apply sequence of text transformations from explanation plan.
Takes transformation plan (list of operation/parameter tuples) and
applies each step in sequence. Useful for replaying ftfy fixes or
applying custom transformation sequences.
Args:
text: Initial text or bytes to transform
plan: List of (operation, parameter) tuples
Returns:
Final transformed text
Operations:
"encode": Convert string to bytes using parameter as encoding
"decode": Convert bytes to string using parameter as encoding
"transcode": Apply bytes→bytes function named in parameter
"apply": Apply string→string function named in parameter
Examples:
>>> plan = [('encode', 'latin-1'), ('decode', 'utf-8')]
>>> apply_plan('só', plan)
'só'
>>> plan = [('apply', 'uncurl_quotes'), ('apply', 'fix_line_breaks')]
>>> apply_plan('"curly quotes"\\r\\n', plan)
'"curly quotes"\\n'
"""Functions for detecting whether text contains mojibake or other problems.
def badness(text: str) -> int:
"""
Count the number of unlikely character sequences in text.
Returns numerical badness score by counting mojibake patterns.
Higher scores indicate more likely encoding problems. Score > 0
indicates text likely contains mojibake.
Args:
text: Unicode string to analyze
Returns:
Number of unlikely character sequences found
Examples:
>>> from ftfy.badness import badness
>>> badness("normal text")
0
>>> badness("âœ" broken") # Multiple mojibake patterns
2
"""
def is_bad(text: str) -> bool:
"""
Heuristic detection of likely mojibake in text.
Uses statistical analysis of Unicode character patterns to detect
text that likely contains encoding problems. Designed to minimize
false positives while catching common mojibake patterns.
Args:
text: Unicode string to analyze
Returns:
True if text likely contains mojibake, False otherwise
Examples:
>>> from ftfy.badness import is_bad
>>> is_bad("normal text")
False
>>> is_bad("âœ" broken") # Mojibake pattern
True
"""from ftfy import explain_unicode
# Debug mysterious characters
mysterious_text = "Weird chars: \u00a0\u200b\u2019"
print("Analyzing mysterious text:")
explain_unicode(mysterious_text)
# Debug emoji and special characters
emoji_text = "🎉🔥💯"
print("\nAnalyzing emoji:")
explain_unicode(emoji_text)
# Debug potential mojibake
mojibake = "café" # This might be mojibake
print("\nAnalyzing potential mojibake:")
explain_unicode(mojibake)from ftfy import fix_and_explain, apply_plan
# Get explanation for a fix
broken_text = "só"
result = fix_and_explain(broken_text)
print(f"Original: {broken_text}")
print(f"Fixed: {result.text}")
print(f"Plan: {result.explanation}")
# Convert ExplanationStep objects to tuples for apply_plan
plan_tuples = [(step.action, step.parameter) for step in result.explanation]
# Replay the same transformation on similar text
similar_text = "José" # Same type of mojibake
replayed = apply_plan(similar_text, plan_tuples)
print(f"Replayed fix: {similar_text} → {replayed}")from ftfy import apply_plan
from ftfy.fixes import FIXERS
# Check available transformations
print("Available fixers:", list(FIXERS.keys()))
# Build custom transformation plan
custom_plan = [
('apply', 'remove_terminal_escapes'),
('apply', 'unescape_html'),
('apply', 'uncurl_quotes'),
('apply', 'fix_character_width')
]
# Apply custom sequence
messy_text = '\x1b[31m<"curly">\x1b[0m WIDE'
cleaned = apply_plan(messy_text, custom_plan)
print(f"Custom clean: {messy_text} → {cleaned}")from ftfy import apply_plan
# Manually specify encoding transformations
encoding_plan = [
('encode', 'latin-1'), # String → bytes as latin-1
('decode', 'utf-8') # Bytes → string as utf-8
]
mojibake_texts = ['café', 'naïve', 'résumé']
for text in mojibake_texts:
try:
fixed = apply_plan(text, encoding_plan)
print(f"{text} → {fixed}")
except UnicodeError as e:
print(f"{text} → Error: {e}")from ftfy.badness import is_bad, badness
from ftfy import fix_text
test_strings = [
"Normal English text",
"Regular café",
"âœ" mojibake pattern",
"Broken text™ with weird chars",
"Standard Unicode: 你好世界",
"Currency symbols: €£¥",
"só definite mojibake"
]
print("Mojibake detection results:")
for text in test_strings:
bad = is_bad(text)
score = badness(text)
if bad:
fixed = fix_text(text)
print(f"😱 BAD (score {score}): '{text}' → '{fixed}'")
else:
print(f"✅ OK (score {score}): '{text}'")from ftfy import fix_and_explain, apply_plan, explain_unicode
from ftfy.badness import is_bad, badness
def debug_text_processing(text):
"""Comprehensive text debugging pipeline."""
print(f"=== Debugging: '{text}' ===")
# Check if text looks problematic
bad_score = badness(text)
print(f"Looks bad: {is_bad(text)} (badness score: {bad_score})")
# Show character details
print("\nCharacter analysis:")
explain_unicode(text)
# Try fixing and get explanation
result = fix_and_explain(text)
print(f"\nFixed: '{result.text}'")
if result.explanation:
print(f"Transformations applied: {len(result.explanation)}")
for i, step in enumerate(result.explanation, 1):
print(f" {i}. {step.action}: {step.parameter}")
# Test plan replay
plan_tuples = [(s.action, s.parameter) for s in result.explanation]
replayed = apply_plan(text, plan_tuples)
print(f"Plan replay result: '{replayed}'")
print(f"Replay matches: {replayed == result.text}")
else:
print("No transformations needed")
print()
# Debug various problematic texts
debug_texts = [
"âœ" Check mark mojibake",
"Normal text",
"só encoding issue",
'\x1b[31mTerminal\x1b[0m escapes'
]
for text in debug_texts:
debug_text_processing(text)from ftfy import apply_plan, fix_and_explain
def analyze_transformation_effects(text, individual_plans):
"""Test individual transformations vs combined effect."""
print(f"Original: '{text}'")
# Apply individual transformations
print("\nIndividual transformations:")
current = text
for plan_name, plan in individual_plans.items():
try:
result = apply_plan(current, plan)
if result != current:
print(f" {plan_name}: '{current}' → '{result}'")
current = result
else:
print(f" {plan_name}: no change")
except Exception as e:
print(f" {plan_name}: ERROR {e}")
print(f"Sequential result: '{current}'")
# Compare with ftfy's automatic processing
auto_result = fix_and_explain(text)
print(f"ftfy result: '{auto_result.text}'")
print(f"Results match: {current == auto_result.text}")
# Test transformation composition
plans = {
'html_unescape': [('apply', 'unescape_html')],
'terminal_clean': [('apply', 'remove_terminal_escapes')],
'quote_fix': [('apply', 'uncurl_quotes')],
'encoding_fix': [('encode', 'latin-1'), ('decode', 'utf-8')]
}
complex_text = '\x1b[32m<"Problematic">\x1b[0m text with sóme issues'
analyze_transformation_effects(complex_text, plans)Install with Tessl CLI
npx tessl i tessl/pypi-ftfy