tessl/pypi-ftfy

Fixes mojibake and other problems with Unicode, after the fact

—

Pending

Overview

Eval results

Files

Utilities and Debugging

Name: tessl/pypi-ftfy
Author: tessl

Debugging and utility functions for understanding Unicode text and applying transformation plans manually.

Capabilities

Unicode Text Analysis

Debugging utility for analyzing Unicode text character by character.

def explain_unicode(text: str) -> None:
    """
    Debug utility showing detailed Unicode information for each character.
    
    Prints character-by-character breakdown showing Unicode codepoint,
    glyph, category, and name for debugging mysterious Unicode text.
    Output goes to stdout and is intended for interactive debugging.
    
    Args:
        text: Unicode string to analyze
        
    Returns:
        None (prints to stdout)
        
    Examples:
        >>> explain_unicode('café')
        U+0063  c       [Ll] LATIN SMALL LETTER C
        U+0061  a       [Ll] LATIN SMALL LETTER A  
        U+0066  f       [Ll] LATIN SMALL LETTER F
        U+00E9  é       [Ll] LATIN SMALL LETTER E WITH ACUTE
        
        >>> explain_unicode('😀🎉')
        U+1F600 😀      [So] GRINNING FACE
        U+1F389 🎉      [So] PARTY POPPER
    """

Transformation Plan Application

Function for manually applying transformation plans generated by ftfy's explanation system.

# Dictionary mapping fixer names to functions for use with apply_plan
FIXERS: dict[str, Callable] = {
    "unescape_html": fixes.unescape_html,
    "remove_terminal_escapes": fixes.remove_terminal_escapes,
    "restore_byte_a0": fixes.restore_byte_a0,
    "replace_lossy_sequences": fixes.replace_lossy_sequences,
    "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
    "fix_c1_controls": fixes.fix_c1_controls,
    "fix_latin_ligatures": fixes.fix_latin_ligatures,
    "fix_character_width": fixes.fix_character_width,
    "uncurl_quotes": fixes.uncurl_quotes,
    "fix_line_breaks": fixes.fix_line_breaks,
    "fix_surrogates": fixes.fix_surrogates,
    "remove_control_chars": fixes.remove_control_chars,
}

def apply_plan(text: str, plan: list[tuple[str, str]]) -> str:
    """
    Apply sequence of text transformations from explanation plan.
    
    Takes transformation plan (list of operation/parameter tuples) and
    applies each step in sequence. Useful for replaying ftfy fixes or
    applying custom transformation sequences.
    
    Args:
        text: Initial text or bytes to transform
        plan: List of (operation, parameter) tuples
        
    Returns:
        Final transformed text
        
    Operations:
        "encode": Convert string to bytes using parameter as encoding
        "decode": Convert bytes to string using parameter as encoding  
        "transcode": Apply bytes→bytes function named in parameter
        "apply": Apply string→string function named in parameter
        
    Examples:
        >>> plan = [('encode', 'latin-1'), ('decode', 'utf-8')]
        >>> apply_plan('sÃ³', plan)
        'só'
        
        >>> plan = [('apply', 'uncurl_quotes'), ('apply', 'fix_line_breaks')]
        >>> apply_plan('"curly quotes"\\r\\n', plan)
        '"curly quotes"\\n'
    """

Heuristic Text Analysis

Functions for detecting whether text contains mojibake or other problems.

def badness(text: str) -> int:
    """
    Count the number of unlikely character sequences in text.
    
    Returns numerical badness score by counting mojibake patterns.
    Higher scores indicate more likely encoding problems. Score > 0
    indicates text likely contains mojibake.
    
    Args:
        text: Unicode string to analyze
        
    Returns:
        Number of unlikely character sequences found
        
    Examples:
        >>> from ftfy.badness import badness
        >>> badness("normal text")
        0
        >>> badness("âœ" broken")  # Multiple mojibake patterns
        2
    """

def is_bad(text: str) -> bool:
    """
    Heuristic detection of likely mojibake in text.
    
    Uses statistical analysis of Unicode character patterns to detect
    text that likely contains encoding problems. Designed to minimize
    false positives while catching common mojibake patterns.
    
    Args:
        text: Unicode string to analyze
        
    Returns:
        True if text likely contains mojibake, False otherwise
        
    Examples:
        >>> from ftfy.badness import is_bad
        >>> is_bad("normal text")
        False
        >>> is_bad("âœ" broken")  # Mojibake pattern
        True
    """

Usage Examples

Unicode Text Debugging

from ftfy import explain_unicode

# Debug mysterious characters
mysterious_text = "Weird chars: \u00a0\u200b\u2019"
print("Analyzing mysterious text:")
explain_unicode(mysterious_text)

# Debug emoji and special characters
emoji_text = "🎉🔥💯"
print("\nAnalyzing emoji:")
explain_unicode(emoji_text)

# Debug potential mojibake
mojibake = "café"  # This might be mojibake
print("\nAnalyzing potential mojibake:")
explain_unicode(mojibake)

Transformation Plan Replay

from ftfy import fix_and_explain, apply_plan

# Get explanation for a fix
broken_text = "sÃ³"
result = fix_and_explain(broken_text)
print(f"Original: {broken_text}")
print(f"Fixed: {result.text}")  
print(f"Plan: {result.explanation}")

# Convert ExplanationStep objects to tuples for apply_plan
plan_tuples = [(step.action, step.parameter) for step in result.explanation]

# Replay the same transformation on similar text
similar_text = "José"  # Same type of mojibake
replayed = apply_plan(similar_text, plan_tuples)
print(f"Replayed fix: {similar_text} → {replayed}")

Custom Transformation Sequences

from ftfy import apply_plan
from ftfy.fixes import FIXERS

# Check available transformations
print("Available fixers:", list(FIXERS.keys()))

# Build custom transformation plan
custom_plan = [
    ('apply', 'remove_terminal_escapes'),
    ('apply', 'unescape_html'),  
    ('apply', 'uncurl_quotes'),
    ('apply', 'fix_character_width')
]

# Apply custom sequence
messy_text = '\x1b[31m&lt;"curly"&gt;\x1b[0m ＷＩＤＥ'
cleaned = apply_plan(messy_text, custom_plan)
print(f"Custom clean: {messy_text} → {cleaned}")

Encoding Transformation Plans

from ftfy import apply_plan

# Manually specify encoding transformations
encoding_plan = [
    ('encode', 'latin-1'),     # String → bytes as latin-1
    ('decode', 'utf-8')        # Bytes → string as utf-8
]

mojibake_texts = ['café', 'naïve', 'résumé']
for text in mojibake_texts:
    try:
        fixed = apply_plan(text, encoding_plan)
        print(f"{text} → {fixed}")
    except UnicodeError as e:
        print(f"{text} → Error: {e}")

Mojibake Detection

from ftfy.badness import is_bad, badness
from ftfy import fix_text

test_strings = [
    "Normal English text",
    "Regular café",  
    "âœ" mojibake pattern",
    "Broken text™ with weird chars",
    "Standard Unicode: 你好世界",
    "Currency symbols: €£¥",
    "sÃ³ definite mojibake"
]

print("Mojibake detection results:")
for text in test_strings:
    bad = is_bad(text)
    score = badness(text)
    if bad:
        fixed = fix_text(text)  
        print(f"😱 BAD (score {score}): '{text}' → '{fixed}'")
    else:
        print(f"✅ OK (score {score}):  '{text}'")

Debugging Text Processing Pipeline

from ftfy import fix_and_explain, apply_plan, explain_unicode
from ftfy.badness import is_bad, badness

def debug_text_processing(text):
    """Comprehensive text debugging pipeline."""
    
    print(f"=== Debugging: '{text}' ===")
    
    # Check if text looks problematic
    bad_score = badness(text)
    print(f"Looks bad: {is_bad(text)} (badness score: {bad_score})")
    
    # Show character details
    print("\nCharacter analysis:")
    explain_unicode(text)
    
    # Try fixing and get explanation
    result = fix_and_explain(text)
    print(f"\nFixed: '{result.text}'")
    
    if result.explanation:
        print(f"Transformations applied: {len(result.explanation)}")
        for i, step in enumerate(result.explanation, 1):
            print(f"  {i}. {step.action}: {step.parameter}")
            
        # Test plan replay
        plan_tuples = [(s.action, s.parameter) for s in result.explanation]
        replayed = apply_plan(text, plan_tuples)
        print(f"Plan replay result: '{replayed}'")
        print(f"Replay matches: {replayed == result.text}")
    else:
        print("No transformations needed")
    
    print()

# Debug various problematic texts
debug_texts = [
    "âœ" Check mark mojibake",
    "Normal text",
    "sÃ³ encoding issue",
    '\x1b[31mTerminal\x1b[0m escapes'
]

for text in debug_texts:
    debug_text_processing(text)

Plan Composition and Analysis

from ftfy import apply_plan, fix_and_explain

def analyze_transformation_effects(text, individual_plans):
    """Test individual transformations vs combined effect."""
    
    print(f"Original: '{text}'")
    
    # Apply individual transformations
    print("\nIndividual transformations:")
    current = text
    for plan_name, plan in individual_plans.items():
        try:
            result = apply_plan(current, plan)
            if result != current:
                print(f"  {plan_name}: '{current}' → '{result}'")
                current = result
            else:
                print(f"  {plan_name}: no change")
        except Exception as e:
            print(f"  {plan_name}: ERROR {e}")
    
    print(f"Sequential result: '{current}'")
    
    # Compare with ftfy's automatic processing
    auto_result = fix_and_explain(text)
    print(f"ftfy result: '{auto_result.text}'")
    print(f"Results match: {current == auto_result.text}")

# Test transformation composition
plans = {
    'html_unescape': [('apply', 'unescape_html')],
    'terminal_clean': [('apply', 'remove_terminal_escapes')],
    'quote_fix': [('apply', 'uncurl_quotes')],
    'encoding_fix': [('encode', 'latin-1'), ('decode', 'utf-8')]
}

complex_text = '\x1b[32m&lt;"Problematic"&gt;\x1b[0m text with sÃ³me issues'
analyze_transformation_effects(complex_text, plans)

Install with Tessl CLI