tessl/pypi-ftfy

Fixes mojibake and other problems with Unicode, after the fact

—

Pending

Overview

Eval results

Files

Individual Text Fixes

Name: tessl/pypi-ftfy
Author: tessl

Individual transformation functions for specific text problems like HTML entities, terminal escapes, character width, quotes, and line breaks. These functions can be used independently or are applied automatically by the main text fixing functions.

Capabilities

HTML and Markup Processing

Functions for handling HTML entities and markup-related text issues.

def unescape_html(text: str) -> str:
    """
    Convert HTML entities to Unicode characters.
    
    Robust replacement for html.unescape that handles malformed entities
    and common entity mistakes. Converts entities like &amp; → &, &lt; → <.
    
    Args:
        text: String potentially containing HTML entities
        
    Returns:
        String with HTML entities converted to Unicode characters
        
    Examples:
        >>> unescape_html("&amp; &lt;tag&gt;")
        '& <tag>'
        >>> unescape_html("&EACUTE;")  # Handles incorrect capitalization  
        'É'
    """

Terminal and Control Characters

Functions for cleaning terminal escapes and control characters.

def remove_terminal_escapes(text: str) -> str:
    """
    Remove ANSI terminal escape sequences.
    
    Strips color codes, cursor positioning, and other ANSI escape
    sequences commonly found in terminal output or log files.
    
    Args:
        text: String potentially containing ANSI escape sequences
        
    Returns:
        String with terminal escapes removed
        
    Examples:
        >>> remove_terminal_escapes("\\x1b[31mRed text\\x1b[0m")
        'Red text'
        >>> remove_terminal_escapes("\\x1b[2J\\x1b[HClear screen")
        'Clear screen'
    """

def remove_control_chars(text: str) -> str:
    """
    Remove unnecessary Unicode control characters.
    
    Removes control characters that have no visual effect and are
    typically unwanted artifacts in text processing.
    
    Args:
        text: String potentially containing control characters
        
    Returns:
        String with control characters removed
    """

def remove_bom(text: str) -> str:
    """
    Remove byte order marks (BOM) from text.
    
    Strips Unicode BOM characters that sometimes appear at the
    beginning of text files or strings.
    
    Args:
        text: String potentially starting with BOM
        
    Returns:
        String with BOM removed
    """

Quote and Punctuation Fixes

Functions for normalizing quotes and punctuation characters.

def uncurl_quotes(text: str) -> str:
    """
    Convert curly quotes to straight ASCII quotes.
    
    Replaces Unicode quotation marks with ASCII equivalents:
    ' ' → ', " " → ". Useful for systems requiring ASCII-only text.
    
    Args:
        text: String containing curly quotes
        
    Returns:
        String with straight ASCII quotes
        
    Examples:
        >>> uncurl_quotes("It's "quoted" text")
        'It\\'s "quoted" text'
        >>> uncurl_quotes("'single' and "double" quotes")
        '\\'single\\' and "double" quotes'
    """

Character Width and Typography

Functions for normalizing character width and typographic elements.

def fix_character_width(text: str) -> str:
    """
    Normalize fullwidth and halfwidth characters.
    
    Converts fullwidth Latin characters to normal width and halfwidth
    Katakana to normal width for consistent display and processing.
    
    Args:
        text: String containing width-variant characters
        
    Returns:
        String with normalized character widths
        
    Examples:
        >>> fix_character_width("ＬＯＵＤ　ＮＯＩＳＥＳ")
        'LOUD NOISES'
        >>> fix_character_width("ﾊﾝｶｸ")  # Halfwidth Katakana
        'ハンカク'
    """

def fix_latin_ligatures(text: str) -> str:
    """
    Replace Latin ligatures with individual letters.
    
    Converts typographic ligatures like ﬁ, ﬂ back to individual
    characters (fi, fl) for searchability and processing.
    
    Args:
        text: String containing Latin ligatures
        
    Returns:
        String with ligatures replaced by letter sequences
        
    Examples:
        >>> fix_latin_ligatures("ﬁle and ﬂower")
        'file and flower'
        >>> fix_latin_ligatures("ofﬁce")
        'office'
    """

Line Break and Whitespace Normalization

Functions for standardizing line breaks and whitespace.

def fix_line_breaks(text: str) -> str:
    """
    Standardize line breaks to Unix format (\\n).
    
    Converts Windows (\\r\\n), Mac (\\r), and other line ending
    variations to standard Unix newlines. Handles Unicode line
    separators and paragraph separators.
    
    Args:
        text: String with various line break formats
        
    Returns:
        String with standardized \\n line breaks
        
    Examples:
        >>> fix_line_breaks("line1\\r\\nline2\\rline3")
        'line1\\nline2\\nline3'
        >>> fix_line_breaks("para1\\u2029para2")  # Unicode paragraph sep
        'para1\\npara2'
    """

Advanced Character Processing

Functions for handling complex Unicode issues.

def fix_surrogates(text: str) -> str:
    """
    Fix UTF-16 surrogate pair sequences.
    
    Converts UTF-16 surrogate codepoints back to the original high-
    numbered Unicode characters like emoji. Fixes text decoded with
    obsolete UCS-2 standard.
    
    Args:
        text: String containing UTF-16 surrogates
        
    Returns:
        String with surrogates converted to proper characters
        
    Examples:
        >>> fix_surrogates("\\ud83d\\ude00")  # Surrogate pair
        '😀'
    """

def fix_c1_controls(text: str) -> str:
    """
    Replace C1 control characters with Windows-1252 equivalents.
    
    Converts Latin-1 control characters (U+80-U+9F) to their
    Windows-1252 interpretations following HTML5 standard.
    
    Args:
        text: String containing C1 control characters
        
    Returns:
        String with C1 controls replaced
        
    Examples:
        >>> fix_c1_controls("\\x80")  # C1 control
        '€'  # Windows-1252 Euro sign
    """

Byte-Level Processing

Functions for processing byte sequences during encoding correction.

def restore_byte_a0(byts: bytes) -> bytes:
    """
    Restore byte 0xA0 in potential UTF-8 mojibake.
    
    Replaces literal space (0x20) with non-breaking space (0xA0)
    when it would make the bytes valid UTF-8. Used during encoding
    detection to handle common mojibake patterns.
    
    Args:
        byts: Byte sequence potentially containing altered UTF-8
        
    Returns:
        Byte sequence with 0xA0 restored where appropriate
    """

def replace_lossy_sequences(byts: bytes) -> bytes:
    """
    Replace lossy byte sequences in mojibake correction.
    
    Identifies and replaces sequences where information was lost
    during encoding/decoding, typically involving � or ? characters.
    
    Args:
        byts: Byte sequence from encoding detection
        
    Returns:
        Byte sequence with lossy sequences replaced
    """

def decode_inconsistent_utf8(text: str) -> str:
    """
    Handle inconsistent UTF-8 sequences in text.
    
    Fixes text where UTF-8 mojibake patterns exist but there's no
    consistent way to reinterpret the string in a single encoding.
    Replaces problematic sequences with proper UTF-8.
    
    Args:
        text: String with inconsistent UTF-8 sequences
        
    Returns:
        String with UTF-8 sequences corrected
    """

Utility Functions

Additional text processing utilities.

def decode_escapes(text: str) -> str:
    """
    Decode backslash escape sequences in text.
    
    More robust version of string decode that handles various escape
    sequence formats including \\n, \\t, \\uXXXX, \\xXX patterns.
    
    Args:
        text: String containing escape sequences
        
    Returns:
        String with escape sequences decoded
        
    Examples:
        >>> decode_escapes("Hello\\nWorld\\t!")
        'Hello\\nWorld\\t!'
        >>> decode_escapes("Unicode: \\u00e9")
        'Unicode: é'
    """

Usage Examples

Individual Fix Application

from ftfy.fixes import unescape_html, remove_terminal_escapes, uncurl_quotes

# Apply individual fixes
html_text = "&lt;p&gt;Hello &amp; goodbye&lt;/p&gt;"
clean_html = unescape_html(html_text)
print(clean_html)  # "<p>Hello & goodbye</p>"

# Clean terminal output
terminal_output = "\x1b[31mError:\x1b[0m File not found"
clean_output = remove_terminal_escapes(terminal_output)
print(clean_output)  # "Error: File not found"

# Normalize quotes for ASCII systems
curly_text = "It's "perfectly" fine"
straight_quotes = uncurl_quotes(curly_text)
print(straight_quotes)  # 'It\'s "perfectly" fine'

Character Width Normalization

from ftfy.fixes import fix_character_width, fix_latin_ligatures

# Fix fullwidth characters
wide_text = "ＨＥＬＬＯ　ＷＯＲＬＤ"  
normal_text = fix_character_width(wide_text)
print(normal_text)  # "HELLO WORLD"

# Decompose ligatures  
ligature_text = "The ofﬁce ﬁle"
decomposed = fix_latin_ligatures(ligature_text)
print(decomposed)  # "The office file"

Line Break Standardization

from ftfy.fixes import fix_line_breaks

# Standardize mixed line endings
mixed_lines = "Line 1\r\nLine 2\rLine 3\nLine 4"
unix_lines = fix_line_breaks(mixed_lines)
print(repr(unix_lines))  # 'Line 1\nLine 2\nLine 3\nLine 4'

# Handle Unicode line separators
unicode_lines = "Para 1\u2029Para 2\u2028Line break"
standard_lines = fix_line_breaks(unicode_lines) 
print(repr(standard_lines))  # 'Para 1\nPara 2\nLine break'

Advanced Character Processing

from ftfy.fixes import fix_surrogates, fix_c1_controls

# Fix emoji from surrogate pairs
surrogate_emoji = "\ud83d\ude00\ud83d\ude01"  # Encoded emoji
real_emoji = fix_surrogates(surrogate_emoji)
print(real_emoji)  # "😀😁"

# Fix C1 control characters  
latin1_controls = "\x80\x85\x91\x92"  # C1 controls
windows1252 = fix_c1_controls(latin1_controls)
print(windows1252)  # "€…''"

Combining Multiple Fixes

from ftfy.fixes import (
    unescape_html, remove_terminal_escapes, 
    uncurl_quotes, fix_character_width, fix_line_breaks
)

def custom_clean(text):
    """Custom text cleaning pipeline."""
    text = remove_terminal_escapes(text)
    text = unescape_html(text)
    text = uncurl_quotes(text) 
    text = fix_character_width(text)
    text = fix_line_breaks(text)
    return text

# Apply custom cleaning
messy_text = "\x1b[32m&lt;ＨＥＬＬＯ&gt;\x1b[0m "world"\r\n"
clean_text = custom_clean(messy_text)
print(clean_text)  # '<HELLO> "world"\n'

Install with Tessl CLI