Fixes mojibake and other problems with Unicode, after the fact
—
Individual transformation functions for specific text problems like HTML entities, terminal escapes, character width, quotes, and line breaks. These functions can be used independently or are applied automatically by the main text fixing functions.
Functions for handling HTML entities and markup-related text issues.
def unescape_html(text: str) -> str:
"""
Convert HTML entities to Unicode characters.
Robust replacement for html.unescape that handles malformed entities
and common entity mistakes. Converts entities like & → &, < → <.
Args:
text: String potentially containing HTML entities
Returns:
String with HTML entities converted to Unicode characters
Examples:
>>> unescape_html("& <tag>")
'& <tag>'
>>> unescape_html("&EACUTE;") # Handles incorrect capitalization
'É'
"""Functions for cleaning terminal escapes and control characters.
def remove_terminal_escapes(text: str) -> str:
"""
Remove ANSI terminal escape sequences.
Strips color codes, cursor positioning, and other ANSI escape
sequences commonly found in terminal output or log files.
Args:
text: String potentially containing ANSI escape sequences
Returns:
String with terminal escapes removed
Examples:
>>> remove_terminal_escapes("\\x1b[31mRed text\\x1b[0m")
'Red text'
>>> remove_terminal_escapes("\\x1b[2J\\x1b[HClear screen")
'Clear screen'
"""
def remove_control_chars(text: str) -> str:
"""
Remove unnecessary Unicode control characters.
Removes control characters that have no visual effect and are
typically unwanted artifacts in text processing.
Args:
text: String potentially containing control characters
Returns:
String with control characters removed
"""
def remove_bom(text: str) -> str:
"""
Remove byte order marks (BOM) from text.
Strips Unicode BOM characters that sometimes appear at the
beginning of text files or strings.
Args:
text: String potentially starting with BOM
Returns:
String with BOM removed
"""Functions for normalizing quotes and punctuation characters.
def uncurl_quotes(text: str) -> str:
"""
Convert curly quotes to straight ASCII quotes.
Replaces Unicode quotation marks with ASCII equivalents:
' ' → ', " " → ". Useful for systems requiring ASCII-only text.
Args:
text: String containing curly quotes
Returns:
String with straight ASCII quotes
Examples:
>>> uncurl_quotes("It's "quoted" text")
'It\\'s "quoted" text'
>>> uncurl_quotes("'single' and "double" quotes")
'\\'single\\' and "double" quotes'
"""Functions for normalizing character width and typographic elements.
def fix_character_width(text: str) -> str:
"""
Normalize fullwidth and halfwidth characters.
Converts fullwidth Latin characters to normal width and halfwidth
Katakana to normal width for consistent display and processing.
Args:
text: String containing width-variant characters
Returns:
String with normalized character widths
Examples:
>>> fix_character_width("LOUD NOISES")
'LOUD NOISES'
>>> fix_character_width("ハンカク") # Halfwidth Katakana
'ハンカク'
"""
def fix_latin_ligatures(text: str) -> str:
"""
Replace Latin ligatures with individual letters.
Converts typographic ligatures like fi, fl back to individual
characters (fi, fl) for searchability and processing.
Args:
text: String containing Latin ligatures
Returns:
String with ligatures replaced by letter sequences
Examples:
>>> fix_latin_ligatures("file and flower")
'file and flower'
>>> fix_latin_ligatures("office")
'office'
"""Functions for standardizing line breaks and whitespace.
def fix_line_breaks(text: str) -> str:
"""
Standardize line breaks to Unix format (\\n).
Converts Windows (\\r\\n), Mac (\\r), and other line ending
variations to standard Unix newlines. Handles Unicode line
separators and paragraph separators.
Args:
text: String with various line break formats
Returns:
String with standardized \\n line breaks
Examples:
>>> fix_line_breaks("line1\\r\\nline2\\rline3")
'line1\\nline2\\nline3'
>>> fix_line_breaks("para1\\u2029para2") # Unicode paragraph sep
'para1\\npara2'
"""Functions for handling complex Unicode issues.
def fix_surrogates(text: str) -> str:
"""
Fix UTF-16 surrogate pair sequences.
Converts UTF-16 surrogate codepoints back to the original high-
numbered Unicode characters like emoji. Fixes text decoded with
obsolete UCS-2 standard.
Args:
text: String containing UTF-16 surrogates
Returns:
String with surrogates converted to proper characters
Examples:
>>> fix_surrogates("\\ud83d\\ude00") # Surrogate pair
'😀'
"""
def fix_c1_controls(text: str) -> str:
"""
Replace C1 control characters with Windows-1252 equivalents.
Converts Latin-1 control characters (U+80-U+9F) to their
Windows-1252 interpretations following HTML5 standard.
Args:
text: String containing C1 control characters
Returns:
String with C1 controls replaced
Examples:
>>> fix_c1_controls("\\x80") # C1 control
'€' # Windows-1252 Euro sign
"""Functions for processing byte sequences during encoding correction.
def restore_byte_a0(byts: bytes) -> bytes:
"""
Restore byte 0xA0 in potential UTF-8 mojibake.
Replaces literal space (0x20) with non-breaking space (0xA0)
when it would make the bytes valid UTF-8. Used during encoding
detection to handle common mojibake patterns.
Args:
byts: Byte sequence potentially containing altered UTF-8
Returns:
Byte sequence with 0xA0 restored where appropriate
"""
def replace_lossy_sequences(byts: bytes) -> bytes:
"""
Replace lossy byte sequences in mojibake correction.
Identifies and replaces sequences where information was lost
during encoding/decoding, typically involving � or ? characters.
Args:
byts: Byte sequence from encoding detection
Returns:
Byte sequence with lossy sequences replaced
"""
def decode_inconsistent_utf8(text: str) -> str:
"""
Handle inconsistent UTF-8 sequences in text.
Fixes text where UTF-8 mojibake patterns exist but there's no
consistent way to reinterpret the string in a single encoding.
Replaces problematic sequences with proper UTF-8.
Args:
text: String with inconsistent UTF-8 sequences
Returns:
String with UTF-8 sequences corrected
"""Additional text processing utilities.
def decode_escapes(text: str) -> str:
"""
Decode backslash escape sequences in text.
More robust version of string decode that handles various escape
sequence formats including \\n, \\t, \\uXXXX, \\xXX patterns.
Args:
text: String containing escape sequences
Returns:
String with escape sequences decoded
Examples:
>>> decode_escapes("Hello\\nWorld\\t!")
'Hello\\nWorld\\t!'
>>> decode_escapes("Unicode: \\u00e9")
'Unicode: é'
"""from ftfy.fixes import unescape_html, remove_terminal_escapes, uncurl_quotes
# Apply individual fixes
html_text = "<p>Hello & goodbye</p>"
clean_html = unescape_html(html_text)
print(clean_html) # "<p>Hello & goodbye</p>"
# Clean terminal output
terminal_output = "\x1b[31mError:\x1b[0m File not found"
clean_output = remove_terminal_escapes(terminal_output)
print(clean_output) # "Error: File not found"
# Normalize quotes for ASCII systems
curly_text = "It's "perfectly" fine"
straight_quotes = uncurl_quotes(curly_text)
print(straight_quotes) # 'It\'s "perfectly" fine'from ftfy.fixes import fix_character_width, fix_latin_ligatures
# Fix fullwidth characters
wide_text = "HELLO WORLD"
normal_text = fix_character_width(wide_text)
print(normal_text) # "HELLO WORLD"
# Decompose ligatures
ligature_text = "The office file"
decomposed = fix_latin_ligatures(ligature_text)
print(decomposed) # "The office file"from ftfy.fixes import fix_line_breaks
# Standardize mixed line endings
mixed_lines = "Line 1\r\nLine 2\rLine 3\nLine 4"
unix_lines = fix_line_breaks(mixed_lines)
print(repr(unix_lines)) # 'Line 1\nLine 2\nLine 3\nLine 4'
# Handle Unicode line separators
unicode_lines = "Para 1\u2029Para 2\u2028Line break"
standard_lines = fix_line_breaks(unicode_lines)
print(repr(standard_lines)) # 'Para 1\nPara 2\nLine break'from ftfy.fixes import fix_surrogates, fix_c1_controls
# Fix emoji from surrogate pairs
surrogate_emoji = "\ud83d\ude00\ud83d\ude01" # Encoded emoji
real_emoji = fix_surrogates(surrogate_emoji)
print(real_emoji) # "😀😁"
# Fix C1 control characters
latin1_controls = "\x80\x85\x91\x92" # C1 controls
windows1252 = fix_c1_controls(latin1_controls)
print(windows1252) # "€…''"from ftfy.fixes import (
unescape_html, remove_terminal_escapes,
uncurl_quotes, fix_character_width, fix_line_breaks
)
def custom_clean(text):
"""Custom text cleaning pipeline."""
text = remove_terminal_escapes(text)
text = unescape_html(text)
text = uncurl_quotes(text)
text = fix_character_width(text)
text = fix_line_breaks(text)
return text
# Apply custom cleaning
messy_text = "\x1b[32m<HELLO>\x1b[0m "world"\r\n"
clean_text = custom_clean(messy_text)
print(clean_text) # '<HELLO> "world"\n'Install with Tessl CLI
npx tessl i tessl/pypi-ftfy