Fixes mojibake and other problems with Unicode, after the fact
npx @tessl/cli install tessl/pypi-ftfy@6.3.0Fixes mojibake and other problems with Unicode text after the fact. Detects and corrects common encoding issues, normalizes character formatting, and provides robust text cleaning utilities for handling text from unreliable sources with mixed or unknown encodings.
pip install ftfyimport ftfyCommon import patterns:
from ftfy import fix_text, fix_and_explain, TextFixerConfigFor individual text fixers:
from ftfy.fixes import unescape_html, remove_terminal_escapes, uncurl_quotesFor formatting utilities:
from ftfy.formatting import display_ljust, character_widthimport ftfy
# Fix encoding problems (mojibake)
broken_text = "âœ" No problems"
fixed = ftfy.fix_text(broken_text)
print(fixed) # "✔ No problems"
# Fix multiple layers of mojibake
broken = "The Mona Lisa doesn’t have eyebrows."
fixed = ftfy.fix_text(broken)
print(fixed) # "The Mona Lisa doesn't have eyebrows."
# Get explanation of what was fixed
text, explanation = ftfy.fix_and_explain("só")
print(text) # "só"
print(explanation) # [('encode', 'latin-1'), ('decode', 'utf-8')]
# Configure specific fixes
from ftfy import TextFixerConfig
config = TextFixerConfig(uncurl_quotes=False)
result = ftfy.fix_text(text, config)ftfy operates through a multi-step pipeline that detects and corrects text problems:
This design enables ftfy to safely process text from unknown sources while avoiding overcorrection of correctly-encoded text.
Core functions for detecting and fixing text encoding problems, including the main fix_text function and variants that provide explanations of applied transformations.
def fix_text(text: str, config: TextFixerConfig | None = None, **kwargs) -> str: ...
def fix_and_explain(text: str, config: TextFixerConfig | None = None, **kwargs) -> ExplainedText: ...
def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs) -> str: ...
def fix_encoding_and_explain(text: str, config: TextFixerConfig | None = None, **kwargs) -> ExplainedText: ...
# Alias for fix_text
ftfy = fix_textConfiguration classes and types for controlling ftfy behavior, including comprehensive options for each fix step and explanation data structures.
class TextFixerConfig(NamedTuple): ...
class ExplainedText(NamedTuple): ...
class ExplanationStep(NamedTuple): ...Individual transformation functions for specific text problems like HTML entities, terminal escapes, character width, quotes, and line breaks.
def unescape_html(text: str) -> str: ...
def remove_terminal_escapes(text: str) -> str: ...
def uncurl_quotes(text: str) -> str: ...
def fix_character_width(text: str) -> str: ...
def fix_line_breaks(text: str) -> str: ...Functions for processing files and handling bytes of unknown encoding, including streaming file processing and encoding detection utilities.
def fix_file(input_file, encoding: str | None = None, config: TextFixerConfig | None = None, **kwargs) -> Iterator[str]: ...
def guess_bytes(bstring: bytes) -> tuple[str, str]: ...Unicode-aware text formatting for terminal display, including width calculation and justification functions that handle fullwidth characters and zero-width characters correctly.
def character_width(char: str) -> int: ...
def display_ljust(text: str, width: int, fillchar: str = " ") -> str: ...
def display_center(text: str, width: int, fillchar: str = " ") -> str: ...Debugging and utility functions for understanding Unicode text and applying transformation plans manually.
def explain_unicode(text: str) -> None: ...
def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: ...
def badness(text: str) -> int: ...
def is_bad(text: str) -> bool: ...Command-line tool for batch text processing with configurable options for encoding, normalization, and entity handling.
def main() -> None: ...__version__ = "6.3.1" # Package version stringclass TextFixerConfig(NamedTuple):
"""Configuration for all ftfy text processing options."""
unescape_html: str | bool = "auto"
remove_terminal_escapes: bool = True
fix_encoding: bool = True
restore_byte_a0: bool = True
replace_lossy_sequences: bool = True
decode_inconsistent_utf8: bool = True
fix_c1_controls: bool = True
fix_latin_ligatures: bool = True
fix_character_width: bool = True
uncurl_quotes: bool = True
fix_line_breaks: bool = True
fix_surrogates: bool = True
remove_control_chars: bool = True
normalization: Literal["NFC", "NFD", "NFKC", "NFKD"] | None = "NFC"
max_decode_length: int = 1000000
explain: bool = True
class ExplainedText(NamedTuple):
"""Result containing fixed text and explanation of changes."""
text: str
explanation: list[ExplanationStep] | None
class ExplanationStep(NamedTuple):
"""Single step in text transformation explanation."""
action: str # "encode", "decode", "transcode", "apply", "normalize"
parameter: str # encoding name or function name