Fuzzy string matching library using Levenshtein Distance algorithms for approximate text comparison
Utility functions for string preprocessing and normalization. These functions prepare strings for fuzzy matching by cleaning and standardizing their format.
Comprehensive string preprocessing that normalizes text for optimal fuzzy matching performance.
def full_process(s: str, force_ascii: bool = False) -> str:
"""
Process string for fuzzy matching by normalizing format.
Processing steps:
1. Convert to string if not already
2. Optionally convert to ASCII (removes accented characters)
3. Remove all non-alphanumeric characters (replaced with spaces)
4. Trim leading/trailing whitespace
5. Convert to lowercase
6. Normalize internal whitespace
Args:
s: String to process
force_ascii: If True, convert accented characters to ASCII equivalents
Returns:
str: Processed and normalized string
"""Convert strings to ASCII-only by removing non-ASCII characters, useful for standardizing international text.
def ascii_only(s: str) -> str:
"""
Convert string to ASCII by removing non-ASCII characters.
Removes characters with ASCII codes 128-255, effectively stripping
accented characters, emoji, and other non-ASCII content.
Args:
s: String to convert
Returns:
str: ASCII-only version of the string
"""# Translation table for ASCII conversion (removes chars 128-255)
translation_table: dictfrom thefuzz import utils
# Standard text normalization
text = " Hello, World! "
processed = utils.full_process(text)
print(processed) # "hello world"
# Handle special characters
text = "New York Mets vs. Atlanta Braves"
processed = utils.full_process(text)
print(processed) # "new york mets vs atlanta braves"from thefuzz import utils
# Convert accented characters
text = "Café Münchën"
ascii_text = utils.ascii_only(text)
print(ascii_text) # "Caf Mnchen"
# Full processing with ASCII conversion
processed = utils.full_process("Café Münchën", force_ascii=True)
print(processed) # "caf mnchen"from thefuzz import fuzz, utils
# Manual preprocessing before comparison
s1 = utils.full_process("New York Mets!")
s2 = utils.full_process("new york mets")
score = fuzz.ratio(s1, s2)
print(score) # 100 (perfect match after processing)
# Compare with and without processing
raw_score = fuzz.ratio("New York Mets!", "new york mets")
processed_score = fuzz.ratio(
utils.full_process("New York Mets!"),
utils.full_process("new york mets")
)
print(f"Raw: {raw_score}, Processed: {processed_score}")from thefuzz import utils
def custom_processor(text):
"""Custom processing for specific use case."""
# First apply standard processing
processed = utils.full_process(text, force_ascii=True)
# Add custom logic
# Remove common stop words, normalize abbreviations, etc.
replacements = {
"street": "st",
"avenue": "ave",
"boulevard": "blvd"
}
for old, new in replacements.items():
processed = processed.replace(old, new)
return processed
# Use with fuzzy matching
from thefuzz import process
addresses = ["123 Main Street", "456 Oak Avenue", "789 First Boulevard"]
result = process.extractOne("main st", addresses, processor=custom_processor)from thefuzz import utils
# For batch processing, consider preprocessing once
texts = ["Text 1", "Text 2", "Text 3", ...]
processed_texts = [utils.full_process(text) for text in texts]
# Then use the processed texts for multiple comparisons
# This avoids repeated preprocessing in fuzzy matching functionsfrom thefuzz import utils
examples = [
"Hello, World!", # → "hello world"
" Multiple Spaces ", # → "multiple spaces"
"New York Mets vs. ATL", # → "new york mets vs atl"
"Café Münchën", # → "café münchën" (or "caf mnchen" with force_ascii=True)
"user@email.com", # → "user email com"
"1st & 2nd Avenue", # → "1st 2nd avenue"
]
for text in examples:
processed = utils.full_process(text)
processed_ascii = utils.full_process(text, force_ascii=True)
print(f"'{text}' → '{processed}' → '{processed_ascii}'")Install with Tessl CLI
npx tessl i tessl/pypi-thefuzz