rapid fuzzy string matching
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Utilities for normalizing and preprocessing strings before comparison. Proper string preprocessing can significantly improve matching accuracy by handling case differences, punctuation, and whitespace variations.
Standard preprocessing function that normalizes strings for comparison by converting to lowercase, removing non-alphanumeric characters, and trimming whitespace.
def default_process(sentence: str) -> strParameters:
sentence: Input string to preprocessReturns: Normalized string with only lowercase alphanumeric characters and spaces
Processing Steps:
Usage Example:
from rapidfuzz import utils
# Basic preprocessing
text = "Hello, World! 123"
processed = utils.default_process(text)
print(processed) # "hello world 123"
# Handling punctuation and case
text = "Don't worry, BE HAPPY!!!"
processed = utils.default_process(text)
print(processed) # "dont worry be happy"
# Normalizing whitespace
text = " multiple spaces "
processed = utils.default_process(text)
print(processed) # "multiple spaces"
# Unicode and special characters
text = "Café & Restaurant — $50"
processed = utils.default_process(text)
print(processed) # "cafe restaurant 50"All fuzz functions accept an optional processor parameter:
from rapidfuzz import fuzz, utils
s1 = "New York Jets"
s2 = "new york jets!!!"
# Without preprocessing
score1 = fuzz.ratio(s1, s2)
print(f"Raw: {score1:.1f}") # Lower score due to case/punctuation differences
# With preprocessing
score2 = fuzz.ratio(s1, s2, processor=utils.default_process)
print(f"Processed: {score2:.1f}") # 100.0 (perfect match after normalization)
# Works with all fuzz functions
score3 = fuzz.WRatio(s1, s2, processor=utils.default_process)
score4 = fuzz.token_sort_ratio(s1, s2, processor=utils.default_process)
score5 = fuzz.partial_ratio(s1, s2, processor=utils.default_process)Process functions also support the processor parameter:
from rapidfuzz import process, utils
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
query = "NEW YORK jets"
# Without preprocessing
match1 = process.extractOne(query, choices)
print(match1) # Lower score due to case differences
# With preprocessing
match2 = process.extractOne(query, choices, processor=utils.default_process)
print(match2) # Perfect match: ('New York Jets', 100.0, 1)
# Batch processing with preprocessing
matches = process.extract(query, choices,
processor=utils.default_process,
limit=3)
print(matches) # All matches benefit from normalizationDistance metrics also support preprocessing:
from rapidfuzz.distance import Levenshtein
from rapidfuzz import utils
s1 = "Testing"
s2 = "TESTING!!!"
# Raw distance
dist1 = Levenshtein.distance(s1, s2)
print(f"Raw distance: {dist1}") # Higher due to case/punctuation
# With preprocessing
dist2 = Levenshtein.distance(s1, s2, processor=utils.default_process)
print(f"Processed distance: {dist2}") # 0 (identical after preprocessing)You can create custom preprocessing functions for specific needs:
from rapidfuzz import fuzz
import re
def custom_preprocess(text):
"""Custom preprocessing for specific use case"""
# Convert to lowercase
text = text.lower()
# Remove only punctuation, keep numbers
text = re.sub(r'[^\w\s]', '', text)
# Normalize whitespace
text = ' '.join(text.split())
return text
def phone_preprocess(phone):
"""Preprocessing for phone numbers"""
# Remove all non-digits
return re.sub(r'[^\d]', '', phone)
# Use custom preprocessor
score = fuzz.ratio("Call (555) 123-4567", "555.123.4567",
processor=phone_preprocess)
print(score) # 100.0 (identical after removing formatting)
def name_preprocess(name):
"""Preprocessing for person names"""
import unicodedata
# Normalize unicode (handle accents)
name = unicodedata.normalize('NFKD', name)
name = ''.join(c for c in name if not unicodedata.combining(c))
# Convert to lowercase and remove punctuation
name = re.sub(r'[^\w\s]', '', name.lower())
# Handle common name variations
name = name.replace('jr', '').replace('sr', '').replace('iii', '').replace('ii', '')
return ' '.join(name.split())
# Better name matching
score = fuzz.ratio("José Martinez Jr.", "jose martinez",
processor=name_preprocess)
print(score) # High score despite accent and suffix differencesUse preprocessing when:
Don't use preprocessing when:
from rapidfuzz import process, utils
choices = ["..."] * 10000 # Large choice list
# Preprocessing overhead for single comparisons
query = "test query"
match = process.extractOne(query, choices, processor=utils.default_process)
# For repeated queries, preprocess choices once
processed_choices = [utils.default_process(choice) for choice in choices]
processed_query = utils.default_process(query)
# Now use without processor (already processed)
match = process.extractOne(processed_query, processed_choices)
# Or use a custom processor that caches results
preprocessing_cache = {}
def cached_preprocess(text):
if text not in preprocessing_cache:
preprocessing_cache[text] = utils.default_process(text)
return preprocessing_cache[text]
match = process.extractOne(query, choices, processor=cached_preprocess)import re
from rapidfuzz import fuzz, utils
def address_preprocess(address):
"""Preprocessing for street addresses"""
address = address.lower()
# Standardize common abbreviations
address = re.sub(r'\bstreet\b|\bst\.?\b', 'st', address)
address = re.sub(r'\bavenue\b|\bave\.?\b', 'ave', address)
address = re.sub(r'\bblvd\.?\b|\bboulevard\b', 'blvd', address)
address = re.sub(r'\bdrive\b|\bdr\.?\b', 'dr', address)
# Remove apartment/suite numbers
address = re.sub(r'\b(apt|apartment|suite|ste|unit)\s*\w+', '', address)
return ' '.join(address.split())
def product_code_preprocess(code):
"""Preprocessing for product codes"""
# Remove common separators but keep structure
code = code.upper()
code = re.sub(r'[-_\s]', '', code)
return code
# Usage examples
addr1 = "123 Main Street, Apt 4B"
addr2 = "123 main st"
score = fuzz.ratio(addr1, addr2, processor=address_preprocess)
print(f"Address match: {score}") # High similarity
code1 = "ABC-123-XYZ"
code2 = "abc123xyz"
score = fuzz.ratio(code1, code2, processor=product_code_preprocess)
print(f"Product code match: {score}") # Perfect matchimport unicodedata
from rapidfuzz import fuzz
def unicode_normalize_preprocess(text):
"""Handle accented characters and unicode normalization"""
# Normalize unicode representation
text = unicodedata.normalize('NFKD', text)
# Remove combining characters (accents)
text = ''.join(c for c in text if not unicodedata.combining(c))
# Apply standard preprocessing
return utils.default_process(text)
# International text matching
text1 = "Café résumé naïve"
text2 = "cafe resume naive"
score = fuzz.ratio(text1, text2, processor=unicode_normalize_preprocess)
print(f"Unicode normalized: {score}") # Perfect matchInstall with Tessl CLI
npx tessl i tessl/pypi-rapidfuzz