Fuzzy string matching library using Levenshtein Distance algorithms for approximate text comparison
Functions for finding the best matches in collections of strings. These functions enable searching through lists or dictionaries of choices to find the closest matches to a query string.
Find the single best match from a collection of choices, with optional score thresholding.
def extractOne(query: str, choices, processor=None, scorer=None, score_cutoff: int = 0):
"""
Find the single best match above a score threshold.
Args:
query: String to match against
choices: List or dictionary of choices to search through
processor: Optional function to preprocess strings before matching
scorer: Optional scoring function (default: fuzz.WRatio)
score_cutoff: Minimum score threshold for matches (default: 0)
Returns:
Tuple of (match, score) for list choices, or (match, score, key) for
dictionary choices. Returns None if no match above score_cutoff.
"""Extract multiple best matches from a collection, with configurable limits and score cutoffs.
def extract(query: str, choices, processor=None, scorer=None, limit: int = 5):
"""
Get a list of the best matches from choices.
Args:
query: String to match against
choices: List or dictionary of choices to search through
processor: Optional function to preprocess strings before matching
scorer: Optional scoring function (default: fuzz.WRatio)
limit: Maximum number of matches to return (default: 5)
Returns:
List of tuples: (match, score) for list choices, or
(match, score, key) for dictionary choices
"""
def extractBests(query: str, choices, processor=None, scorer=None, score_cutoff: int = 0, limit: int = 5):
"""
Get best matches with both score cutoff and limit controls.
Args:
query: String to match against
choices: List or dictionary of choices to search through
processor: Optional function to preprocess strings before matching
scorer: Optional scoring function (default: fuzz.WRatio)
score_cutoff: Minimum score threshold for matches (default: 0)
limit: Maximum number of matches to return (default: 5)
Returns:
List of tuples: (match, score) for list choices, or
(match, score, key) for dictionary choices
"""
def extractWithoutOrder(query: str, choices, processor=None, scorer=None, score_cutoff: int = 0):
"""
Extract all matches above threshold without ordering or limit.
Args:
query: String to match against
choices: List or dictionary of choices to search through
processor: Optional function to preprocess strings before matching
scorer: Optional scoring function (default: fuzz.WRatio)
score_cutoff: Minimum score threshold for matches (default: 0)
Returns:
Generator yielding tuples: (match, score) for list choices, or
(match, score, key) for dictionary choices
"""Remove fuzzy duplicates from a list of strings using configurable similarity thresholds.
def dedupe(contains_dupes: list, threshold: int = 70, scorer=None):
"""
Remove fuzzy duplicates from a list of strings.
Uses fuzzy matching to identify duplicates above the threshold score,
then returns the longest string from each duplicate group.
Args:
contains_dupes: List of strings that may contain duplicates
threshold: Similarity threshold for considering strings duplicates (default: 70)
scorer: Optional scoring function (default: fuzz.token_set_ratio)
Returns:
List of deduplicated strings
"""The process module provides sensible defaults for common use cases.
# Module-level constants
default_scorer = fuzz.WRatio
default_processor = utils.full_processfrom thefuzz import process
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
# Find single best match
result = process.extractOne("new york jets", choices)
print(result) # ('New York Jets', 100)
# Find multiple matches
results = process.extract("new york", choices, limit=2)
print(results) # [('New York Jets', 90), ('New York Giants', 90)]from thefuzz import process
# Dictionary choices return key information
team_info = {
"ATL": "Atlanta Falcons",
"NYJ": "New York Jets",
"NYG": "New York Giants",
"DAL": "Dallas Cowboys"
}
result = process.extractOne("new york jets", team_info)
print(result) # ('New York Jets', 100, 'NYJ')from thefuzz import process, fuzz
choices = [" ATLANTA FALCONS ", "new york jets", "New York Giants"]
# Custom processor to handle case and whitespace
def clean_processor(s):
return s.strip().lower()
# Use token-based scoring for better word order handling
results = process.extract(
"new york",
choices,
processor=clean_processor,
scorer=fuzz.token_sort_ratio,
limit=2
)from thefuzz import process
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
# Only return matches above 80% similarity
results = process.extractBests("york", choices, score_cutoff=80, limit=10)
print(results) # Only high-quality matchesfrom thefuzz import process
# List with fuzzy duplicates
names = [
"Frodo Baggins",
"Frodo Baggin",
"F. Baggins",
"Samwise Gamgee",
"Gandalf",
"Bilbo Baggins"
]
# Remove duplicates (default threshold: 70)
deduplicated = process.dedupe(names)
print(deduplicated) # ['Frodo Baggins', 'Samwise Gamgee', 'Gandalf', 'Bilbo Baggins']
# Use stricter threshold
strict_dedupe = process.dedupe(names, threshold=90)from thefuzz import process
# For large datasets, use generator to avoid loading all results
choices = ["choice1", "choice2", ...] # Large list
for match, score in process.extractWithoutOrder("query", choices, score_cutoff=75):
if score > 90:
print(f"High confidence match: {match} ({score})")
else:
print(f"Moderate match: {match} ({score})")Install with Tessl CLI
npx tessl i tessl/pypi-thefuzz