Fuzzy string matching library using Levenshtein Distance calculations for approximate string comparison and search
—
Functions for processing collections of strings and finding best matches using fuzzy string matching. These functions enable searching, ranking, and deduplication operations on lists or dictionaries of strings.
default_scorer = fuzz.WRatio # Default scoring function
default_processor = utils.full_process # Default string preprocessing functionFind the single best match above a score threshold in a collection of choices.
def extractOne(query: str, choices, processor=default_processor, scorer=default_scorer, score_cutoff: int = 0):
"""
Find the single best match above a score in a list of choices.
Parameters:
query: String to match against
choices: List or dict of choices to search through
processor: Function to preprocess strings before matching
scorer: Function to score matches (default: fuzz.WRatio)
score_cutoff: Minimum score threshold (default: 0)
Returns:
tuple: (match, score) if found, None if no match above cutoff
tuple: (match, score, key) if choices is a dictionary
"""Usage Example:
from fuzzywuzzy import process
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
result = process.extractOne("new york jets", choices)
print(result) # ("New York Jets", 100)
# With score cutoff
result = process.extractOne("new york", choices, score_cutoff=80)
print(result) # ("New York Jets", 90) or ("New York Giants", 90)
# With dictionary
choices_dict = {"team1": "Atlanta Falcons", "team2": "New York Jets"}
result = process.extractOne("jets", choices_dict)
print(result) # ("New York Jets", 90, "team2")Extract multiple best matches from a collection with optional limits.
def extract(query: str, choices, processor=default_processor, scorer=default_scorer, limit: int = 5):
"""
Select the best matches in a list or dictionary of choices.
Parameters:
query: String to match against
choices: List or dict of choices to search through
processor: Function to preprocess strings before matching
scorer: Function to score matches (default: fuzz.WRatio)
limit: Maximum number of results to return (default: 5)
Returns:
list: List of (match, score) tuples sorted by score descending
list: List of (match, score, key) tuples if choices is a dictionary
"""Usage Example:
from fuzzywuzzy import process
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
results = process.extract("new york", choices, limit=2)
print(results) # [("New York Jets", 90), ("New York Giants", 90)]
# Get all matches
all_results = process.extract("new", choices, limit=None)
print(all_results) # All matches sorted by scoreExtract multiple matches above a score threshold with optional limits.
def extractBests(query: str, choices, processor=default_processor, scorer=default_scorer, score_cutoff: int = 0, limit: int = 5):
"""
Get a list of the best matches above a score threshold.
Parameters:
query: String to match against
choices: List or dict of choices to search through
processor: Function to preprocess strings before matching
scorer: Function to score matches (default: fuzz.WRatio)
score_cutoff: Minimum score threshold (default: 0)
limit: Maximum number of results to return (default: 5)
Returns:
list: List of (match, score) tuples above cutoff, sorted by score
"""Usage Example:
from fuzzywuzzy import process
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
results = process.extractBests("new", choices, score_cutoff=50, limit=3)
print(results) # Only matches scoring 50 or higherGenerator function that yields matches without sorting, useful for large datasets.
def extractWithoutOrder(query: str, choices, processor=default_processor, scorer=default_scorer, score_cutoff: int = 0):
"""
Generator yielding best matches without ordering, for memory efficiency.
Parameters:
query: String to match against
choices: List or dict of choices to search through
processor: Function to preprocess strings before matching
scorer: Function to score matches (default: fuzz.WRatio)
score_cutoff: Minimum score threshold (default: 0)
Yields:
tuple: (match, score) for list choices
tuple: (match, score, key) for dictionary choices
"""Usage Example:
from fuzzywuzzy import process
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
for match in process.extractWithoutOrder("new", choices, score_cutoff=60):
print(match) # Yields matches as found, not sortedRemove duplicates from a list using fuzzy matching to identify similar items.
def dedupe(contains_dupes: list, threshold: int = 70, scorer=fuzz.token_set_ratio):
"""
Remove duplicates from a list using fuzzy matching.
Uses fuzzy matching to identify duplicates that score above the threshold.
For each group of duplicates, returns the longest item (most information),
breaking ties alphabetically.
Parameters:
contains_dupes: List of strings that may contain duplicates
threshold: Score threshold for considering items duplicates (default: 70)
scorer: Function to score similarity (default: fuzz.token_set_ratio)
Returns:
list: Deduplicated list with longest representative from each group
"""Usage Example:
from fuzzywuzzy import process
duplicates = [
'Frodo Baggin',
'Frodo Baggins',
'F. Baggins',
'Samwise G.',
'Gandalf',
'Bilbo Baggins'
]
deduped = process.dedupe(duplicates)
print(deduped) # ['Frodo Baggins', 'Samwise G.', 'Bilbo Baggins', 'Gandalf']
# Lower threshold finds more duplicates
deduped_aggressive = process.dedupe(duplicates, threshold=50)
print(deduped_aggressive) # Even fewer items returnedYou can provide custom processing and scoring functions:
Usage Example:
from fuzzywuzzy import process, fuzz
# Custom processor that only looks at first word
def first_word_processor(s):
return s.split()[0] if s else ""
# Custom scorer that uses partial ratio
choices = ["John Smith", "Jane Smith", "Bob Johnson"]
result = process.extractOne(
"John",
choices,
processor=first_word_processor,
scorer=fuzz.partial_ratio
)
print(result) # ("John Smith", 100)
# No processing
result = process.extractOne(
"JOHN SMITH",
choices,
processor=None, # No preprocessing
scorer=fuzz.ratio
)
print(result) # Lower score due to case mismatchInstall with Tessl CLI
npx tessl i tessl/pypi-fuzzywuzzy