tessl/pypi-thefuzz

Fuzzy string matching library using Levenshtein Distance algorithms for approximate text comparison

Overview

Eval results

Files

String Processing and Extraction

Name: tessl/pypi-thefuzz
Author: tessl

Functions for finding the best matches in collections of strings. These functions enable searching through lists or dictionaries of choices to find the closest matches to a query string.

Capabilities

Single Best Match Extraction

Find the single best match from a collection of choices, with optional score thresholding.

def extractOne(query: str, choices, processor=None, scorer=None, score_cutoff: int = 0):
    """
    Find the single best match above a score threshold.
    
    Args:
        query: String to match against
        choices: List or dictionary of choices to search through
        processor: Optional function to preprocess strings before matching
        scorer: Optional scoring function (default: fuzz.WRatio)
        score_cutoff: Minimum score threshold for matches (default: 0)
        
    Returns:
        Tuple of (match, score) for list choices, or (match, score, key) for 
        dictionary choices. Returns None if no match above score_cutoff.
    """

Multiple Match Extraction

Extract multiple best matches from a collection, with configurable limits and score cutoffs.

def extract(query: str, choices, processor=None, scorer=None, limit: int = 5):
    """
    Get a list of the best matches from choices.
    
    Args:
        query: String to match against
        choices: List or dictionary of choices to search through  
        processor: Optional function to preprocess strings before matching
        scorer: Optional scoring function (default: fuzz.WRatio)
        limit: Maximum number of matches to return (default: 5)
        
    Returns:
        List of tuples: (match, score) for list choices, or 
        (match, score, key) for dictionary choices
    """

def extractBests(query: str, choices, processor=None, scorer=None, score_cutoff: int = 0, limit: int = 5):
    """
    Get best matches with both score cutoff and limit controls.
    
    Args:
        query: String to match against
        choices: List or dictionary of choices to search through
        processor: Optional function to preprocess strings before matching  
        scorer: Optional scoring function (default: fuzz.WRatio)
        score_cutoff: Minimum score threshold for matches (default: 0)
        limit: Maximum number of matches to return (default: 5)
        
    Returns:
        List of tuples: (match, score) for list choices, or
        (match, score, key) for dictionary choices
    """

def extractWithoutOrder(query: str, choices, processor=None, scorer=None, score_cutoff: int = 0):
    """
    Extract all matches above threshold without ordering or limit.
    
    Args:
        query: String to match against
        choices: List or dictionary of choices to search through
        processor: Optional function to preprocess strings before matching
        scorer: Optional scoring function (default: fuzz.WRatio)  
        score_cutoff: Minimum score threshold for matches (default: 0)
        
    Returns:
        Generator yielding tuples: (match, score) for list choices, or
        (match, score, key) for dictionary choices
    """

Duplicate Removal

Remove fuzzy duplicates from a list of strings using configurable similarity thresholds.

def dedupe(contains_dupes: list, threshold: int = 70, scorer=None):
    """
    Remove fuzzy duplicates from a list of strings.
    
    Uses fuzzy matching to identify duplicates above the threshold score,
    then returns the longest string from each duplicate group.
    
    Args:
        contains_dupes: List of strings that may contain duplicates
        threshold: Similarity threshold for considering strings duplicates (default: 70)
        scorer: Optional scoring function (default: fuzz.token_set_ratio)
        
    Returns:
        List of deduplicated strings
    """

Default Configuration

The process module provides sensible defaults for common use cases.

# Module-level constants
default_scorer = fuzz.WRatio
default_processor = utils.full_process

Usage Examples

Basic Extraction

from thefuzz import process

choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]

# Find single best match
result = process.extractOne("new york jets", choices)
print(result)  # ('New York Jets', 100)

# Find multiple matches  
results = process.extract("new york", choices, limit=2)
print(results)  # [('New York Jets', 90), ('New York Giants', 90)]

Working with Dictionaries

from thefuzz import process

# Dictionary choices return key information
team_info = {
    "ATL": "Atlanta Falcons",
    "NYJ": "New York Jets", 
    "NYG": "New York Giants",
    "DAL": "Dallas Cowboys"
}

result = process.extractOne("new york jets", team_info)
print(result)  # ('New York Jets', 100, 'NYJ')

Custom Scoring and Processing

from thefuzz import process, fuzz

choices = ["  ATLANTA FALCONS  ", "new york jets", "New York Giants"]

# Custom processor to handle case and whitespace
def clean_processor(s):
    return s.strip().lower()

# Use token-based scoring for better word order handling
results = process.extract(
    "new york", 
    choices,
    processor=clean_processor,
    scorer=fuzz.token_sort_ratio,
    limit=2
)

Score Thresholding

from thefuzz import process

choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]

# Only return matches above 80% similarity
results = process.extractBests("york", choices, score_cutoff=80, limit=10)
print(results)  # Only high-quality matches

Duplicate Removal

from thefuzz import process

# List with fuzzy duplicates
names = [
    "Frodo Baggins",
    "Frodo Baggin", 
    "F. Baggins",
    "Samwise Gamgee",
    "Gandalf",
    "Bilbo Baggins"
]

# Remove duplicates (default threshold: 70)
deduplicated = process.dedupe(names)
print(deduplicated)  # ['Frodo Baggins', 'Samwise Gamgee', 'Gandalf', 'Bilbo Baggins']

# Use stricter threshold
strict_dedupe = process.dedupe(names, threshold=90)

Generator-Based Processing

from thefuzz import process

# For large datasets, use generator to avoid loading all results
choices = ["choice1", "choice2", ...]  # Large list

for match, score in process.extractWithoutOrder("query", choices, score_cutoff=75):
    if score > 90:
        print(f"High confidence match: {match} ({score})")
    else:
        print(f"Moderate match: {match} ({score})")

Install with Tessl CLI