tessl/pypi-rapidfuzz

rapid fuzzy string matching

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Batch Processing

Name: tessl/pypi-rapidfuzz
Author: tessl

Efficient functions for comparing a query string against collections of candidate strings. These functions are optimized for performance when working with large lists and provide various output formats for different use cases.

Capabilities

Extract Single Best Match

Finds the single best match from a collection of choices.

def extractOne(
    query: Sequence[Hashable] | None,
    choices: Iterable[Sequence[Hashable] | None] | Mapping[Any, Sequence[Hashable] | None],
    *,
    scorer: Callable = WRatio,
    processor: Callable | None = None, 
    score_cutoff: float | None = None,
    score_hint: float | None = None,
    scorer_kwargs: dict[str, Any] | None = None
) -> tuple[Sequence[Hashable], float, int | Any] | None

Parameters:

query: String to find matches for
choices: Iterable of strings or mapping {key: string}
scorer: Scoring function (default: WRatio)
processor: String preprocessing function
score_cutoff: Minimum score threshold
score_hint: Expected score for optimization
scorer_kwargs: Additional arguments for scorer

Returns: (match, score, index_or_key) tuple or None if no match above cutoff

Usage Example:

from rapidfuzz import process, fuzz

choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]

# Find best match
match = process.extractOne("new york jets", choices)
print(match)  # ('New York Jets', 76.92, 1)

# With custom scorer
match = process.extractOne("cowboys", choices, scorer=fuzz.partial_ratio)
print(match)  # ('Dallas Cowboys', 100.0, 3)

# With score cutoff
match = process.extractOne("chicago", choices, score_cutoff=50)
print(match)  # None (no match above 50%)

# With mapping
choices_dict = {"team1": "Atlanta Falcons", "team2": "New York Jets"}
match = process.extractOne("jets", choices_dict)
print(match)  # ('New York Jets', 90.0, 'team2')

Extract Multiple Matches

Finds the top N matches from a collection, sorted by score in descending order.

def extract(
    query: Sequence[Hashable] | None,
    choices: Collection[Sequence[Hashable] | None] | Mapping[Any, Sequence[Hashable] | None],
    *,
    scorer: Callable = WRatio,
    processor: Callable | None = None,
    limit: int | None = 5, 
    score_cutoff: float | None = None,
    score_hint: float | None = None,
    scorer_kwargs: dict[str, Any] | None = None
) -> list[tuple[Sequence[Hashable], float, int | Any]]

Parameters:

limit: Maximum number of matches to return (default: 5)

Returns: List of (match, score, index_or_key) tuples, sorted by score descending

Usage Example:

from rapidfuzz import process, utils

choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]

# Get top 2 matches
matches = process.extract("new york", choices, limit=2)
print(matches)  
# [('New York Jets', 76.92, 1), ('New York Giants', 64.29, 2)]

# With preprocessing for better matches
matches = process.extract("new york jets", choices, 
                         processor=utils.default_process, limit=3)
print(matches)
# [('New York Jets', 100.0, 1), ('New York Giants', 78.57, 2), ...]

# Get all matches above threshold
matches = process.extract("new", choices, score_cutoff=30, limit=None)
print(len(matches))  # All matches with score >= 30

Extract Iterator

Returns an iterator over all matches above the score cutoff, useful for memory-efficient processing of large choice sets.

def extract_iter(
    query: Sequence[Hashable] | None, 
    choices: Iterable[Sequence[Hashable] | None] | Mapping[Any, Sequence[Hashable] | None],
    *,
    scorer: Callable = WRatio,
    processor: Callable | None = None,
    score_cutoff: float | None = None,
    score_hint: float | None = None,
    scorer_kwargs: dict[str, Any] | None = None
) -> Generator[tuple[Sequence[Hashable], float, int | Any], None, None]

Returns: Generator yielding (match, score, index_or_key) tuples

Usage Example:

from rapidfuzz import process

choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]

# Process matches one at a time
for match, score, index in process.extract_iter("new", choices, score_cutoff=50):
    print(f"Match: {match}, Score: {score:.1f}, Index: {index}")
    
# Memory-efficient processing of large datasets
large_choices = [...] # Large list of strings
best_score = 0
best_match = None

for match, score, index in process.extract_iter("query", large_choices):
    if score > best_score:
        best_score = score
        best_match = (match, score, index)

Cross-Distance Matrix

Computes similarity/distance matrix between all queries and all choices. Requires NumPy.

def cdist(
    queries: Collection[Sequence[Hashable] | None],
    choices: Collection[Sequence[Hashable] | None], 
    *,
    scorer: Callable = ratio,
    processor: Callable | None = None,
    score_cutoff: float | None = None,
    score_hint: float | None = None,
    score_multiplier: float = 1,
    dtype: Any = None,
    workers: int = 1,
    scorer_kwargs: dict[str, Any] | None = None
) -> numpy.ndarray

Parameters:

queries: List of query strings
choices: List of choice strings
score_multiplier: Multiply scores by this factor
dtype: NumPy data type for result array
workers: Number of parallel workers

Returns: 2D NumPy array with shape (len(queries), len(choices))

Usage Example:

import numpy as np
from rapidfuzz import process

queries = ["apple", "orange"] 
choices = ["apples", "oranges", "banana"]

# Compute full distance matrix
matrix = process.cdist(queries, choices)
print(matrix.shape)  # (2, 3)
print(matrix)
# [[similarity(apple, apples), similarity(apple, oranges), similarity(apple, banana)],
#  [similarity(orange, apples), similarity(orange, oranges), similarity(orange, banana)]]

# Find best match for each query
best_indices = np.argmax(matrix, axis=1)
for i, query in enumerate(queries):
    best_choice = choices[best_indices[i]]
    best_score = matrix[i, best_indices[i]]
    print(f"{query} -> {best_choice} ({best_score:.1f})")

Cartesian Product Distance

Computes distances for all possible pairs. Requires NumPy.

def cpdist(
    queries: Collection[Sequence[Hashable] | None],
    choices: Collection[Sequence[Hashable] | None],
    *,
    scorer: Callable = ratio,
    processor: Callable | None = None,
    score_cutoff: float | None = None,
    score_hint: float | None = None, 
    score_multiplier: float = 1,
    dtype: Any = None,
    workers: int = 1,
    scorer_kwargs: dict[str, Any] | None = None
) -> numpy.ndarray

Returns: 1D NumPy array with len(queries) * len(choices) elements

Usage Patterns

Choosing the Right Function

extractOne: Need single best match
extract: Need top N matches, known small result set
extract_iter: Large choice sets, memory-constrained, or streaming results
cdist: Need complete similarity matrix, multiple queries
cpdist: Need all pairwise comparisons in flat array format

Performance Optimization

from rapidfuzz import process, fuzz

choices = ["..." * 10000]  # Large choice list

# Use score_cutoff to filter weak matches early
matches = process.extract("query", choices, score_cutoff=80)

# Use score_hint if you know expected score range
matches = process.extract("query", choices, score_hint=85)

# Use faster scorer for approximate results
matches = process.extract("query", choices, scorer=fuzz.QRatio)

# Parallel processing for matrix operations
matrix = process.cdist(queries, choices, workers=4)

Handling Different Input Types

from rapidfuzz import process

# List of strings (most common)
choices = ["option1", "option2", "option3"]
match = process.extractOne("query", choices)
# Returns: (match_string, score, index)

# Dictionary mapping
choices = {"a": "option1", "b": "option2", "c": "option3"}  
match = process.extractOne("query", choices)
# Returns: (match_string, score, key)

# Pandas Series (if pandas available)
import pandas as pd
choices = pd.Series(["option1", "option2", "option3"])
match = process.extractOne("query", choices) 
# Returns: (match_string, score, index)

# Handle None values in choices
choices = ["option1", None, "option3"]
matches = process.extract("query", choices)  # None values ignored

Custom Scoring Functions

from rapidfuzz import process, distance

# Use distance metrics directly
matches = process.extract("query", choices, scorer=distance.Levenshtein.distance)
# Returns edit distance (lower = more similar)

# Custom scorer function
def custom_scorer(s1, s2, **kwargs):
    # Custom scoring logic
    return some_similarity_score

matches = process.extract("query", choices, scorer=custom_scorer)

# Pass additional arguments to scorer
matches = process.extract("query", choices, 
                         scorer=distance.Levenshtein.distance,
                         scorer_kwargs={"weights": (1, 2, 1)})

Install with Tessl CLI