rapid fuzzy string matching
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Efficient functions for comparing a query string against collections of candidate strings. These functions are optimized for performance when working with large lists and provide various output formats for different use cases.
Finds the single best match from a collection of choices.
def extractOne(
query: Sequence[Hashable] | None,
choices: Iterable[Sequence[Hashable] | None] | Mapping[Any, Sequence[Hashable] | None],
*,
scorer: Callable = WRatio,
processor: Callable | None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None
) -> tuple[Sequence[Hashable], float, int | Any] | NoneParameters:
query: String to find matches forchoices: Iterable of strings or mapping {key: string}scorer: Scoring function (default: WRatio)processor: String preprocessing functionscore_cutoff: Minimum score thresholdscore_hint: Expected score for optimizationscorer_kwargs: Additional arguments for scorerReturns: (match, score, index_or_key) tuple or None if no match above cutoff
Usage Example:
from rapidfuzz import process, fuzz
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
# Find best match
match = process.extractOne("new york jets", choices)
print(match) # ('New York Jets', 76.92, 1)
# With custom scorer
match = process.extractOne("cowboys", choices, scorer=fuzz.partial_ratio)
print(match) # ('Dallas Cowboys', 100.0, 3)
# With score cutoff
match = process.extractOne("chicago", choices, score_cutoff=50)
print(match) # None (no match above 50%)
# With mapping
choices_dict = {"team1": "Atlanta Falcons", "team2": "New York Jets"}
match = process.extractOne("jets", choices_dict)
print(match) # ('New York Jets', 90.0, 'team2')Finds the top N matches from a collection, sorted by score in descending order.
def extract(
query: Sequence[Hashable] | None,
choices: Collection[Sequence[Hashable] | None] | Mapping[Any, Sequence[Hashable] | None],
*,
scorer: Callable = WRatio,
processor: Callable | None = None,
limit: int | None = 5,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None
) -> list[tuple[Sequence[Hashable], float, int | Any]]Parameters:
limit: Maximum number of matches to return (default: 5)Returns: List of (match, score, index_or_key) tuples, sorted by score descending
Usage Example:
from rapidfuzz import process, utils
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
# Get top 2 matches
matches = process.extract("new york", choices, limit=2)
print(matches)
# [('New York Jets', 76.92, 1), ('New York Giants', 64.29, 2)]
# With preprocessing for better matches
matches = process.extract("new york jets", choices,
processor=utils.default_process, limit=3)
print(matches)
# [('New York Jets', 100.0, 1), ('New York Giants', 78.57, 2), ...]
# Get all matches above threshold
matches = process.extract("new", choices, score_cutoff=30, limit=None)
print(len(matches)) # All matches with score >= 30Returns an iterator over all matches above the score cutoff, useful for memory-efficient processing of large choice sets.
def extract_iter(
query: Sequence[Hashable] | None,
choices: Iterable[Sequence[Hashable] | None] | Mapping[Any, Sequence[Hashable] | None],
*,
scorer: Callable = WRatio,
processor: Callable | None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None
) -> Generator[tuple[Sequence[Hashable], float, int | Any], None, None]Returns: Generator yielding (match, score, index_or_key) tuples
Usage Example:
from rapidfuzz import process
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
# Process matches one at a time
for match, score, index in process.extract_iter("new", choices, score_cutoff=50):
print(f"Match: {match}, Score: {score:.1f}, Index: {index}")
# Memory-efficient processing of large datasets
large_choices = [...] # Large list of strings
best_score = 0
best_match = None
for match, score, index in process.extract_iter("query", large_choices):
if score > best_score:
best_score = score
best_match = (match, score, index)Computes similarity/distance matrix between all queries and all choices. Requires NumPy.
def cdist(
queries: Collection[Sequence[Hashable] | None],
choices: Collection[Sequence[Hashable] | None],
*,
scorer: Callable = ratio,
processor: Callable | None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
score_multiplier: float = 1,
dtype: Any = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None
) -> numpy.ndarrayParameters:
queries: List of query stringschoices: List of choice stringsscore_multiplier: Multiply scores by this factordtype: NumPy data type for result arrayworkers: Number of parallel workersReturns: 2D NumPy array with shape (len(queries), len(choices))
Usage Example:
import numpy as np
from rapidfuzz import process
queries = ["apple", "orange"]
choices = ["apples", "oranges", "banana"]
# Compute full distance matrix
matrix = process.cdist(queries, choices)
print(matrix.shape) # (2, 3)
print(matrix)
# [[similarity(apple, apples), similarity(apple, oranges), similarity(apple, banana)],
# [similarity(orange, apples), similarity(orange, oranges), similarity(orange, banana)]]
# Find best match for each query
best_indices = np.argmax(matrix, axis=1)
for i, query in enumerate(queries):
best_choice = choices[best_indices[i]]
best_score = matrix[i, best_indices[i]]
print(f"{query} -> {best_choice} ({best_score:.1f})")Computes distances for all possible pairs. Requires NumPy.
def cpdist(
queries: Collection[Sequence[Hashable] | None],
choices: Collection[Sequence[Hashable] | None],
*,
scorer: Callable = ratio,
processor: Callable | None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
score_multiplier: float = 1,
dtype: Any = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None
) -> numpy.ndarrayReturns: 1D NumPy array with len(queries) * len(choices) elements
extractOne: Need single best matchextract: Need top N matches, known small result setextract_iter: Large choice sets, memory-constrained, or streaming resultscdist: Need complete similarity matrix, multiple queriescpdist: Need all pairwise comparisons in flat array formatfrom rapidfuzz import process, fuzz
choices = ["..." * 10000] # Large choice list
# Use score_cutoff to filter weak matches early
matches = process.extract("query", choices, score_cutoff=80)
# Use score_hint if you know expected score range
matches = process.extract("query", choices, score_hint=85)
# Use faster scorer for approximate results
matches = process.extract("query", choices, scorer=fuzz.QRatio)
# Parallel processing for matrix operations
matrix = process.cdist(queries, choices, workers=4)from rapidfuzz import process
# List of strings (most common)
choices = ["option1", "option2", "option3"]
match = process.extractOne("query", choices)
# Returns: (match_string, score, index)
# Dictionary mapping
choices = {"a": "option1", "b": "option2", "c": "option3"}
match = process.extractOne("query", choices)
# Returns: (match_string, score, key)
# Pandas Series (if pandas available)
import pandas as pd
choices = pd.Series(["option1", "option2", "option3"])
match = process.extractOne("query", choices)
# Returns: (match_string, score, index)
# Handle None values in choices
choices = ["option1", None, "option3"]
matches = process.extract("query", choices) # None values ignoredfrom rapidfuzz import process, distance
# Use distance metrics directly
matches = process.extract("query", choices, scorer=distance.Levenshtein.distance)
# Returns edit distance (lower = more similar)
# Custom scorer function
def custom_scorer(s1, s2, **kwargs):
# Custom scoring logic
return some_similarity_score
matches = process.extract("query", choices, scorer=custom_scorer)
# Pass additional arguments to scorer
matches = process.extract("query", choices,
scorer=distance.Levenshtein.distance,
scorer_kwargs={"weights": (1, 2, 1)})Install with Tessl CLI
npx tessl i tessl/pypi-rapidfuzz