rapid fuzzy string matching
npx @tessl/cli install tessl/pypi-rapidfuzz@3.14.0A high-performance Python library for rapid fuzzy string matching that provides string similarity calculations using advanced algorithms including Levenshtein distance, Hamming distance, and Jaro-Winkler metrics. Built with C++ extensions for optimal performance, it offers a comprehensive set of string matching functions and efficient batch processing capabilities.
pip install rapidfuzzimport rapidfuzzCommon patterns for specific functionality:
from rapidfuzz import fuzz, process, distance, utilsImport specific functions:
from rapidfuzz.fuzz import ratio, partial_ratio, partial_ratio_alignment, token_ratio, WRatio, QRatio
from rapidfuzz.process import extractOne, extract, extract_iter, cdist, cpdist
from rapidfuzz.distance import Levenshtein, Hamming, Jaro, JaroWinkler, DamerauLevenshtein
from rapidfuzz.distance import OSA, Indel, LCSseq, Prefix, Postfix
from rapidfuzz.utils import default_processfrom rapidfuzz import fuzz, process
# Basic string similarity
score = fuzz.ratio("this is a test", "this is a test!")
print(f"Similarity: {score}") # 96.55
# Partial matching (substring matching)
score = fuzz.partial_ratio("this is a test", "this is a test!")
print(f"Partial similarity: {score}") # 100.0
# Find best match from a list
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
match = process.extractOne("new york jets", choices)
print(f"Best match: {match}") # ('New York Jets', 76.92, 1)
# Find multiple matches
matches = process.extract("new york", choices, limit=2)
print(f"Top matches: {matches}")
# [('New York Jets', 76.92, 1), ('New York Giants', 64.29, 2)]
# With string preprocessing
from rapidfuzz import utils
match = process.extractOne("new york jets", choices, processor=utils.default_process)
print(f"Preprocessed match: {match}") # ('New York Jets', 100.0, 1)RapidFuzz is organized into four main modules, each serving distinct purposes:
The library automatically selects optimized C++ implementations (AVX2, SSE2) when available, falling back to Python implementations for compatibility.
def get_include() -> strReturns the directory containing RapidFuzz header files for building C++ extensions that use RapidFuzz functionality.
Usage Example:
import rapidfuzz
include_dir = rapidfuzz.get_include()
print(f"Header files located at: {include_dir}")
# Use in setup.py for C++ extensions
from setuptools import Extension
ext = Extension(
'my_extension',
sources=['my_extension.cpp'],
include_dirs=[rapidfuzz.get_include()]
)High-level string similarity functions including basic ratios, partial matching, token-based comparisons, and weighted algorithms optimized for different use cases.
def ratio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def partial_ratio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def partial_ratio_alignment(s1, s2, *, processor=None, score_cutoff=0) -> ScoreAlignment | None: ...
def token_sort_ratio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def token_set_ratio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def token_ratio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def partial_token_sort_ratio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def partial_token_set_ratio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def partial_token_ratio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def WRatio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...
def QRatio(s1, s2, *, processor=None, score_cutoff=0) -> float: ...Efficient functions for comparing a query string against lists or collections of candidate strings, with support for finding single best matches, top-N matches, and distance matrices.
def extractOne(query, choices, *, scorer=WRatio, processor=None, score_cutoff=None) -> tuple | None: ...
def extract(query, choices, *, scorer=WRatio, processor=None, limit=5, score_cutoff=None) -> list: ...
def extract_iter(query, choices, *, scorer=WRatio, processor=None, score_cutoff=None) -> Generator: ...
def cdist(queries, choices, *, scorer=ratio, processor=None, workers=1) -> numpy.ndarray: ...
def cpdist(queries, choices, *, scorer=ratio, processor=None, workers=1) -> numpy.ndarray: ...Low-level distance algorithms providing raw distance calculations, similarity scores, normalized metrics, and edit operation sequences for advanced string analysis.
class Levenshtein:
@staticmethod
def distance(s1, s2, *, score_cutoff=None) -> int: ...
@staticmethod
def similarity(s1, s2, *, score_cutoff=None) -> int: ...
@staticmethod
def normalized_distance(s1, s2, *, score_cutoff=None) -> float: ...
@staticmethod
def normalized_similarity(s1, s2, *, score_cutoff=None) -> float: ...Utilities for normalizing and preprocessing strings before comparison, including case normalization, whitespace handling, and non-alphanumeric character removal.
def default_process(sentence: str) -> str: ...from typing import Sequence, Hashable, Callable, Iterable, Mapping, Any
from collections.abc import Generator
import numpy
# Core types for string inputs
StringType = Sequence[Hashable] # Accepts strings, lists, tuples of hashable items
# Edit operation types
class Editop:
def __init__(self, tag: str, src_pos: int, dest_pos: int) -> None: ...
tag: str # 'replace', 'delete', 'insert'
src_pos: int # Position in source string
dest_pos: int # Position in destination string
class Editops:
# List-like container of Editop objects
def __init__(self, editops: list | None = None, src_len: int = 0, dest_len: int = 0) -> None: ...
def __len__(self) -> int: ...
def __getitem__(self, index: int) -> Editop: ...
def as_opcodes(self) -> Opcodes: ...
def as_matching_blocks(self) -> list[MatchingBlock]: ...
def as_list(self) -> list[tuple[str, int, int]]: ...
def copy(self) -> Editops: ...
def inverse(self) -> Editops: ...
def remove_subsequence(self, subsequence: Editops) -> Editops: ...
def apply(self, source_string: str | bytes, destination_string: str | bytes) -> str: ...
@classmethod
def from_opcodes(cls, opcodes: Opcodes) -> Editops: ...
src_len: int
dest_len: int
class Opcode:
def __init__(self, tag: str, a1: int, a2: int, b1: int, b2: int) -> None: ...
tag: str # 'replace', 'delete', 'insert', 'equal'
a1: int # Start position in first string
a2: int # End position in first string
b1: int # Start position in second string
b2: int # End position in second string
class Opcodes:
# List-like container of Opcode objects
def __init__(self, opcodes: list | None = None, src_len: int = 0, dest_len: int = 0) -> None: ...
def __len__(self) -> int: ...
def __getitem__(self, index: int) -> Opcode: ...
def as_editops(self) -> Editops: ...
def as_matching_blocks(self) -> list[MatchingBlock]: ...
def as_list(self) -> list[tuple[str, int, int, int, int]]: ...
def copy(self) -> Opcodes: ...
def inverse(self) -> Opcodes: ...
def apply(self, source_string: str | bytes, destination_string: str | bytes) -> str: ...
@classmethod
def from_editops(cls, editops: Editops) -> Opcodes: ...
src_len: int
dest_len: int
class MatchingBlock:
def __init__(self, a: int, b: int, size: int) -> None: ...
a: int # Start position in first string
b: int # Start position in second string
size: int # Length of the matching block
class ScoreAlignment:
def __init__(self, score: float, src_start: int, src_end: int, dest_start: int, dest_end: int) -> None: ...
score: float # Similarity/distance score
src_start: int # Start position in source
src_end: int # End position in source
dest_start: int # Start position in destination
dest_end: int # End position in destination
# Process function return types
ExtractResult = tuple[str, float, int] # (match, score, index)
ExtractResultMapping = tuple[str, float, Any] # (match, score, key)