rapid fuzzy string matching
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Low-level distance algorithms that provide raw distance calculations, similarity scores, normalized metrics, and edit operation sequences. These form the foundation of RapidFuzz's fuzzy matching capabilities and offer fine-grained control over string comparison algorithms.
The most commonly used edit distance, allowing insertions, deletions, and substitutions.
class Levenshtein:
@staticmethod
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: Callable | None = None,
score_cutoff: int | None = None
) -> int
@staticmethod
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: Callable | None = None,
score_cutoff: int | None = None
) -> int
@staticmethod
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: Callable | None = None,
score_cutoff: float | None = None
) -> float
@staticmethod
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: Callable | None = None,
score_cutoff: float | None = None
) -> float
@staticmethod
def editops(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: Callable | None = None
) -> Editops
@staticmethod
def opcodes(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: Callable | None = None
) -> OpcodesParameters:
weights: Cost tuple (insertion, deletion, substitution) - default (1,1,1)processor: String preprocessing functionscore_cutoff: Threshold for early terminationUsage Example:
from rapidfuzz.distance import Levenshtein
# Raw edit distance (number of operations needed)
dist = Levenshtein.distance("kitten", "sitting")
print(dist) # 3
# Similarity (max_length - distance)
sim = Levenshtein.similarity("kitten", "sitting")
print(sim) # 4
# Normalized distance (0.0 to 1.0, where 0.0 = identical)
norm_dist = Levenshtein.normalized_distance("kitten", "sitting")
print(norm_dist) # 0.43
# Normalized similarity (0.0 to 1.0, where 1.0 = identical)
norm_sim = Levenshtein.normalized_similarity("kitten", "sitting")
print(norm_sim) # 0.57
# Custom weights (insert=1, delete=2, substitute=1)
dist = Levenshtein.distance("kitten", "sitting", weights=(1, 2, 1))
print(dist) # Different cost
# Get edit operations
ops = Levenshtein.editops("kitten", "sitting")
for op in ops:
print(f"{op.tag}: src_pos={op.src_pos}, dest_pos={op.dest_pos}")Extended Levenshtein distance that also allows transpositions (swapping adjacent characters).
class DamerauLevenshtein:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> floatUsage Example:
from rapidfuzz.distance import DamerauLevenshtein
# Better for transposition errors
dist1 = DamerauLevenshtein.distance("abcd", "acbd") # 1 (transposition)
dist2 = Levenshtein.distance("abcd", "acbd") # 2 (substitute twice)Compares strings of equal length, counting position-wise character differences.
class Hamming:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops
@staticmethod
def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> OpcodesUsage Example:
from rapidfuzz.distance import Hamming
# Only works with equal-length strings
dist = Hamming.distance("abcde", "aXcYe") # 2 (positions 1 and 3 differ)
print(dist)
# Raises ValueError for different lengths
# Hamming.distance("abc", "abcd") # Error!Measures similarity based on matching characters and their transpositions, good for shorter strings.
class Jaro:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> floatExtension of Jaro distance that gives higher scores to strings with common prefixes.
class JaroWinkler:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> floatParameters:
prefix_weight: Weight for common prefix bonus (0.0-0.25, default 0.1)Usage Example:
from rapidfuzz.distance import Jaro, JaroWinkler
s1, s2 = "martha", "marhta"
jaro_sim = Jaro.similarity(s1, s2)
jw_sim = JaroWinkler.similarity(s1, s2)
print(f"Jaro: {jaro_sim:.3f}") # Jaro: 0.944
print(f"Jaro-Winkler: {jw_sim:.3f}") # Jaro-Winkler: 0.961 (higher due to 'ma' prefix)Only allows insertions and deletions (no substitutions).
class Indel:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops
@staticmethod
def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> OpcodesLike Damerau-Levenshtein but with the restriction that no substring can be edited more than once.
class OSA:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops
@staticmethod
def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> OpcodesFinds the longest subsequence common to both strings.
class LCSseq:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops
@staticmethod
def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> OpcodesMeasures length of common prefix.
class Prefix:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> floatMeasures length of common suffix.
class Postfix:
@staticmethod
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
@staticmethod
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
@staticmethod
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> floatfrom rapidfuzz.distance import Levenshtein
s1, s2 = "kitten", "sitting"
# Raw distance (edit operations count)
dist = Levenshtein.distance(s1, s2) # 3
# Raw similarity (max_len - distance)
sim = Levenshtein.similarity(s1, s2) # 4 (7 - 3)
# Normalized distance (0.0 = identical, 1.0 = completely different)
norm_dist = Levenshtein.normalized_distance(s1, s2) # 0.43
# Normalized similarity (1.0 = identical, 0.0 = completely different)
norm_sim = Levenshtein.normalized_similarity(s1, s2) # 0.57from rapidfuzz.distance import Levenshtein
s1, s2 = "kitten", "sitting"
# Get individual edit operations
editops = Levenshtein.editops(s1, s2)
print(f"Number of operations: {len(editops)}")
for op in editops:
if op.tag == "replace":
print(f"Replace '{s1[op.src_pos]}' with '{s2[op.dest_pos]}' at position {op.src_pos}")
elif op.tag == "insert":
print(f"Insert '{s2[op.dest_pos]}' at position {op.src_pos}")
elif op.tag == "delete":
print(f"Delete '{s1[op.src_pos]}' at position {op.src_pos}")
# Get opcodes (grouped operations)
opcodes = Levenshtein.opcodes(s1, s2)
for opcode in opcodes:
src_slice = s1[opcode.a1:opcode.a2]
dest_slice = s2[opcode.b1:opcode.b2]
print(f"{opcode.tag}: '{src_slice}' -> '{dest_slice}'")from rapidfuzz.distance import Levenshtein
# Use score_cutoff for early termination
dist = Levenshtein.distance("long string", "very different string", score_cutoff=5)
# Returns early if distance > 5
# Custom weights for specific use cases
dist = Levenshtein.distance("text", "test", weights=(1, 2, 1)) # Deletions cost more
# Process many comparisons efficiently
strings = ["..." for _ in range(1000)]
target = "target"
distances = [Levenshtein.distance(target, s, score_cutoff=3) for s in strings]
# Only calculates full distance for strings within cutoffInstall with Tessl CLI
npx tessl i tessl/pypi-rapidfuzz