Python extension for computing string edit distances and similarities.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Functions for computing various string distance metrics and similarity scores. These functions provide the core string comparison capabilities including Levenshtein distance, normalized similarity ratios, Hamming distance, and Jaro/Jaro-Winkler similarities.
Calculates the minimum number of insertions, deletions, and substitutions required to change one sequence into another according to Levenshtein with custom costs for insertion, deletion and substitution.
def distance(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None, score_hint=None):
"""
Calculate Levenshtein distance with custom operation weights.
Parameters:
- s1: First string to compare
- s2: Second string to compare
- weights: Tuple of (insertion, deletion, substitution) weights. Default (1, 1, 1)
- processor: Optional callable to preprocess strings before comparison
- score_cutoff: Maximum distance to consider. Returns cutoff + 1 if exceeded
- score_hint: Expected distance hint for algorithm optimization
Returns:
int: Distance between s1 and s2
Raises:
ValueError: If unsupported weights are provided
"""Usage Examples:
import Levenshtein
# Basic distance calculation
dist = Levenshtein.distance("kitten", "sitting")
print(dist) # 3
# Custom weights (insertion, deletion, substitution)
dist = Levenshtein.distance("kitten", "sitting", weights=(1, 1, 2))
print(dist) # 4 (substitutions cost more)
# With score cutoff for performance
dist = Levenshtein.distance("very long string", "another long string", score_cutoff=5)
print(dist) # Returns actual distance or 6 if > 5Calculates a normalized indel similarity in the range [0, 1]. The indel distance calculates the minimum number of insertions and deletions required to change one sequence into the other.
def ratio(s1, s2, *, processor=None, score_cutoff=None):
"""
Calculate normalized indel similarity ratio [0, 1].
This is calculated as 1 - (distance / (len1 + len2))
Parameters:
- s1: First string to compare
- s2: Second string to compare
- processor: Optional callable to preprocess strings before comparison
- score_cutoff: Minimum similarity threshold. Returns 0 if below threshold
Returns:
float: Normalized similarity between s1 and s2 as a float between 0 and 1.0
"""Usage Examples:
import Levenshtein
# Basic similarity ratio
ratio = Levenshtein.ratio("kitten", "sitting")
print(f"{ratio:.3f}") # 0.615
# With score cutoff
ratio = Levenshtein.ratio("hello", "world", score_cutoff=0.5)
print(ratio) # 0.0 (similarity is below 0.5)
# With custom processor
def preprocess(s):
return s.lower().strip()
ratio = Levenshtein.ratio(" Hello ", "HELLO", processor=preprocess)
print(ratio) # 1.0 (identical after processing)Calculates the Hamming distance between two strings. The hamming distance is defined as the number of positions where the two strings differ. It describes the minimum amount of substitutions required to transform s1 into s2.
def hamming(s1, s2, *, pad=True, processor=None, score_cutoff=None):
"""
Calculate Hamming distance (substitutions only).
Parameters:
- s1: First string to compare
- s2: Second string to compare
- pad: Should strings be padded if there is a length difference
- processor: Optional callable to preprocess strings before comparison
- score_cutoff: Maximum distance to consider. Returns cutoff + 1 if exceeded
Returns:
int: Hamming distance between s1 and s2
Raises:
ValueError: If s1 and s2 have different length and pad=False
"""Usage Examples:
import Levenshtein
# Same length strings
dist = Levenshtein.hamming("karolin", "kathrin")
print(dist) # 3
# Different length strings with padding (default)
dist = Levenshtein.hamming("karolin", "kath")
print(dist) # 7 (3 substitutions + 4 for length difference)
# Different length strings without padding raises error
try:
dist = Levenshtein.hamming("karolin", "kath", pad=False)
except ValueError as e:
print("Length mismatch error")Calculates the Jaro similarity between two strings. Jaro similarity is particularly effective for comparing names and short strings with character transpositions.
def jaro(s1, s2, *, processor=None, score_cutoff=None):
"""
Calculate Jaro similarity.
Parameters:
- s1: First string to compare
- s2: Second string to compare
- processor: Optional callable to preprocess strings before comparison
- score_cutoff: Minimum similarity threshold. Returns 0 if below threshold
Returns:
float: Jaro similarity between s1 and s2 as a float between 0 and 1.0
"""Usage Examples:
import Levenshtein
# Basic Jaro similarity
sim = Levenshtein.jaro("martha", "marhta")
print(f"{sim:.3f}") # 0.944
# Names comparison
sim = Levenshtein.jaro("DIXON", "DICKSONX")
print(f"{sim:.3f}") # 0.767Calculates the Jaro-Winkler similarity, which gives more favorable ratings to strings with common prefixes. This is particularly useful for comparing names and addresses.
def jaro_winkler(s1, s2, *, prefix_weight=0.1, processor=None, score_cutoff=None):
"""
Calculate Jaro-Winkler similarity with prefix weighting.
Parameters:
- s1: First string to compare
- s2: Second string to compare
- prefix_weight: Weight used for common prefix (0 to 0.25). Default 0.1
- processor: Optional callable to preprocess strings before comparison
- score_cutoff: Minimum similarity threshold. Returns 0 if below threshold
Returns:
float: Jaro-Winkler similarity between s1 and s2 as a float between 0 and 1.0
Raises:
ValueError: If prefix_weight is not between 0 and 0.25
"""Usage Examples:
import Levenshtein
# Basic Jaro-Winkler similarity
sim = Levenshtein.jaro_winkler("martha", "marhta")
print(f"{sim:.3f}") # 0.961 (higher than Jaro due to common prefix "mar")
# Custom prefix weight
sim = Levenshtein.jaro_winkler("prefix_test", "prefix_demo", prefix_weight=0.2)
print(f"{sim:.3f}") # Higher weight for common prefix
# Names with common prefix
sim = Levenshtein.jaro_winkler("JOHNSON", "JOHNSTON")
print(f"{sim:.3f}") # 0.957import Levenshtein
def fuzzy_match(target, candidates, threshold=0.8):
"""Find best matches from candidates list."""
matches = []
for candidate in candidates:
ratio = Levenshtein.ratio(target, candidate)
if ratio >= threshold:
matches.append((candidate, ratio))
return sorted(matches, key=lambda x: x[1], reverse=True)
# Example usage
candidates = ["apple", "application", "apply", "april", "ample"]
matches = fuzzy_match("appl", candidates, threshold=0.6)
print(matches) # [('apply', 0.8), ('apple', 0.8), ('ample', 0.8)]import Levenshtein
def suggest_corrections(word, dictionary, max_distance=2, max_suggestions=5):
"""Suggest spelling corrections."""
suggestions = []
for dict_word in dictionary:
dist = Levenshtein.distance(word, dict_word)
if dist <= max_distance:
ratio = Levenshtein.ratio(word, dict_word)
suggestions.append((dict_word, dist, ratio))
# Sort by distance (ascending) then ratio (descending)
suggestions.sort(key=lambda x: (x[1], -x[2]))
return [word for word, _, _ in suggestions[:max_suggestions]]import Levenshtein
# Use score_cutoff for early termination
def fast_filter(query, candidates, max_distance=3):
"""Quickly filter candidates by maximum distance."""
results = []
for candidate in candidates:
dist = Levenshtein.distance(query, candidate, score_cutoff=max_distance)
if dist <= max_distance:
results.append((candidate, dist))
return results
# Use score_hint when you have an expected distance
def optimized_distance(s1, s2, expected_dist=None):
"""Calculate distance with optimization hint."""
return Levenshtein.distance(s1, s2, score_hint=expected_dist)Install with Tessl CLI
npx tessl i tessl/pypi-levenshtein