Python extension for computing string edit distances and similarities.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Functions for computing approximate median strings, improving strings toward a target set, and calculating sequence and set similarity ratios for multiple strings. These functions are particularly useful for finding representative strings from collections and analyzing groups of related strings.
Find approximate median string from a list of strings using a more thorough algorithm that considers all possible character combinations.
def median(strings, weights=None):
"""
Find approximate median string from a list of strings.
Uses a comprehensive algorithm to find a string that minimizes the sum of
edit distances to all input strings. More accurate but slower than quickmedian.
Parameters:
- strings: List of strings to find median for
- weights: Optional list of weights for each string (same length as strings)
Returns:
str: Approximate median string
"""Usage Examples:
import Levenshtein
# Find median of similar strings
strings = ["SpSm", "mpamm", "Spam", "Spa", "Sua", "hSam"]
med = Levenshtein.median(strings)
print(f"Median: '{med}'") # 'Spam'
# Find median for spell-checking candidates
misspellings = [
"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
]
correct = Levenshtein.median(misspellings)
print(f"Correct spelling: '{correct}'") # 'Levenshtein'
# Find median of file names
filenames = ["document1.txt", "document2.txt", "document3.txt", "document4.txt"]
template = Levenshtein.median(filenames)
print(f"Template: '{template}'") # 'document.txt' (approximate)
# Weighted median (give more weight to certain strings)
strings = ["correct", "crect", "corect", "correct"]
weights = [3, 1, 1, 2] # Give more weight to "correct"
weighted_med = Levenshtein.median(strings, weights)
print(f"Weighted median: '{weighted_med}'") # Should be closer to "correct"Fast approximate median string calculation with optional weights for different strings in the input set.
def quickmedian(strings, weights=None):
"""
Fast approximate median string calculation.
Uses a faster heuristic algorithm. Less accurate than median() but much faster
for large string sets. Supports optional weighting of input strings.
Parameters:
- strings: List of strings to find median for
- weights: Optional list of weights for each string (same length as strings)
Returns:
str: Approximate median string
"""Usage Examples:
import Levenshtein
# Fast median calculation
misspellings = [
"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
]
quick_med = Levenshtein.quickmedian(misspellings)
print(f"Quick median: '{quick_med}'") # 'Levnshein'
# Weighted median (emphasize certain strings)
strings = ["apple", "aple", "appl", "apple"]
weights = [3, 1, 1, 2] # Give more weight to "apple"
weighted_med = Levenshtein.quickmedian(strings, weights)
print(f"Weighted median: '{weighted_med}'") # Closer to "apple"
# Handle zero weights (ignored strings)
strings = ["test", "testing", "tester"]
weights = [1, 0, 1] # Ignore "testing"
result = Levenshtein.quickmedian(strings, weights)
print(f"With zero weights: '{result}'") # Based only on "test" and "tester"Improve a string towards the median of a given set of strings through iterative refinement.
def median_improve(string, strings, weights=None):
"""
Improve a string towards median of given strings.
Takes an initial string and iteratively improves it to be closer to the
median of the target string set. Can be applied multiple times for further improvement.
Parameters:
- string: Initial string to improve
- strings: List of target strings to improve towards
- weights: Optional list of weights for each string (same length as strings)
Returns:
str: Improved string that is closer to the median of the target strings
"""Usage Examples:
import Levenshtein
# Single improvement step
target_strings = [
"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
]
# Start with a poor guess
initial = "spam"
improved1 = Levenshtein.median_improve(initial, target_strings)
print(f"First improvement: '{initial}' -> '{improved1}'") # 'spam' -> 'enhtein'
# Apply improvement again
improved2 = Levenshtein.median_improve(improved1, target_strings)
print(f"Second improvement: '{improved1}' -> '{improved2}'") # 'enhtein' -> 'Levenshtein'
# Iterative improvement function
def iterative_improve(initial, targets, max_iterations=10):
"""Iteratively improve a string."""
current = initial
for i in range(max_iterations):
improved = Levenshtein.median_improve(current, targets)
if improved == current: # No more improvement possible
break
print(f"Iteration {i+1}: '{current}' -> '{improved}'")
current = improved
return current
# Example
final = iterative_improve("xyz", target_strings)
print(f"Final result: '{final}'")
# Using weights to emphasize certain target strings
target_strings_weighted = ["python", "programming", "coding", "development"]
weights = [3, 2, 1, 1] # Give more weight to "python" and "programming"
initial = "prog"
improved_weighted = Levenshtein.median_improve(initial, target_strings_weighted, weights)
print(f"Weighted improvement: '{initial}' -> '{improved_weighted}'")Find median from set of strings treating each string as a character set rather than a sequence.
def setmedian(strings, weights=None):
"""
Find median from set of strings (treats as character sets).
Treats each input string as a set of characters rather than a sequence,
finding a string that best represents the common characters across all inputs.
Parameters:
- strings: List of strings to find set median for
- weights: Optional list of weights for each string (same length as strings)
Returns:
str: Set median string containing representative characters
"""Usage Examples:
import Levenshtein
# Set median treats strings as character sets
strings = [
"ehee", "cceaes", "chees", "chreesc",
"chees", "cheesee", "cseese", "chetese"
]
set_med = Levenshtein.setmedian(strings)
print(f"Set median: '{set_med}'") # 'chees'
# Compare with regular median
regular_med = Levenshtein.median(strings)
print(f"Regular median: '{regular_med}'")
# Character frequency analysis
cheese_variants = ["cheese", "chese", "cheeze", "chees", "cheess"]
set_result = Levenshtein.setmedian(cheese_variants)
print(f"Cheese variants set median: '{set_result}'")
# Weighted set median (emphasize certain variants)
weights = [3, 1, 1, 2, 1] # Give more weight to "cheese" and "chees"
weighted_set_result = Levenshtein.setmedian(cheese_variants, weights)
print(f"Weighted set median: '{weighted_set_result}'")Calculate similarity ratio for string sequences, measuring how similar the strings are when treated as sequences.
def seqratio(strings1, strings2):
"""
Calculate similarity ratio between two string sequences.
Computes a similarity measure between two collections of strings treated as sequences,
measuring how similar they are in terms of character order and position.
Parameters:
- strings1: First list of strings to compare
- strings2: Second list of strings to compare
Returns:
float: Sequence similarity ratio between the two string collections
"""Usage Examples:
import Levenshtein
# Compare two similar sequence collections
seq1 = ["newspaper", "litter bin", "tinny", "antelope"]
seq2 = ["caribou", "sausage", "gorn", "woody"]
seq_ratio = Levenshtein.seqratio(seq1, seq2)
print(f"Sequence similarity ratio: {seq_ratio:.3f}") # ~0.215
# Compare identical sequences
identical1 = ["abc", "def", "ghi"]
identical2 = ["abc", "def", "ghi"]
seq_ratio = Levenshtein.seqratio(identical1, identical2)
print(f"Identical sequences ratio: {seq_ratio:.3f}") # Should be high
# Compare sequences with different lengths
short_seq = ["a", "b"]
long_seq = ["apple", "banana", "cherry", "date"]
seq_ratio = Levenshtein.seqratio(short_seq, long_seq)
print(f"Different length sequences ratio: {seq_ratio:.3f}")Calculate similarity ratio for string sets, measuring how similar the strings are when treated as character sets.
def setratio(strings1, strings2):
"""
Calculate similarity ratio between two string sets.
Computes a similarity measure between two collections of strings treated as character sets,
measuring how much they overlap in terms of unique characters present.
Parameters:
- strings1: First list of strings to compare
- strings2: Second list of strings to compare
Returns:
float: Set similarity ratio between the two string collections
"""Usage Examples:
import Levenshtein
# Compare two string collections as character sets
set1 = ["newspaper", "litter bin", "tinny", "antelope"]
set2 = ["caribou", "sausage", "gorn", "woody"]
set_ratio = Levenshtein.setratio(set1, set2)
print(f"Set similarity ratio: {set_ratio:.3f}") # ~0.282
# Compare sets with overlapping characters
overlapping1 = ["hello", "help", "held"]
overlapping2 = ["hell", "helping", "holder"]
set_ratio = Levenshtein.setratio(overlapping1, overlapping2)
print(f"Overlapping character sets ratio: {set_ratio:.3f}")
# Compare sets with identical character composition but different words
identical_chars1 = ["abc", "bca", "cab"]
identical_chars2 = ["acb", "cba", "abc"]
set_ratio = Levenshtein.setratio(identical_chars1, identical_chars2)
print(f"Same character sets ratio: {set_ratio:.3f}") # Should be high
# Compare completely different character sets
different1 = ["abc", "def"]
different2 = ["xyz", "uvw"]
set_ratio = Levenshtein.setratio(different1, different2)
print(f"Different character sets ratio: {set_ratio:.3f}") # Should be lowimport Levenshtein
def spell_check_with_median(word, dictionary, k=5):
"""
Advanced spell checking using median of closest matches.
"""
# Find k closest dictionary words
candidates = []
for dict_word in dictionary:
dist = Levenshtein.distance(word, dict_word)
candidates.append((dict_word, dist))
# Sort by distance and take top k
candidates.sort(key=lambda x: x[1])
closest_words = [word for word, _ in candidates[:k]]
# Find median of closest matches
suggestion = Levenshtein.median(closest_words)
return suggestion, closest_words
# Example usage
dictionary = ["hello", "help", "helm", "held", "hell", "heal"]
word = "helo"
suggestion, candidates = spell_check_with_median(word, dictionary)
print(f"Word: '{word}'")
print(f"Candidates: {candidates}")
print(f"Suggested correction: '{suggestion}'")import Levenshtein
def analyze_string_cluster(strings1, strings2):
"""
Analyze two clusters of strings for similarity patterns.
"""
results = {
'median1': Levenshtein.median(strings1),
'median2': Levenshtein.median(strings2),
'quick_median1': Levenshtein.quickmedian(strings1),
'quick_median2': Levenshtein.quickmedian(strings2),
'set_median1': Levenshtein.setmedian(strings1),
'set_median2': Levenshtein.setmedian(strings2),
'sequence_ratio': Levenshtein.seqratio(strings1, strings2),
'set_ratio': Levenshtein.setratio(strings1, strings2)
}
print("String Cluster Analysis:")
print(f"Input strings1: {strings1}")
print(f"Input strings2: {strings2}")
print(f"Median1: '{results['median1']}'")
print(f"Median2: '{results['median2']}'")
print(f"Quick median1: '{results['quick_median1']}'")
print(f"Quick median2: '{results['quick_median2']}'")
print(f"Set median1: '{results['set_median1']}'")
print(f"Set median2: '{results['set_median2']}'")
print(f"Sequence similarity: {results['sequence_ratio']:.3f}")
print(f"Set similarity: {results['set_ratio']:.3f}")
return results
# Example
file_variants1 = [
"data_file_001.csv", "data_file_002.csv", "data_file_003.csv"
]
file_variants2 = [
"info_doc_001.txt", "info_doc_002.txt", "info_doc_003.txt"
]
analyze_string_cluster(file_variants1, file_variants2)import Levenshtein
def progressive_improvement(initial, targets, verbose=True):
"""
Progressive improvement of a string towards a target set.
"""
current = initial
iterations = 0
improvements = [current]
while iterations < 20: # Prevent infinite loops
improved = Levenshtein.median_improve(current, targets)
if improved == current:
if verbose:
print(f"Converged after {iterations} iterations")
break
if verbose:
# Calculate average distance to targets
avg_dist_before = sum(Levenshtein.distance(current, t) for t in targets) / len(targets)
avg_dist_after = sum(Levenshtein.distance(improved, t) for t in targets) / len(targets)
print(f"Iteration {iterations + 1}: '{current}' -> '{improved}' "
f"(avg distance: {avg_dist_before:.2f} -> {avg_dist_after:.2f})")
current = improved
improvements.append(current)
iterations += 1
return current, improvements
# Example
targets = ["python", "programming", "development", "coding"]
final, history = progressive_improvement("xyz", targets)
print(f"\nFinal result: '{final}'")
print(f"Improvement history: {' -> '.join(history[:5])}{'...' if len(history) > 5 else ''}")import Levenshtein
def weighted_median_with_confidence(strings, confidences=None):
"""
Calculate weighted median with confidence scores.
"""
if confidences is None:
confidences = [1.0] * len(strings)
# Normalize confidences to use as weights
total_conf = sum(confidences)
weights = [int(conf * 100 / total_conf) for conf in confidences]
# Calculate different medians
regular_median = Levenshtein.median(strings)
weighted_median = Levenshtein.quickmedian(strings, weights)
# Calculate confidence metrics
reg_avg_dist = sum(Levenshtein.distance(regular_median, s) for s in strings) / len(strings)
weighted_avg_dist = sum(Levenshtein.distance(weighted_median, s) * w
for s, w in zip(strings, weights)) / sum(weights)
return {
'regular_median': regular_median,
'weighted_median': weighted_median,
'regular_avg_distance': reg_avg_dist,
'weighted_avg_distance': weighted_avg_dist,
'weights_used': weights
}
# Example
strings = ["correct", "corect", "corerct", "correct", "crect"]
confidences = [0.9, 0.3, 0.2, 0.9, 0.1] # High confidence in first and fourth
result = weighted_median_with_confidence(strings, confidences)
print("Weighted Median Analysis:")
for key, value in result.items():
print(f"{key}: {value}")Install with Tessl CLI
npx tessl i tessl/pypi-levenshtein