tessl/pypi-levenshtein

Python extension for computing string edit distances and similarities.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

String Averaging and Median

Name: tessl/pypi-levenshtein
Author: tessl

Functions for computing approximate median strings, improving strings toward a target set, and calculating sequence and set similarity ratios for multiple strings. These functions are particularly useful for finding representative strings from collections and analyzing groups of related strings.

Capabilities

Median String

Find approximate median string from a list of strings using a more thorough algorithm that considers all possible character combinations.

def median(strings, weights=None):
    """
    Find approximate median string from a list of strings.
    
    Uses a comprehensive algorithm to find a string that minimizes the sum of 
    edit distances to all input strings. More accurate but slower than quickmedian.
    
    Parameters:
    - strings: List of strings to find median for
    - weights: Optional list of weights for each string (same length as strings)
    
    Returns:
    str: Approximate median string
    """

Usage Examples:

import Levenshtein

# Find median of similar strings
strings = ["SpSm", "mpamm", "Spam", "Spa", "Sua", "hSam"]
med = Levenshtein.median(strings)
print(f"Median: '{med}'")  # 'Spam'

# Find median for spell-checking candidates
misspellings = [
    "Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
    "Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
]
correct = Levenshtein.median(misspellings)
print(f"Correct spelling: '{correct}'")  # 'Levenshtein'

# Find median of file names
filenames = ["document1.txt", "document2.txt", "document3.txt", "document4.txt"]
template = Levenshtein.median(filenames)
print(f"Template: '{template}'")  # 'document.txt' (approximate)

# Weighted median (give more weight to certain strings)
strings = ["correct", "crect", "corect", "correct"]
weights = [3, 1, 1, 2]  # Give more weight to "correct" 
weighted_med = Levenshtein.median(strings, weights)
print(f"Weighted median: '{weighted_med}'")  # Should be closer to "correct"

Quick Median String

Fast approximate median string calculation with optional weights for different strings in the input set.

def quickmedian(strings, weights=None):
    """
    Fast approximate median string calculation.
    
    Uses a faster heuristic algorithm. Less accurate than median() but much faster
    for large string sets. Supports optional weighting of input strings.
    
    Parameters:
    - strings: List of strings to find median for
    - weights: Optional list of weights for each string (same length as strings)
    
    Returns:
    str: Approximate median string
    """

Usage Examples:

import Levenshtein

# Fast median calculation
misspellings = [
    "Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
    "Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"  
]
quick_med = Levenshtein.quickmedian(misspellings)
print(f"Quick median: '{quick_med}'")  # 'Levnshein'

# Weighted median (emphasize certain strings)
strings = ["apple", "aple", "appl", "apple"]
weights = [3, 1, 1, 2]  # Give more weight to "apple"
weighted_med = Levenshtein.quickmedian(strings, weights)
print(f"Weighted median: '{weighted_med}'")  # Closer to "apple"

# Handle zero weights (ignored strings)
strings = ["test", "testing", "tester"]
weights = [1, 0, 1]  # Ignore "testing"
result = Levenshtein.quickmedian(strings, weights)
print(f"With zero weights: '{result}'")  # Based only on "test" and "tester"

Median Improvement

Improve a string towards the median of a given set of strings through iterative refinement.

def median_improve(string, strings, weights=None):
    """
    Improve a string towards median of given strings.
    
    Takes an initial string and iteratively improves it to be closer to the 
    median of the target string set. Can be applied multiple times for further improvement.
    
    Parameters:
    - string: Initial string to improve
    - strings: List of target strings to improve towards
    - weights: Optional list of weights for each string (same length as strings)
    
    Returns:
    str: Improved string that is closer to the median of the target strings
    """

Usage Examples:

import Levenshtein

# Single improvement step
target_strings = [
    "Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
    "Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
]

# Start with a poor guess
initial = "spam"
improved1 = Levenshtein.median_improve(initial, target_strings)
print(f"First improvement: '{initial}' -> '{improved1}'")  # 'spam' -> 'enhtein'

# Apply improvement again
improved2 = Levenshtein.median_improve(improved1, target_strings) 
print(f"Second improvement: '{improved1}' -> '{improved2}'")  # 'enhtein' -> 'Levenshtein'

# Iterative improvement function
def iterative_improve(initial, targets, max_iterations=10):
    """Iteratively improve a string."""
    current = initial
    for i in range(max_iterations):
        improved = Levenshtein.median_improve(current, targets)
        if improved == current:  # No more improvement possible
            break
        print(f"Iteration {i+1}: '{current}' -> '{improved}'")
        current = improved
    return current

# Example
final = iterative_improve("xyz", target_strings)
print(f"Final result: '{final}'")

# Using weights to emphasize certain target strings
target_strings_weighted = ["python", "programming", "coding", "development"]
weights = [3, 2, 1, 1]  # Give more weight to "python" and "programming"
initial = "prog"
improved_weighted = Levenshtein.median_improve(initial, target_strings_weighted, weights)
print(f"Weighted improvement: '{initial}' -> '{improved_weighted}'")

Set Median

Find median from set of strings treating each string as a character set rather than a sequence.

def setmedian(strings, weights=None):
    """
    Find median from set of strings (treats as character sets).
    
    Treats each input string as a set of characters rather than a sequence,
    finding a string that best represents the common characters across all inputs.
    
    Parameters:
    - strings: List of strings to find set median for
    - weights: Optional list of weights for each string (same length as strings)
    
    Returns:
    str: Set median string containing representative characters
    """

Usage Examples:

import Levenshtein

# Set median treats strings as character sets
strings = [
    "ehee", "cceaes", "chees", "chreesc", 
    "chees", "cheesee", "cseese", "chetese"
]
set_med = Levenshtein.setmedian(strings)
print(f"Set median: '{set_med}'")  # 'chees'

# Compare with regular median
regular_med = Levenshtein.median(strings)
print(f"Regular median: '{regular_med}'")

# Character frequency analysis
cheese_variants = ["cheese", "chese", "cheeze", "chees", "cheess"]
set_result = Levenshtein.setmedian(cheese_variants)
print(f"Cheese variants set median: '{set_result}'")

# Weighted set median (emphasize certain variants)
weights = [3, 1, 1, 2, 1]  # Give more weight to "cheese" and "chees"
weighted_set_result = Levenshtein.setmedian(cheese_variants, weights)
print(f"Weighted set median: '{weighted_set_result}'")

Sequence Ratio

Calculate similarity ratio for string sequences, measuring how similar the strings are when treated as sequences.

def seqratio(strings1, strings2):
    """
    Calculate similarity ratio between two string sequences.
    
    Computes a similarity measure between two collections of strings treated as sequences,
    measuring how similar they are in terms of character order and position.
    
    Parameters:
    - strings1: First list of strings to compare
    - strings2: Second list of strings to compare
    
    Returns:
    float: Sequence similarity ratio between the two string collections
    """

Usage Examples:

import Levenshtein

# Compare two similar sequence collections
seq1 = ["newspaper", "litter bin", "tinny", "antelope"]
seq2 = ["caribou", "sausage", "gorn", "woody"]
seq_ratio = Levenshtein.seqratio(seq1, seq2)
print(f"Sequence similarity ratio: {seq_ratio:.3f}")  # ~0.215

# Compare identical sequences
identical1 = ["abc", "def", "ghi"]
identical2 = ["abc", "def", "ghi"]
seq_ratio = Levenshtein.seqratio(identical1, identical2)
print(f"Identical sequences ratio: {seq_ratio:.3f}")  # Should be high

# Compare sequences with different lengths
short_seq = ["a", "b"]
long_seq = ["apple", "banana", "cherry", "date"]
seq_ratio = Levenshtein.seqratio(short_seq, long_seq)
print(f"Different length sequences ratio: {seq_ratio:.3f}")

Set Ratio

Calculate similarity ratio for string sets, measuring how similar the strings are when treated as character sets.

def setratio(strings1, strings2):
    """
    Calculate similarity ratio between two string sets.
    
    Computes a similarity measure between two collections of strings treated as character sets,
    measuring how much they overlap in terms of unique characters present.
    
    Parameters:
    - strings1: First list of strings to compare
    - strings2: Second list of strings to compare
    
    Returns:
    float: Set similarity ratio between the two string collections
    """

Usage Examples:

import Levenshtein

# Compare two string collections as character sets
set1 = ["newspaper", "litter bin", "tinny", "antelope"]
set2 = ["caribou", "sausage", "gorn", "woody"]
set_ratio = Levenshtein.setratio(set1, set2)
print(f"Set similarity ratio: {set_ratio:.3f}")  # ~0.282

# Compare sets with overlapping characters
overlapping1 = ["hello", "help", "held"]
overlapping2 = ["hell", "helping", "holder"]
set_ratio = Levenshtein.setratio(overlapping1, overlapping2)
print(f"Overlapping character sets ratio: {set_ratio:.3f}")

# Compare sets with identical character composition but different words
identical_chars1 = ["abc", "bca", "cab"]  
identical_chars2 = ["acb", "cba", "abc"]
set_ratio = Levenshtein.setratio(identical_chars1, identical_chars2)
print(f"Same character sets ratio: {set_ratio:.3f}")  # Should be high

# Compare completely different character sets
different1 = ["abc", "def"]
different2 = ["xyz", "uvw"]
set_ratio = Levenshtein.setratio(different1, different2)
print(f"Different character sets ratio: {set_ratio:.3f}")  # Should be low

Advanced Usage Patterns

Spell Checking with Median

import Levenshtein

def spell_check_with_median(word, dictionary, k=5):
    """
    Advanced spell checking using median of closest matches.
    """
    # Find k closest dictionary words
    candidates = []
    for dict_word in dictionary:
        dist = Levenshtein.distance(word, dict_word)
        candidates.append((dict_word, dist))
    
    # Sort by distance and take top k
    candidates.sort(key=lambda x: x[1])
    closest_words = [word for word, _ in candidates[:k]]
    
    # Find median of closest matches
    suggestion = Levenshtein.median(closest_words)
    return suggestion, closest_words

# Example usage
dictionary = ["hello", "help", "helm", "held", "hell", "heal"]
word = "helo"
suggestion, candidates = spell_check_with_median(word, dictionary)
print(f"Word: '{word}'")
print(f"Candidates: {candidates}")
print(f"Suggested correction: '{suggestion}'")

String Clustering Analysis

import Levenshtein

def analyze_string_cluster(strings1, strings2):
    """
    Analyze two clusters of strings for similarity patterns.
    """
    results = {
        'median1': Levenshtein.median(strings1),
        'median2': Levenshtein.median(strings2),
        'quick_median1': Levenshtein.quickmedian(strings1),
        'quick_median2': Levenshtein.quickmedian(strings2),
        'set_median1': Levenshtein.setmedian(strings1),
        'set_median2': Levenshtein.setmedian(strings2),
        'sequence_ratio': Levenshtein.seqratio(strings1, strings2),
        'set_ratio': Levenshtein.setratio(strings1, strings2)
    }
    
    print("String Cluster Analysis:")
    print(f"Input strings1: {strings1}")
    print(f"Input strings2: {strings2}")
    print(f"Median1: '{results['median1']}'")
    print(f"Median2: '{results['median2']}'")
    print(f"Quick median1: '{results['quick_median1']}'") 
    print(f"Quick median2: '{results['quick_median2']}'") 
    print(f"Set median1: '{results['set_median1']}'")
    print(f"Set median2: '{results['set_median2']}'")
    print(f"Sequence similarity: {results['sequence_ratio']:.3f}")
    print(f"Set similarity: {results['set_ratio']:.3f}")
    
    return results

# Example
file_variants1 = [
    "data_file_001.csv", "data_file_002.csv", "data_file_003.csv"
]
file_variants2 = [
    "info_doc_001.txt", "info_doc_002.txt", "info_doc_003.txt"
]
analyze_string_cluster(file_variants1, file_variants2)

Progressive String Improvement

import Levenshtein

def progressive_improvement(initial, targets, verbose=True):
    """
    Progressive improvement of a string towards a target set.
    """
    current = initial
    iterations = 0
    improvements = [current]
    
    while iterations < 20:  # Prevent infinite loops
        improved = Levenshtein.median_improve(current, targets)
        
        if improved == current:
            if verbose:
                print(f"Converged after {iterations} iterations")
            break
            
        if verbose:
            # Calculate average distance to targets
            avg_dist_before = sum(Levenshtein.distance(current, t) for t in targets) / len(targets)
            avg_dist_after = sum(Levenshtein.distance(improved, t) for t in targets) / len(targets)
            print(f"Iteration {iterations + 1}: '{current}' -> '{improved}' "
                  f"(avg distance: {avg_dist_before:.2f} -> {avg_dist_after:.2f})")
        
        current = improved
        improvements.append(current)
        iterations += 1
    
    return current, improvements

# Example
targets = ["python", "programming", "development", "coding"]
final, history = progressive_improvement("xyz", targets)
print(f"\nFinal result: '{final}'")
print(f"Improvement history: {' -> '.join(history[:5])}{'...' if len(history) > 5 else ''}")

Weighted Median with Confidence

import Levenshtein

def weighted_median_with_confidence(strings, confidences=None):
    """
    Calculate weighted median with confidence scores.
    """
    if confidences is None:
        confidences = [1.0] * len(strings)
    
    # Normalize confidences to use as weights
    total_conf = sum(confidences)
    weights = [int(conf * 100 / total_conf) for conf in confidences]
    
    # Calculate different medians
    regular_median = Levenshtein.median(strings)
    weighted_median = Levenshtein.quickmedian(strings, weights)
    
    # Calculate confidence metrics
    reg_avg_dist = sum(Levenshtein.distance(regular_median, s) for s in strings) / len(strings)
    weighted_avg_dist = sum(Levenshtein.distance(weighted_median, s) * w 
                           for s, w in zip(strings, weights)) / sum(weights)
    
    return {
        'regular_median': regular_median,
        'weighted_median': weighted_median,
        'regular_avg_distance': reg_avg_dist,
        'weighted_avg_distance': weighted_avg_dist,
        'weights_used': weights
    }

# Example
strings = ["correct", "corect", "corerct", "correct", "crect"]
confidences = [0.9, 0.3, 0.2, 0.9, 0.1]  # High confidence in first and fourth

result = weighted_median_with_confidence(strings, confidences)
print("Weighted Median Analysis:")
for key, value in result.items():
    print(f"{key}: {value}")

Install with Tessl CLI