tessl/pypi-strsim

A library implementing different string similarity and distance measures

Overview

Eval results

Files

Factory and Utility Classes

Name: tessl/pypi-strsim
Author: tessl

Factory pattern for algorithm instantiation and utility interfaces for customizing algorithm behavior. These components provide convenient ways to work with multiple algorithms and customize their operation.

Capabilities

Algorithm Factory

The Factory class provides a convenient way to instantiate algorithms using an enumeration, making it easy to switch between different similarity measures programmatically.

from enum import IntEnum

class Algorithm(IntEnum):
    """Enumeration of available string similarity algorithms."""
    COSINE = 1
    DAMERAU = 2
    JACCARD = 3
    JARO_WINKLE = 4
    LEVENSHTEIN = 5
    LCS = 6
    METRIC_LCS = 7
    N_GRAM = 8
    NORMALIZED_LEVENSHTEIN = 9
    OPTIMAL_STRING_ALIGNMENT = 10
    Q_GRAM = 11
    SORENSEN_DICE = 12
    WEIGHTED_LEVENSHTEIN = 13

class Factory:
    """Factory for creating string similarity algorithm instances."""
    
    @staticmethod
    def get_algorithm(algorithm: Algorithm, k: int = 3):
        """
        Create an algorithm instance using the factory pattern.
        
        Args:
            algorithm: Algorithm type from Algorithm enum
            k: Shingle size for n-gram based algorithms (default: 3)
            
        Returns:
            Algorithm instance ready for use
            
        Raises:
            TypeError: For WEIGHTED_LEVENSHTEIN (use get_weighted_levenshtein instead)
        """
    
    @staticmethod
    def get_weighted_levenshtein(char_sub: CharacterSubstitutionInterface, 
                                char_change: CharacterInsDelInterface):
        """
        Create a WeightedLevenshtein instance with custom cost interfaces.
        
        Args:
            char_sub: Interface defining character substitution costs
            char_change: Interface defining insertion/deletion costs
            
        Returns:
            WeightedLevenshtein: Configured weighted Levenshtein algorithm
        """

Usage Examples:

from similarity.similarity import Factory, Algorithm

# Basic factory usage
levenshtein = Factory.get_algorithm(Algorithm.LEVENSHTEIN)
distance = levenshtein.distance('hello', 'hallo')

# N-gram algorithms with custom k
cosine = Factory.get_algorithm(Algorithm.COSINE, k=4)
jaccard = Factory.get_algorithm(Algorithm.JACCARD, k=2)

# Programmatic algorithm selection
def compare_strings(s1, s2, algorithm_name):
    """Compare strings using specified algorithm."""
    algo_map = {
        'levenshtein': Algorithm.LEVENSHTEIN,
        'jaro_winkler': Algorithm.JARO_WINKLE,
        'cosine': Algorithm.COSINE,
        'jaccard': Algorithm.JACCARD
    }
    
    if algorithm_name not in algo_map:
        raise ValueError(f"Unknown algorithm: {algorithm_name}")
    
    algorithm = Factory.get_algorithm(algo_map[algorithm_name])
    
    if hasattr(algorithm, 'similarity'):
        return algorithm.similarity(s1, s2)
    else:
        return algorithm.distance(s1, s2)

# Algorithm comparison utility
def compare_with_all_algorithms(s1, s2):
    """Compare two strings using all available algorithms."""
    results = {}
    
    # Distance-only algorithms
    distance_algos = [
        Algorithm.LEVENSHTEIN,
        Algorithm.DAMERAU,
        Algorithm.LCS,
        Algorithm.OPTIMAL_STRING_ALIGNMENT,
        Algorithm.Q_GRAM,
        Algorithm.N_GRAM
    ]
    
    for algo in distance_algos:
        instance = Factory.get_algorithm(algo)
        results[algo.name] = {'distance': instance.distance(s1, s2)}
    
    # Similarity/distance algorithms  
    sim_dist_algos = [
        Algorithm.JARO_WINKLE,
        Algorithm.NORMALIZED_LEVENSHTEIN,
        Algorithm.COSINE,
        Algorithm.JACCARD,
        Algorithm.SORENSEN_DICE,
        Algorithm.METRIC_LCS
    ]
    
    for algo in sim_dist_algos:
        instance = Factory.get_algorithm(algo)
        results[algo.name] = {
            'similarity': instance.similarity(s1, s2),
            'distance': instance.distance(s1, s2)
        }
    
    return results

# Example usage
results = compare_with_all_algorithms('programming', 'program')
for algo_name, scores in results.items():
    print(f"{algo_name}: {scores}")

Weighted Levenshtein Factory Method

Special factory method for creating WeightedLevenshtein instances with custom cost functions:

Usage Examples:

from similarity.similarity import Factory
from similarity.weighted_levenshtein import CharacterSubstitutionInterface, CharacterInsDelInterface

# Custom substitution costs for OCR
class OCRSubstitution(CharacterSubstitutionInterface):
    def cost(self, c0, c1):
        # Visually similar characters have lower cost
        similar_pairs = [
            ('o', '0'), ('O', '0'),
            ('l', '1'), ('I', '1'),
            ('S', '5'), ('s', '5'),
            ('Z', '2'), ('z', '2')
        ]
        
        if (c0, c1) in similar_pairs or (c1, c0) in similar_pairs:
            return 0.3
        elif c0.lower() == c1.lower():  # Case differences
            return 0.1
        else:
            return 1.0

# Custom insertion/deletion costs
class KeyboardInsDelCosts(CharacterInsDelInterface):
    def insertion_cost(self, c):
        # Common accidentally typed characters
        if c in ' \t\n':
            return 0.5  # Whitespace often accidental
        return 1.0
    
    def deletion_cost(self, c):
        # Vowels are often dropped in informal text
        if c.lower() in 'aeiou':
            return 0.7
        return 1.0

# Create weighted Levenshtein with custom costs
substitution = OCRSubstitution()
ins_del = KeyboardInsDelCosts()
weighted_lev = Factory.get_weighted_levenshtein(substitution, ins_del)

# OCR correction example
ocr_text = "Th3 qu1ck br0wn f0x"
correct_text = "The quick brown fox"
distance = weighted_lev.distance(ocr_text, correct_text)
print(f"Weighted distance: {distance}")

# Standard Levenshtein for comparison
standard_lev = Factory.get_algorithm(Algorithm.LEVENSHTEIN)
standard_distance = standard_lev.distance(ocr_text, correct_text)
print(f"Standard distance: {standard_distance}")

Utility Interfaces

Base interfaces that can be implemented to customize algorithm behavior:

class CharacterSubstitutionInterface:
    """Interface for defining custom character substitution costs."""
    
    def cost(self, c0: str, c1: str) -> float:
        """
        Return the cost of substituting character c0 with c1.
        
        Args:
            c0: Original character
            c1: Replacement character
            
        Returns:
            float: Cost of substitution (typically 0.0-1.0)
        """

class CharacterInsDelInterface:
    """Interface for defining custom insertion and deletion costs."""
    
    def deletion_cost(self, c: str) -> float:
        """
        Return the cost of deleting character c.
        
        Args:
            c: Character to delete
            
        Returns:
            float: Cost of deletion
        """
    
    def insertion_cost(self, c: str) -> float:
        """
        Return the cost of inserting character c.
        
        Args:
            c: Character to insert
            
        Returns:
            float: Cost of insertion
        """

Advanced Customization Examples:

# Domain-specific costs for name matching
class NameMatchingCosts(CharacterSubstitutionInterface):
    def cost(self, c0, c1):
        # Common name variations
        name_variants = {
            ('c', 'k'), ('ph', 'f'), ('y', 'i'),
            ('ie', 'y'), ('son', 'sen'), ('sen', 'son')
        }
        
        # Handle multi-character patterns
        pair = (c0, c1)
        if pair in name_variants or (c1, c0) in name_variants:
            return 0.2
        
        # Phonetically similar
        phonetic_groups = [
            'bp', 'dt', 'kg', 'sz', 'fv'
        ]
        
        for group in phonetic_groups:
            if c0 in group and c1 in group:
                return 0.4
        
        return 1.0

# Keyboard layout based costs
class QWERTYSubstitution(CharacterSubstitutionInterface):
    def __init__(self):
        # Define keyboard layout
        self.keyboard = [
            'qwertyuiop',
            'asdfghjkl',
            'zxcvbnm'
        ]
        self.positions = {}
        for row, keys in enumerate(self.keyboard):
            for col, key in enumerate(keys):
                self.positions[key] = (row, col)
    
    def cost(self, c0, c1):
        if c0 == c1:
            return 0.0
        
        c0, c1 = c0.lower(), c1.lower()
        
        if c0 in self.positions and c1 in self.positions:
            pos0 = self.positions[c0]
            pos1 = self.positions[c1]
            
            # Manhattan distance on keyboard
            distance = abs(pos0[0] - pos1[0]) + abs(pos0[1] - pos1[1])
            
            # Adjacent keys have lower cost
            if distance == 1:
                return 0.3
            elif distance == 2:
                return 0.6
        
        return 1.0

# Usage with factory
keyboard_costs = QWERTYSubstitution()
weighted_lev = Factory.get_weighted_levenshtein(keyboard_costs, None)

# Typo detection
typos = [
    ("hello", "heklo"),   # Adjacent key error
    ("world", "qorld"),   # q instead of w
    ("python", "pytjon")  # j instead of h
]

for correct, typo in typos:
    distance = weighted_lev.distance(correct, typo)
    print(f"'{correct}' vs '{typo}': {distance}")

Error Handling

All factory methods and interfaces include proper error handling:

TypeError: Raised when attempting to create WEIGHTED_LEVENSHTEIN through get_algorithm()
ValueError: May be raised by custom interface implementations for invalid inputs
AttributeError: Raised when accessing methods not supported by specific algorithms

Example Error Handling:

from similarity.similarity import Factory, Algorithm

try:
    # This will raise TypeError
    weighted = Factory.get_algorithm(Algorithm.WEIGHTED_LEVENSHTEIN)
except TypeError as e:
    print(f"Error: {e}")
    print("Use get_weighted_levenshtein() instead")

# Proper way to handle different algorithm types
def safe_get_similarity(s1, s2, algorithm_type):
    """Safely get similarity score from any algorithm."""
    try:
        algorithm = Factory.get_algorithm(algorithm_type)
        
        if hasattr(algorithm, 'similarity'):
            return algorithm.similarity(s1, s2)
        else:
            # Convert distance to similarity (approximate)
            distance = algorithm.distance(s1, s2)
            max_len = max(len(s1), len(s2))
            return max(0, 1 - distance / max_len) if max_len > 0 else 1.0
            
    except TypeError:
        return None  # Weighted Levenshtein requires special handling

Install with Tessl CLI