A library implementing different string similarity and distance measures
Factory pattern for algorithm instantiation and utility interfaces for customizing algorithm behavior. These components provide convenient ways to work with multiple algorithms and customize their operation.
The Factory class provides a convenient way to instantiate algorithms using an enumeration, making it easy to switch between different similarity measures programmatically.
from enum import IntEnum
class Algorithm(IntEnum):
"""Enumeration of available string similarity algorithms."""
COSINE = 1
DAMERAU = 2
JACCARD = 3
JARO_WINKLE = 4
LEVENSHTEIN = 5
LCS = 6
METRIC_LCS = 7
N_GRAM = 8
NORMALIZED_LEVENSHTEIN = 9
OPTIMAL_STRING_ALIGNMENT = 10
Q_GRAM = 11
SORENSEN_DICE = 12
WEIGHTED_LEVENSHTEIN = 13
class Factory:
"""Factory for creating string similarity algorithm instances."""
@staticmethod
def get_algorithm(algorithm: Algorithm, k: int = 3):
"""
Create an algorithm instance using the factory pattern.
Args:
algorithm: Algorithm type from Algorithm enum
k: Shingle size for n-gram based algorithms (default: 3)
Returns:
Algorithm instance ready for use
Raises:
TypeError: For WEIGHTED_LEVENSHTEIN (use get_weighted_levenshtein instead)
"""
@staticmethod
def get_weighted_levenshtein(char_sub: CharacterSubstitutionInterface,
char_change: CharacterInsDelInterface):
"""
Create a WeightedLevenshtein instance with custom cost interfaces.
Args:
char_sub: Interface defining character substitution costs
char_change: Interface defining insertion/deletion costs
Returns:
WeightedLevenshtein: Configured weighted Levenshtein algorithm
"""Usage Examples:
from similarity.similarity import Factory, Algorithm
# Basic factory usage
levenshtein = Factory.get_algorithm(Algorithm.LEVENSHTEIN)
distance = levenshtein.distance('hello', 'hallo')
# N-gram algorithms with custom k
cosine = Factory.get_algorithm(Algorithm.COSINE, k=4)
jaccard = Factory.get_algorithm(Algorithm.JACCARD, k=2)
# Programmatic algorithm selection
def compare_strings(s1, s2, algorithm_name):
"""Compare strings using specified algorithm."""
algo_map = {
'levenshtein': Algorithm.LEVENSHTEIN,
'jaro_winkler': Algorithm.JARO_WINKLE,
'cosine': Algorithm.COSINE,
'jaccard': Algorithm.JACCARD
}
if algorithm_name not in algo_map:
raise ValueError(f"Unknown algorithm: {algorithm_name}")
algorithm = Factory.get_algorithm(algo_map[algorithm_name])
if hasattr(algorithm, 'similarity'):
return algorithm.similarity(s1, s2)
else:
return algorithm.distance(s1, s2)
# Algorithm comparison utility
def compare_with_all_algorithms(s1, s2):
"""Compare two strings using all available algorithms."""
results = {}
# Distance-only algorithms
distance_algos = [
Algorithm.LEVENSHTEIN,
Algorithm.DAMERAU,
Algorithm.LCS,
Algorithm.OPTIMAL_STRING_ALIGNMENT,
Algorithm.Q_GRAM,
Algorithm.N_GRAM
]
for algo in distance_algos:
instance = Factory.get_algorithm(algo)
results[algo.name] = {'distance': instance.distance(s1, s2)}
# Similarity/distance algorithms
sim_dist_algos = [
Algorithm.JARO_WINKLE,
Algorithm.NORMALIZED_LEVENSHTEIN,
Algorithm.COSINE,
Algorithm.JACCARD,
Algorithm.SORENSEN_DICE,
Algorithm.METRIC_LCS
]
for algo in sim_dist_algos:
instance = Factory.get_algorithm(algo)
results[algo.name] = {
'similarity': instance.similarity(s1, s2),
'distance': instance.distance(s1, s2)
}
return results
# Example usage
results = compare_with_all_algorithms('programming', 'program')
for algo_name, scores in results.items():
print(f"{algo_name}: {scores}")Special factory method for creating WeightedLevenshtein instances with custom cost functions:
Usage Examples:
from similarity.similarity import Factory
from similarity.weighted_levenshtein import CharacterSubstitutionInterface, CharacterInsDelInterface
# Custom substitution costs for OCR
class OCRSubstitution(CharacterSubstitutionInterface):
def cost(self, c0, c1):
# Visually similar characters have lower cost
similar_pairs = [
('o', '0'), ('O', '0'),
('l', '1'), ('I', '1'),
('S', '5'), ('s', '5'),
('Z', '2'), ('z', '2')
]
if (c0, c1) in similar_pairs or (c1, c0) in similar_pairs:
return 0.3
elif c0.lower() == c1.lower(): # Case differences
return 0.1
else:
return 1.0
# Custom insertion/deletion costs
class KeyboardInsDelCosts(CharacterInsDelInterface):
def insertion_cost(self, c):
# Common accidentally typed characters
if c in ' \t\n':
return 0.5 # Whitespace often accidental
return 1.0
def deletion_cost(self, c):
# Vowels are often dropped in informal text
if c.lower() in 'aeiou':
return 0.7
return 1.0
# Create weighted Levenshtein with custom costs
substitution = OCRSubstitution()
ins_del = KeyboardInsDelCosts()
weighted_lev = Factory.get_weighted_levenshtein(substitution, ins_del)
# OCR correction example
ocr_text = "Th3 qu1ck br0wn f0x"
correct_text = "The quick brown fox"
distance = weighted_lev.distance(ocr_text, correct_text)
print(f"Weighted distance: {distance}")
# Standard Levenshtein for comparison
standard_lev = Factory.get_algorithm(Algorithm.LEVENSHTEIN)
standard_distance = standard_lev.distance(ocr_text, correct_text)
print(f"Standard distance: {standard_distance}")Base interfaces that can be implemented to customize algorithm behavior:
class CharacterSubstitutionInterface:
"""Interface for defining custom character substitution costs."""
def cost(self, c0: str, c1: str) -> float:
"""
Return the cost of substituting character c0 with c1.
Args:
c0: Original character
c1: Replacement character
Returns:
float: Cost of substitution (typically 0.0-1.0)
"""
class CharacterInsDelInterface:
"""Interface for defining custom insertion and deletion costs."""
def deletion_cost(self, c: str) -> float:
"""
Return the cost of deleting character c.
Args:
c: Character to delete
Returns:
float: Cost of deletion
"""
def insertion_cost(self, c: str) -> float:
"""
Return the cost of inserting character c.
Args:
c: Character to insert
Returns:
float: Cost of insertion
"""Advanced Customization Examples:
# Domain-specific costs for name matching
class NameMatchingCosts(CharacterSubstitutionInterface):
def cost(self, c0, c1):
# Common name variations
name_variants = {
('c', 'k'), ('ph', 'f'), ('y', 'i'),
('ie', 'y'), ('son', 'sen'), ('sen', 'son')
}
# Handle multi-character patterns
pair = (c0, c1)
if pair in name_variants or (c1, c0) in name_variants:
return 0.2
# Phonetically similar
phonetic_groups = [
'bp', 'dt', 'kg', 'sz', 'fv'
]
for group in phonetic_groups:
if c0 in group and c1 in group:
return 0.4
return 1.0
# Keyboard layout based costs
class QWERTYSubstitution(CharacterSubstitutionInterface):
def __init__(self):
# Define keyboard layout
self.keyboard = [
'qwertyuiop',
'asdfghjkl',
'zxcvbnm'
]
self.positions = {}
for row, keys in enumerate(self.keyboard):
for col, key in enumerate(keys):
self.positions[key] = (row, col)
def cost(self, c0, c1):
if c0 == c1:
return 0.0
c0, c1 = c0.lower(), c1.lower()
if c0 in self.positions and c1 in self.positions:
pos0 = self.positions[c0]
pos1 = self.positions[c1]
# Manhattan distance on keyboard
distance = abs(pos0[0] - pos1[0]) + abs(pos0[1] - pos1[1])
# Adjacent keys have lower cost
if distance == 1:
return 0.3
elif distance == 2:
return 0.6
return 1.0
# Usage with factory
keyboard_costs = QWERTYSubstitution()
weighted_lev = Factory.get_weighted_levenshtein(keyboard_costs, None)
# Typo detection
typos = [
("hello", "heklo"), # Adjacent key error
("world", "qorld"), # q instead of w
("python", "pytjon") # j instead of h
]
for correct, typo in typos:
distance = weighted_lev.distance(correct, typo)
print(f"'{correct}' vs '{typo}': {distance}")All factory methods and interfaces include proper error handling:
get_algorithm()Example Error Handling:
from similarity.similarity import Factory, Algorithm
try:
# This will raise TypeError
weighted = Factory.get_algorithm(Algorithm.WEIGHTED_LEVENSHTEIN)
except TypeError as e:
print(f"Error: {e}")
print("Use get_weighted_levenshtein() instead")
# Proper way to handle different algorithm types
def safe_get_similarity(s1, s2, algorithm_type):
"""Safely get similarity score from any algorithm."""
try:
algorithm = Factory.get_algorithm(algorithm_type)
if hasattr(algorithm, 'similarity'):
return algorithm.similarity(s1, s2)
else:
# Convert distance to similarity (approximate)
distance = algorithm.distance(s1, s2)
max_len = max(len(s1), len(s2))
return max(0, 1 - distance / max_len) if max_len > 0 else 1.0
except TypeError:
return None # Weighted Levenshtein requires special handlingInstall with Tessl CLI
npx tessl i tessl/pypi-strsim