A library implementing different string similarity and distance measures
npx @tessl/cli install tessl/pypi-strsim@0.0.0A comprehensive Python library implementing different string similarity and distance measures. strsim provides over a dozen algorithms including Levenshtein edit distance, Jaro-Winkler, Longest Common Subsequence, cosine similarity, and various n-gram based measures for applications such as record linkage, duplicate detection, typo correction, and general text comparison tasks.
pip install strsimDirect algorithm imports:
from similarity.levenshtein import Levenshtein
from similarity.jarowinkler import JaroWinkler
from similarity.cosine import Cosine
from similarity.jaccard import JaccardFactory pattern imports:
from similarity.similarity import Factory, Algorithmfrom similarity.levenshtein import Levenshtein
from similarity.jarowinkler import JaroWinkler
from similarity.similarity import Factory, Algorithm
# Direct algorithm usage
levenshtein = Levenshtein()
distance = levenshtein.distance('hello', 'hallo') # Returns: 1.0
# Normalized similarity
jarowinkler = JaroWinkler()
similarity = jarowinkler.similarity('hello', 'hallo') # Returns: 0.933...
# Factory pattern usage
algorithm = Factory.get_algorithm(Algorithm.LEVENSHTEIN)
distance = algorithm.distance('hello', 'hallo') # Returns: 1.0The library is built around a hierarchy of base interface classes that define different types of string comparison algorithms:
This design enables consistent interfaces across algorithm types while providing flexibility for specialized implementations and pre-computed profile optimizations for large datasets.
Classic edit distance algorithms including Levenshtein, Damerau-Levenshtein, Weighted Levenshtein, and Optimal String Alignment. These algorithms measure the minimum number of character operations needed to transform one string into another.
class Levenshtein(MetricStringDistance):
def distance(self, s0: str, s1: str) -> float: ...
class NormalizedLevenshtein(NormalizedStringDistance, NormalizedStringSimilarity):
def __init__(self): ...
def distance(self, s0: str, s1: str) -> float: ...
def similarity(self, s0: str, s1: str) -> float: ...
class WeightedLevenshtein(StringDistance):
def __init__(self, character_substitution: CharacterSubstitutionInterface,
character_ins_del: CharacterInsDelInterface = None): ...
def distance(self, s0: str, s1: str) -> float: ...Algorithms designed for fuzzy matching, typo correction, and record linkage applications. These are optimized for short strings like person names and handle character transpositions intelligently.
class JaroWinkler(NormalizedStringSimilarity, NormalizedStringDistance):
def __init__(self, threshold: float = 0.7): ...
def get_threshold(self) -> float: ...
def similarity(self, s0: str, s1: str) -> float: ...
def distance(self, s0: str, s1: str) -> float: ...
class Damerau(MetricStringDistance):
def distance(self, s0: str, s1: str) -> float: ...Algorithms based on longest common subsequences and sequence alignment, commonly used in diff utilities, version control systems, and bioinformatics applications.
class LongestCommonSubsequence(StringDistance):
def distance(self, s0: str, s1: str) -> float: ...
@staticmethod
def length(s0: str, s1: str) -> float: ...
class MetricLCS(MetricStringDistance, NormalizedStringDistance):
def __init__(self): ...
def distance(self, s0: str, s1: str) -> float: ...Algorithms that convert strings into sets or profiles of n-character sequences (shingles) and compute similarity based on these representations. Support both direct string comparison and pre-computed profile optimization for large datasets.
class Cosine(ShingleBased, NormalizedStringDistance, NormalizedStringSimilarity):
def __init__(self, k: int): ...
def distance(self, s0: str, s1: str) -> float: ...
def similarity(self, s0: str, s1: str) -> float: ...
def similarity_profiles(self, profile0: dict, profile1: dict) -> float: ...
class QGram(ShingleBased, StringDistance):
def __init__(self, k: int = 3): ...
def distance(self, s0: str, s1: str) -> float: ...
@staticmethod
def distance_profile(profile0: dict, profile1: dict) -> float: ...Factory pattern for algorithm instantiation and utility interfaces for customizing algorithm behavior.
from enum import IntEnum
class Algorithm(IntEnum):
COSINE = 1
DAMERAU = 2
JACCARD = 3
JARO_WINKLE = 4
LEVENSHTEIN = 5
LCS = 6
METRIC_LCS = 7
N_GRAM = 8
NORMALIZED_LEVENSHTEIN = 9
OPTIMAL_STRING_ALIGNMENT = 10
Q_GRAM = 11
SORENSEN_DICE = 12
WEIGHTED_LEVENSHTEIN = 13
class Factory:
@staticmethod
def get_algorithm(algorithm: Algorithm, k: int = 3): ...
@staticmethod
def get_weighted_levenshtein(char_sub: CharacterSubstitutionInterface,
char_change: CharacterInsDelInterface): ...# Base interface classes
class StringSimilarity:
def similarity(self, s0: str, s1: str) -> float: ...
class NormalizedStringSimilarity(StringSimilarity):
def similarity(self, s0: str, s1: str) -> float: ...
class StringDistance:
def distance(self, s0: str, s1: str) -> float: ...
class NormalizedStringDistance(StringDistance):
def distance(self, s0: str, s1: str) -> float: ...
class MetricStringDistance(StringDistance):
def distance(self, s0: str, s1: str) -> float: ...
# Shingle-based algorithms base class
class ShingleBased:
def __init__(self, k: int = 3): ...
def get_k(self) -> int: ...
def get_profile(self, string: str) -> dict: ...
# Weighted Levenshtein interfaces
class CharacterSubstitutionInterface:
def cost(self, c0: str, c1: str) -> float: ...
class CharacterInsDelInterface:
def deletion_cost(self, c: str) -> float: ...
def insertion_cost(self, c: str) -> float: ...