Tessl Tile for pypi/strsim@0.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

edit-distance.md factory-utilities.md index.md ngram-shingle.md phonetic-record-linkage.md sequence-based.md

factory-utilities.mddocs/

0
# Factory and Utility Classes
1

2
Factory pattern for algorithm instantiation and utility interfaces for customizing algorithm behavior. These components provide convenient ways to work with multiple algorithms and customize their operation.
3

4
## Capabilities
5

6
### Algorithm Factory
7

8
The Factory class provides a convenient way to instantiate algorithms using an enumeration, making it easy to switch between different similarity measures programmatically.
9

10
```python { .api }
11
from enum import IntEnum
12

13
class Algorithm(IntEnum):
14
    """Enumeration of available string similarity algorithms."""
15
    COSINE = 1
16
    DAMERAU = 2
17
    JACCARD = 3
18
    JARO_WINKLE = 4
19
    LEVENSHTEIN = 5
20
    LCS = 6
21
    METRIC_LCS = 7
22
    N_GRAM = 8
23
    NORMALIZED_LEVENSHTEIN = 9
24
    OPTIMAL_STRING_ALIGNMENT = 10
25
    Q_GRAM = 11
26
    SORENSEN_DICE = 12
27
    WEIGHTED_LEVENSHTEIN = 13
28

29
class Factory:
30
    """Factory for creating string similarity algorithm instances."""
31
    
32
    @staticmethod
33
    def get_algorithm(algorithm: Algorithm, k: int = 3):
34
        """
35
        Create an algorithm instance using the factory pattern.
36
        
37
        Args:
38
            algorithm: Algorithm type from Algorithm enum
39
            k: Shingle size for n-gram based algorithms (default: 3)
40
            
41
        Returns:
42
            Algorithm instance ready for use
43
            
44
        Raises:
45
            TypeError: For WEIGHTED_LEVENSHTEIN (use get_weighted_levenshtein instead)
46
        """
47
    
48
    @staticmethod
49
    def get_weighted_levenshtein(char_sub: CharacterSubstitutionInterface, 
50
                                char_change: CharacterInsDelInterface):
51
        """
52
        Create a WeightedLevenshtein instance with custom cost interfaces.
53
        
54
        Args:
55
            char_sub: Interface defining character substitution costs
56
            char_change: Interface defining insertion/deletion costs
57
            
58
        Returns:
59
            WeightedLevenshtein: Configured weighted Levenshtein algorithm
60
        """
61
```
62

63
**Usage Examples:**
64

65
```python
66
from similarity.similarity import Factory, Algorithm
67

68
# Basic factory usage
69
levenshtein = Factory.get_algorithm(Algorithm.LEVENSHTEIN)
70
distance = levenshtein.distance('hello', 'hallo')
71

72
# N-gram algorithms with custom k
73
cosine = Factory.get_algorithm(Algorithm.COSINE, k=4)
74
jaccard = Factory.get_algorithm(Algorithm.JACCARD, k=2)
75

76
# Programmatic algorithm selection
77
def compare_strings(s1, s2, algorithm_name):
78
    """Compare strings using specified algorithm."""
79
    algo_map = {
80
        'levenshtein': Algorithm.LEVENSHTEIN,
81
        'jaro_winkler': Algorithm.JARO_WINKLE,
82
        'cosine': Algorithm.COSINE,
83
        'jaccard': Algorithm.JACCARD
84
    }
85
    
86
    if algorithm_name not in algo_map:
87
        raise ValueError(f"Unknown algorithm: {algorithm_name}")
88
    
89
    algorithm = Factory.get_algorithm(algo_map[algorithm_name])
90
    
91
    if hasattr(algorithm, 'similarity'):
92
        return algorithm.similarity(s1, s2)
93
    else:
94
        return algorithm.distance(s1, s2)
95

96
# Algorithm comparison utility
97
def compare_with_all_algorithms(s1, s2):
98
    """Compare two strings using all available algorithms."""
99
    results = {}
100
    
101
    # Distance-only algorithms
102
    distance_algos = [
103
        Algorithm.LEVENSHTEIN,
104
        Algorithm.DAMERAU,
105
        Algorithm.LCS,
106
        Algorithm.OPTIMAL_STRING_ALIGNMENT,
107
        Algorithm.Q_GRAM,
108
        Algorithm.N_GRAM
109
    ]
110
    
111
    for algo in distance_algos:
112
        instance = Factory.get_algorithm(algo)
113
        results[algo.name] = {'distance': instance.distance(s1, s2)}
114
    
115
    # Similarity/distance algorithms  
116
    sim_dist_algos = [
117
        Algorithm.JARO_WINKLE,
118
        Algorithm.NORMALIZED_LEVENSHTEIN,
119
        Algorithm.COSINE,
120
        Algorithm.JACCARD,
121
        Algorithm.SORENSEN_DICE,
122
        Algorithm.METRIC_LCS
123
    ]
124
    
125
    for algo in sim_dist_algos:
126
        instance = Factory.get_algorithm(algo)
127
        results[algo.name] = {
128
            'similarity': instance.similarity(s1, s2),
129
            'distance': instance.distance(s1, s2)
130
        }
131
    
132
    return results
133

134
# Example usage
135
results = compare_with_all_algorithms('programming', 'program')
136
for algo_name, scores in results.items():
137
    print(f"{algo_name}: {scores}")
138
```
139

140
### Weighted Levenshtein Factory Method
141

142
Special factory method for creating WeightedLevenshtein instances with custom cost functions:
143

144
**Usage Examples:**
145

146
```python
147
from similarity.similarity import Factory
148
from similarity.weighted_levenshtein import CharacterSubstitutionInterface, CharacterInsDelInterface
149

150
# Custom substitution costs for OCR
151
class OCRSubstitution(CharacterSubstitutionInterface):
152
    def cost(self, c0, c1):
153
        # Visually similar characters have lower cost
154
        similar_pairs = [
155
            ('o', '0'), ('O', '0'),
156
            ('l', '1'), ('I', '1'),
157
            ('S', '5'), ('s', '5'),
158
            ('Z', '2'), ('z', '2')
159
        ]
160
        
161
        if (c0, c1) in similar_pairs or (c1, c0) in similar_pairs:
162
            return 0.3
163
        elif c0.lower() == c1.lower():  # Case differences
164
            return 0.1
165
        else:
166
            return 1.0
167

168
# Custom insertion/deletion costs
169
class KeyboardInsDelCosts(CharacterInsDelInterface):
170
    def insertion_cost(self, c):
171
        # Common accidentally typed characters
172
        if c in ' \t\n':
173
            return 0.5  # Whitespace often accidental
174
        return 1.0
175
    
176
    def deletion_cost(self, c):
177
        # Vowels are often dropped in informal text
178
        if c.lower() in 'aeiou':
179
            return 0.7
180
        return 1.0
181

182
# Create weighted Levenshtein with custom costs
183
substitution = OCRSubstitution()
184
ins_del = KeyboardInsDelCosts()
185
weighted_lev = Factory.get_weighted_levenshtein(substitution, ins_del)
186

187
# OCR correction example
188
ocr_text = "Th3 qu1ck br0wn f0x"
189
correct_text = "The quick brown fox"
190
distance = weighted_lev.distance(ocr_text, correct_text)
191
print(f"Weighted distance: {distance}")
192

193
# Standard Levenshtein for comparison
194
standard_lev = Factory.get_algorithm(Algorithm.LEVENSHTEIN)
195
standard_distance = standard_lev.distance(ocr_text, correct_text)
196
print(f"Standard distance: {standard_distance}")
197
```
198

199
### Utility Interfaces
200

201
Base interfaces that can be implemented to customize algorithm behavior:
202

203
```python { .api }
204
class CharacterSubstitutionInterface:
205
    """Interface for defining custom character substitution costs."""
206
    
207
    def cost(self, c0: str, c1: str) -> float:
208
        """
209
        Return the cost of substituting character c0 with c1.
210
        
211
        Args:
212
            c0: Original character
213
            c1: Replacement character
214
            
215
        Returns:
216
            float: Cost of substitution (typically 0.0-1.0)
217
        """
218

219
class CharacterInsDelInterface:
220
    """Interface for defining custom insertion and deletion costs."""
221
    
222
    def deletion_cost(self, c: str) -> float:
223
        """
224
        Return the cost of deleting character c.
225
        
226
        Args:
227
            c: Character to delete
228
            
229
        Returns:
230
            float: Cost of deletion
231
        """
232
    
233
    def insertion_cost(self, c: str) -> float:
234
        """
235
        Return the cost of inserting character c.
236
        
237
        Args:
238
            c: Character to insert
239
            
240
        Returns:
241
            float: Cost of insertion
242
        """
243
```
244

245
**Advanced Customization Examples:**
246

247
```python
248
# Domain-specific costs for name matching
249
class NameMatchingCosts(CharacterSubstitutionInterface):
250
    def cost(self, c0, c1):
251
        # Common name variations
252
        name_variants = {
253
            ('c', 'k'), ('ph', 'f'), ('y', 'i'),
254
            ('ie', 'y'), ('son', 'sen'), ('sen', 'son')
255
        }
256
        
257
        # Handle multi-character patterns
258
        pair = (c0, c1)
259
        if pair in name_variants or (c1, c0) in name_variants:
260
            return 0.2
261
        
262
        # Phonetically similar
263
        phonetic_groups = [
264
            'bp', 'dt', 'kg', 'sz', 'fv'
265
        ]
266
        
267
        for group in phonetic_groups:
268
            if c0 in group and c1 in group:
269
                return 0.4
270
        
271
        return 1.0
272

273
# Keyboard layout based costs
274
class QWERTYSubstitution(CharacterSubstitutionInterface):
275
    def __init__(self):
276
        # Define keyboard layout
277
        self.keyboard = [
278
            'qwertyuiop',
279
            'asdfghjkl',
280
            'zxcvbnm'
281
        ]
282
        self.positions = {}
283
        for row, keys in enumerate(self.keyboard):
284
            for col, key in enumerate(keys):
285
                self.positions[key] = (row, col)
286
    
287
    def cost(self, c0, c1):
288
        if c0 == c1:
289
            return 0.0
290
        
291
        c0, c1 = c0.lower(), c1.lower()
292
        
293
        if c0 in self.positions and c1 in self.positions:
294
            pos0 = self.positions[c0]
295
            pos1 = self.positions[c1]
296
            
297
            # Manhattan distance on keyboard
298
            distance = abs(pos0[0] - pos1[0]) + abs(pos0[1] - pos1[1])
299
            
300
            # Adjacent keys have lower cost
301
            if distance == 1:
302
                return 0.3
303
            elif distance == 2:
304
                return 0.6
305
        
306
        return 1.0
307

308
# Usage with factory
309
keyboard_costs = QWERTYSubstitution()
310
weighted_lev = Factory.get_weighted_levenshtein(keyboard_costs, None)
311

312
# Typo detection
313
typos = [
314
    ("hello", "heklo"),   # Adjacent key error
315
    ("world", "qorld"),   # q instead of w
316
    ("python", "pytjon")  # j instead of h
317
]
318

319
for correct, typo in typos:
320
    distance = weighted_lev.distance(correct, typo)
321
    print(f"'{correct}' vs '{typo}': {distance}")
322
```
323

324
### Error Handling
325

326
All factory methods and interfaces include proper error handling:
327

328
- **TypeError**: Raised when attempting to create WEIGHTED_LEVENSHTEIN through `get_algorithm()`
329
- **ValueError**: May be raised by custom interface implementations for invalid inputs
330
- **AttributeError**: Raised when accessing methods not supported by specific algorithms
331

332
**Example Error Handling:**
333

334
```python
335
from similarity.similarity import Factory, Algorithm
336

337
try:
338
    # This will raise TypeError
339
    weighted = Factory.get_algorithm(Algorithm.WEIGHTED_LEVENSHTEIN)
340
except TypeError as e:
341
    print(f"Error: {e}")
342
    print("Use get_weighted_levenshtein() instead")
343

344
# Proper way to handle different algorithm types
345
def safe_get_similarity(s1, s2, algorithm_type):
346
    """Safely get similarity score from any algorithm."""
347
    try:
348
        algorithm = Factory.get_algorithm(algorithm_type)
349
        
350
        if hasattr(algorithm, 'similarity'):
351
            return algorithm.similarity(s1, s2)
352
        else:
353
            # Convert distance to similarity (approximate)
354
            distance = algorithm.distance(s1, s2)
355
            max_len = max(len(s1), len(s2))
356
            return max(0, 1 - distance / max_len) if max_len > 0 else 1.0
357
            
358
    except TypeError:
359
        return None  # Weighted Levenshtein requires special handling
360
```

Version

Tile

Files

factory-utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

factory-utilities.mddocs/