Tessl Tile for pypi/levenshtein@0.27.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

edit-operations.md index.md string-averaging.md string-distance.md

string-averaging.mddocs/

0
# String Averaging and Median
1

2
Functions for computing approximate median strings, improving strings toward a target set, and calculating sequence and set similarity ratios for multiple strings. These functions are particularly useful for finding representative strings from collections and analyzing groups of related strings.
3

4
## Capabilities
5

6
### Median String
7

8
Find approximate median string from a list of strings using a more thorough algorithm that considers all possible character combinations.
9

10
```python { .api }
11
def median(strings, weights=None):
12
    """
13
    Find approximate median string from a list of strings.
14
    
15
    Uses a comprehensive algorithm to find a string that minimizes the sum of 
16
    edit distances to all input strings. More accurate but slower than quickmedian.
17
    
18
    Parameters:
19
    - strings: List of strings to find median for
20
    - weights: Optional list of weights for each string (same length as strings)
21
    
22
    Returns:
23
    str: Approximate median string
24
    """
25
```
26

27
**Usage Examples:**
28

29
```python
30
import Levenshtein
31

32
# Find median of similar strings
33
strings = ["SpSm", "mpamm", "Spam", "Spa", "Sua", "hSam"]
34
med = Levenshtein.median(strings)
35
print(f"Median: '{med}'")  # 'Spam'
36

37
# Find median for spell-checking candidates
38
misspellings = [
39
    "Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
40
    "Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
41
]
42
correct = Levenshtein.median(misspellings)
43
print(f"Correct spelling: '{correct}'")  # 'Levenshtein'
44

45
# Find median of file names
46
filenames = ["document1.txt", "document2.txt", "document3.txt", "document4.txt"]
47
template = Levenshtein.median(filenames)
48
print(f"Template: '{template}'")  # 'document.txt' (approximate)
49

50
# Weighted median (give more weight to certain strings)
51
strings = ["correct", "crect", "corect", "correct"]
52
weights = [3, 1, 1, 2]  # Give more weight to "correct" 
53
weighted_med = Levenshtein.median(strings, weights)
54
print(f"Weighted median: '{weighted_med}'")  # Should be closer to "correct"
55
```
56

57
### Quick Median String
58

59
Fast approximate median string calculation with optional weights for different strings in the input set.
60

61
```python { .api }
62
def quickmedian(strings, weights=None):
63
    """
64
    Fast approximate median string calculation.
65
    
66
    Uses a faster heuristic algorithm. Less accurate than median() but much faster
67
    for large string sets. Supports optional weighting of input strings.
68
    
69
    Parameters:
70
    - strings: List of strings to find median for
71
    - weights: Optional list of weights for each string (same length as strings)
72
    
73
    Returns:
74
    str: Approximate median string
75
    """
76
```
77

78
**Usage Examples:**
79

80
```python
81
import Levenshtein
82

83
# Fast median calculation
84
misspellings = [
85
    "Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
86
    "Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"  
87
]
88
quick_med = Levenshtein.quickmedian(misspellings)
89
print(f"Quick median: '{quick_med}'")  # 'Levnshein'
90

91
# Weighted median (emphasize certain strings)
92
strings = ["apple", "aple", "appl", "apple"]
93
weights = [3, 1, 1, 2]  # Give more weight to "apple"
94
weighted_med = Levenshtein.quickmedian(strings, weights)
95
print(f"Weighted median: '{weighted_med}'")  # Closer to "apple"
96

97
# Handle zero weights (ignored strings)
98
strings = ["test", "testing", "tester"]
99
weights = [1, 0, 1]  # Ignore "testing"
100
result = Levenshtein.quickmedian(strings, weights)
101
print(f"With zero weights: '{result}'")  # Based only on "test" and "tester"
102
```
103

104
### Median Improvement
105

106
Improve a string towards the median of a given set of strings through iterative refinement.
107

108
```python { .api }
109
def median_improve(string, strings, weights=None):
110
    """
111
    Improve a string towards median of given strings.
112
    
113
    Takes an initial string and iteratively improves it to be closer to the 
114
    median of the target string set. Can be applied multiple times for further improvement.
115
    
116
    Parameters:
117
    - string: Initial string to improve
118
    - strings: List of target strings to improve towards
119
    - weights: Optional list of weights for each string (same length as strings)
120
    
121
    Returns:
122
    str: Improved string that is closer to the median of the target strings
123
    """
124
```
125

126
**Usage Examples:**
127

128
```python
129
import Levenshtein
130

131
# Single improvement step
132
target_strings = [
133
    "Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
134
    "Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
135
]
136

137
# Start with a poor guess
138
initial = "spam"
139
improved1 = Levenshtein.median_improve(initial, target_strings)
140
print(f"First improvement: '{initial}' -> '{improved1}'")  # 'spam' -> 'enhtein'
141

142
# Apply improvement again
143
improved2 = Levenshtein.median_improve(improved1, target_strings) 
144
print(f"Second improvement: '{improved1}' -> '{improved2}'")  # 'enhtein' -> 'Levenshtein'
145

146
# Iterative improvement function
147
def iterative_improve(initial, targets, max_iterations=10):
148
    """Iteratively improve a string."""
149
    current = initial
150
    for i in range(max_iterations):
151
        improved = Levenshtein.median_improve(current, targets)
152
        if improved == current:  # No more improvement possible
153
            break
154
        print(f"Iteration {i+1}: '{current}' -> '{improved}'")
155
        current = improved
156
    return current
157

158
# Example
159
final = iterative_improve("xyz", target_strings)
160
print(f"Final result: '{final}'")
161

162
# Using weights to emphasize certain target strings
163
target_strings_weighted = ["python", "programming", "coding", "development"]
164
weights = [3, 2, 1, 1]  # Give more weight to "python" and "programming"
165
initial = "prog"
166
improved_weighted = Levenshtein.median_improve(initial, target_strings_weighted, weights)
167
print(f"Weighted improvement: '{initial}' -> '{improved_weighted}'")
168
```
169

170
### Set Median
171

172
Find median from set of strings treating each string as a character set rather than a sequence.
173

174
```python { .api }
175
def setmedian(strings, weights=None):
176
    """
177
    Find median from set of strings (treats as character sets).
178
    
179
    Treats each input string as a set of characters rather than a sequence,
180
    finding a string that best represents the common characters across all inputs.
181
    
182
    Parameters:
183
    - strings: List of strings to find set median for
184
    - weights: Optional list of weights for each string (same length as strings)
185
    
186
    Returns:
187
    str: Set median string containing representative characters
188
    """
189
```
190

191
**Usage Examples:**
192

193
```python
194
import Levenshtein
195

196
# Set median treats strings as character sets
197
strings = [
198
    "ehee", "cceaes", "chees", "chreesc", 
199
    "chees", "cheesee", "cseese", "chetese"
200
]
201
set_med = Levenshtein.setmedian(strings)
202
print(f"Set median: '{set_med}'")  # 'chees'
203

204
# Compare with regular median
205
regular_med = Levenshtein.median(strings)
206
print(f"Regular median: '{regular_med}'")
207

208
# Character frequency analysis
209
cheese_variants = ["cheese", "chese", "cheeze", "chees", "cheess"]
210
set_result = Levenshtein.setmedian(cheese_variants)
211
print(f"Cheese variants set median: '{set_result}'")
212

213
# Weighted set median (emphasize certain variants)
214
weights = [3, 1, 1, 2, 1]  # Give more weight to "cheese" and "chees"
215
weighted_set_result = Levenshtein.setmedian(cheese_variants, weights)
216
print(f"Weighted set median: '{weighted_set_result}'")
217
```
218

219
### Sequence Ratio
220

221
Calculate similarity ratio for string sequences, measuring how similar the strings are when treated as sequences.
222

223
```python { .api }
224
def seqratio(strings1, strings2):
225
    """
226
    Calculate similarity ratio between two string sequences.
227
    
228
    Computes a similarity measure between two collections of strings treated as sequences,
229
    measuring how similar they are in terms of character order and position.
230
    
231
    Parameters:
232
    - strings1: First list of strings to compare
233
    - strings2: Second list of strings to compare
234
    
235
    Returns:
236
    float: Sequence similarity ratio between the two string collections
237
    """
238
```
239

240
**Usage Examples:**
241

242
```python
243
import Levenshtein
244

245
# Compare two similar sequence collections
246
seq1 = ["newspaper", "litter bin", "tinny", "antelope"]
247
seq2 = ["caribou", "sausage", "gorn", "woody"]
248
seq_ratio = Levenshtein.seqratio(seq1, seq2)
249
print(f"Sequence similarity ratio: {seq_ratio:.3f}")  # ~0.215
250

251
# Compare identical sequences
252
identical1 = ["abc", "def", "ghi"]
253
identical2 = ["abc", "def", "ghi"]
254
seq_ratio = Levenshtein.seqratio(identical1, identical2)
255
print(f"Identical sequences ratio: {seq_ratio:.3f}")  # Should be high
256

257
# Compare sequences with different lengths
258
short_seq = ["a", "b"]
259
long_seq = ["apple", "banana", "cherry", "date"]
260
seq_ratio = Levenshtein.seqratio(short_seq, long_seq)
261
print(f"Different length sequences ratio: {seq_ratio:.3f}")
262
```
263

264
### Set Ratio
265

266
Calculate similarity ratio for string sets, measuring how similar the strings are when treated as character sets.
267

268
```python { .api }
269
def setratio(strings1, strings2):
270
    """
271
    Calculate similarity ratio between two string sets.
272
    
273
    Computes a similarity measure between two collections of strings treated as character sets,
274
    measuring how much they overlap in terms of unique characters present.
275
    
276
    Parameters:
277
    - strings1: First list of strings to compare
278
    - strings2: Second list of strings to compare
279
    
280
    Returns:
281
    float: Set similarity ratio between the two string collections
282
    """
283
```
284

285
**Usage Examples:**
286

287
```python
288
import Levenshtein
289

290
# Compare two string collections as character sets
291
set1 = ["newspaper", "litter bin", "tinny", "antelope"]
292
set2 = ["caribou", "sausage", "gorn", "woody"]
293
set_ratio = Levenshtein.setratio(set1, set2)
294
print(f"Set similarity ratio: {set_ratio:.3f}")  # ~0.282
295

296
# Compare sets with overlapping characters
297
overlapping1 = ["hello", "help", "held"]
298
overlapping2 = ["hell", "helping", "holder"]
299
set_ratio = Levenshtein.setratio(overlapping1, overlapping2)
300
print(f"Overlapping character sets ratio: {set_ratio:.3f}")
301

302
# Compare sets with identical character composition but different words
303
identical_chars1 = ["abc", "bca", "cab"]  
304
identical_chars2 = ["acb", "cba", "abc"]
305
set_ratio = Levenshtein.setratio(identical_chars1, identical_chars2)
306
print(f"Same character sets ratio: {set_ratio:.3f}")  # Should be high
307

308
# Compare completely different character sets
309
different1 = ["abc", "def"]
310
different2 = ["xyz", "uvw"]
311
set_ratio = Levenshtein.setratio(different1, different2)
312
print(f"Different character sets ratio: {set_ratio:.3f}")  # Should be low
313
```
314

315
## Advanced Usage Patterns
316

317
### Spell Checking with Median
318

319
```python
320
import Levenshtein
321

322
def spell_check_with_median(word, dictionary, k=5):
323
    """
324
    Advanced spell checking using median of closest matches.
325
    """
326
    # Find k closest dictionary words
327
    candidates = []
328
    for dict_word in dictionary:
329
        dist = Levenshtein.distance(word, dict_word)
330
        candidates.append((dict_word, dist))
331
    
332
    # Sort by distance and take top k
333
    candidates.sort(key=lambda x: x[1])
334
    closest_words = [word for word, _ in candidates[:k]]
335
    
336
    # Find median of closest matches
337
    suggestion = Levenshtein.median(closest_words)
338
    return suggestion, closest_words
339

340
# Example usage
341
dictionary = ["hello", "help", "helm", "held", "hell", "heal"]
342
word = "helo"
343
suggestion, candidates = spell_check_with_median(word, dictionary)
344
print(f"Word: '{word}'")
345
print(f"Candidates: {candidates}")
346
print(f"Suggested correction: '{suggestion}'")
347
```
348

349
### String Clustering Analysis
350

351
```python
352
import Levenshtein
353

354
def analyze_string_cluster(strings1, strings2):
355
    """
356
    Analyze two clusters of strings for similarity patterns.
357
    """
358
    results = {
359
        'median1': Levenshtein.median(strings1),
360
        'median2': Levenshtein.median(strings2),
361
        'quick_median1': Levenshtein.quickmedian(strings1),
362
        'quick_median2': Levenshtein.quickmedian(strings2),
363
        'set_median1': Levenshtein.setmedian(strings1),
364
        'set_median2': Levenshtein.setmedian(strings2),
365
        'sequence_ratio': Levenshtein.seqratio(strings1, strings2),
366
        'set_ratio': Levenshtein.setratio(strings1, strings2)
367
    }
368
    
369
    print("String Cluster Analysis:")
370
    print(f"Input strings1: {strings1}")
371
    print(f"Input strings2: {strings2}")
372
    print(f"Median1: '{results['median1']}'")
373
    print(f"Median2: '{results['median2']}'")
374
    print(f"Quick median1: '{results['quick_median1']}'") 
375
    print(f"Quick median2: '{results['quick_median2']}'") 
376
    print(f"Set median1: '{results['set_median1']}'")
377
    print(f"Set median2: '{results['set_median2']}'")
378
    print(f"Sequence similarity: {results['sequence_ratio']:.3f}")
379
    print(f"Set similarity: {results['set_ratio']:.3f}")
380
    
381
    return results
382

383
# Example
384
file_variants1 = [
385
    "data_file_001.csv", "data_file_002.csv", "data_file_003.csv"
386
]
387
file_variants2 = [
388
    "info_doc_001.txt", "info_doc_002.txt", "info_doc_003.txt"
389
]
390
analyze_string_cluster(file_variants1, file_variants2)
391
```
392

393
### Progressive String Improvement
394

395
```python
396
import Levenshtein
397

398
def progressive_improvement(initial, targets, verbose=True):
399
    """
400
    Progressive improvement of a string towards a target set.
401
    """
402
    current = initial
403
    iterations = 0
404
    improvements = [current]
405
    
406
    while iterations < 20:  # Prevent infinite loops
407
        improved = Levenshtein.median_improve(current, targets)
408
        
409
        if improved == current:
410
            if verbose:
411
                print(f"Converged after {iterations} iterations")
412
            break
413
            
414
        if verbose:
415
            # Calculate average distance to targets
416
            avg_dist_before = sum(Levenshtein.distance(current, t) for t in targets) / len(targets)
417
            avg_dist_after = sum(Levenshtein.distance(improved, t) for t in targets) / len(targets)
418
            print(f"Iteration {iterations + 1}: '{current}' -> '{improved}' "
419
                  f"(avg distance: {avg_dist_before:.2f} -> {avg_dist_after:.2f})")
420
        
421
        current = improved
422
        improvements.append(current)
423
        iterations += 1
424
    
425
    return current, improvements
426

427
# Example
428
targets = ["python", "programming", "development", "coding"]
429
final, history = progressive_improvement("xyz", targets)
430
print(f"\nFinal result: '{final}'")
431
print(f"Improvement history: {' -> '.join(history[:5])}{'...' if len(history) > 5 else ''}")
432
```
433

434
### Weighted Median with Confidence
435

436
```python
437
import Levenshtein
438

439
def weighted_median_with_confidence(strings, confidences=None):
440
    """
441
    Calculate weighted median with confidence scores.
442
    """
443
    if confidences is None:
444
        confidences = [1.0] * len(strings)
445
    
446
    # Normalize confidences to use as weights
447
    total_conf = sum(confidences)
448
    weights = [int(conf * 100 / total_conf) for conf in confidences]
449
    
450
    # Calculate different medians
451
    regular_median = Levenshtein.median(strings)
452
    weighted_median = Levenshtein.quickmedian(strings, weights)
453
    
454
    # Calculate confidence metrics
455
    reg_avg_dist = sum(Levenshtein.distance(regular_median, s) for s in strings) / len(strings)
456
    weighted_avg_dist = sum(Levenshtein.distance(weighted_median, s) * w 
457
                           for s, w in zip(strings, weights)) / sum(weights)
458
    
459
    return {
460
        'regular_median': regular_median,
461
        'weighted_median': weighted_median,
462
        'regular_avg_distance': reg_avg_dist,
463
        'weighted_avg_distance': weighted_avg_dist,
464
        'weights_used': weights
465
    }
466

467
# Example
468
strings = ["correct", "corect", "corerct", "correct", "crect"]
469
confidences = [0.9, 0.3, 0.2, 0.9, 0.1]  # High confidence in first and fourth
470

471
result = weighted_median_with_confidence(strings, confidences)
472
print("Weighted Median Analysis:")
473
for key, value in result.items():
474
    print(f"{key}: {value}")
475
```

Version

Tile

Files

string-averaging.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

string-averaging.mddocs/