0
# String Averaging and Median
1
2
Functions for computing approximate median strings, improving strings toward a target set, and calculating sequence and set similarity ratios for multiple strings. These functions are particularly useful for finding representative strings from collections and analyzing groups of related strings.
3
4
## Capabilities
5
6
### Median String
7
8
Find approximate median string from a list of strings using a more thorough algorithm that considers all possible character combinations.
9
10
```python { .api }
11
def median(strings, weights=None):
12
"""
13
Find approximate median string from a list of strings.
14
15
Uses a comprehensive algorithm to find a string that minimizes the sum of
16
edit distances to all input strings. More accurate but slower than quickmedian.
17
18
Parameters:
19
- strings: List of strings to find median for
20
- weights: Optional list of weights for each string (same length as strings)
21
22
Returns:
23
str: Approximate median string
24
"""
25
```
26
27
**Usage Examples:**
28
29
```python
30
import Levenshtein
31
32
# Find median of similar strings
33
strings = ["SpSm", "mpamm", "Spam", "Spa", "Sua", "hSam"]
34
med = Levenshtein.median(strings)
35
print(f"Median: '{med}'") # 'Spam'
36
37
# Find median for spell-checking candidates
38
misspellings = [
39
"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
40
"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
41
]
42
correct = Levenshtein.median(misspellings)
43
print(f"Correct spelling: '{correct}'") # 'Levenshtein'
44
45
# Find median of file names
46
filenames = ["document1.txt", "document2.txt", "document3.txt", "document4.txt"]
47
template = Levenshtein.median(filenames)
48
print(f"Template: '{template}'") # 'document.txt' (approximate)
49
50
# Weighted median (give more weight to certain strings)
51
strings = ["correct", "crect", "corect", "correct"]
52
weights = [3, 1, 1, 2] # Give more weight to "correct"
53
weighted_med = Levenshtein.median(strings, weights)
54
print(f"Weighted median: '{weighted_med}'") # Should be closer to "correct"
55
```
56
57
### Quick Median String
58
59
Fast approximate median string calculation with optional weights for different strings in the input set.
60
61
```python { .api }
62
def quickmedian(strings, weights=None):
63
"""
64
Fast approximate median string calculation.
65
66
Uses a faster heuristic algorithm. Less accurate than median() but much faster
67
for large string sets. Supports optional weighting of input strings.
68
69
Parameters:
70
- strings: List of strings to find median for
71
- weights: Optional list of weights for each string (same length as strings)
72
73
Returns:
74
str: Approximate median string
75
"""
76
```
77
78
**Usage Examples:**
79
80
```python
81
import Levenshtein
82
83
# Fast median calculation
84
misspellings = [
85
"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
86
"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
87
]
88
quick_med = Levenshtein.quickmedian(misspellings)
89
print(f"Quick median: '{quick_med}'") # 'Levnshein'
90
91
# Weighted median (emphasize certain strings)
92
strings = ["apple", "aple", "appl", "apple"]
93
weights = [3, 1, 1, 2] # Give more weight to "apple"
94
weighted_med = Levenshtein.quickmedian(strings, weights)
95
print(f"Weighted median: '{weighted_med}'") # Closer to "apple"
96
97
# Handle zero weights (ignored strings)
98
strings = ["test", "testing", "tester"]
99
weights = [1, 0, 1] # Ignore "testing"
100
result = Levenshtein.quickmedian(strings, weights)
101
print(f"With zero weights: '{result}'") # Based only on "test" and "tester"
102
```
103
104
### Median Improvement
105
106
Improve a string towards the median of a given set of strings through iterative refinement.
107
108
```python { .api }
109
def median_improve(string, strings, weights=None):
110
"""
111
Improve a string towards median of given strings.
112
113
Takes an initial string and iteratively improves it to be closer to the
114
median of the target string set. Can be applied multiple times for further improvement.
115
116
Parameters:
117
- string: Initial string to improve
118
- strings: List of target strings to improve towards
119
- weights: Optional list of weights for each string (same length as strings)
120
121
Returns:
122
str: Improved string that is closer to the median of the target strings
123
"""
124
```
125
126
**Usage Examples:**
127
128
```python
129
import Levenshtein
130
131
# Single improvement step
132
target_strings = [
133
"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",
134
"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"
135
]
136
137
# Start with a poor guess
138
initial = "spam"
139
improved1 = Levenshtein.median_improve(initial, target_strings)
140
print(f"First improvement: '{initial}' -> '{improved1}'") # 'spam' -> 'enhtein'
141
142
# Apply improvement again
143
improved2 = Levenshtein.median_improve(improved1, target_strings)
144
print(f"Second improvement: '{improved1}' -> '{improved2}'") # 'enhtein' -> 'Levenshtein'
145
146
# Iterative improvement function
147
def iterative_improve(initial, targets, max_iterations=10):
148
"""Iteratively improve a string."""
149
current = initial
150
for i in range(max_iterations):
151
improved = Levenshtein.median_improve(current, targets)
152
if improved == current: # No more improvement possible
153
break
154
print(f"Iteration {i+1}: '{current}' -> '{improved}'")
155
current = improved
156
return current
157
158
# Example
159
final = iterative_improve("xyz", target_strings)
160
print(f"Final result: '{final}'")
161
162
# Using weights to emphasize certain target strings
163
target_strings_weighted = ["python", "programming", "coding", "development"]
164
weights = [3, 2, 1, 1] # Give more weight to "python" and "programming"
165
initial = "prog"
166
improved_weighted = Levenshtein.median_improve(initial, target_strings_weighted, weights)
167
print(f"Weighted improvement: '{initial}' -> '{improved_weighted}'")
168
```
169
170
### Set Median
171
172
Find median from set of strings treating each string as a character set rather than a sequence.
173
174
```python { .api }
175
def setmedian(strings, weights=None):
176
"""
177
Find median from set of strings (treats as character sets).
178
179
Treats each input string as a set of characters rather than a sequence,
180
finding a string that best represents the common characters across all inputs.
181
182
Parameters:
183
- strings: List of strings to find set median for
184
- weights: Optional list of weights for each string (same length as strings)
185
186
Returns:
187
str: Set median string containing representative characters
188
"""
189
```
190
191
**Usage Examples:**
192
193
```python
194
import Levenshtein
195
196
# Set median treats strings as character sets
197
strings = [
198
"ehee", "cceaes", "chees", "chreesc",
199
"chees", "cheesee", "cseese", "chetese"
200
]
201
set_med = Levenshtein.setmedian(strings)
202
print(f"Set median: '{set_med}'") # 'chees'
203
204
# Compare with regular median
205
regular_med = Levenshtein.median(strings)
206
print(f"Regular median: '{regular_med}'")
207
208
# Character frequency analysis
209
cheese_variants = ["cheese", "chese", "cheeze", "chees", "cheess"]
210
set_result = Levenshtein.setmedian(cheese_variants)
211
print(f"Cheese variants set median: '{set_result}'")
212
213
# Weighted set median (emphasize certain variants)
214
weights = [3, 1, 1, 2, 1] # Give more weight to "cheese" and "chees"
215
weighted_set_result = Levenshtein.setmedian(cheese_variants, weights)
216
print(f"Weighted set median: '{weighted_set_result}'")
217
```
218
219
### Sequence Ratio
220
221
Calculate similarity ratio for string sequences, measuring how similar the strings are when treated as sequences.
222
223
```python { .api }
224
def seqratio(strings1, strings2):
225
"""
226
Calculate similarity ratio between two string sequences.
227
228
Computes a similarity measure between two collections of strings treated as sequences,
229
measuring how similar they are in terms of character order and position.
230
231
Parameters:
232
- strings1: First list of strings to compare
233
- strings2: Second list of strings to compare
234
235
Returns:
236
float: Sequence similarity ratio between the two string collections
237
"""
238
```
239
240
**Usage Examples:**
241
242
```python
243
import Levenshtein
244
245
# Compare two similar sequence collections
246
seq1 = ["newspaper", "litter bin", "tinny", "antelope"]
247
seq2 = ["caribou", "sausage", "gorn", "woody"]
248
seq_ratio = Levenshtein.seqratio(seq1, seq2)
249
print(f"Sequence similarity ratio: {seq_ratio:.3f}") # ~0.215
250
251
# Compare identical sequences
252
identical1 = ["abc", "def", "ghi"]
253
identical2 = ["abc", "def", "ghi"]
254
seq_ratio = Levenshtein.seqratio(identical1, identical2)
255
print(f"Identical sequences ratio: {seq_ratio:.3f}") # Should be high
256
257
# Compare sequences with different lengths
258
short_seq = ["a", "b"]
259
long_seq = ["apple", "banana", "cherry", "date"]
260
seq_ratio = Levenshtein.seqratio(short_seq, long_seq)
261
print(f"Different length sequences ratio: {seq_ratio:.3f}")
262
```
263
264
### Set Ratio
265
266
Calculate similarity ratio for string sets, measuring how similar the strings are when treated as character sets.
267
268
```python { .api }
269
def setratio(strings1, strings2):
270
"""
271
Calculate similarity ratio between two string sets.
272
273
Computes a similarity measure between two collections of strings treated as character sets,
274
measuring how much they overlap in terms of unique characters present.
275
276
Parameters:
277
- strings1: First list of strings to compare
278
- strings2: Second list of strings to compare
279
280
Returns:
281
float: Set similarity ratio between the two string collections
282
"""
283
```
284
285
**Usage Examples:**
286
287
```python
288
import Levenshtein
289
290
# Compare two string collections as character sets
291
set1 = ["newspaper", "litter bin", "tinny", "antelope"]
292
set2 = ["caribou", "sausage", "gorn", "woody"]
293
set_ratio = Levenshtein.setratio(set1, set2)
294
print(f"Set similarity ratio: {set_ratio:.3f}") # ~0.282
295
296
# Compare sets with overlapping characters
297
overlapping1 = ["hello", "help", "held"]
298
overlapping2 = ["hell", "helping", "holder"]
299
set_ratio = Levenshtein.setratio(overlapping1, overlapping2)
300
print(f"Overlapping character sets ratio: {set_ratio:.3f}")
301
302
# Compare sets with identical character composition but different words
303
identical_chars1 = ["abc", "bca", "cab"]
304
identical_chars2 = ["acb", "cba", "abc"]
305
set_ratio = Levenshtein.setratio(identical_chars1, identical_chars2)
306
print(f"Same character sets ratio: {set_ratio:.3f}") # Should be high
307
308
# Compare completely different character sets
309
different1 = ["abc", "def"]
310
different2 = ["xyz", "uvw"]
311
set_ratio = Levenshtein.setratio(different1, different2)
312
print(f"Different character sets ratio: {set_ratio:.3f}") # Should be low
313
```
314
315
## Advanced Usage Patterns
316
317
### Spell Checking with Median
318
319
```python
320
import Levenshtein
321
322
def spell_check_with_median(word, dictionary, k=5):
323
"""
324
Advanced spell checking using median of closest matches.
325
"""
326
# Find k closest dictionary words
327
candidates = []
328
for dict_word in dictionary:
329
dist = Levenshtein.distance(word, dict_word)
330
candidates.append((dict_word, dist))
331
332
# Sort by distance and take top k
333
candidates.sort(key=lambda x: x[1])
334
closest_words = [word for word, _ in candidates[:k]]
335
336
# Find median of closest matches
337
suggestion = Levenshtein.median(closest_words)
338
return suggestion, closest_words
339
340
# Example usage
341
dictionary = ["hello", "help", "helm", "held", "hell", "heal"]
342
word = "helo"
343
suggestion, candidates = spell_check_with_median(word, dictionary)
344
print(f"Word: '{word}'")
345
print(f"Candidates: {candidates}")
346
print(f"Suggested correction: '{suggestion}'")
347
```
348
349
### String Clustering Analysis
350
351
```python
352
import Levenshtein
353
354
def analyze_string_cluster(strings1, strings2):
355
"""
356
Analyze two clusters of strings for similarity patterns.
357
"""
358
results = {
359
'median1': Levenshtein.median(strings1),
360
'median2': Levenshtein.median(strings2),
361
'quick_median1': Levenshtein.quickmedian(strings1),
362
'quick_median2': Levenshtein.quickmedian(strings2),
363
'set_median1': Levenshtein.setmedian(strings1),
364
'set_median2': Levenshtein.setmedian(strings2),
365
'sequence_ratio': Levenshtein.seqratio(strings1, strings2),
366
'set_ratio': Levenshtein.setratio(strings1, strings2)
367
}
368
369
print("String Cluster Analysis:")
370
print(f"Input strings1: {strings1}")
371
print(f"Input strings2: {strings2}")
372
print(f"Median1: '{results['median1']}'")
373
print(f"Median2: '{results['median2']}'")
374
print(f"Quick median1: '{results['quick_median1']}'")
375
print(f"Quick median2: '{results['quick_median2']}'")
376
print(f"Set median1: '{results['set_median1']}'")
377
print(f"Set median2: '{results['set_median2']}'")
378
print(f"Sequence similarity: {results['sequence_ratio']:.3f}")
379
print(f"Set similarity: {results['set_ratio']:.3f}")
380
381
return results
382
383
# Example
384
file_variants1 = [
385
"data_file_001.csv", "data_file_002.csv", "data_file_003.csv"
386
]
387
file_variants2 = [
388
"info_doc_001.txt", "info_doc_002.txt", "info_doc_003.txt"
389
]
390
analyze_string_cluster(file_variants1, file_variants2)
391
```
392
393
### Progressive String Improvement
394
395
```python
396
import Levenshtein
397
398
def progressive_improvement(initial, targets, verbose=True):
399
"""
400
Progressive improvement of a string towards a target set.
401
"""
402
current = initial
403
iterations = 0
404
improvements = [current]
405
406
while iterations < 20: # Prevent infinite loops
407
improved = Levenshtein.median_improve(current, targets)
408
409
if improved == current:
410
if verbose:
411
print(f"Converged after {iterations} iterations")
412
break
413
414
if verbose:
415
# Calculate average distance to targets
416
avg_dist_before = sum(Levenshtein.distance(current, t) for t in targets) / len(targets)
417
avg_dist_after = sum(Levenshtein.distance(improved, t) for t in targets) / len(targets)
418
print(f"Iteration {iterations + 1}: '{current}' -> '{improved}' "
419
f"(avg distance: {avg_dist_before:.2f} -> {avg_dist_after:.2f})")
420
421
current = improved
422
improvements.append(current)
423
iterations += 1
424
425
return current, improvements
426
427
# Example
428
targets = ["python", "programming", "development", "coding"]
429
final, history = progressive_improvement("xyz", targets)
430
print(f"\nFinal result: '{final}'")
431
print(f"Improvement history: {' -> '.join(history[:5])}{'...' if len(history) > 5 else ''}")
432
```
433
434
### Weighted Median with Confidence
435
436
```python
437
import Levenshtein
438
439
def weighted_median_with_confidence(strings, confidences=None):
440
"""
441
Calculate weighted median with confidence scores.
442
"""
443
if confidences is None:
444
confidences = [1.0] * len(strings)
445
446
# Normalize confidences to use as weights
447
total_conf = sum(confidences)
448
weights = [int(conf * 100 / total_conf) for conf in confidences]
449
450
# Calculate different medians
451
regular_median = Levenshtein.median(strings)
452
weighted_median = Levenshtein.quickmedian(strings, weights)
453
454
# Calculate confidence metrics
455
reg_avg_dist = sum(Levenshtein.distance(regular_median, s) for s in strings) / len(strings)
456
weighted_avg_dist = sum(Levenshtein.distance(weighted_median, s) * w
457
for s, w in zip(strings, weights)) / sum(weights)
458
459
return {
460
'regular_median': regular_median,
461
'weighted_median': weighted_median,
462
'regular_avg_distance': reg_avg_dist,
463
'weighted_avg_distance': weighted_avg_dist,
464
'weights_used': weights
465
}
466
467
# Example
468
strings = ["correct", "corect", "corerct", "correct", "crect"]
469
confidences = [0.9, 0.3, 0.2, 0.9, 0.1] # High confidence in first and fourth
470
471
result = weighted_median_with_confidence(strings, confidences)
472
print("Weighted Median Analysis:")
473
for key, value in result.items():
474
print(f"{key}: {value}")
475
```