or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

edit-operations.mdindex.mdstring-averaging.mdstring-distance.md

string-averaging.mddocs/

0

# String Averaging and Median

1

2

Functions for computing approximate median strings, improving strings toward a target set, and calculating sequence and set similarity ratios for multiple strings. These functions are particularly useful for finding representative strings from collections and analyzing groups of related strings.

3

4

## Capabilities

5

6

### Median String

7

8

Find approximate median string from a list of strings using a more thorough algorithm that considers all possible character combinations.

9

10

```python { .api }

11

def median(strings, weights=None):

12

"""

13

Find approximate median string from a list of strings.

14

15

Uses a comprehensive algorithm to find a string that minimizes the sum of

16

edit distances to all input strings. More accurate but slower than quickmedian.

17

18

Parameters:

19

- strings: List of strings to find median for

20

- weights: Optional list of weights for each string (same length as strings)

21

22

Returns:

23

str: Approximate median string

24

"""

25

```

26

27

**Usage Examples:**

28

29

```python

30

import Levenshtein

31

32

# Find median of similar strings

33

strings = ["SpSm", "mpamm", "Spam", "Spa", "Sua", "hSam"]

34

med = Levenshtein.median(strings)

35

print(f"Median: '{med}'") # 'Spam'

36

37

# Find median for spell-checking candidates

38

misspellings = [

39

"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",

40

"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"

41

]

42

correct = Levenshtein.median(misspellings)

43

print(f"Correct spelling: '{correct}'") # 'Levenshtein'

44

45

# Find median of file names

46

filenames = ["document1.txt", "document2.txt", "document3.txt", "document4.txt"]

47

template = Levenshtein.median(filenames)

48

print(f"Template: '{template}'") # 'document.txt' (approximate)

49

50

# Weighted median (give more weight to certain strings)

51

strings = ["correct", "crect", "corect", "correct"]

52

weights = [3, 1, 1, 2] # Give more weight to "correct"

53

weighted_med = Levenshtein.median(strings, weights)

54

print(f"Weighted median: '{weighted_med}'") # Should be closer to "correct"

55

```

56

57

### Quick Median String

58

59

Fast approximate median string calculation with optional weights for different strings in the input set.

60

61

```python { .api }

62

def quickmedian(strings, weights=None):

63

"""

64

Fast approximate median string calculation.

65

66

Uses a faster heuristic algorithm. Less accurate than median() but much faster

67

for large string sets. Supports optional weighting of input strings.

68

69

Parameters:

70

- strings: List of strings to find median for

71

- weights: Optional list of weights for each string (same length as strings)

72

73

Returns:

74

str: Approximate median string

75

"""

76

```

77

78

**Usage Examples:**

79

80

```python

81

import Levenshtein

82

83

# Fast median calculation

84

misspellings = [

85

"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",

86

"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"

87

]

88

quick_med = Levenshtein.quickmedian(misspellings)

89

print(f"Quick median: '{quick_med}'") # 'Levnshein'

90

91

# Weighted median (emphasize certain strings)

92

strings = ["apple", "aple", "appl", "apple"]

93

weights = [3, 1, 1, 2] # Give more weight to "apple"

94

weighted_med = Levenshtein.quickmedian(strings, weights)

95

print(f"Weighted median: '{weighted_med}'") # Closer to "apple"

96

97

# Handle zero weights (ignored strings)

98

strings = ["test", "testing", "tester"]

99

weights = [1, 0, 1] # Ignore "testing"

100

result = Levenshtein.quickmedian(strings, weights)

101

print(f"With zero weights: '{result}'") # Based only on "test" and "tester"

102

```

103

104

### Median Improvement

105

106

Improve a string towards the median of a given set of strings through iterative refinement.

107

108

```python { .api }

109

def median_improve(string, strings, weights=None):

110

"""

111

Improve a string towards median of given strings.

112

113

Takes an initial string and iteratively improves it to be closer to the

114

median of the target string set. Can be applied multiple times for further improvement.

115

116

Parameters:

117

- string: Initial string to improve

118

- strings: List of target strings to improve towards

119

- weights: Optional list of weights for each string (same length as strings)

120

121

Returns:

122

str: Improved string that is closer to the median of the target strings

123

"""

124

```

125

126

**Usage Examples:**

127

128

```python

129

import Levenshtein

130

131

# Single improvement step

132

target_strings = [

133

"Levnhtein", "Leveshein", "Leenshten", "Leveshtei",

134

"Lenshtein", "Lvenstein", "Levenhtin", "evenshtei"

135

]

136

137

# Start with a poor guess

138

initial = "spam"

139

improved1 = Levenshtein.median_improve(initial, target_strings)

140

print(f"First improvement: '{initial}' -> '{improved1}'") # 'spam' -> 'enhtein'

141

142

# Apply improvement again

143

improved2 = Levenshtein.median_improve(improved1, target_strings)

144

print(f"Second improvement: '{improved1}' -> '{improved2}'") # 'enhtein' -> 'Levenshtein'

145

146

# Iterative improvement function

147

def iterative_improve(initial, targets, max_iterations=10):

148

"""Iteratively improve a string."""

149

current = initial

150

for i in range(max_iterations):

151

improved = Levenshtein.median_improve(current, targets)

152

if improved == current: # No more improvement possible

153

break

154

print(f"Iteration {i+1}: '{current}' -> '{improved}'")

155

current = improved

156

return current

157

158

# Example

159

final = iterative_improve("xyz", target_strings)

160

print(f"Final result: '{final}'")

161

162

# Using weights to emphasize certain target strings

163

target_strings_weighted = ["python", "programming", "coding", "development"]

164

weights = [3, 2, 1, 1] # Give more weight to "python" and "programming"

165

initial = "prog"

166

improved_weighted = Levenshtein.median_improve(initial, target_strings_weighted, weights)

167

print(f"Weighted improvement: '{initial}' -> '{improved_weighted}'")

168

```

169

170

### Set Median

171

172

Find median from set of strings treating each string as a character set rather than a sequence.

173

174

```python { .api }

175

def setmedian(strings, weights=None):

176

"""

177

Find median from set of strings (treats as character sets).

178

179

Treats each input string as a set of characters rather than a sequence,

180

finding a string that best represents the common characters across all inputs.

181

182

Parameters:

183

- strings: List of strings to find set median for

184

- weights: Optional list of weights for each string (same length as strings)

185

186

Returns:

187

str: Set median string containing representative characters

188

"""

189

```

190

191

**Usage Examples:**

192

193

```python

194

import Levenshtein

195

196

# Set median treats strings as character sets

197

strings = [

198

"ehee", "cceaes", "chees", "chreesc",

199

"chees", "cheesee", "cseese", "chetese"

200

]

201

set_med = Levenshtein.setmedian(strings)

202

print(f"Set median: '{set_med}'") # 'chees'

203

204

# Compare with regular median

205

regular_med = Levenshtein.median(strings)

206

print(f"Regular median: '{regular_med}'")

207

208

# Character frequency analysis

209

cheese_variants = ["cheese", "chese", "cheeze", "chees", "cheess"]

210

set_result = Levenshtein.setmedian(cheese_variants)

211

print(f"Cheese variants set median: '{set_result}'")

212

213

# Weighted set median (emphasize certain variants)

214

weights = [3, 1, 1, 2, 1] # Give more weight to "cheese" and "chees"

215

weighted_set_result = Levenshtein.setmedian(cheese_variants, weights)

216

print(f"Weighted set median: '{weighted_set_result}'")

217

```

218

219

### Sequence Ratio

220

221

Calculate similarity ratio for string sequences, measuring how similar the strings are when treated as sequences.

222

223

```python { .api }

224

def seqratio(strings1, strings2):

225

"""

226

Calculate similarity ratio between two string sequences.

227

228

Computes a similarity measure between two collections of strings treated as sequences,

229

measuring how similar they are in terms of character order and position.

230

231

Parameters:

232

- strings1: First list of strings to compare

233

- strings2: Second list of strings to compare

234

235

Returns:

236

float: Sequence similarity ratio between the two string collections

237

"""

238

```

239

240

**Usage Examples:**

241

242

```python

243

import Levenshtein

244

245

# Compare two similar sequence collections

246

seq1 = ["newspaper", "litter bin", "tinny", "antelope"]

247

seq2 = ["caribou", "sausage", "gorn", "woody"]

248

seq_ratio = Levenshtein.seqratio(seq1, seq2)

249

print(f"Sequence similarity ratio: {seq_ratio:.3f}") # ~0.215

250

251

# Compare identical sequences

252

identical1 = ["abc", "def", "ghi"]

253

identical2 = ["abc", "def", "ghi"]

254

seq_ratio = Levenshtein.seqratio(identical1, identical2)

255

print(f"Identical sequences ratio: {seq_ratio:.3f}") # Should be high

256

257

# Compare sequences with different lengths

258

short_seq = ["a", "b"]

259

long_seq = ["apple", "banana", "cherry", "date"]

260

seq_ratio = Levenshtein.seqratio(short_seq, long_seq)

261

print(f"Different length sequences ratio: {seq_ratio:.3f}")

262

```

263

264

### Set Ratio

265

266

Calculate similarity ratio for string sets, measuring how similar the strings are when treated as character sets.

267

268

```python { .api }

269

def setratio(strings1, strings2):

270

"""

271

Calculate similarity ratio between two string sets.

272

273

Computes a similarity measure between two collections of strings treated as character sets,

274

measuring how much they overlap in terms of unique characters present.

275

276

Parameters:

277

- strings1: First list of strings to compare

278

- strings2: Second list of strings to compare

279

280

Returns:

281

float: Set similarity ratio between the two string collections

282

"""

283

```

284

285

**Usage Examples:**

286

287

```python

288

import Levenshtein

289

290

# Compare two string collections as character sets

291

set1 = ["newspaper", "litter bin", "tinny", "antelope"]

292

set2 = ["caribou", "sausage", "gorn", "woody"]

293

set_ratio = Levenshtein.setratio(set1, set2)

294

print(f"Set similarity ratio: {set_ratio:.3f}") # ~0.282

295

296

# Compare sets with overlapping characters

297

overlapping1 = ["hello", "help", "held"]

298

overlapping2 = ["hell", "helping", "holder"]

299

set_ratio = Levenshtein.setratio(overlapping1, overlapping2)

300

print(f"Overlapping character sets ratio: {set_ratio:.3f}")

301

302

# Compare sets with identical character composition but different words

303

identical_chars1 = ["abc", "bca", "cab"]

304

identical_chars2 = ["acb", "cba", "abc"]

305

set_ratio = Levenshtein.setratio(identical_chars1, identical_chars2)

306

print(f"Same character sets ratio: {set_ratio:.3f}") # Should be high

307

308

# Compare completely different character sets

309

different1 = ["abc", "def"]

310

different2 = ["xyz", "uvw"]

311

set_ratio = Levenshtein.setratio(different1, different2)

312

print(f"Different character sets ratio: {set_ratio:.3f}") # Should be low

313

```

314

315

## Advanced Usage Patterns

316

317

### Spell Checking with Median

318

319

```python

320

import Levenshtein

321

322

def spell_check_with_median(word, dictionary, k=5):

323

"""

324

Advanced spell checking using median of closest matches.

325

"""

326

# Find k closest dictionary words

327

candidates = []

328

for dict_word in dictionary:

329

dist = Levenshtein.distance(word, dict_word)

330

candidates.append((dict_word, dist))

331

332

# Sort by distance and take top k

333

candidates.sort(key=lambda x: x[1])

334

closest_words = [word for word, _ in candidates[:k]]

335

336

# Find median of closest matches

337

suggestion = Levenshtein.median(closest_words)

338

return suggestion, closest_words

339

340

# Example usage

341

dictionary = ["hello", "help", "helm", "held", "hell", "heal"]

342

word = "helo"

343

suggestion, candidates = spell_check_with_median(word, dictionary)

344

print(f"Word: '{word}'")

345

print(f"Candidates: {candidates}")

346

print(f"Suggested correction: '{suggestion}'")

347

```

348

349

### String Clustering Analysis

350

351

```python

352

import Levenshtein

353

354

def analyze_string_cluster(strings1, strings2):

355

"""

356

Analyze two clusters of strings for similarity patterns.

357

"""

358

results = {

359

'median1': Levenshtein.median(strings1),

360

'median2': Levenshtein.median(strings2),

361

'quick_median1': Levenshtein.quickmedian(strings1),

362

'quick_median2': Levenshtein.quickmedian(strings2),

363

'set_median1': Levenshtein.setmedian(strings1),

364

'set_median2': Levenshtein.setmedian(strings2),

365

'sequence_ratio': Levenshtein.seqratio(strings1, strings2),

366

'set_ratio': Levenshtein.setratio(strings1, strings2)

367

}

368

369

print("String Cluster Analysis:")

370

print(f"Input strings1: {strings1}")

371

print(f"Input strings2: {strings2}")

372

print(f"Median1: '{results['median1']}'")

373

print(f"Median2: '{results['median2']}'")

374

print(f"Quick median1: '{results['quick_median1']}'")

375

print(f"Quick median2: '{results['quick_median2']}'")

376

print(f"Set median1: '{results['set_median1']}'")

377

print(f"Set median2: '{results['set_median2']}'")

378

print(f"Sequence similarity: {results['sequence_ratio']:.3f}")

379

print(f"Set similarity: {results['set_ratio']:.3f}")

380

381

return results

382

383

# Example

384

file_variants1 = [

385

"data_file_001.csv", "data_file_002.csv", "data_file_003.csv"

386

]

387

file_variants2 = [

388

"info_doc_001.txt", "info_doc_002.txt", "info_doc_003.txt"

389

]

390

analyze_string_cluster(file_variants1, file_variants2)

391

```

392

393

### Progressive String Improvement

394

395

```python

396

import Levenshtein

397

398

def progressive_improvement(initial, targets, verbose=True):

399

"""

400

Progressive improvement of a string towards a target set.

401

"""

402

current = initial

403

iterations = 0

404

improvements = [current]

405

406

while iterations < 20: # Prevent infinite loops

407

improved = Levenshtein.median_improve(current, targets)

408

409

if improved == current:

410

if verbose:

411

print(f"Converged after {iterations} iterations")

412

break

413

414

if verbose:

415

# Calculate average distance to targets

416

avg_dist_before = sum(Levenshtein.distance(current, t) for t in targets) / len(targets)

417

avg_dist_after = sum(Levenshtein.distance(improved, t) for t in targets) / len(targets)

418

print(f"Iteration {iterations + 1}: '{current}' -> '{improved}' "

419

f"(avg distance: {avg_dist_before:.2f} -> {avg_dist_after:.2f})")

420

421

current = improved

422

improvements.append(current)

423

iterations += 1

424

425

return current, improvements

426

427

# Example

428

targets = ["python", "programming", "development", "coding"]

429

final, history = progressive_improvement("xyz", targets)

430

print(f"\nFinal result: '{final}'")

431

print(f"Improvement history: {' -> '.join(history[:5])}{'...' if len(history) > 5 else ''}")

432

```

433

434

### Weighted Median with Confidence

435

436

```python

437

import Levenshtein

438

439

def weighted_median_with_confidence(strings, confidences=None):

440

"""

441

Calculate weighted median with confidence scores.

442

"""

443

if confidences is None:

444

confidences = [1.0] * len(strings)

445

446

# Normalize confidences to use as weights

447

total_conf = sum(confidences)

448

weights = [int(conf * 100 / total_conf) for conf in confidences]

449

450

# Calculate different medians

451

regular_median = Levenshtein.median(strings)

452

weighted_median = Levenshtein.quickmedian(strings, weights)

453

454

# Calculate confidence metrics

455

reg_avg_dist = sum(Levenshtein.distance(regular_median, s) for s in strings) / len(strings)

456

weighted_avg_dist = sum(Levenshtein.distance(weighted_median, s) * w

457

for s, w in zip(strings, weights)) / sum(weights)

458

459

return {

460

'regular_median': regular_median,

461

'weighted_median': weighted_median,

462

'regular_avg_distance': reg_avg_dist,

463

'weighted_avg_distance': weighted_avg_dist,

464

'weights_used': weights

465

}

466

467

# Example

468

strings = ["correct", "corect", "corerct", "correct", "crect"]

469

confidences = [0.9, 0.3, 0.2, 0.9, 0.1] # High confidence in first and fourth

470

471

result = weighted_median_with_confidence(strings, confidences)

472

print("Weighted Median Analysis:")

473

for key, value in result.items():

474

print(f"{key}: {value}")

475

```