or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch-processing.mddistance-metrics.mdfuzzy-matching.mdindex.mdstring-preprocessing.md

distance-metrics.mddocs/

0

# Distance Metrics

1

2

Low-level distance algorithms that provide raw distance calculations, similarity scores, normalized metrics, and edit operation sequences. These form the foundation of RapidFuzz's fuzzy matching capabilities and offer fine-grained control over string comparison algorithms.

3

4

## Capabilities

5

6

### Levenshtein Distance

7

8

The most commonly used edit distance, allowing insertions, deletions, and substitutions.

9

10

```python { .api }

11

class Levenshtein:

12

@staticmethod

13

def distance(

14

s1: Sequence[Hashable],

15

s2: Sequence[Hashable],

16

*,

17

weights: tuple[int, int, int] | None = (1, 1, 1),

18

processor: Callable | None = None,

19

score_cutoff: int | None = None

20

) -> int

21

22

@staticmethod

23

def similarity(

24

s1: Sequence[Hashable],

25

s2: Sequence[Hashable],

26

*,

27

weights: tuple[int, int, int] | None = (1, 1, 1),

28

processor: Callable | None = None,

29

score_cutoff: int | None = None

30

) -> int

31

32

@staticmethod

33

def normalized_distance(

34

s1: Sequence[Hashable],

35

s2: Sequence[Hashable],

36

*,

37

weights: tuple[int, int, int] | None = (1, 1, 1),

38

processor: Callable | None = None,

39

score_cutoff: float | None = None

40

) -> float

41

42

@staticmethod

43

def normalized_similarity(

44

s1: Sequence[Hashable],

45

s2: Sequence[Hashable],

46

*,

47

weights: tuple[int, int, int] | None = (1, 1, 1),

48

processor: Callable | None = None,

49

score_cutoff: float | None = None

50

) -> float

51

52

@staticmethod

53

def editops(

54

s1: Sequence[Hashable],

55

s2: Sequence[Hashable],

56

*,

57

processor: Callable | None = None

58

) -> Editops

59

60

@staticmethod

61

def opcodes(

62

s1: Sequence[Hashable],

63

s2: Sequence[Hashable],

64

*,

65

processor: Callable | None = None

66

) -> Opcodes

67

```

68

69

**Parameters:**

70

- `weights`: Cost tuple (insertion, deletion, substitution) - default (1,1,1)

71

- `processor`: String preprocessing function

72

- `score_cutoff`: Threshold for early termination

73

74

**Usage Example:**

75

```python

76

from rapidfuzz.distance import Levenshtein

77

78

# Raw edit distance (number of operations needed)

79

dist = Levenshtein.distance("kitten", "sitting")

80

print(dist) # 3

81

82

# Similarity (max_length - distance)

83

sim = Levenshtein.similarity("kitten", "sitting")

84

print(sim) # 4

85

86

# Normalized distance (0.0 to 1.0, where 0.0 = identical)

87

norm_dist = Levenshtein.normalized_distance("kitten", "sitting")

88

print(norm_dist) # 0.43

89

90

# Normalized similarity (0.0 to 1.0, where 1.0 = identical)

91

norm_sim = Levenshtein.normalized_similarity("kitten", "sitting")

92

print(norm_sim) # 0.57

93

94

# Custom weights (insert=1, delete=2, substitute=1)

95

dist = Levenshtein.distance("kitten", "sitting", weights=(1, 2, 1))

96

print(dist) # Different cost

97

98

# Get edit operations

99

ops = Levenshtein.editops("kitten", "sitting")

100

for op in ops:

101

print(f"{op.tag}: src_pos={op.src_pos}, dest_pos={op.dest_pos}")

102

```

103

104

### Damerau-Levenshtein Distance

105

106

Extended Levenshtein distance that also allows transpositions (swapping adjacent characters).

107

108

```python { .api }

109

class DamerauLevenshtein:

110

@staticmethod

111

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

112

113

@staticmethod

114

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

115

116

@staticmethod

117

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

118

119

@staticmethod

120

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

121

```

122

123

**Usage Example:**

124

```python

125

from rapidfuzz.distance import DamerauLevenshtein

126

127

# Better for transposition errors

128

dist1 = DamerauLevenshtein.distance("abcd", "acbd") # 1 (transposition)

129

dist2 = Levenshtein.distance("abcd", "acbd") # 2 (substitute twice)

130

```

131

132

### Hamming Distance

133

134

Compares strings of equal length, counting position-wise character differences.

135

136

```python { .api }

137

class Hamming:

138

@staticmethod

139

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

140

141

@staticmethod

142

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

143

144

@staticmethod

145

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

146

147

@staticmethod

148

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

149

150

@staticmethod

151

def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops

152

153

@staticmethod

154

def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Opcodes

155

```

156

157

**Usage Example:**

158

```python

159

from rapidfuzz.distance import Hamming

160

161

# Only works with equal-length strings

162

dist = Hamming.distance("abcde", "aXcYe") # 2 (positions 1 and 3 differ)

163

print(dist)

164

165

# Raises ValueError for different lengths

166

# Hamming.distance("abc", "abcd") # Error!

167

```

168

169

### Jaro Distance

170

171

Measures similarity based on matching characters and their transpositions, good for shorter strings.

172

173

```python { .api }

174

class Jaro:

175

@staticmethod

176

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

177

178

@staticmethod

179

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

180

181

@staticmethod

182

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

183

184

@staticmethod

185

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

186

```

187

188

### Jaro-Winkler Distance

189

190

Extension of Jaro distance that gives higher scores to strings with common prefixes.

191

192

```python { .api }

193

class JaroWinkler:

194

@staticmethod

195

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float

196

197

@staticmethod

198

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float

199

200

@staticmethod

201

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float

202

203

@staticmethod

204

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float

205

```

206

207

**Parameters:**

208

- `prefix_weight`: Weight for common prefix bonus (0.0-0.25, default 0.1)

209

210

**Usage Example:**

211

```python

212

from rapidfuzz.distance import Jaro, JaroWinkler

213

214

s1, s2 = "martha", "marhta"

215

216

jaro_sim = Jaro.similarity(s1, s2)

217

jw_sim = JaroWinkler.similarity(s1, s2)

218

219

print(f"Jaro: {jaro_sim:.3f}") # Jaro: 0.944

220

print(f"Jaro-Winkler: {jw_sim:.3f}") # Jaro-Winkler: 0.961 (higher due to 'ma' prefix)

221

```

222

223

### Indel Distance

224

225

Only allows insertions and deletions (no substitutions).

226

227

```python { .api }

228

class Indel:

229

@staticmethod

230

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

231

232

@staticmethod

233

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

234

235

@staticmethod

236

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

237

238

@staticmethod

239

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

240

241

@staticmethod

242

def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops

243

244

@staticmethod

245

def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Opcodes

246

```

247

248

### OSA (Optimal String Alignment)

249

250

Like Damerau-Levenshtein but with the restriction that no substring can be edited more than once.

251

252

```python { .api }

253

class OSA:

254

@staticmethod

255

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

256

257

@staticmethod

258

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

259

260

@staticmethod

261

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

262

263

@staticmethod

264

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

265

266

@staticmethod

267

def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops

268

269

@staticmethod

270

def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Opcodes

271

```

272

273

### Longest Common Subsequence

274

275

Finds the longest subsequence common to both strings.

276

277

```python { .api }

278

class LCSseq:

279

@staticmethod

280

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

281

282

@staticmethod

283

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

284

285

@staticmethod

286

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

287

288

@staticmethod

289

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

290

291

@staticmethod

292

def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops

293

294

@staticmethod

295

def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Opcodes

296

```

297

298

### Prefix Distance

299

300

Measures length of common prefix.

301

302

```python { .api }

303

class Prefix:

304

@staticmethod

305

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

306

307

@staticmethod

308

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

309

310

@staticmethod

311

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

312

313

@staticmethod

314

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

315

```

316

317

### Postfix Distance

318

319

Measures length of common suffix.

320

321

```python { .api }

322

class Postfix:

323

@staticmethod

324

def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

325

326

@staticmethod

327

def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int

328

329

@staticmethod

330

def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

331

332

@staticmethod

333

def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float

334

```

335

336

## Usage Patterns

337

338

### Choosing the Right Metric

339

340

- **Levenshtein**: General-purpose, most common choice

341

- **Damerau-Levenshtein**: When transposition errors are common (typing errors)

342

- **Hamming**: Fixed-length strings, genetic sequences, checksums

343

- **Jaro/Jaro-Winkler**: Names, short strings, when prefix similarity important

344

- **Indel**: When only insertions/deletions occur

345

- **OSA**: Alternative to Damerau-Levenshtein with different constraints

346

- **LCSseq**: When order matters but gaps are allowed

347

- **Prefix/Postfix**: Hierarchical data, version strings, prefixed identifiers

348

349

### Distance vs Similarity vs Normalized

350

351

```python

352

from rapidfuzz.distance import Levenshtein

353

354

s1, s2 = "kitten", "sitting"

355

356

# Raw distance (edit operations count)

357

dist = Levenshtein.distance(s1, s2) # 3

358

# Raw similarity (max_len - distance)

359

sim = Levenshtein.similarity(s1, s2) # 4 (7 - 3)

360

# Normalized distance (0.0 = identical, 1.0 = completely different)

361

norm_dist = Levenshtein.normalized_distance(s1, s2) # 0.43

362

# Normalized similarity (1.0 = identical, 0.0 = completely different)

363

norm_sim = Levenshtein.normalized_similarity(s1, s2) # 0.57

364

```

365

366

### Edit Operations Analysis

367

368

```python

369

from rapidfuzz.distance import Levenshtein

370

371

s1, s2 = "kitten", "sitting"

372

373

# Get individual edit operations

374

editops = Levenshtein.editops(s1, s2)

375

print(f"Number of operations: {len(editops)}")

376

377

for op in editops:

378

if op.tag == "replace":

379

print(f"Replace '{s1[op.src_pos]}' with '{s2[op.dest_pos]}' at position {op.src_pos}")

380

elif op.tag == "insert":

381

print(f"Insert '{s2[op.dest_pos]}' at position {op.src_pos}")

382

elif op.tag == "delete":

383

print(f"Delete '{s1[op.src_pos]}' at position {op.src_pos}")

384

385

# Get opcodes (grouped operations)

386

opcodes = Levenshtein.opcodes(s1, s2)

387

for opcode in opcodes:

388

src_slice = s1[opcode.a1:opcode.a2]

389

dest_slice = s2[opcode.b1:opcode.b2]

390

print(f"{opcode.tag}: '{src_slice}' -> '{dest_slice}'")

391

```

392

393

### Performance Considerations

394

395

```python

396

from rapidfuzz.distance import Levenshtein

397

398

# Use score_cutoff for early termination

399

dist = Levenshtein.distance("long string", "very different string", score_cutoff=5)

400

# Returns early if distance > 5

401

402

# Custom weights for specific use cases

403

dist = Levenshtein.distance("text", "test", weights=(1, 2, 1)) # Deletions cost more

404

405

# Process many comparisons efficiently

406

strings = ["..." for _ in range(1000)]

407

target = "target"

408

409

distances = [Levenshtein.distance(target, s, score_cutoff=3) for s in strings]

410

# Only calculates full distance for strings within cutoff

411

```