0
# Distance Metrics
1
2
Low-level distance algorithms that provide raw distance calculations, similarity scores, normalized metrics, and edit operation sequences. These form the foundation of RapidFuzz's fuzzy matching capabilities and offer fine-grained control over string comparison algorithms.
3
4
## Capabilities
5
6
### Levenshtein Distance
7
8
The most commonly used edit distance, allowing insertions, deletions, and substitutions.
9
10
```python { .api }
11
class Levenshtein:
12
@staticmethod
13
def distance(
14
s1: Sequence[Hashable],
15
s2: Sequence[Hashable],
16
*,
17
weights: tuple[int, int, int] | None = (1, 1, 1),
18
processor: Callable | None = None,
19
score_cutoff: int | None = None
20
) -> int
21
22
@staticmethod
23
def similarity(
24
s1: Sequence[Hashable],
25
s2: Sequence[Hashable],
26
*,
27
weights: tuple[int, int, int] | None = (1, 1, 1),
28
processor: Callable | None = None,
29
score_cutoff: int | None = None
30
) -> int
31
32
@staticmethod
33
def normalized_distance(
34
s1: Sequence[Hashable],
35
s2: Sequence[Hashable],
36
*,
37
weights: tuple[int, int, int] | None = (1, 1, 1),
38
processor: Callable | None = None,
39
score_cutoff: float | None = None
40
) -> float
41
42
@staticmethod
43
def normalized_similarity(
44
s1: Sequence[Hashable],
45
s2: Sequence[Hashable],
46
*,
47
weights: tuple[int, int, int] | None = (1, 1, 1),
48
processor: Callable | None = None,
49
score_cutoff: float | None = None
50
) -> float
51
52
@staticmethod
53
def editops(
54
s1: Sequence[Hashable],
55
s2: Sequence[Hashable],
56
*,
57
processor: Callable | None = None
58
) -> Editops
59
60
@staticmethod
61
def opcodes(
62
s1: Sequence[Hashable],
63
s2: Sequence[Hashable],
64
*,
65
processor: Callable | None = None
66
) -> Opcodes
67
```
68
69
**Parameters:**
70
- `weights`: Cost tuple (insertion, deletion, substitution) - default (1,1,1)
71
- `processor`: String preprocessing function
72
- `score_cutoff`: Threshold for early termination
73
74
**Usage Example:**
75
```python
76
from rapidfuzz.distance import Levenshtein
77
78
# Raw edit distance (number of operations needed)
79
dist = Levenshtein.distance("kitten", "sitting")
80
print(dist) # 3
81
82
# Similarity (max_length - distance)
83
sim = Levenshtein.similarity("kitten", "sitting")
84
print(sim) # 4
85
86
# Normalized distance (0.0 to 1.0, where 0.0 = identical)
87
norm_dist = Levenshtein.normalized_distance("kitten", "sitting")
88
print(norm_dist) # 0.43
89
90
# Normalized similarity (0.0 to 1.0, where 1.0 = identical)
91
norm_sim = Levenshtein.normalized_similarity("kitten", "sitting")
92
print(norm_sim) # 0.57
93
94
# Custom weights (insert=1, delete=2, substitute=1)
95
dist = Levenshtein.distance("kitten", "sitting", weights=(1, 2, 1))
96
print(dist) # Different cost
97
98
# Get edit operations
99
ops = Levenshtein.editops("kitten", "sitting")
100
for op in ops:
101
print(f"{op.tag}: src_pos={op.src_pos}, dest_pos={op.dest_pos}")
102
```
103
104
### Damerau-Levenshtein Distance
105
106
Extended Levenshtein distance that also allows transpositions (swapping adjacent characters).
107
108
```python { .api }
109
class DamerauLevenshtein:
110
@staticmethod
111
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
112
113
@staticmethod
114
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
115
116
@staticmethod
117
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
118
119
@staticmethod
120
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
121
```
122
123
**Usage Example:**
124
```python
125
from rapidfuzz.distance import DamerauLevenshtein
126
127
# Better for transposition errors
128
dist1 = DamerauLevenshtein.distance("abcd", "acbd") # 1 (transposition)
129
dist2 = Levenshtein.distance("abcd", "acbd") # 2 (substitute twice)
130
```
131
132
### Hamming Distance
133
134
Compares strings of equal length, counting position-wise character differences.
135
136
```python { .api }
137
class Hamming:
138
@staticmethod
139
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
140
141
@staticmethod
142
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
143
144
@staticmethod
145
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
146
147
@staticmethod
148
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
149
150
@staticmethod
151
def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops
152
153
@staticmethod
154
def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Opcodes
155
```
156
157
**Usage Example:**
158
```python
159
from rapidfuzz.distance import Hamming
160
161
# Only works with equal-length strings
162
dist = Hamming.distance("abcde", "aXcYe") # 2 (positions 1 and 3 differ)
163
print(dist)
164
165
# Raises ValueError for different lengths
166
# Hamming.distance("abc", "abcd") # Error!
167
```
168
169
### Jaro Distance
170
171
Measures similarity based on matching characters and their transpositions, good for shorter strings.
172
173
```python { .api }
174
class Jaro:
175
@staticmethod
176
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
177
178
@staticmethod
179
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
180
181
@staticmethod
182
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
183
184
@staticmethod
185
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
186
```
187
188
### Jaro-Winkler Distance
189
190
Extension of Jaro distance that gives higher scores to strings with common prefixes.
191
192
```python { .api }
193
class JaroWinkler:
194
@staticmethod
195
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float
196
197
@staticmethod
198
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float
199
200
@staticmethod
201
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float
202
203
@staticmethod
204
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, prefix_weight: float = 0.1, processor: Callable | None = None, score_cutoff: float | None = None) -> float
205
```
206
207
**Parameters:**
208
- `prefix_weight`: Weight for common prefix bonus (0.0-0.25, default 0.1)
209
210
**Usage Example:**
211
```python
212
from rapidfuzz.distance import Jaro, JaroWinkler
213
214
s1, s2 = "martha", "marhta"
215
216
jaro_sim = Jaro.similarity(s1, s2)
217
jw_sim = JaroWinkler.similarity(s1, s2)
218
219
print(f"Jaro: {jaro_sim:.3f}") # Jaro: 0.944
220
print(f"Jaro-Winkler: {jw_sim:.3f}") # Jaro-Winkler: 0.961 (higher due to 'ma' prefix)
221
```
222
223
### Indel Distance
224
225
Only allows insertions and deletions (no substitutions).
226
227
```python { .api }
228
class Indel:
229
@staticmethod
230
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
231
232
@staticmethod
233
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
234
235
@staticmethod
236
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
237
238
@staticmethod
239
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
240
241
@staticmethod
242
def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops
243
244
@staticmethod
245
def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Opcodes
246
```
247
248
### OSA (Optimal String Alignment)
249
250
Like Damerau-Levenshtein but with the restriction that no substring can be edited more than once.
251
252
```python { .api }
253
class OSA:
254
@staticmethod
255
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
256
257
@staticmethod
258
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
259
260
@staticmethod
261
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
262
263
@staticmethod
264
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
265
266
@staticmethod
267
def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops
268
269
@staticmethod
270
def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Opcodes
271
```
272
273
### Longest Common Subsequence
274
275
Finds the longest subsequence common to both strings.
276
277
```python { .api }
278
class LCSseq:
279
@staticmethod
280
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
281
282
@staticmethod
283
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
284
285
@staticmethod
286
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
287
288
@staticmethod
289
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
290
291
@staticmethod
292
def editops(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Editops
293
294
@staticmethod
295
def opcodes(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None) -> Opcodes
296
```
297
298
### Prefix Distance
299
300
Measures length of common prefix.
301
302
```python { .api }
303
class Prefix:
304
@staticmethod
305
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
306
307
@staticmethod
308
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
309
310
@staticmethod
311
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
312
313
@staticmethod
314
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
315
```
316
317
### Postfix Distance
318
319
Measures length of common suffix.
320
321
```python { .api }
322
class Postfix:
323
@staticmethod
324
def distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
325
326
@staticmethod
327
def similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: int | None = None) -> int
328
329
@staticmethod
330
def normalized_distance(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
331
332
@staticmethod
333
def normalized_similarity(s1: Sequence[Hashable], s2: Sequence[Hashable], *, processor: Callable | None = None, score_cutoff: float | None = None) -> float
334
```
335
336
## Usage Patterns
337
338
### Choosing the Right Metric
339
340
- **Levenshtein**: General-purpose, most common choice
341
- **Damerau-Levenshtein**: When transposition errors are common (typing errors)
342
- **Hamming**: Fixed-length strings, genetic sequences, checksums
343
- **Jaro/Jaro-Winkler**: Names, short strings, when prefix similarity important
344
- **Indel**: When only insertions/deletions occur
345
- **OSA**: Alternative to Damerau-Levenshtein with different constraints
346
- **LCSseq**: When order matters but gaps are allowed
347
- **Prefix/Postfix**: Hierarchical data, version strings, prefixed identifiers
348
349
### Distance vs Similarity vs Normalized
350
351
```python
352
from rapidfuzz.distance import Levenshtein
353
354
s1, s2 = "kitten", "sitting"
355
356
# Raw distance (edit operations count)
357
dist = Levenshtein.distance(s1, s2) # 3
358
# Raw similarity (max_len - distance)
359
sim = Levenshtein.similarity(s1, s2) # 4 (7 - 3)
360
# Normalized distance (0.0 = identical, 1.0 = completely different)
361
norm_dist = Levenshtein.normalized_distance(s1, s2) # 0.43
362
# Normalized similarity (1.0 = identical, 0.0 = completely different)
363
norm_sim = Levenshtein.normalized_similarity(s1, s2) # 0.57
364
```
365
366
### Edit Operations Analysis
367
368
```python
369
from rapidfuzz.distance import Levenshtein
370
371
s1, s2 = "kitten", "sitting"
372
373
# Get individual edit operations
374
editops = Levenshtein.editops(s1, s2)
375
print(f"Number of operations: {len(editops)}")
376
377
for op in editops:
378
if op.tag == "replace":
379
print(f"Replace '{s1[op.src_pos]}' with '{s2[op.dest_pos]}' at position {op.src_pos}")
380
elif op.tag == "insert":
381
print(f"Insert '{s2[op.dest_pos]}' at position {op.src_pos}")
382
elif op.tag == "delete":
383
print(f"Delete '{s1[op.src_pos]}' at position {op.src_pos}")
384
385
# Get opcodes (grouped operations)
386
opcodes = Levenshtein.opcodes(s1, s2)
387
for opcode in opcodes:
388
src_slice = s1[opcode.a1:opcode.a2]
389
dest_slice = s2[opcode.b1:opcode.b2]
390
print(f"{opcode.tag}: '{src_slice}' -> '{dest_slice}'")
391
```
392
393
### Performance Considerations
394
395
```python
396
from rapidfuzz.distance import Levenshtein
397
398
# Use score_cutoff for early termination
399
dist = Levenshtein.distance("long string", "very different string", score_cutoff=5)
400
# Returns early if distance > 5
401
402
# Custom weights for specific use cases
403
dist = Levenshtein.distance("text", "test", weights=(1, 2, 1)) # Deletions cost more
404
405
# Process many comparisons efficiently
406
strings = ["..." for _ in range(1000)]
407
target = "target"
408
409
distances = [Levenshtein.distance(target, s, score_cutoff=3) for s in strings]
410
# Only calculates full distance for strings within cutoff
411
```