or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

edit-distance.mdfactory-utilities.mdindex.mdngram-shingle.mdphonetic-record-linkage.mdsequence-based.md

factory-utilities.mddocs/

0

# Factory and Utility Classes

1

2

Factory pattern for algorithm instantiation and utility interfaces for customizing algorithm behavior. These components provide convenient ways to work with multiple algorithms and customize their operation.

3

4

## Capabilities

5

6

### Algorithm Factory

7

8

The Factory class provides a convenient way to instantiate algorithms using an enumeration, making it easy to switch between different similarity measures programmatically.

9

10

```python { .api }

11

from enum import IntEnum

12

13

class Algorithm(IntEnum):

14

"""Enumeration of available string similarity algorithms."""

15

COSINE = 1

16

DAMERAU = 2

17

JACCARD = 3

18

JARO_WINKLE = 4

19

LEVENSHTEIN = 5

20

LCS = 6

21

METRIC_LCS = 7

22

N_GRAM = 8

23

NORMALIZED_LEVENSHTEIN = 9

24

OPTIMAL_STRING_ALIGNMENT = 10

25

Q_GRAM = 11

26

SORENSEN_DICE = 12

27

WEIGHTED_LEVENSHTEIN = 13

28

29

class Factory:

30

"""Factory for creating string similarity algorithm instances."""

31

32

@staticmethod

33

def get_algorithm(algorithm: Algorithm, k: int = 3):

34

"""

35

Create an algorithm instance using the factory pattern.

36

37

Args:

38

algorithm: Algorithm type from Algorithm enum

39

k: Shingle size for n-gram based algorithms (default: 3)

40

41

Returns:

42

Algorithm instance ready for use

43

44

Raises:

45

TypeError: For WEIGHTED_LEVENSHTEIN (use get_weighted_levenshtein instead)

46

"""

47

48

@staticmethod

49

def get_weighted_levenshtein(char_sub: CharacterSubstitutionInterface,

50

char_change: CharacterInsDelInterface):

51

"""

52

Create a WeightedLevenshtein instance with custom cost interfaces.

53

54

Args:

55

char_sub: Interface defining character substitution costs

56

char_change: Interface defining insertion/deletion costs

57

58

Returns:

59

WeightedLevenshtein: Configured weighted Levenshtein algorithm

60

"""

61

```

62

63

**Usage Examples:**

64

65

```python

66

from similarity.similarity import Factory, Algorithm

67

68

# Basic factory usage

69

levenshtein = Factory.get_algorithm(Algorithm.LEVENSHTEIN)

70

distance = levenshtein.distance('hello', 'hallo')

71

72

# N-gram algorithms with custom k

73

cosine = Factory.get_algorithm(Algorithm.COSINE, k=4)

74

jaccard = Factory.get_algorithm(Algorithm.JACCARD, k=2)

75

76

# Programmatic algorithm selection

77

def compare_strings(s1, s2, algorithm_name):

78

"""Compare strings using specified algorithm."""

79

algo_map = {

80

'levenshtein': Algorithm.LEVENSHTEIN,

81

'jaro_winkler': Algorithm.JARO_WINKLE,

82

'cosine': Algorithm.COSINE,

83

'jaccard': Algorithm.JACCARD

84

}

85

86

if algorithm_name not in algo_map:

87

raise ValueError(f"Unknown algorithm: {algorithm_name}")

88

89

algorithm = Factory.get_algorithm(algo_map[algorithm_name])

90

91

if hasattr(algorithm, 'similarity'):

92

return algorithm.similarity(s1, s2)

93

else:

94

return algorithm.distance(s1, s2)

95

96

# Algorithm comparison utility

97

def compare_with_all_algorithms(s1, s2):

98

"""Compare two strings using all available algorithms."""

99

results = {}

100

101

# Distance-only algorithms

102

distance_algos = [

103

Algorithm.LEVENSHTEIN,

104

Algorithm.DAMERAU,

105

Algorithm.LCS,

106

Algorithm.OPTIMAL_STRING_ALIGNMENT,

107

Algorithm.Q_GRAM,

108

Algorithm.N_GRAM

109

]

110

111

for algo in distance_algos:

112

instance = Factory.get_algorithm(algo)

113

results[algo.name] = {'distance': instance.distance(s1, s2)}

114

115

# Similarity/distance algorithms

116

sim_dist_algos = [

117

Algorithm.JARO_WINKLE,

118

Algorithm.NORMALIZED_LEVENSHTEIN,

119

Algorithm.COSINE,

120

Algorithm.JACCARD,

121

Algorithm.SORENSEN_DICE,

122

Algorithm.METRIC_LCS

123

]

124

125

for algo in sim_dist_algos:

126

instance = Factory.get_algorithm(algo)

127

results[algo.name] = {

128

'similarity': instance.similarity(s1, s2),

129

'distance': instance.distance(s1, s2)

130

}

131

132

return results

133

134

# Example usage

135

results = compare_with_all_algorithms('programming', 'program')

136

for algo_name, scores in results.items():

137

print(f"{algo_name}: {scores}")

138

```

139

140

### Weighted Levenshtein Factory Method

141

142

Special factory method for creating WeightedLevenshtein instances with custom cost functions:

143

144

**Usage Examples:**

145

146

```python

147

from similarity.similarity import Factory

148

from similarity.weighted_levenshtein import CharacterSubstitutionInterface, CharacterInsDelInterface

149

150

# Custom substitution costs for OCR

151

class OCRSubstitution(CharacterSubstitutionInterface):

152

def cost(self, c0, c1):

153

# Visually similar characters have lower cost

154

similar_pairs = [

155

('o', '0'), ('O', '0'),

156

('l', '1'), ('I', '1'),

157

('S', '5'), ('s', '5'),

158

('Z', '2'), ('z', '2')

159

]

160

161

if (c0, c1) in similar_pairs or (c1, c0) in similar_pairs:

162

return 0.3

163

elif c0.lower() == c1.lower(): # Case differences

164

return 0.1

165

else:

166

return 1.0

167

168

# Custom insertion/deletion costs

169

class KeyboardInsDelCosts(CharacterInsDelInterface):

170

def insertion_cost(self, c):

171

# Common accidentally typed characters

172

if c in ' \t\n':

173

return 0.5 # Whitespace often accidental

174

return 1.0

175

176

def deletion_cost(self, c):

177

# Vowels are often dropped in informal text

178

if c.lower() in 'aeiou':

179

return 0.7

180

return 1.0

181

182

# Create weighted Levenshtein with custom costs

183

substitution = OCRSubstitution()

184

ins_del = KeyboardInsDelCosts()

185

weighted_lev = Factory.get_weighted_levenshtein(substitution, ins_del)

186

187

# OCR correction example

188

ocr_text = "Th3 qu1ck br0wn f0x"

189

correct_text = "The quick brown fox"

190

distance = weighted_lev.distance(ocr_text, correct_text)

191

print(f"Weighted distance: {distance}")

192

193

# Standard Levenshtein for comparison

194

standard_lev = Factory.get_algorithm(Algorithm.LEVENSHTEIN)

195

standard_distance = standard_lev.distance(ocr_text, correct_text)

196

print(f"Standard distance: {standard_distance}")

197

```

198

199

### Utility Interfaces

200

201

Base interfaces that can be implemented to customize algorithm behavior:

202

203

```python { .api }

204

class CharacterSubstitutionInterface:

205

"""Interface for defining custom character substitution costs."""

206

207

def cost(self, c0: str, c1: str) -> float:

208

"""

209

Return the cost of substituting character c0 with c1.

210

211

Args:

212

c0: Original character

213

c1: Replacement character

214

215

Returns:

216

float: Cost of substitution (typically 0.0-1.0)

217

"""

218

219

class CharacterInsDelInterface:

220

"""Interface for defining custom insertion and deletion costs."""

221

222

def deletion_cost(self, c: str) -> float:

223

"""

224

Return the cost of deleting character c.

225

226

Args:

227

c: Character to delete

228

229

Returns:

230

float: Cost of deletion

231

"""

232

233

def insertion_cost(self, c: str) -> float:

234

"""

235

Return the cost of inserting character c.

236

237

Args:

238

c: Character to insert

239

240

Returns:

241

float: Cost of insertion

242

"""

243

```

244

245

**Advanced Customization Examples:**

246

247

```python

248

# Domain-specific costs for name matching

249

class NameMatchingCosts(CharacterSubstitutionInterface):

250

def cost(self, c0, c1):

251

# Common name variations

252

name_variants = {

253

('c', 'k'), ('ph', 'f'), ('y', 'i'),

254

('ie', 'y'), ('son', 'sen'), ('sen', 'son')

255

}

256

257

# Handle multi-character patterns

258

pair = (c0, c1)

259

if pair in name_variants or (c1, c0) in name_variants:

260

return 0.2

261

262

# Phonetically similar

263

phonetic_groups = [

264

'bp', 'dt', 'kg', 'sz', 'fv'

265

]

266

267

for group in phonetic_groups:

268

if c0 in group and c1 in group:

269

return 0.4

270

271

return 1.0

272

273

# Keyboard layout based costs

274

class QWERTYSubstitution(CharacterSubstitutionInterface):

275

def __init__(self):

276

# Define keyboard layout

277

self.keyboard = [

278

'qwertyuiop',

279

'asdfghjkl',

280

'zxcvbnm'

281

]

282

self.positions = {}

283

for row, keys in enumerate(self.keyboard):

284

for col, key in enumerate(keys):

285

self.positions[key] = (row, col)

286

287

def cost(self, c0, c1):

288

if c0 == c1:

289

return 0.0

290

291

c0, c1 = c0.lower(), c1.lower()

292

293

if c0 in self.positions and c1 in self.positions:

294

pos0 = self.positions[c0]

295

pos1 = self.positions[c1]

296

297

# Manhattan distance on keyboard

298

distance = abs(pos0[0] - pos1[0]) + abs(pos0[1] - pos1[1])

299

300

# Adjacent keys have lower cost

301

if distance == 1:

302

return 0.3

303

elif distance == 2:

304

return 0.6

305

306

return 1.0

307

308

# Usage with factory

309

keyboard_costs = QWERTYSubstitution()

310

weighted_lev = Factory.get_weighted_levenshtein(keyboard_costs, None)

311

312

# Typo detection

313

typos = [

314

("hello", "heklo"), # Adjacent key error

315

("world", "qorld"), # q instead of w

316

("python", "pytjon") # j instead of h

317

]

318

319

for correct, typo in typos:

320

distance = weighted_lev.distance(correct, typo)

321

print(f"'{correct}' vs '{typo}': {distance}")

322

```

323

324

### Error Handling

325

326

All factory methods and interfaces include proper error handling:

327

328

- **TypeError**: Raised when attempting to create WEIGHTED_LEVENSHTEIN through `get_algorithm()`

329

- **ValueError**: May be raised by custom interface implementations for invalid inputs

330

- **AttributeError**: Raised when accessing methods not supported by specific algorithms

331

332

**Example Error Handling:**

333

334

```python

335

from similarity.similarity import Factory, Algorithm

336

337

try:

338

# This will raise TypeError

339

weighted = Factory.get_algorithm(Algorithm.WEIGHTED_LEVENSHTEIN)

340

except TypeError as e:

341

print(f"Error: {e}")

342

print("Use get_weighted_levenshtein() instead")

343

344

# Proper way to handle different algorithm types

345

def safe_get_similarity(s1, s2, algorithm_type):

346

"""Safely get similarity score from any algorithm."""

347

try:

348

algorithm = Factory.get_algorithm(algorithm_type)

349

350

if hasattr(algorithm, 'similarity'):

351

return algorithm.similarity(s1, s2)

352

else:

353

# Convert distance to similarity (approximate)

354

distance = algorithm.distance(s1, s2)

355

max_len = max(len(s1), len(s2))

356

return max(0, 1 - distance / max_len) if max_len > 0 else 1.0

357

358

except TypeError:

359

return None # Weighted Levenshtein requires special handling

360

```