or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

corpus-management.mddata-downloading.mdindex.mdmathematical-utilities.mdnlp-models.mdsimilarity-computations.mdtext-preprocessing.md

similarity-computations.mddocs/

0

# Similarity Computations

1

2

Efficient similarity calculations for documents and terms with support for large-scale corpora through sharded indexing and various distance metrics. Gensim provides both exact and approximate similarity methods optimized for different use cases.

3

4

## Capabilities

5

6

### Document Similarity

7

8

Core similarity computations between documents using various distance metrics and indexing strategies.

9

10

```python { .api }

11

class Similarity:

12

"""Sharded similarity index for large corpora."""

13

14

def __init__(

15

self,

16

corpus,

17

num_features,

18

num_best=None,

19

chunksize=256,

20

shardsize=32768,

21

output_prefix=None

22

): ...

23

24

def __getitem__(self, query): ...

25

def get_similarities(self, query): ...

26

def add_documents(self, corpus): ...

27

def destroy(self): ...

28

29

class MatrixSimilarity:

30

"""Dense similarity matrix stored in memory."""

31

32

def __init__(

33

self,

34

corpus,

35

num_features=None,

36

num_best=None,

37

dtype=np.float32,

38

normalize=True,

39

maintain_sparsity=False

40

): ...

41

42

def __getitem__(self, query): ...

43

def get_similarities(self, query): ...

44

45

class SparseMatrixSimilarity:

46

"""Sparse similarity matrix for memory efficiency."""

47

48

def __init__(

49

self,

50

corpus,

51

num_features=None,

52

num_terms=None,

53

num_docs=None,

54

num_nnz=None,

55

num_best=None,

56

chunksize=500,

57

dtype=np.float32,

58

maintain_sparsity=False

59

): ...

60

61

def __getitem__(self, query): ...

62

def get_similarities(self, query): ...

63

64

class SoftCosineSimilarity:

65

"""Soft cosine similarity with term relationship matrix."""

66

67

def __init__(

68

self,

69

corpus,

70

similarity_matrix,

71

num_best=None,

72

chunksize=256

73

): ...

74

75

def __getitem__(self, query): ...

76

def get_similarities(self, query): ...

77

78

class WmdSimilarity:

79

"""Word Mover's Distance similarity using word embeddings."""

80

81

def __init__(

82

self,

83

corpus,

84

w2v_model,

85

num_best=None,

86

normalize_w2v_and_replace=True,

87

chunksize=256

88

): ...

89

90

def __getitem__(self, query): ...

91

def get_similarities(self, query): ...

92

```

93

94

### Term Similarity

95

96

Similarity computations between individual terms and construction of term similarity matrices.

97

98

```python { .api }

99

class TermSimilarityIndex:

100

"""Base interface for term similarity computation."""

101

102

def most_similar(self, term, topn=10): ...

103

def similarity(self, term1, term2): ...

104

def __getitem__(self, term): ...

105

106

class UniformTermSimilarityIndex(TermSimilarityIndex):

107

"""Uniform term similarity (all terms equally similar)."""

108

109

def __init__(self, dictionary, term_similarity=1.0): ...

110

111

def most_similar(self, term, topn=10): ...

112

def similarity(self, term1, term2): ...

113

114

class WordEmbeddingSimilarityIndex(TermSimilarityIndex):

115

"""Term similarity based on word embeddings."""

116

117

def __init__(self, keyed_vectors, threshold=0.0, exponent=2.0, kwargs=None): ...

118

119

def most_similar(self, term, topn=10): ...

120

def similarity(self, term1, term2): ...

121

def __getitem__(self, term): ...

122

123

class SparseTermSimilarityMatrix:

124

"""Sparse matrix representation of term similarities."""

125

126

def __init__(

127

self,

128

term_similarity_index,

129

dictionary=None,

130

tfidf=None,

131

symmetric=True,

132

dominant=False,

133

nonzero_limit=100,

134

dtype=np.float32

135

): ...

136

137

def inner_product(self, X, Y): ...

138

def __getitem__(self, bow): ...

139

```

140

141

### String Similarity

142

143

Similarity computations for raw strings using edit distance metrics.

144

145

```python { .api }

146

class LevenshteinSimilarityIndex:

147

"""Levenshtein distance-based string similarity."""

148

149

def __init__(self, strings, alpha=1.0, beta=1.0, max_distance=10): ...

150

151

def most_similar(self, query, topn=10): ...

152

def __getitem__(self, stringlist): ...

153

```

154

155

## Usage Examples

156

157

### Basic Document Similarity

158

159

```python

160

from gensim import corpora, models, similarities

161

from gensim.test.utils import common_texts

162

163

# Create corpus and dictionary

164

dictionary = corpora.Dictionary(common_texts)

165

corpus = [dictionary.doc2bow(text) for text in common_texts]

166

167

# Create TF-IDF model

168

tfidf = models.TfidfModel(corpus)

169

corpus_tfidf = tfidf[corpus]

170

171

# Create similarity index

172

index = similarities.MatrixSimilarity(corpus_tfidf)

173

174

# Query with new document

175

query_doc = ['computer', 'human', 'interface']

176

query_bow = dictionary.doc2bow(query_doc)

177

query_tfidf = tfidf[query_bow]

178

179

# Get similarities

180

sims = index[query_tfidf]

181

print(f"Similarities: {list(enumerate(sims))}")

182

183

# Get most similar documents

184

sims_sorted = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)

185

print(f"Most similar: {sims_sorted[:3]}")

186

```

187

188

### Large Corpus Similarity with Sharding

189

190

```python

191

from gensim.similarities import Similarity

192

import tempfile

193

import os

194

195

# Create temporary directory for shards

196

temp_dir = tempfile.mkdtemp()

197

198

# Create sharded similarity index for large corpus

199

index = Similarity(

200

output_prefix=os.path.join(temp_dir, 'similarity'),

201

corpus=corpus_tfidf,

202

num_features=len(dictionary),

203

shardsize=1000, # Documents per shard

204

num_best=10 # Return top 10 similarities

205

)

206

207

# Query similarity

208

similarities = index[query_tfidf]

209

print(f"Top similarities: {similarities}")

210

211

# Clean up

212

index.destroy()

213

```

214

215

### Soft Cosine Similarity with Term Relationships

216

217

```python

218

from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix

219

from gensim.similarities.termsim import WordEmbeddingSimilarityIndex

220

from gensim.models import Word2Vec

221

222

# Train word embeddings

223

sentences = [text for text in common_texts]

224

w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)

225

226

# Create term similarity index

227

term_index = WordEmbeddingSimilarityIndex(w2v_model.wv)

228

229

# Create sparse term similarity matrix

230

similarity_matrix = SparseTermSimilarityMatrix(term_index, dictionary)

231

232

# Create soft cosine similarity index

233

soft_cosine_index = SoftCosineSimilarity(corpus, similarity_matrix)

234

235

# Query with soft cosine similarity

236

soft_similarities = soft_cosine_index[query_bow]

237

print(f"Soft cosine similarities: {list(enumerate(soft_similarities))}")

238

```

239

240

### Word Mover's Distance

241

242

```python

243

from gensim.similarities import WmdSimilarity

244

245

# Create WMD similarity index (requires word embeddings)

246

wmd_index = WmdSimilarity(corpus, w2v_model)

247

248

# Query with WMD

249

wmd_similarities = wmd_index[query_doc] # Note: WMD uses raw tokens, not BOW

250

print(f"WMD similarities: {list(enumerate(wmd_similarities))}")

251

```

252

253

### Term Similarity Operations

254

255

```python

256

from gensim.similarities.termsim import WordEmbeddingSimilarityIndex

257

258

# Create term similarity index

259

term_sim_index = WordEmbeddingSimilarityIndex(w2v_model.wv)

260

261

# Find most similar terms

262

if 'computer' in w2v_model.wv:

263

similar_terms = term_sim_index.most_similar('computer', topn=5)

264

print(f"Terms similar to 'computer': {similar_terms}")

265

266

# Calculate term similarity

267

if 'computer' in w2v_model.wv and 'system' in w2v_model.wv:

268

sim_score = term_sim_index.similarity('computer', 'system')

269

print(f"Similarity between 'computer' and 'system': {sim_score}")

270

```

271

272

### String Similarity with Levenshtein Distance

273

274

```python

275

from gensim.similarities import LevenshteinSimilarityIndex

276

277

# Create string similarity index

278

strings = ['computer', 'computing', 'computation', 'system', 'systematic']

279

string_index = LevenshteinSimilarityIndex(strings)

280

281

# Find similar strings

282

similar_strings = string_index.most_similar(['compute'], topn=3)

283

print(f"Strings similar to 'compute': {similar_strings}")

284

```

285

286

### Batch Similarity Queries

287

288

```python

289

# Query multiple documents at once

290

queries = [

291

dictionary.doc2bow(['computer', 'interface']),

292

dictionary.doc2bow(['human', 'system']),

293

dictionary.doc2bow(['response', 'time'])

294

]

295

296

# Get similarities for all queries

297

for i, query in enumerate(queries):

298

query_tfidf = tfidf[query]

299

sims = index[query_tfidf]

300

top_sim = max(enumerate(sims), key=lambda x: x[1])

301

print(f"Query {i+1} most similar to doc {top_sim[0]} (score: {top_sim[1]:.3f})")

302

```

303

304

### Similarity Index Persistence

305

306

```python

307

# Save similarity index

308

index.save('/tmp/similarity_index.index')

309

310

# Load similarity index

311

loaded_index = similarities.MatrixSimilarity.load('/tmp/similarity_index.index')

312

313

# Verify loaded index works

314

test_sims = loaded_index[query_tfidf]

315

print(f"Loaded index similarities: {list(enumerate(test_sims))}")

316

```

317

318

### Memory-Efficient Sparse Similarity

319

320

```python

321

from gensim.similarities import SparseMatrixSimilarity

322

323

# Create sparse similarity index for memory efficiency

324

sparse_index = SparseMatrixSimilarity(

325

corpus_tfidf,

326

num_features=len(dictionary),

327

num_best=5, # Only store top 5 similarities

328

maintain_sparsity=True

329

)

330

331

# Query sparse index

332

sparse_sims = sparse_index[query_tfidf]

333

print(f"Sparse similarities: {sparse_sims}")

334

```