0
# Similarity Computations
1
2
Efficient similarity calculations for documents and terms with support for large-scale corpora through sharded indexing and various distance metrics. Gensim provides both exact and approximate similarity methods optimized for different use cases.
3
4
## Capabilities
5
6
### Document Similarity
7
8
Core similarity computations between documents using various distance metrics and indexing strategies.
9
10
```python { .api }
11
class Similarity:
12
"""Sharded similarity index for large corpora."""
13
14
def __init__(
15
self,
16
corpus,
17
num_features,
18
num_best=None,
19
chunksize=256,
20
shardsize=32768,
21
output_prefix=None
22
): ...
23
24
def __getitem__(self, query): ...
25
def get_similarities(self, query): ...
26
def add_documents(self, corpus): ...
27
def destroy(self): ...
28
29
class MatrixSimilarity:
30
"""Dense similarity matrix stored in memory."""
31
32
def __init__(
33
self,
34
corpus,
35
num_features=None,
36
num_best=None,
37
dtype=np.float32,
38
normalize=True,
39
maintain_sparsity=False
40
): ...
41
42
def __getitem__(self, query): ...
43
def get_similarities(self, query): ...
44
45
class SparseMatrixSimilarity:
46
"""Sparse similarity matrix for memory efficiency."""
47
48
def __init__(
49
self,
50
corpus,
51
num_features=None,
52
num_terms=None,
53
num_docs=None,
54
num_nnz=None,
55
num_best=None,
56
chunksize=500,
57
dtype=np.float32,
58
maintain_sparsity=False
59
): ...
60
61
def __getitem__(self, query): ...
62
def get_similarities(self, query): ...
63
64
class SoftCosineSimilarity:
65
"""Soft cosine similarity with term relationship matrix."""
66
67
def __init__(
68
self,
69
corpus,
70
similarity_matrix,
71
num_best=None,
72
chunksize=256
73
): ...
74
75
def __getitem__(self, query): ...
76
def get_similarities(self, query): ...
77
78
class WmdSimilarity:
79
"""Word Mover's Distance similarity using word embeddings."""
80
81
def __init__(
82
self,
83
corpus,
84
w2v_model,
85
num_best=None,
86
normalize_w2v_and_replace=True,
87
chunksize=256
88
): ...
89
90
def __getitem__(self, query): ...
91
def get_similarities(self, query): ...
92
```
93
94
### Term Similarity
95
96
Similarity computations between individual terms and construction of term similarity matrices.
97
98
```python { .api }
99
class TermSimilarityIndex:
100
"""Base interface for term similarity computation."""
101
102
def most_similar(self, term, topn=10): ...
103
def similarity(self, term1, term2): ...
104
def __getitem__(self, term): ...
105
106
class UniformTermSimilarityIndex(TermSimilarityIndex):
107
"""Uniform term similarity (all terms equally similar)."""
108
109
def __init__(self, dictionary, term_similarity=1.0): ...
110
111
def most_similar(self, term, topn=10): ...
112
def similarity(self, term1, term2): ...
113
114
class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
115
"""Term similarity based on word embeddings."""
116
117
def __init__(self, keyed_vectors, threshold=0.0, exponent=2.0, kwargs=None): ...
118
119
def most_similar(self, term, topn=10): ...
120
def similarity(self, term1, term2): ...
121
def __getitem__(self, term): ...
122
123
class SparseTermSimilarityMatrix:
124
"""Sparse matrix representation of term similarities."""
125
126
def __init__(
127
self,
128
term_similarity_index,
129
dictionary=None,
130
tfidf=None,
131
symmetric=True,
132
dominant=False,
133
nonzero_limit=100,
134
dtype=np.float32
135
): ...
136
137
def inner_product(self, X, Y): ...
138
def __getitem__(self, bow): ...
139
```
140
141
### String Similarity
142
143
Similarity computations for raw strings using edit distance metrics.
144
145
```python { .api }
146
class LevenshteinSimilarityIndex:
147
"""Levenshtein distance-based string similarity."""
148
149
def __init__(self, strings, alpha=1.0, beta=1.0, max_distance=10): ...
150
151
def most_similar(self, query, topn=10): ...
152
def __getitem__(self, stringlist): ...
153
```
154
155
## Usage Examples
156
157
### Basic Document Similarity
158
159
```python
160
from gensim import corpora, models, similarities
161
from gensim.test.utils import common_texts
162
163
# Create corpus and dictionary
164
dictionary = corpora.Dictionary(common_texts)
165
corpus = [dictionary.doc2bow(text) for text in common_texts]
166
167
# Create TF-IDF model
168
tfidf = models.TfidfModel(corpus)
169
corpus_tfidf = tfidf[corpus]
170
171
# Create similarity index
172
index = similarities.MatrixSimilarity(corpus_tfidf)
173
174
# Query with new document
175
query_doc = ['computer', 'human', 'interface']
176
query_bow = dictionary.doc2bow(query_doc)
177
query_tfidf = tfidf[query_bow]
178
179
# Get similarities
180
sims = index[query_tfidf]
181
print(f"Similarities: {list(enumerate(sims))}")
182
183
# Get most similar documents
184
sims_sorted = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
185
print(f"Most similar: {sims_sorted[:3]}")
186
```
187
188
### Large Corpus Similarity with Sharding
189
190
```python
191
from gensim.similarities import Similarity
192
import tempfile
193
import os
194
195
# Create temporary directory for shards
196
temp_dir = tempfile.mkdtemp()
197
198
# Create sharded similarity index for large corpus
199
index = Similarity(
200
output_prefix=os.path.join(temp_dir, 'similarity'),
201
corpus=corpus_tfidf,
202
num_features=len(dictionary),
203
shardsize=1000, # Documents per shard
204
num_best=10 # Return top 10 similarities
205
)
206
207
# Query similarity
208
similarities = index[query_tfidf]
209
print(f"Top similarities: {similarities}")
210
211
# Clean up
212
index.destroy()
213
```
214
215
### Soft Cosine Similarity with Term Relationships
216
217
```python
218
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
219
from gensim.similarities.termsim import WordEmbeddingSimilarityIndex
220
from gensim.models import Word2Vec
221
222
# Train word embeddings
223
sentences = [text for text in common_texts]
224
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
225
226
# Create term similarity index
227
term_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
228
229
# Create sparse term similarity matrix
230
similarity_matrix = SparseTermSimilarityMatrix(term_index, dictionary)
231
232
# Create soft cosine similarity index
233
soft_cosine_index = SoftCosineSimilarity(corpus, similarity_matrix)
234
235
# Query with soft cosine similarity
236
soft_similarities = soft_cosine_index[query_bow]
237
print(f"Soft cosine similarities: {list(enumerate(soft_similarities))}")
238
```
239
240
### Word Mover's Distance
241
242
```python
243
from gensim.similarities import WmdSimilarity
244
245
# Create WMD similarity index (requires word embeddings)
246
wmd_index = WmdSimilarity(corpus, w2v_model)
247
248
# Query with WMD
249
wmd_similarities = wmd_index[query_doc] # Note: WMD uses raw tokens, not BOW
250
print(f"WMD similarities: {list(enumerate(wmd_similarities))}")
251
```
252
253
### Term Similarity Operations
254
255
```python
256
from gensim.similarities.termsim import WordEmbeddingSimilarityIndex
257
258
# Create term similarity index
259
term_sim_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
260
261
# Find most similar terms
262
if 'computer' in w2v_model.wv:
263
similar_terms = term_sim_index.most_similar('computer', topn=5)
264
print(f"Terms similar to 'computer': {similar_terms}")
265
266
# Calculate term similarity
267
if 'computer' in w2v_model.wv and 'system' in w2v_model.wv:
268
sim_score = term_sim_index.similarity('computer', 'system')
269
print(f"Similarity between 'computer' and 'system': {sim_score}")
270
```
271
272
### String Similarity with Levenshtein Distance
273
274
```python
275
from gensim.similarities import LevenshteinSimilarityIndex
276
277
# Create string similarity index
278
strings = ['computer', 'computing', 'computation', 'system', 'systematic']
279
string_index = LevenshteinSimilarityIndex(strings)
280
281
# Find similar strings
282
similar_strings = string_index.most_similar(['compute'], topn=3)
283
print(f"Strings similar to 'compute': {similar_strings}")
284
```
285
286
### Batch Similarity Queries
287
288
```python
289
# Query multiple documents at once
290
queries = [
291
dictionary.doc2bow(['computer', 'interface']),
292
dictionary.doc2bow(['human', 'system']),
293
dictionary.doc2bow(['response', 'time'])
294
]
295
296
# Get similarities for all queries
297
for i, query in enumerate(queries):
298
query_tfidf = tfidf[query]
299
sims = index[query_tfidf]
300
top_sim = max(enumerate(sims), key=lambda x: x[1])
301
print(f"Query {i+1} most similar to doc {top_sim[0]} (score: {top_sim[1]:.3f})")
302
```
303
304
### Similarity Index Persistence
305
306
```python
307
# Save similarity index
308
index.save('/tmp/similarity_index.index')
309
310
# Load similarity index
311
loaded_index = similarities.MatrixSimilarity.load('/tmp/similarity_index.index')
312
313
# Verify loaded index works
314
test_sims = loaded_index[query_tfidf]
315
print(f"Loaded index similarities: {list(enumerate(test_sims))}")
316
```
317
318
### Memory-Efficient Sparse Similarity
319
320
```python
321
from gensim.similarities import SparseMatrixSimilarity
322
323
# Create sparse similarity index for memory efficiency
324
sparse_index = SparseMatrixSimilarity(
325
corpus_tfidf,
326
num_features=len(dictionary),
327
num_best=5, # Only store top 5 similarities
328
maintain_sparsity=True
329
)
330
331
# Query sparse index
332
sparse_sims = sparse_index[query_tfidf]
333
print(f"Sparse similarities: {sparse_sims}")
334
```