Python library for topic modelling, document indexing and similarity retrieval with large corpora
78
Efficient similarity calculations for documents and terms with support for large-scale corpora through sharded indexing and various distance metrics. Gensim provides both exact and approximate similarity methods optimized for different use cases.
Core similarity computations between documents using various distance metrics and indexing strategies.
class Similarity:
"""Sharded similarity index for large corpora."""
def __init__(
self,
corpus,
num_features,
num_best=None,
chunksize=256,
shardsize=32768,
output_prefix=None
): ...
def __getitem__(self, query): ...
def get_similarities(self, query): ...
def add_documents(self, corpus): ...
def destroy(self): ...
class MatrixSimilarity:
"""Dense similarity matrix stored in memory."""
def __init__(
self,
corpus,
num_features=None,
num_best=None,
dtype=np.float32,
normalize=True,
maintain_sparsity=False
): ...
def __getitem__(self, query): ...
def get_similarities(self, query): ...
class SparseMatrixSimilarity:
"""Sparse similarity matrix for memory efficiency."""
def __init__(
self,
corpus,
num_features=None,
num_terms=None,
num_docs=None,
num_nnz=None,
num_best=None,
chunksize=500,
dtype=np.float32,
maintain_sparsity=False
): ...
def __getitem__(self, query): ...
def get_similarities(self, query): ...
class SoftCosineSimilarity:
"""Soft cosine similarity with term relationship matrix."""
def __init__(
self,
corpus,
similarity_matrix,
num_best=None,
chunksize=256
): ...
def __getitem__(self, query): ...
def get_similarities(self, query): ...
class WmdSimilarity:
"""Word Mover's Distance similarity using word embeddings."""
def __init__(
self,
corpus,
w2v_model,
num_best=None,
normalize_w2v_and_replace=True,
chunksize=256
): ...
def __getitem__(self, query): ...
def get_similarities(self, query): ...Similarity computations between individual terms and construction of term similarity matrices.
class TermSimilarityIndex:
"""Base interface for term similarity computation."""
def most_similar(self, term, topn=10): ...
def similarity(self, term1, term2): ...
def __getitem__(self, term): ...
class UniformTermSimilarityIndex(TermSimilarityIndex):
"""Uniform term similarity (all terms equally similar)."""
def __init__(self, dictionary, term_similarity=1.0): ...
def most_similar(self, term, topn=10): ...
def similarity(self, term1, term2): ...
class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
"""Term similarity based on word embeddings."""
def __init__(self, keyed_vectors, threshold=0.0, exponent=2.0, kwargs=None): ...
def most_similar(self, term, topn=10): ...
def similarity(self, term1, term2): ...
def __getitem__(self, term): ...
class SparseTermSimilarityMatrix:
"""Sparse matrix representation of term similarities."""
def __init__(
self,
term_similarity_index,
dictionary=None,
tfidf=None,
symmetric=True,
dominant=False,
nonzero_limit=100,
dtype=np.float32
): ...
def inner_product(self, X, Y): ...
def __getitem__(self, bow): ...Similarity computations for raw strings using edit distance metrics.
class LevenshteinSimilarityIndex:
"""Levenshtein distance-based string similarity."""
def __init__(self, strings, alpha=1.0, beta=1.0, max_distance=10): ...
def most_similar(self, query, topn=10): ...
def __getitem__(self, stringlist): ...from gensim import corpora, models, similarities
from gensim.test.utils import common_texts
# Create corpus and dictionary
dictionary = corpora.Dictionary(common_texts)
corpus = [dictionary.doc2bow(text) for text in common_texts]
# Create TF-IDF model
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
# Create similarity index
index = similarities.MatrixSimilarity(corpus_tfidf)
# Query with new document
query_doc = ['computer', 'human', 'interface']
query_bow = dictionary.doc2bow(query_doc)
query_tfidf = tfidf[query_bow]
# Get similarities
sims = index[query_tfidf]
print(f"Similarities: {list(enumerate(sims))}")
# Get most similar documents
sims_sorted = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
print(f"Most similar: {sims_sorted[:3]}")from gensim.similarities import Similarity
import tempfile
import os
# Create temporary directory for shards
temp_dir = tempfile.mkdtemp()
# Create sharded similarity index for large corpus
index = Similarity(
output_prefix=os.path.join(temp_dir, 'similarity'),
corpus=corpus_tfidf,
num_features=len(dictionary),
shardsize=1000, # Documents per shard
num_best=10 # Return top 10 similarities
)
# Query similarity
similarities = index[query_tfidf]
print(f"Top similarities: {similarities}")
# Clean up
index.destroy()from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from gensim.similarities.termsim import WordEmbeddingSimilarityIndex
from gensim.models import Word2Vec
# Train word embeddings
sentences = [text for text in common_texts]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
# Create term similarity index
term_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
# Create sparse term similarity matrix
similarity_matrix = SparseTermSimilarityMatrix(term_index, dictionary)
# Create soft cosine similarity index
soft_cosine_index = SoftCosineSimilarity(corpus, similarity_matrix)
# Query with soft cosine similarity
soft_similarities = soft_cosine_index[query_bow]
print(f"Soft cosine similarities: {list(enumerate(soft_similarities))}")from gensim.similarities import WmdSimilarity
# Create WMD similarity index (requires word embeddings)
wmd_index = WmdSimilarity(corpus, w2v_model)
# Query with WMD
wmd_similarities = wmd_index[query_doc] # Note: WMD uses raw tokens, not BOW
print(f"WMD similarities: {list(enumerate(wmd_similarities))}")from gensim.similarities.termsim import WordEmbeddingSimilarityIndex
# Create term similarity index
term_sim_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
# Find most similar terms
if 'computer' in w2v_model.wv:
similar_terms = term_sim_index.most_similar('computer', topn=5)
print(f"Terms similar to 'computer': {similar_terms}")
# Calculate term similarity
if 'computer' in w2v_model.wv and 'system' in w2v_model.wv:
sim_score = term_sim_index.similarity('computer', 'system')
print(f"Similarity between 'computer' and 'system': {sim_score}")from gensim.similarities import LevenshteinSimilarityIndex
# Create string similarity index
strings = ['computer', 'computing', 'computation', 'system', 'systematic']
string_index = LevenshteinSimilarityIndex(strings)
# Find similar strings
similar_strings = string_index.most_similar(['compute'], topn=3)
print(f"Strings similar to 'compute': {similar_strings}")# Query multiple documents at once
queries = [
dictionary.doc2bow(['computer', 'interface']),
dictionary.doc2bow(['human', 'system']),
dictionary.doc2bow(['response', 'time'])
]
# Get similarities for all queries
for i, query in enumerate(queries):
query_tfidf = tfidf[query]
sims = index[query_tfidf]
top_sim = max(enumerate(sims), key=lambda x: x[1])
print(f"Query {i+1} most similar to doc {top_sim[0]} (score: {top_sim[1]:.3f})")# Save similarity index
index.save('/tmp/similarity_index.index')
# Load similarity index
loaded_index = similarities.MatrixSimilarity.load('/tmp/similarity_index.index')
# Verify loaded index works
test_sims = loaded_index[query_tfidf]
print(f"Loaded index similarities: {list(enumerate(test_sims))}")from gensim.similarities import SparseMatrixSimilarity
# Create sparse similarity index for memory efficiency
sparse_index = SparseMatrixSimilarity(
corpus_tfidf,
num_features=len(dictionary),
num_best=5, # Only store top 5 similarities
maintain_sparsity=True
)
# Query sparse index
sparse_sims = sparse_index[query_tfidf]
print(f"Sparse similarities: {sparse_sims}")Install with Tessl CLI
npx tessl i tessl/pypi-gensimdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9