CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-gensim

Python library for topic modelling, document indexing and similarity retrieval with large corpora

78

1.02x
Overview
Eval results
Files

similarity-computations.mddocs/

Similarity Computations

Efficient similarity calculations for documents and terms with support for large-scale corpora through sharded indexing and various distance metrics. Gensim provides both exact and approximate similarity methods optimized for different use cases.

Capabilities

Document Similarity

Core similarity computations between documents using various distance metrics and indexing strategies.

class Similarity:
    """Sharded similarity index for large corpora."""
    
    def __init__(
        self,
        corpus,
        num_features,
        num_best=None,
        chunksize=256,
        shardsize=32768,
        output_prefix=None
    ): ...
    
    def __getitem__(self, query): ...
    def get_similarities(self, query): ...
    def add_documents(self, corpus): ...
    def destroy(self): ...

class MatrixSimilarity:
    """Dense similarity matrix stored in memory."""
    
    def __init__(
        self,
        corpus,
        num_features=None,
        num_best=None,
        dtype=np.float32,
        normalize=True,
        maintain_sparsity=False
    ): ...
    
    def __getitem__(self, query): ...
    def get_similarities(self, query): ...

class SparseMatrixSimilarity:
    """Sparse similarity matrix for memory efficiency."""
    
    def __init__(
        self,
        corpus,
        num_features=None,
        num_terms=None,
        num_docs=None,
        num_nnz=None,
        num_best=None,
        chunksize=500,
        dtype=np.float32,
        maintain_sparsity=False
    ): ...
    
    def __getitem__(self, query): ...
    def get_similarities(self, query): ...

class SoftCosineSimilarity:
    """Soft cosine similarity with term relationship matrix."""
    
    def __init__(
        self,
        corpus,
        similarity_matrix,
        num_best=None,
        chunksize=256
    ): ...
    
    def __getitem__(self, query): ...
    def get_similarities(self, query): ...

class WmdSimilarity:
    """Word Mover's Distance similarity using word embeddings."""
    
    def __init__(
        self,
        corpus,
        w2v_model,
        num_best=None,
        normalize_w2v_and_replace=True,
        chunksize=256
    ): ...
    
    def __getitem__(self, query): ...
    def get_similarities(self, query): ...

Term Similarity

Similarity computations between individual terms and construction of term similarity matrices.

class TermSimilarityIndex:
    """Base interface for term similarity computation."""
    
    def most_similar(self, term, topn=10): ...
    def similarity(self, term1, term2): ...
    def __getitem__(self, term): ...

class UniformTermSimilarityIndex(TermSimilarityIndex):
    """Uniform term similarity (all terms equally similar)."""
    
    def __init__(self, dictionary, term_similarity=1.0): ...
    
    def most_similar(self, term, topn=10): ...
    def similarity(self, term1, term2): ...

class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
    """Term similarity based on word embeddings."""
    
    def __init__(self, keyed_vectors, threshold=0.0, exponent=2.0, kwargs=None): ...
    
    def most_similar(self, term, topn=10): ...
    def similarity(self, term1, term2): ...
    def __getitem__(self, term): ...

class SparseTermSimilarityMatrix:
    """Sparse matrix representation of term similarities."""
    
    def __init__(
        self,
        term_similarity_index,
        dictionary=None,
        tfidf=None,
        symmetric=True,
        dominant=False,
        nonzero_limit=100,
        dtype=np.float32
    ): ...
    
    def inner_product(self, X, Y): ...
    def __getitem__(self, bow): ...

String Similarity

Similarity computations for raw strings using edit distance metrics.

class LevenshteinSimilarityIndex:
    """Levenshtein distance-based string similarity."""
    
    def __init__(self, strings, alpha=1.0, beta=1.0, max_distance=10): ...
    
    def most_similar(self, query, topn=10): ...
    def __getitem__(self, stringlist): ...

Usage Examples

Basic Document Similarity

from gensim import corpora, models, similarities
from gensim.test.utils import common_texts

# Create corpus and dictionary
dictionary = corpora.Dictionary(common_texts)
corpus = [dictionary.doc2bow(text) for text in common_texts]

# Create TF-IDF model
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# Create similarity index
index = similarities.MatrixSimilarity(corpus_tfidf)

# Query with new document
query_doc = ['computer', 'human', 'interface']
query_bow = dictionary.doc2bow(query_doc)
query_tfidf = tfidf[query_bow]

# Get similarities
sims = index[query_tfidf]
print(f"Similarities: {list(enumerate(sims))}")

# Get most similar documents
sims_sorted = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
print(f"Most similar: {sims_sorted[:3]}")

Large Corpus Similarity with Sharding

from gensim.similarities import Similarity
import tempfile
import os

# Create temporary directory for shards
temp_dir = tempfile.mkdtemp()

# Create sharded similarity index for large corpus
index = Similarity(
    output_prefix=os.path.join(temp_dir, 'similarity'),
    corpus=corpus_tfidf,
    num_features=len(dictionary),
    shardsize=1000,  # Documents per shard
    num_best=10      # Return top 10 similarities
)

# Query similarity
similarities = index[query_tfidf]
print(f"Top similarities: {similarities}")

# Clean up
index.destroy()

Soft Cosine Similarity with Term Relationships

from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from gensim.similarities.termsim import WordEmbeddingSimilarityIndex
from gensim.models import Word2Vec

# Train word embeddings
sentences = [text for text in common_texts]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)

# Create term similarity index
term_index = WordEmbeddingSimilarityIndex(w2v_model.wv)

# Create sparse term similarity matrix
similarity_matrix = SparseTermSimilarityMatrix(term_index, dictionary)

# Create soft cosine similarity index
soft_cosine_index = SoftCosineSimilarity(corpus, similarity_matrix)

# Query with soft cosine similarity
soft_similarities = soft_cosine_index[query_bow]
print(f"Soft cosine similarities: {list(enumerate(soft_similarities))}")

Word Mover's Distance

from gensim.similarities import WmdSimilarity

# Create WMD similarity index (requires word embeddings)
wmd_index = WmdSimilarity(corpus, w2v_model)

# Query with WMD
wmd_similarities = wmd_index[query_doc]  # Note: WMD uses raw tokens, not BOW
print(f"WMD similarities: {list(enumerate(wmd_similarities))}")

Term Similarity Operations

from gensim.similarities.termsim import WordEmbeddingSimilarityIndex

# Create term similarity index
term_sim_index = WordEmbeddingSimilarityIndex(w2v_model.wv)

# Find most similar terms
if 'computer' in w2v_model.wv:
    similar_terms = term_sim_index.most_similar('computer', topn=5)
    print(f"Terms similar to 'computer': {similar_terms}")

# Calculate term similarity
if 'computer' in w2v_model.wv and 'system' in w2v_model.wv:
    sim_score = term_sim_index.similarity('computer', 'system')
    print(f"Similarity between 'computer' and 'system': {sim_score}")

String Similarity with Levenshtein Distance

from gensim.similarities import LevenshteinSimilarityIndex

# Create string similarity index
strings = ['computer', 'computing', 'computation', 'system', 'systematic']
string_index = LevenshteinSimilarityIndex(strings)

# Find similar strings
similar_strings = string_index.most_similar(['compute'], topn=3)
print(f"Strings similar to 'compute': {similar_strings}")

Batch Similarity Queries

# Query multiple documents at once
queries = [
    dictionary.doc2bow(['computer', 'interface']),
    dictionary.doc2bow(['human', 'system']),
    dictionary.doc2bow(['response', 'time'])
]

# Get similarities for all queries
for i, query in enumerate(queries):
    query_tfidf = tfidf[query]
    sims = index[query_tfidf]
    top_sim = max(enumerate(sims), key=lambda x: x[1])
    print(f"Query {i+1} most similar to doc {top_sim[0]} (score: {top_sim[1]:.3f})")

Similarity Index Persistence

# Save similarity index
index.save('/tmp/similarity_index.index')

# Load similarity index
loaded_index = similarities.MatrixSimilarity.load('/tmp/similarity_index.index')

# Verify loaded index works
test_sims = loaded_index[query_tfidf]
print(f"Loaded index similarities: {list(enumerate(test_sims))}")

Memory-Efficient Sparse Similarity

from gensim.similarities import SparseMatrixSimilarity

# Create sparse similarity index for memory efficiency
sparse_index = SparseMatrixSimilarity(
    corpus_tfidf,
    num_features=len(dictionary),
    num_best=5,  # Only store top 5 similarities
    maintain_sparsity=True
)

# Query sparse index
sparse_sims = sparse_index[query_tfidf]
print(f"Sparse similarities: {sparse_sims}")

Install with Tessl CLI

npx tessl i tessl/pypi-gensim

docs

corpus-management.md

data-downloading.md

index.md

mathematical-utilities.md

nlp-models.md

similarity-computations.md

text-preprocessing.md

tile.json