Python library for topic modelling, document indexing and similarity retrieval with large corpora
78
Linear algebra operations, vector manipulations, and distance metrics optimized for NLP tasks. Gensim's mathematical utilities provide efficient implementations of common operations needed for text processing and machine learning.
Core vector operations for normalizing, measuring, and manipulating document vectors.
def unitvec(vec, norm='l2', return_norm=False):
"""
Scale vector to unit length.
Parameters:
- vec: Input vector (scipy.sparse or numpy array)
- norm: Normalization method ('l1' or 'l2')
- return_norm: Whether to return the original norm
Returns:
Normalized vector, optionally with original norm
"""
def veclen(vec):
"""
Calculate length/magnitude of vector.
Parameters:
- vec: Input vector (scipy.sparse or numpy array)
Returns:
Vector length as float
"""
def cossim(vec1, vec2):
"""
Calculate cosine similarity between two vectors.
Parameters:
- vec1: First vector
- vec2: Second vector
Returns:
Cosine similarity as float (-1 to 1)
"""
def ret_normalized_vec(vec, length):
"""
Return vector normalized to specified length.
Parameters:
- vec: Input vector
- length: Target length
Returns:
Normalized vector of specified length
"""
def ret_log_normalize_vec(vec, axis=1):
"""
Log-normalize vector values.
Parameters:
- vec: Input vector
- axis: Normalization axis
Returns:
Log-normalized vector
"""
def isbow(vec):
"""
Check if vector is in bag-of-words format.
Parameters:
- vec: Input vector
Returns:
Boolean indicating if vector is BOW format
"""Efficient matrix operations for corpus processing and linear algebra.
def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
"""
Convert corpus to scipy.sparse.csc_matrix format.
Parameters:
- corpus: Input corpus in BOW format
- num_terms: Number of terms (optional)
- dtype: Data type for matrix values
- num_docs: Number of documents (optional)
- num_nnz: Number of non-zero elements (optional)
- printprogress: Progress reporting frequency
Returns:
CSC sparse matrix with documents as columns
"""
def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):
"""
Convert corpus to dense numpy matrix.
Parameters:
- corpus: Input corpus in BOW format
- num_terms: Number of terms
- num_docs: Number of documents (optional)
- dtype: Data type for matrix values
Returns:
Dense numpy matrix
"""
def pad(mat, padrow=False, padcol=False):
"""
Pad matrix with zeros.
Parameters:
- mat: Input matrix
- padrow: Whether to pad rows
- padcol: Whether to pad columns
Returns:
Padded matrix
"""
def zeros_aligned(shape, dtype, order='C', align=128):
"""
Create aligned zero array for optimized operations.
Parameters:
- shape: Array shape
- dtype: Data type
- order: Memory layout ('C' or 'F')
- align: Memory alignment in bytes
Returns:
Aligned zero array
"""
def ismatrix(m):
"""
Check if object is a matrix.
Parameters:
- m: Object to check
Returns:
Boolean indicating if object is matrix-like
"""Functions for converting between sparse and dense vector representations.
def sparse2full(vec, length):
"""
Convert sparse vector to dense representation.
Parameters:
- vec: Sparse vector in BOW format
- length: Length of full vector
Returns:
Dense numpy array
"""
def full2sparse(vec, eps=1e-9):
"""
Convert dense vector to sparse BOW format.
Parameters:
- vec: Dense vector
- eps: Minimum value threshold
Returns:
Sparse vector in BOW format
"""
def full2sparse_clipped(vec, topn, eps=1e-9):
"""
Convert dense vector to sparse format, keeping only top-N values.
Parameters:
- vec: Dense vector
- topn: Number of top values to keep
- eps: Minimum value threshold
Returns:
Clipped sparse vector in BOW format
"""
def any2sparse(vec, eps=1e-9):
"""
Convert vector to sparse format regardless of input type.
Parameters:
- vec: Input vector (any format)
- eps: Minimum value threshold
Returns:
Sparse vector in BOW format
"""
def scipy2sparse(vec):
"""
Convert scipy sparse vector to gensim sparse format.
Parameters:
- vec: Scipy sparse matrix/vector
Returns:
Gensim sparse vector (BOW format)
"""
def scipy2scipy_clipped(matrix, topn, eps=1e-9):
"""
Clip scipy sparse matrix to top-N values per row/column.
Parameters:
- matrix: Scipy sparse matrix
- topn: Number of top values to keep
- eps: Minimum value threshold
Returns:
Clipped scipy sparse matrix
"""Statistical distance measures for comparing probability distributions and vectors.
def kullback_leibler(vec1, vec2, num_features=None):
"""
Calculate Kullback-Leibler divergence between two probability distributions.
Parameters:
- vec1: First probability distribution
- vec2: Second probability distribution
- num_features: Number of features (optional)
Returns:
KL divergence as float
"""
def jensen_shannon(vec1, vec2, num_features=None):
"""
Calculate Jensen-Shannon distance between two probability distributions.
Parameters:
- vec1: First probability distribution
- vec2: Second probability distribution
- num_features: Number of features (optional)
Returns:
JS distance as float (0 to 1)
"""
def hellinger(vec1, vec2):
"""
Calculate Hellinger distance between two probability distributions.
Parameters:
- vec1: First probability distribution
- vec2: Second probability distribution
Returns:
Hellinger distance as float (0 to 1)
"""
def jaccard(vec1, vec2):
"""
Calculate Jaccard similarity coefficient.
Parameters:
- vec1: First vector
- vec2: Second vector
Returns:
Jaccard similarity as float (0 to 1)
"""
def jaccard_distance(vec1, vec2):
"""
Calculate Jaccard distance.
Parameters:
- vec1: First vector
- vec2: Second vector
Returns:
Jaccard distance as float (0 to 1)
"""Advanced linear algebra operations with BLAS integration.
def blas(name, ndarray):
"""
Get appropriate BLAS function for array operations.
Parameters:
- name: BLAS function name
- ndarray: Input array to determine data type
Returns:
BLAS function object
"""
def argsort(x, topn=None, reverse=False):
"""
Efficiently find indices of smallest/largest elements.
Parameters:
- x: Input array
- topn: Number of top elements to return
- reverse: Whether to return largest elements
Returns:
Array of indices
"""
def qr_destroy(la):
"""
QR decomposition that destroys input matrix for memory efficiency.
Parameters:
- la: Input matrix (will be destroyed)
Returns:
Q and R matrices from QR decomposition
"""import numpy as np
from gensim import matutils
# Create sample vectors
vec1 = [(0, 1.0), (1, 2.0), (2, 3.0)] # BOW format
vec2 = [(0, 2.0), (1, 1.0), (3, 1.0)] # BOW format
# Calculate vector length
length1 = matutils.veclen(vec1)
print(f"Vector 1 length: {length1}")
# Normalize vector to unit length
unit_vec1 = matutils.unitvec(vec1)
print(f"Unit vector 1: {unit_vec1}")
# Calculate cosine similarity
similarity = matutils.cossim(vec1, vec2)
print(f"Cosine similarity: {similarity}")
# Check if vector is BOW format
is_bow = matutils.isbow(vec1)
print(f"Is BOW format: {is_bow}")# Convert sparse to dense
dense_vec1 = matutils.sparse2full(vec1, length=5)
print(f"Dense vector: {dense_vec1}")
# Convert dense to sparse
dense_array = np.array([1.0, 2.0, 0.0, 3.0, 0.0])
sparse_vec = matutils.full2sparse(dense_array)
print(f"Sparse vector: {sparse_vec}")
# Keep only top-N values
top2_sparse = matutils.full2sparse_clipped(dense_array, topn=2)
print(f"Top-2 sparse: {top2_sparse}")from gensim import corpora
from gensim.test.utils import common_texts
# Create sample corpus
dictionary = corpora.Dictionary(common_texts)
corpus = [dictionary.doc2bow(text) for text in common_texts]
# Convert corpus to CSC matrix
csc_matrix = matutils.corpus2csc(corpus, num_terms=len(dictionary))
print(f"CSC matrix shape: {csc_matrix.shape}")
print(f"CSC matrix type: {type(csc_matrix)}")
# Convert corpus to dense matrix
dense_matrix = matutils.corpus2dense(corpus, num_terms=len(dictionary))
print(f"Dense matrix shape: {dense_matrix.shape}")
print(f"Dense matrix type: {type(dense_matrix)}")# Create probability distributions
prob1 = [(0, 0.3), (1, 0.4), (2, 0.3)]
prob2 = [(0, 0.2), (1, 0.5), (2, 0.3)]
# Calculate various distance metrics
kl_div = matutils.kullback_leibler(prob1, prob2)
print(f"KL divergence: {kl_div}")
js_dist = matutils.jensen_shannon(prob1, prob2)
print(f"Jensen-Shannon distance: {js_dist}")
hellinger_dist = matutils.hellinger(prob1, prob2)
print(f"Hellinger distance: {hellinger_dist}")
# Jaccard similarity for binary vectors
binary1 = [(0, 1), (1, 1), (3, 1)]
binary2 = [(0, 1), (2, 1), (3, 1)]
jaccard_sim = matutils.jaccard(binary1, binary2)
jaccard_dist = matutils.jaccard_distance(binary1, binary2)
print(f"Jaccard similarity: {jaccard_sim}")
print(f"Jaccard distance: {jaccard_dist}")# Create large array for demonstration
large_array = np.random.rand(10000)
# Find indices of top 10 largest values efficiently
top10_indices = matutils.argsort(large_array, topn=10, reverse=True)
print(f"Top 10 indices: {top10_indices}")
print(f"Top 10 values: {large_array[top10_indices]}")
# Find indices of top 5 smallest values
bottom5_indices = matutils.argsort(large_array, topn=5, reverse=False)
print(f"Bottom 5 indices: {bottom5_indices}")
print(f"Bottom 5 values: {large_array[bottom5_indices]}")# Get BLAS function for dot product
test_array = np.array([1.0, 2.0, 3.0], dtype=np.float64)
dot_func = matutils.blas('dot', test_array)
print(f"BLAS dot function: {dot_func}")
# Use BLAS function for efficient computation
result = dot_func(test_array, test_array)
print(f"Dot product result: {result}")# Create aligned zero array for optimized operations
aligned_zeros = matutils.zeros_aligned((1000, 100), dtype=np.float32)
print(f"Aligned array shape: {aligned_zeros.shape}")
print(f"Aligned array dtype: {aligned_zeros.dtype}")
# Check if object is matrix-like
is_matrix = matutils.ismatrix(aligned_zeros)
print(f"Is matrix: {is_matrix}")
# Pad matrix with zeros
small_matrix = np.array([[1, 2], [3, 4]])
padded_matrix = matutils.pad(small_matrix, padrow=True, padcol=True)
print(f"Original matrix:\n{small_matrix}")
print(f"Padded matrix:\n{padded_matrix}")from scipy import sparse
# Create scipy sparse matrix
scipy_matrix = sparse.csr_matrix([[1, 0, 2], [0, 3, 0], [4, 0, 5]])
# Convert scipy sparse to gensim format (for first row)
gensim_sparse = matutils.scipy2sparse(scipy_matrix.getrow(0))
print(f"Scipy to gensim: {gensim_sparse}")
# Clip scipy matrix to top values
clipped_matrix = matutils.scipy2scipy_clipped(scipy_matrix, topn=2)
print(f"Original matrix:\n{scipy_matrix.toarray()}")
print(f"Clipped matrix:\n{clipped_matrix.toarray()}")# L2 normalization (default)
l2_normalized = matutils.unitvec(vec1, norm='l2')
print(f"L2 normalized: {l2_normalized}")
# L1 normalization
l1_normalized = matutils.unitvec(vec1, norm='l1')
print(f"L1 normalized: {l1_normalized}")
# Get normalized vector with original norm
normalized_with_norm = matutils.unitvec(vec1, return_norm=True)
print(f"Normalized vector: {normalized_with_norm[0]}")
print(f"Original norm: {normalized_with_norm[1]}")
# Log normalization
dense_vec = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
log_normalized = matutils.ret_log_normalize_vec(dense_vec)
print(f"Log normalized:\n{log_normalized}")Install with Tessl CLI
npx tessl i tessl/pypi-gensimdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9