Python library for topic modelling, document indexing and similarity retrieval with large corpora
78
Core machine learning models and transformation algorithms that convert documents between different vector representations. Gensim's models support streaming training for datasets larger than memory and provide both supervised and unsupervised learning approaches for natural language processing tasks.
Probabilistic models that discover abstract topics within document collections. These models identify patterns of word co-occurrence to reveal thematic structure in large text corpora.
class LdaModel:
"""Latent Dirichlet Allocation topic model implementation."""
def __init__(
self,
corpus=None,
num_topics=100,
id2word=None,
distributed=False,
chunksize=2000,
passes=1,
update_every=1,
alpha='symmetric',
eta=None,
decay=0.5,
offset=1.0,
eval_every=10,
iterations=50,
gamma_threshold=0.001,
minimum_probability=0.01,
random_state=None,
ns_conf=None,
minimum_phi_value=0.01,
per_word_topics=False,
callbacks=None,
dtype=np.float32
): ...
def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): ...
def log_perplexity(self, chunk, total_docs=None): ...
def print_topics(self, num_topics=10, num_words=10): ...
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...
def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): ...
def get_topic_terms(self, topicid, topn=10): ...
class LdaMulticore:
"""Multicore implementation of LDA using multiple worker processes."""
def __init__(
self,
corpus=None,
num_topics=100,
id2word=None,
workers=None,
chunksize=2000,
passes=1,
batch=False,
alpha='symmetric',
eta=None,
decay=0.5,
offset=1.0,
eval_every=10,
iterations=50,
gamma_threshold=0.001,
random_state=None,
minimum_probability=0.01,
minimum_phi_value=0.01,
per_word_topics=False,
dtype=np.float32
): ...
class HdpModel:
"""Hierarchical Dirichlet Process topic model."""
def __init__(
self,
corpus,
id2word,
max_chunks=None,
max_time=None,
chunksize=256,
kappa=1.0,
tau=64.0,
K=15,
T=150,
alpha=1,
gamma=1,
eta=0.01,
scale=1.0,
var_converge=0.0001,
outputdir=None,
random_state=None
): ...
def print_topics(self, topics=10, topn=10): ...
def show_topics(self, topics=10, topn=10, log=False, formatted=True): ...
class LdaSeqModel:
"""Dynamic Topic Model for sequential/temporal topic modeling."""
def __init__(
self,
corpus=None,
time_slice=None,
id2word=None,
alphas=0.01,
num_topics=10,
initialize='gensim',
sstats=None,
lda_model=None,
obs_variance=0.5,
chain_variance=0.005,
passes=10,
random_state=None,
lda_inference_max_iter=25,
em_min_iter=6,
em_max_iter=20,
chunksize=100
): ...
def print_topics(self, time=0, top_terms=10): ...
def doc_topics(self, doc_bow): ...
class AuthorTopicModel:
"""Author-Topic model for modeling documents with author information."""
def __init__(
self,
corpus=None,
num_topics=10,
id2word=None,
author2doc=None,
doc2author=None,
chunksize=2000,
passes=1,
iterations=50,
decay=0.5,
offset=1.0,
alpha='symmetric',
eta='symmetric',
update_every=1,
eval_every=10,
gamma_threshold=0.001,
serialized=False,
serialization_path=None,
minimum_probability=0.01,
random_state=None
): ...
def get_author_topics(self, author_name, minimum_probability=0.01): ...
def get_document_topics(self, bow, minimum_probability=0.01): ...
class EnsembleLda:
"""Ensemble of LDA models for improved topic stability."""
def __init__(
self,
corpus=None,
id2word=None,
num_topics=10,
num_models=3,
topic_model_class='ldamulticore',
ensemble_workers=1,
distance_workers=1,
min_samples=None,
epsilon=0.1,
random_state=None,
memory_friendly_ttda=True
): ...
def generate_gensim_representation(self): ...
def get_topics(self): ...
class Nmf:
"""Non-negative Matrix Factorization for topic modeling."""
def __init__(
self,
corpus=None,
num_topics=100,
id2word=None,
chunksize=2000,
passes=1,
kappa=1.0,
minimum_probability=0.01,
w_max_iter=200,
w_stop_condition=1e-4,
h_max_iter=50,
h_stop_condition=1e-4,
eval_every=10,
normalize=True,
random_state=None
): ...
def print_topics(self, num_topics=10, num_words=10): ...
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...Neural network models that learn dense vector representations of words and documents, capturing semantic relationships through continuous vector spaces.
class Word2Vec:
"""Word2Vec neural word embedding model."""
def __init__(
self,
sentences=None,
corpus_file=None,
vector_size=100,
alpha=0.025,
window=5,
min_count=5,
max_vocab_size=None,
sample=1e-3,
seed=1,
workers=3,
min_alpha=0.0001,
sg=0,
hs=0,
negative=5,
ns_exponent=0.75,
cbow_mean=1,
hashfxn=hash,
epochs=5,
null_word=0,
trim_rule=None,
sorted_vocab=1,
batch_words=10000,
compute_loss=False,
callbacks=(),
comment=None,
max_final_vocab=None,
shrink_windows=True
): ...
def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), **kwargs): ...
def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): ...
def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None): ...
def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
def similarity(self, w1, w2): ...
def n_similarity(self, ws1, ws2): ...
def doesnt_match(self, words): ...
def wv: KeyedVectors
class Doc2Vec:
"""Doc2Vec model for learning document embeddings."""
def __init__(
self,
documents=None,
corpus_file=None,
dm_mean=None,
dm=1,
dbow_words=0,
dm_concat=0,
dm_tag_count=1,
docvecs=None,
docvecs_mapfile=None,
comment=None,
trim_rule=None,
callbacks=(),
**kwargs
): ...
def train(self, documents=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): ...
def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps=None): ...
def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None): ...
def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
def similarity(self, d1, d2): ...
def n_similarity(self, doc_ids1, doc_ids2): ...
def doesnt_match(self, docs): ...
class FastText:
"""FastText model with subword information."""
def __init__(
self,
sentences=None,
corpus_file=None,
sg=0,
hs=0,
vector_size=100,
alpha=0.025,
window=5,
min_count=5,
max_vocab_size=None,
word_ngrams=1,
sample=1e-3,
seed=1,
workers=3,
min_alpha=0.0001,
negative=5,
ns_exponent=0.75,
cbow_mean=1,
hashfxn=hash,
epochs=5,
null_word=0,
min_n=3,
max_n=6,
sorted_vocab=1,
bucket=2000000,
trim_rule=None,
batch_words=10000,
callbacks=(),
compatible_hash=True,
shrink_windows=True
): ...
def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): ...
def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): ...
class KeyedVectors:
"""Standalone word vectors without training functionality."""
def __init__(self, vector_size, count=0, dtype=np.float32): ...
def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None): ...
def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
def similarity(self, w1, w2): ...
def n_similarity(self, ws1, ws2): ...
def distance(self, w1, w2): ...
def distances(self, word_or_vector, other_words=()): ...
def word_vec(self, word, use_norm=False): ...
def get_vector(self, word, norm=False): ...
def words_closer_than(self, w1, w2): ...
def rank(self, w1, w2): ...
def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): ...
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): ...
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): ...
@classmethod
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=np.float32): ...Mathematical transformations that convert high-dimensional sparse document vectors into lower-dimensional dense representations, often improving computational efficiency and revealing latent structure.
class LsiModel:
"""Latent Semantic Indexing model using SVD."""
def __init__(
self,
corpus=None,
num_topics=200,
id2word=None,
chunksize=20000,
decay=1.0,
distributed=False,
onepass=True,
power_iters=2,
extra_samples=100,
dtype=np.float64
): ...
def add_documents(self, corpus, chunksize=None, decay=None): ...
def print_topics(self, num_topics=10, num_words=10): ...
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...
class TfidfModel:
"""TF-IDF transformation model."""
def __init__(
self,
corpus=None,
id2word=None,
dictionary=None,
wlocal=utils.identity,
wglobal=df2idf,
normalize=True,
smartirs=None,
pivot=None,
slope=0.65
): ...
def __getitem__(self, bow): ...
class RpModel:
"""Random Projections model for dimensionality reduction."""
def __init__(self, corpus, id2word=None, num_topics=300): ...
def __getitem__(self, bow): ...
class LogEntropyModel:
"""Log-entropy normalization model."""
def __init__(self, corpus, id2word=None, normalize=True): ...
def __getitem__(self, bow): ...
class NormModel:
"""L2 normalization model."""
def __init__(self, corpus=None, norm='l2'): ...
def __getitem__(self, bow): ...Information retrieval ranking functions that score document relevance based on term frequency and document statistics.
class OkapiBM25Model:
"""Okapi BM25 ranking function."""
def __init__(self, corpus, k1=1.2, b=0.75, epsilon=0.25): ...
def get_scores(self, query): ...
def get_batch_scores(self, query, doc_ids): ...
class LuceneBM25Model:
"""Lucene variant of BM25."""
def __init__(self, corpus, k1=1.2, b=0.75): ...
def get_scores(self, query): ...
def get_batch_scores(self, query, doc_ids): ...
class AtireBM25Model:
"""ATIRE variant of BM25."""
def __init__(self, corpus, k1=1.2, b=0.75): ...
def get_scores(self, query): ...
def get_batch_scores(self, query, doc_ids): ...Models for detecting phrases and handling n-gram construction from text corpora.
class Phrases:
"""Automatic phrase detection model."""
def __init__(
self,
sentences=None,
min_count=5,
threshold=10.0,
max_vocab_size=40000000,
delimiter=b'_',
progress_per=10000,
scoring='default',
common_terms=frozenset()
): ...
def add_vocab(self, sentences): ...
def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): ...
def __getitem__(self, sentence): ...Tools for evaluating topic model quality and coherence.
class CoherenceModel:
"""Topic coherence evaluation model."""
def __init__(
self,
model=None,
topics=None,
texts=None,
corpus=None,
dictionary=None,
window_size=None,
keyed_vectors=None,
coherence='c_v',
topn=20,
processes=-1
): ...
def get_coherence(self): ...
def get_coherence_per_topic(self, with_std=False, with_confidence=False): ...Models for cross-language document translation and alignment.
class TranslationMatrix:
"""Translation matrix for cross-language document alignment."""
def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): ...
def translate(self, source_words, topn=5): ...
def apply(self, docs): ...
class BackMappingTranslationMatrix:
"""Back-mapping translation matrix."""
def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): ...
def translate(self, source_words, topn=5): ...from gensim.models import Word2Vec
from gensim.test.utils import common_texts
# Train Word2Vec on sample data
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
# Find similar words
similar_words = model.wv.most_similar('computer', topn=5)
print(similar_words)
# Get word vector
vector = model.wv['computer']
print(f"Vector shape: {vector.shape}")from gensim import corpora
from gensim.models import LdaModel
from gensim.test.utils import common_texts
# Create dictionary and corpus
dictionary = corpora.Dictionary(common_texts)
corpus = [dictionary.doc2bow(text) for text in common_texts]
# Train LDA model
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10)
# Print topics
topics = lda.print_topics(num_words=4)
for topic in topics:
print(topic)# Get topic distribution for new document
new_doc = ['computer', 'time', 'graph']
new_doc_bow = dictionary.doc2bow(new_doc)
doc_topics = lda.get_document_topics(new_doc_bow)
print(doc_topics)Install with Tessl CLI
npx tessl i tessl/pypi-gensimdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9