0
# NLP Models and Transformations
1
2
Core machine learning models and transformation algorithms that convert documents between different vector representations. Gensim's models support streaming training for datasets larger than memory and provide both supervised and unsupervised learning approaches for natural language processing tasks.
3
4
## Capabilities
5
6
### Topic Models
7
8
Probabilistic models that discover abstract topics within document collections. These models identify patterns of word co-occurrence to reveal thematic structure in large text corpora.
9
10
```python { .api }
11
class LdaModel:
12
"""Latent Dirichlet Allocation topic model implementation."""
13
14
def __init__(
15
self,
16
corpus=None,
17
num_topics=100,
18
id2word=None,
19
distributed=False,
20
chunksize=2000,
21
passes=1,
22
update_every=1,
23
alpha='symmetric',
24
eta=None,
25
decay=0.5,
26
offset=1.0,
27
eval_every=10,
28
iterations=50,
29
gamma_threshold=0.001,
30
minimum_probability=0.01,
31
random_state=None,
32
ns_conf=None,
33
minimum_phi_value=0.01,
34
per_word_topics=False,
35
callbacks=None,
36
dtype=np.float32
37
): ...
38
39
def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): ...
40
def log_perplexity(self, chunk, total_docs=None): ...
41
def print_topics(self, num_topics=10, num_words=10): ...
42
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...
43
def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): ...
44
def get_topic_terms(self, topicid, topn=10): ...
45
46
class LdaMulticore:
47
"""Multicore implementation of LDA using multiple worker processes."""
48
49
def __init__(
50
self,
51
corpus=None,
52
num_topics=100,
53
id2word=None,
54
workers=None,
55
chunksize=2000,
56
passes=1,
57
batch=False,
58
alpha='symmetric',
59
eta=None,
60
decay=0.5,
61
offset=1.0,
62
eval_every=10,
63
iterations=50,
64
gamma_threshold=0.001,
65
random_state=None,
66
minimum_probability=0.01,
67
minimum_phi_value=0.01,
68
per_word_topics=False,
69
dtype=np.float32
70
): ...
71
72
class HdpModel:
73
"""Hierarchical Dirichlet Process topic model."""
74
75
def __init__(
76
self,
77
corpus,
78
id2word,
79
max_chunks=None,
80
max_time=None,
81
chunksize=256,
82
kappa=1.0,
83
tau=64.0,
84
K=15,
85
T=150,
86
alpha=1,
87
gamma=1,
88
eta=0.01,
89
scale=1.0,
90
var_converge=0.0001,
91
outputdir=None,
92
random_state=None
93
): ...
94
95
def print_topics(self, topics=10, topn=10): ...
96
def show_topics(self, topics=10, topn=10, log=False, formatted=True): ...
97
98
class LdaSeqModel:
99
"""Dynamic Topic Model for sequential/temporal topic modeling."""
100
101
def __init__(
102
self,
103
corpus=None,
104
time_slice=None,
105
id2word=None,
106
alphas=0.01,
107
num_topics=10,
108
initialize='gensim',
109
sstats=None,
110
lda_model=None,
111
obs_variance=0.5,
112
chain_variance=0.005,
113
passes=10,
114
random_state=None,
115
lda_inference_max_iter=25,
116
em_min_iter=6,
117
em_max_iter=20,
118
chunksize=100
119
): ...
120
121
def print_topics(self, time=0, top_terms=10): ...
122
def doc_topics(self, doc_bow): ...
123
124
class AuthorTopicModel:
125
"""Author-Topic model for modeling documents with author information."""
126
127
def __init__(
128
self,
129
corpus=None,
130
num_topics=10,
131
id2word=None,
132
author2doc=None,
133
doc2author=None,
134
chunksize=2000,
135
passes=1,
136
iterations=50,
137
decay=0.5,
138
offset=1.0,
139
alpha='symmetric',
140
eta='symmetric',
141
update_every=1,
142
eval_every=10,
143
gamma_threshold=0.001,
144
serialized=False,
145
serialization_path=None,
146
minimum_probability=0.01,
147
random_state=None
148
): ...
149
150
def get_author_topics(self, author_name, minimum_probability=0.01): ...
151
def get_document_topics(self, bow, minimum_probability=0.01): ...
152
153
class EnsembleLda:
154
"""Ensemble of LDA models for improved topic stability."""
155
156
def __init__(
157
self,
158
corpus=None,
159
id2word=None,
160
num_topics=10,
161
num_models=3,
162
topic_model_class='ldamulticore',
163
ensemble_workers=1,
164
distance_workers=1,
165
min_samples=None,
166
epsilon=0.1,
167
random_state=None,
168
memory_friendly_ttda=True
169
): ...
170
171
def generate_gensim_representation(self): ...
172
def get_topics(self): ...
173
174
class Nmf:
175
"""Non-negative Matrix Factorization for topic modeling."""
176
177
def __init__(
178
self,
179
corpus=None,
180
num_topics=100,
181
id2word=None,
182
chunksize=2000,
183
passes=1,
184
kappa=1.0,
185
minimum_probability=0.01,
186
w_max_iter=200,
187
w_stop_condition=1e-4,
188
h_max_iter=50,
189
h_stop_condition=1e-4,
190
eval_every=10,
191
normalize=True,
192
random_state=None
193
): ...
194
195
def print_topics(self, num_topics=10, num_words=10): ...
196
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...
197
```
198
199
### Word Embeddings
200
201
Neural network models that learn dense vector representations of words and documents, capturing semantic relationships through continuous vector spaces.
202
203
```python { .api }
204
class Word2Vec:
205
"""Word2Vec neural word embedding model."""
206
207
def __init__(
208
self,
209
sentences=None,
210
corpus_file=None,
211
vector_size=100,
212
alpha=0.025,
213
window=5,
214
min_count=5,
215
max_vocab_size=None,
216
sample=1e-3,
217
seed=1,
218
workers=3,
219
min_alpha=0.0001,
220
sg=0,
221
hs=0,
222
negative=5,
223
ns_exponent=0.75,
224
cbow_mean=1,
225
hashfxn=hash,
226
epochs=5,
227
null_word=0,
228
trim_rule=None,
229
sorted_vocab=1,
230
batch_words=10000,
231
compute_loss=False,
232
callbacks=(),
233
comment=None,
234
max_final_vocab=None,
235
shrink_windows=True
236
): ...
237
238
def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), **kwargs): ...
239
def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): ...
240
def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None): ...
241
def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
242
def similarity(self, w1, w2): ...
243
def n_similarity(self, ws1, ws2): ...
244
def doesnt_match(self, words): ...
245
def wv: KeyedVectors
246
247
class Doc2Vec:
248
"""Doc2Vec model for learning document embeddings."""
249
250
def __init__(
251
self,
252
documents=None,
253
corpus_file=None,
254
dm_mean=None,
255
dm=1,
256
dbow_words=0,
257
dm_concat=0,
258
dm_tag_count=1,
259
docvecs=None,
260
docvecs_mapfile=None,
261
comment=None,
262
trim_rule=None,
263
callbacks=(),
264
**kwargs
265
): ...
266
267
def train(self, documents=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): ...
268
def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps=None): ...
269
def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None): ...
270
def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
271
def similarity(self, d1, d2): ...
272
def n_similarity(self, doc_ids1, doc_ids2): ...
273
def doesnt_match(self, docs): ...
274
275
class FastText:
276
"""FastText model with subword information."""
277
278
def __init__(
279
self,
280
sentences=None,
281
corpus_file=None,
282
sg=0,
283
hs=0,
284
vector_size=100,
285
alpha=0.025,
286
window=5,
287
min_count=5,
288
max_vocab_size=None,
289
word_ngrams=1,
290
sample=1e-3,
291
seed=1,
292
workers=3,
293
min_alpha=0.0001,
294
negative=5,
295
ns_exponent=0.75,
296
cbow_mean=1,
297
hashfxn=hash,
298
epochs=5,
299
null_word=0,
300
min_n=3,
301
max_n=6,
302
sorted_vocab=1,
303
bucket=2000000,
304
trim_rule=None,
305
batch_words=10000,
306
callbacks=(),
307
compatible_hash=True,
308
shrink_windows=True
309
): ...
310
311
def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): ...
312
def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): ...
313
314
class KeyedVectors:
315
"""Standalone word vectors without training functionality."""
316
317
def __init__(self, vector_size, count=0, dtype=np.float32): ...
318
319
def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None): ...
320
def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
321
def similarity(self, w1, w2): ...
322
def n_similarity(self, ws1, ws2): ...
323
def distance(self, w1, w2): ...
324
def distances(self, word_or_vector, other_words=()): ...
325
def word_vec(self, word, use_norm=False): ...
326
def get_vector(self, word, norm=False): ...
327
def words_closer_than(self, w1, w2): ...
328
def rank(self, w1, w2): ...
329
def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): ...
330
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): ...
331
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): ...
332
@classmethod
333
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=np.float32): ...
334
```
335
336
### Dimensionality Reduction and Transformations
337
338
Mathematical transformations that convert high-dimensional sparse document vectors into lower-dimensional dense representations, often improving computational efficiency and revealing latent structure.
339
340
```python { .api }
341
class LsiModel:
342
"""Latent Semantic Indexing model using SVD."""
343
344
def __init__(
345
self,
346
corpus=None,
347
num_topics=200,
348
id2word=None,
349
chunksize=20000,
350
decay=1.0,
351
distributed=False,
352
onepass=True,
353
power_iters=2,
354
extra_samples=100,
355
dtype=np.float64
356
): ...
357
358
def add_documents(self, corpus, chunksize=None, decay=None): ...
359
def print_topics(self, num_topics=10, num_words=10): ...
360
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...
361
362
class TfidfModel:
363
"""TF-IDF transformation model."""
364
365
def __init__(
366
self,
367
corpus=None,
368
id2word=None,
369
dictionary=None,
370
wlocal=utils.identity,
371
wglobal=df2idf,
372
normalize=True,
373
smartirs=None,
374
pivot=None,
375
slope=0.65
376
): ...
377
378
def __getitem__(self, bow): ...
379
380
class RpModel:
381
"""Random Projections model for dimensionality reduction."""
382
383
def __init__(self, corpus, id2word=None, num_topics=300): ...
384
385
def __getitem__(self, bow): ...
386
387
class LogEntropyModel:
388
"""Log-entropy normalization model."""
389
390
def __init__(self, corpus, id2word=None, normalize=True): ...
391
392
def __getitem__(self, bow): ...
393
394
class NormModel:
395
"""L2 normalization model."""
396
397
def __init__(self, corpus=None, norm='l2'): ...
398
399
def __getitem__(self, bow): ...
400
```
401
402
### Ranking Models
403
404
Information retrieval ranking functions that score document relevance based on term frequency and document statistics.
405
406
```python { .api }
407
class OkapiBM25Model:
408
"""Okapi BM25 ranking function."""
409
410
def __init__(self, corpus, k1=1.2, b=0.75, epsilon=0.25): ...
411
412
def get_scores(self, query): ...
413
def get_batch_scores(self, query, doc_ids): ...
414
415
class LuceneBM25Model:
416
"""Lucene variant of BM25."""
417
418
def __init__(self, corpus, k1=1.2, b=0.75): ...
419
420
def get_scores(self, query): ...
421
def get_batch_scores(self, query, doc_ids): ...
422
423
class AtireBM25Model:
424
"""ATIRE variant of BM25."""
425
426
def __init__(self, corpus, k1=1.2, b=0.75): ...
427
428
def get_scores(self, query): ...
429
def get_batch_scores(self, query, doc_ids): ...
430
```
431
432
### Text Processing Models
433
434
Models for detecting phrases and handling n-gram construction from text corpora.
435
436
```python { .api }
437
class Phrases:
438
"""Automatic phrase detection model."""
439
440
def __init__(
441
self,
442
sentences=None,
443
min_count=5,
444
threshold=10.0,
445
max_vocab_size=40000000,
446
delimiter=b'_',
447
progress_per=10000,
448
scoring='default',
449
common_terms=frozenset()
450
): ...
451
452
def add_vocab(self, sentences): ...
453
def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): ...
454
def __getitem__(self, sentence): ...
455
```
456
457
### Model Evaluation
458
459
Tools for evaluating topic model quality and coherence.
460
461
```python { .api }
462
class CoherenceModel:
463
"""Topic coherence evaluation model."""
464
465
def __init__(
466
self,
467
model=None,
468
topics=None,
469
texts=None,
470
corpus=None,
471
dictionary=None,
472
window_size=None,
473
keyed_vectors=None,
474
coherence='c_v',
475
topn=20,
476
processes=-1
477
): ...
478
479
def get_coherence(self): ...
480
def get_coherence_per_topic(self, with_std=False, with_confidence=False): ...
481
```
482
483
### Translation and Cross-Language Models
484
485
Models for cross-language document translation and alignment.
486
487
```python { .api }
488
class TranslationMatrix:
489
"""Translation matrix for cross-language document alignment."""
490
491
def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): ...
492
493
def translate(self, source_words, topn=5): ...
494
def apply(self, docs): ...
495
496
class BackMappingTranslationMatrix:
497
"""Back-mapping translation matrix."""
498
499
def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): ...
500
501
def translate(self, source_words, topn=5): ...
502
```
503
504
## Usage Examples
505
506
### Training a Word2Vec Model
507
508
```python
509
from gensim.models import Word2Vec
510
from gensim.test.utils import common_texts
511
512
# Train Word2Vec on sample data
513
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
514
515
# Find similar words
516
similar_words = model.wv.most_similar('computer', topn=5)
517
print(similar_words)
518
519
# Get word vector
520
vector = model.wv['computer']
521
print(f"Vector shape: {vector.shape}")
522
```
523
524
### Training an LDA Topic Model
525
526
```python
527
from gensim import corpora
528
from gensim.models import LdaModel
529
from gensim.test.utils import common_texts
530
531
# Create dictionary and corpus
532
dictionary = corpora.Dictionary(common_texts)
533
corpus = [dictionary.doc2bow(text) for text in common_texts]
534
535
# Train LDA model
536
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10)
537
538
# Print topics
539
topics = lda.print_topics(num_words=4)
540
for topic in topics:
541
print(topic)
542
```
543
544
### Document Topic Inference
545
546
```python
547
# Get topic distribution for new document
548
new_doc = ['computer', 'time', 'graph']
549
new_doc_bow = dictionary.doc2bow(new_doc)
550
doc_topics = lda.get_document_topics(new_doc_bow)
551
print(doc_topics)
552
```