Tessl Tile for pypi/gensim@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

corpus-management.md data-downloading.md index.md mathematical-utilities.md nlp-models.md similarity-computations.md text-preprocessing.md

nlp-models.mddocs/

0
# NLP Models and Transformations
1

2
Core machine learning models and transformation algorithms that convert documents between different vector representations. Gensim's models support streaming training for datasets larger than memory and provide both supervised and unsupervised learning approaches for natural language processing tasks.
3

4
## Capabilities
5

6
### Topic Models
7

8
Probabilistic models that discover abstract topics within document collections. These models identify patterns of word co-occurrence to reveal thematic structure in large text corpora.
9

10
```python { .api }
11
class LdaModel:
12
    """Latent Dirichlet Allocation topic model implementation."""
13
    
14
    def __init__(
15
        self,
16
        corpus=None,
17
        num_topics=100,
18
        id2word=None,
19
        distributed=False,
20
        chunksize=2000,
21
        passes=1,
22
        update_every=1,
23
        alpha='symmetric',
24
        eta=None,
25
        decay=0.5,
26
        offset=1.0,
27
        eval_every=10,
28
        iterations=50,
29
        gamma_threshold=0.001,
30
        minimum_probability=0.01,
31
        random_state=None,
32
        ns_conf=None,
33
        minimum_phi_value=0.01,
34
        per_word_topics=False,
35
        callbacks=None,
36
        dtype=np.float32
37
    ): ...
38
    
39
    def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): ...
40
    def log_perplexity(self, chunk, total_docs=None): ...
41
    def print_topics(self, num_topics=10, num_words=10): ...
42
    def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...
43
    def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): ...
44
    def get_topic_terms(self, topicid, topn=10): ...
45

46
class LdaMulticore:
47
    """Multicore implementation of LDA using multiple worker processes."""
48
    
49
    def __init__(
50
        self,
51
        corpus=None,
52
        num_topics=100,
53
        id2word=None,
54
        workers=None,
55
        chunksize=2000,
56
        passes=1,
57
        batch=False,
58
        alpha='symmetric',
59
        eta=None,
60
        decay=0.5,
61
        offset=1.0,
62
        eval_every=10,
63
        iterations=50,
64
        gamma_threshold=0.001,
65
        random_state=None,
66
        minimum_probability=0.01,
67
        minimum_phi_value=0.01,
68
        per_word_topics=False,
69
        dtype=np.float32
70
    ): ...
71

72
class HdpModel:
73
    """Hierarchical Dirichlet Process topic model."""
74
    
75
    def __init__(
76
        self,
77
        corpus,
78
        id2word,
79
        max_chunks=None,
80
        max_time=None,
81
        chunksize=256,
82
        kappa=1.0,
83
        tau=64.0,
84
        K=15,
85
        T=150,
86
        alpha=1,
87
        gamma=1,
88
        eta=0.01,
89
        scale=1.0,
90
        var_converge=0.0001,
91
        outputdir=None,
92
        random_state=None
93
    ): ...
94
    
95
    def print_topics(self, topics=10, topn=10): ...
96
    def show_topics(self, topics=10, topn=10, log=False, formatted=True): ...
97

98
class LdaSeqModel:
99
    """Dynamic Topic Model for sequential/temporal topic modeling."""
100
    
101
    def __init__(
102
        self,
103
        corpus=None,
104
        time_slice=None,
105
        id2word=None,
106
        alphas=0.01,
107
        num_topics=10,
108
        initialize='gensim',
109
        sstats=None,
110
        lda_model=None,
111
        obs_variance=0.5,
112
        chain_variance=0.005,
113
        passes=10,
114
        random_state=None,
115
        lda_inference_max_iter=25,
116
        em_min_iter=6,
117
        em_max_iter=20,
118
        chunksize=100
119
    ): ...
120
    
121
    def print_topics(self, time=0, top_terms=10): ...
122
    def doc_topics(self, doc_bow): ...
123

124
class AuthorTopicModel:
125
    """Author-Topic model for modeling documents with author information."""
126
    
127
    def __init__(
128
        self,
129
        corpus=None,
130
        num_topics=10,
131
        id2word=None,
132
        author2doc=None,
133
        doc2author=None,
134
        chunksize=2000,
135
        passes=1,
136
        iterations=50,
137
        decay=0.5,
138
        offset=1.0,
139
        alpha='symmetric',
140
        eta='symmetric',
141
        update_every=1,
142
        eval_every=10,
143
        gamma_threshold=0.001,
144
        serialized=False,
145
        serialization_path=None,
146
        minimum_probability=0.01,
147
        random_state=None
148
    ): ...
149
    
150
    def get_author_topics(self, author_name, minimum_probability=0.01): ...
151
    def get_document_topics(self, bow, minimum_probability=0.01): ...
152

153
class EnsembleLda:
154
    """Ensemble of LDA models for improved topic stability."""
155
    
156
    def __init__(
157
        self,
158
        corpus=None,
159
        id2word=None,
160
        num_topics=10,
161
        num_models=3,
162
        topic_model_class='ldamulticore',
163
        ensemble_workers=1,
164
        distance_workers=1,
165
        min_samples=None,
166
        epsilon=0.1,
167
        random_state=None,
168
        memory_friendly_ttda=True
169
    ): ...
170
    
171
    def generate_gensim_representation(self): ...
172
    def get_topics(self): ...
173

174
class Nmf:
175
    """Non-negative Matrix Factorization for topic modeling."""
176
    
177
    def __init__(
178
        self,
179
        corpus=None,
180
        num_topics=100,
181
        id2word=None,
182
        chunksize=2000,
183
        passes=1,
184
        kappa=1.0,
185
        minimum_probability=0.01,
186
        w_max_iter=200,
187
        w_stop_condition=1e-4,
188
        h_max_iter=50,
189
        h_stop_condition=1e-4,
190
        eval_every=10,
191
        normalize=True,
192
        random_state=None
193
    ): ...
194
    
195
    def print_topics(self, num_topics=10, num_words=10): ...
196
    def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...
197
```
198

199
### Word Embeddings
200

201
Neural network models that learn dense vector representations of words and documents, capturing semantic relationships through continuous vector spaces.
202

203
```python { .api }
204
class Word2Vec:
205
    """Word2Vec neural word embedding model."""
206
    
207
    def __init__(
208
        self,
209
        sentences=None,
210
        corpus_file=None,
211
        vector_size=100,
212
        alpha=0.025,
213
        window=5,
214
        min_count=5,
215
        max_vocab_size=None,
216
        sample=1e-3,
217
        seed=1,
218
        workers=3,
219
        min_alpha=0.0001,
220
        sg=0,
221
        hs=0,
222
        negative=5,
223
        ns_exponent=0.75,
224
        cbow_mean=1,
225
        hashfxn=hash,
226
        epochs=5,
227
        null_word=0,
228
        trim_rule=None,
229
        sorted_vocab=1,
230
        batch_words=10000,
231
        compute_loss=False,
232
        callbacks=(),
233
        comment=None,
234
        max_final_vocab=None,
235
        shrink_windows=True
236
    ): ...
237
    
238
    def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), **kwargs): ...
239
    def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): ...
240
    def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None): ...
241
    def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
242
    def similarity(self, w1, w2): ...
243
    def n_similarity(self, ws1, ws2): ...
244
    def doesnt_match(self, words): ...
245
    def wv: KeyedVectors
246

247
class Doc2Vec:
248
    """Doc2Vec model for learning document embeddings."""
249
    
250
    def __init__(
251
        self,
252
        documents=None,
253
        corpus_file=None,
254
        dm_mean=None,
255
        dm=1,
256
        dbow_words=0,
257
        dm_concat=0,
258
        dm_tag_count=1,
259
        docvecs=None,
260
        docvecs_mapfile=None,
261
        comment=None,
262
        trim_rule=None,
263
        callbacks=(),
264
        **kwargs
265
    ): ...
266
    
267
    def train(self, documents=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): ...
268
    def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps=None): ...
269
    def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None): ...
270
    def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
271
    def similarity(self, d1, d2): ...
272
    def n_similarity(self, doc_ids1, doc_ids2): ...
273
    def doesnt_match(self, docs): ...
274

275
class FastText:
276
    """FastText model with subword information."""
277
    
278
    def __init__(
279
        self,
280
        sentences=None,
281
        corpus_file=None,
282
        sg=0,
283
        hs=0,
284
        vector_size=100,
285
        alpha=0.025,
286
        window=5,
287
        min_count=5,
288
        max_vocab_size=None,
289
        word_ngrams=1,
290
        sample=1e-3,
291
        seed=1,
292
        workers=3,
293
        min_alpha=0.0001,
294
        negative=5,
295
        ns_exponent=0.75,
296
        cbow_mean=1,
297
        hashfxn=hash,
298
        epochs=5,
299
        null_word=0,
300
        min_n=3,
301
        max_n=6,
302
        sorted_vocab=1,
303
        bucket=2000000,
304
        trim_rule=None,
305
        batch_words=10000,
306
        callbacks=(),
307
        compatible_hash=True,
308
        shrink_windows=True
309
    ): ...
310
    
311
    def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): ...
312
    def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): ...
313

314
class KeyedVectors:
315
    """Standalone word vectors without training functionality."""
316
    
317
    def __init__(self, vector_size, count=0, dtype=np.float32): ...
318
    
319
    def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None): ...
320
    def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...
321
    def similarity(self, w1, w2): ...
322
    def n_similarity(self, ws1, ws2): ...
323
    def distance(self, w1, w2): ...
324
    def distances(self, word_or_vector, other_words=()): ...
325
    def word_vec(self, word, use_norm=False): ...
326
    def get_vector(self, word, norm=False): ...
327
    def words_closer_than(self, w1, w2): ...
328
    def rank(self, w1, w2): ...
329
    def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): ...
330
    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): ...
331
    def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): ...
332
    @classmethod
333
    def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=np.float32): ...
334
```
335

336
### Dimensionality Reduction and Transformations
337

338
Mathematical transformations that convert high-dimensional sparse document vectors into lower-dimensional dense representations, often improving computational efficiency and revealing latent structure.
339

340
```python { .api }
341
class LsiModel:
342
    """Latent Semantic Indexing model using SVD."""
343
    
344
    def __init__(
345
        self,
346
        corpus=None,
347
        num_topics=200,
348
        id2word=None,
349
        chunksize=20000,
350
        decay=1.0,
351
        distributed=False,
352
        onepass=True,
353
        power_iters=2,
354
        extra_samples=100,
355
        dtype=np.float64
356
    ): ...
357
    
358
    def add_documents(self, corpus, chunksize=None, decay=None): ...
359
    def print_topics(self, num_topics=10, num_words=10): ...
360
    def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...
361

362
class TfidfModel:
363
    """TF-IDF transformation model."""
364
    
365
    def __init__(
366
        self,
367
        corpus=None,
368
        id2word=None,
369
        dictionary=None,
370
        wlocal=utils.identity,
371
        wglobal=df2idf,
372
        normalize=True,
373
        smartirs=None,
374
        pivot=None,
375
        slope=0.65
376
    ): ...
377
    
378
    def __getitem__(self, bow): ...
379

380
class RpModel:
381
    """Random Projections model for dimensionality reduction."""
382
    
383
    def __init__(self, corpus, id2word=None, num_topics=300): ...
384
    
385
    def __getitem__(self, bow): ...
386

387
class LogEntropyModel:
388
    """Log-entropy normalization model."""
389
    
390
    def __init__(self, corpus, id2word=None, normalize=True): ...
391
    
392
    def __getitem__(self, bow): ...
393

394
class NormModel:
395
    """L2 normalization model."""
396
    
397
    def __init__(self, corpus=None, norm='l2'): ...
398
    
399
    def __getitem__(self, bow): ...
400
```
401

402
### Ranking Models
403

404
Information retrieval ranking functions that score document relevance based on term frequency and document statistics.
405

406
```python { .api }
407
class OkapiBM25Model:
408
    """Okapi BM25 ranking function."""
409
    
410
    def __init__(self, corpus, k1=1.2, b=0.75, epsilon=0.25): ...
411
    
412
    def get_scores(self, query): ...
413
    def get_batch_scores(self, query, doc_ids): ...
414

415
class LuceneBM25Model:
416
    """Lucene variant of BM25."""
417
    
418
    def __init__(self, corpus, k1=1.2, b=0.75): ...
419
    
420
    def get_scores(self, query): ...
421
    def get_batch_scores(self, query, doc_ids): ...
422

423
class AtireBM25Model:
424
    """ATIRE variant of BM25."""
425
    
426
    def __init__(self, corpus, k1=1.2, b=0.75): ...
427
    
428
    def get_scores(self, query): ...
429
    def get_batch_scores(self, query, doc_ids): ...
430
```
431

432
### Text Processing Models
433

434
Models for detecting phrases and handling n-gram construction from text corpora.
435

436
```python { .api }
437
class Phrases:
438
    """Automatic phrase detection model."""
439
    
440
    def __init__(
441
        self,
442
        sentences=None,
443
        min_count=5,
444
        threshold=10.0,
445
        max_vocab_size=40000000,
446
        delimiter=b'_',
447
        progress_per=10000,
448
        scoring='default',
449
        common_terms=frozenset()
450
    ): ...
451
    
452
    def add_vocab(self, sentences): ...
453
    def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): ...
454
    def __getitem__(self, sentence): ...
455
```
456

457
### Model Evaluation
458

459
Tools for evaluating topic model quality and coherence.
460

461
```python { .api }
462
class CoherenceModel:
463
    """Topic coherence evaluation model."""
464
    
465
    def __init__(
466
        self,
467
        model=None,
468
        topics=None,
469
        texts=None,
470
        corpus=None,
471
        dictionary=None,
472
        window_size=None,
473
        keyed_vectors=None,
474
        coherence='c_v',
475
        topn=20,
476
        processes=-1
477
    ): ...
478
    
479
    def get_coherence(self): ...
480
    def get_coherence_per_topic(self, with_std=False, with_confidence=False): ...
481
```
482

483
### Translation and Cross-Language Models
484

485
Models for cross-language document translation and alignment.
486

487
```python { .api }
488
class TranslationMatrix:
489
    """Translation matrix for cross-language document alignment."""
490
    
491
    def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): ...
492
    
493
    def translate(self, source_words, topn=5): ...
494
    def apply(self, docs): ...
495

496
class BackMappingTranslationMatrix:
497
    """Back-mapping translation matrix."""
498
    
499
    def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): ...
500
    
501
    def translate(self, source_words, topn=5): ...
502
```
503

504
## Usage Examples
505

506
### Training a Word2Vec Model
507

508
```python
509
from gensim.models import Word2Vec
510
from gensim.test.utils import common_texts
511

512
# Train Word2Vec on sample data
513
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
514

515
# Find similar words
516
similar_words = model.wv.most_similar('computer', topn=5)
517
print(similar_words)
518

519
# Get word vector
520
vector = model.wv['computer']
521
print(f"Vector shape: {vector.shape}")
522
```
523

524
### Training an LDA Topic Model
525

526
```python
527
from gensim import corpora
528
from gensim.models import LdaModel
529
from gensim.test.utils import common_texts
530

531
# Create dictionary and corpus
532
dictionary = corpora.Dictionary(common_texts)
533
corpus = [dictionary.doc2bow(text) for text in common_texts]
534

535
# Train LDA model
536
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10)
537

538
# Print topics
539
topics = lda.print_topics(num_words=4)
540
for topic in topics:
541
    print(topic)
542
```
543

544
### Document Topic Inference
545

546
```python
547
# Get topic distribution for new document
548
new_doc = ['computer', 'time', 'graph']
549
new_doc_bow = dictionary.doc2bow(new_doc)
550
doc_topics = lda.get_document_topics(new_doc_bow)
551
print(doc_topics)
552
```

Version

Tile

Files

nlp-models.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

nlp-models.mddocs/