or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

corpus-management.mddata-downloading.mdindex.mdmathematical-utilities.mdnlp-models.mdsimilarity-computations.mdtext-preprocessing.md

nlp-models.mddocs/

0

# NLP Models and Transformations

1

2

Core machine learning models and transformation algorithms that convert documents between different vector representations. Gensim's models support streaming training for datasets larger than memory and provide both supervised and unsupervised learning approaches for natural language processing tasks.

3

4

## Capabilities

5

6

### Topic Models

7

8

Probabilistic models that discover abstract topics within document collections. These models identify patterns of word co-occurrence to reveal thematic structure in large text corpora.

9

10

```python { .api }

11

class LdaModel:

12

"""Latent Dirichlet Allocation topic model implementation."""

13

14

def __init__(

15

self,

16

corpus=None,

17

num_topics=100,

18

id2word=None,

19

distributed=False,

20

chunksize=2000,

21

passes=1,

22

update_every=1,

23

alpha='symmetric',

24

eta=None,

25

decay=0.5,

26

offset=1.0,

27

eval_every=10,

28

iterations=50,

29

gamma_threshold=0.001,

30

minimum_probability=0.01,

31

random_state=None,

32

ns_conf=None,

33

minimum_phi_value=0.01,

34

per_word_topics=False,

35

callbacks=None,

36

dtype=np.float32

37

): ...

38

39

def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): ...

40

def log_perplexity(self, chunk, total_docs=None): ...

41

def print_topics(self, num_topics=10, num_words=10): ...

42

def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...

43

def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): ...

44

def get_topic_terms(self, topicid, topn=10): ...

45

46

class LdaMulticore:

47

"""Multicore implementation of LDA using multiple worker processes."""

48

49

def __init__(

50

self,

51

corpus=None,

52

num_topics=100,

53

id2word=None,

54

workers=None,

55

chunksize=2000,

56

passes=1,

57

batch=False,

58

alpha='symmetric',

59

eta=None,

60

decay=0.5,

61

offset=1.0,

62

eval_every=10,

63

iterations=50,

64

gamma_threshold=0.001,

65

random_state=None,

66

minimum_probability=0.01,

67

minimum_phi_value=0.01,

68

per_word_topics=False,

69

dtype=np.float32

70

): ...

71

72

class HdpModel:

73

"""Hierarchical Dirichlet Process topic model."""

74

75

def __init__(

76

self,

77

corpus,

78

id2word,

79

max_chunks=None,

80

max_time=None,

81

chunksize=256,

82

kappa=1.0,

83

tau=64.0,

84

K=15,

85

T=150,

86

alpha=1,

87

gamma=1,

88

eta=0.01,

89

scale=1.0,

90

var_converge=0.0001,

91

outputdir=None,

92

random_state=None

93

): ...

94

95

def print_topics(self, topics=10, topn=10): ...

96

def show_topics(self, topics=10, topn=10, log=False, formatted=True): ...

97

98

class LdaSeqModel:

99

"""Dynamic Topic Model for sequential/temporal topic modeling."""

100

101

def __init__(

102

self,

103

corpus=None,

104

time_slice=None,

105

id2word=None,

106

alphas=0.01,

107

num_topics=10,

108

initialize='gensim',

109

sstats=None,

110

lda_model=None,

111

obs_variance=0.5,

112

chain_variance=0.005,

113

passes=10,

114

random_state=None,

115

lda_inference_max_iter=25,

116

em_min_iter=6,

117

em_max_iter=20,

118

chunksize=100

119

): ...

120

121

def print_topics(self, time=0, top_terms=10): ...

122

def doc_topics(self, doc_bow): ...

123

124

class AuthorTopicModel:

125

"""Author-Topic model for modeling documents with author information."""

126

127

def __init__(

128

self,

129

corpus=None,

130

num_topics=10,

131

id2word=None,

132

author2doc=None,

133

doc2author=None,

134

chunksize=2000,

135

passes=1,

136

iterations=50,

137

decay=0.5,

138

offset=1.0,

139

alpha='symmetric',

140

eta='symmetric',

141

update_every=1,

142

eval_every=10,

143

gamma_threshold=0.001,

144

serialized=False,

145

serialization_path=None,

146

minimum_probability=0.01,

147

random_state=None

148

): ...

149

150

def get_author_topics(self, author_name, minimum_probability=0.01): ...

151

def get_document_topics(self, bow, minimum_probability=0.01): ...

152

153

class EnsembleLda:

154

"""Ensemble of LDA models for improved topic stability."""

155

156

def __init__(

157

self,

158

corpus=None,

159

id2word=None,

160

num_topics=10,

161

num_models=3,

162

topic_model_class='ldamulticore',

163

ensemble_workers=1,

164

distance_workers=1,

165

min_samples=None,

166

epsilon=0.1,

167

random_state=None,

168

memory_friendly_ttda=True

169

): ...

170

171

def generate_gensim_representation(self): ...

172

def get_topics(self): ...

173

174

class Nmf:

175

"""Non-negative Matrix Factorization for topic modeling."""

176

177

def __init__(

178

self,

179

corpus=None,

180

num_topics=100,

181

id2word=None,

182

chunksize=2000,

183

passes=1,

184

kappa=1.0,

185

minimum_probability=0.01,

186

w_max_iter=200,

187

w_stop_condition=1e-4,

188

h_max_iter=50,

189

h_stop_condition=1e-4,

190

eval_every=10,

191

normalize=True,

192

random_state=None

193

): ...

194

195

def print_topics(self, num_topics=10, num_words=10): ...

196

def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...

197

```

198

199

### Word Embeddings

200

201

Neural network models that learn dense vector representations of words and documents, capturing semantic relationships through continuous vector spaces.

202

203

```python { .api }

204

class Word2Vec:

205

"""Word2Vec neural word embedding model."""

206

207

def __init__(

208

self,

209

sentences=None,

210

corpus_file=None,

211

vector_size=100,

212

alpha=0.025,

213

window=5,

214

min_count=5,

215

max_vocab_size=None,

216

sample=1e-3,

217

seed=1,

218

workers=3,

219

min_alpha=0.0001,

220

sg=0,

221

hs=0,

222

negative=5,

223

ns_exponent=0.75,

224

cbow_mean=1,

225

hashfxn=hash,

226

epochs=5,

227

null_word=0,

228

trim_rule=None,

229

sorted_vocab=1,

230

batch_words=10000,

231

compute_loss=False,

232

callbacks=(),

233

comment=None,

234

max_final_vocab=None,

235

shrink_windows=True

236

): ...

237

238

def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), **kwargs): ...

239

def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): ...

240

def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None): ...

241

def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...

242

def similarity(self, w1, w2): ...

243

def n_similarity(self, ws1, ws2): ...

244

def doesnt_match(self, words): ...

245

def wv: KeyedVectors

246

247

class Doc2Vec:

248

"""Doc2Vec model for learning document embeddings."""

249

250

def __init__(

251

self,

252

documents=None,

253

corpus_file=None,

254

dm_mean=None,

255

dm=1,

256

dbow_words=0,

257

dm_concat=0,

258

dm_tag_count=1,

259

docvecs=None,

260

docvecs_mapfile=None,

261

comment=None,

262

trim_rule=None,

263

callbacks=(),

264

**kwargs

265

): ...

266

267

def train(self, documents=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): ...

268

def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps=None): ...

269

def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None): ...

270

def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...

271

def similarity(self, d1, d2): ...

272

def n_similarity(self, doc_ids1, doc_ids2): ...

273

def doesnt_match(self, docs): ...

274

275

class FastText:

276

"""FastText model with subword information."""

277

278

def __init__(

279

self,

280

sentences=None,

281

corpus_file=None,

282

sg=0,

283

hs=0,

284

vector_size=100,

285

alpha=0.025,

286

window=5,

287

min_count=5,

288

max_vocab_size=None,

289

word_ngrams=1,

290

sample=1e-3,

291

seed=1,

292

workers=3,

293

min_alpha=0.0001,

294

negative=5,

295

ns_exponent=0.75,

296

cbow_mean=1,

297

hashfxn=hash,

298

epochs=5,

299

null_word=0,

300

min_n=3,

301

max_n=6,

302

sorted_vocab=1,

303

bucket=2000000,

304

trim_rule=None,

305

batch_words=10000,

306

callbacks=(),

307

compatible_hash=True,

308

shrink_windows=True

309

): ...

310

311

def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): ...

312

def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): ...

313

314

class KeyedVectors:

315

"""Standalone word vectors without training functionality."""

316

317

def __init__(self, vector_size, count=0, dtype=np.float32): ...

318

319

def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None): ...

320

def most_similar_cosmul(self, positive=None, negative=None, topn=10): ...

321

def similarity(self, w1, w2): ...

322

def n_similarity(self, ws1, ws2): ...

323

def distance(self, w1, w2): ...

324

def distances(self, word_or_vector, other_words=()): ...

325

def word_vec(self, word, use_norm=False): ...

326

def get_vector(self, word, norm=False): ...

327

def words_closer_than(self, w1, w2): ...

328

def rank(self, w1, w2): ...

329

def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): ...

330

def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): ...

331

def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): ...

332

@classmethod

333

def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=np.float32): ...

334

```

335

336

### Dimensionality Reduction and Transformations

337

338

Mathematical transformations that convert high-dimensional sparse document vectors into lower-dimensional dense representations, often improving computational efficiency and revealing latent structure.

339

340

```python { .api }

341

class LsiModel:

342

"""Latent Semantic Indexing model using SVD."""

343

344

def __init__(

345

self,

346

corpus=None,

347

num_topics=200,

348

id2word=None,

349

chunksize=20000,

350

decay=1.0,

351

distributed=False,

352

onepass=True,

353

power_iters=2,

354

extra_samples=100,

355

dtype=np.float64

356

): ...

357

358

def add_documents(self, corpus, chunksize=None, decay=None): ...

359

def print_topics(self, num_topics=10, num_words=10): ...

360

def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): ...

361

362

class TfidfModel:

363

"""TF-IDF transformation model."""

364

365

def __init__(

366

self,

367

corpus=None,

368

id2word=None,

369

dictionary=None,

370

wlocal=utils.identity,

371

wglobal=df2idf,

372

normalize=True,

373

smartirs=None,

374

pivot=None,

375

slope=0.65

376

): ...

377

378

def __getitem__(self, bow): ...

379

380

class RpModel:

381

"""Random Projections model for dimensionality reduction."""

382

383

def __init__(self, corpus, id2word=None, num_topics=300): ...

384

385

def __getitem__(self, bow): ...

386

387

class LogEntropyModel:

388

"""Log-entropy normalization model."""

389

390

def __init__(self, corpus, id2word=None, normalize=True): ...

391

392

def __getitem__(self, bow): ...

393

394

class NormModel:

395

"""L2 normalization model."""

396

397

def __init__(self, corpus=None, norm='l2'): ...

398

399

def __getitem__(self, bow): ...

400

```

401

402

### Ranking Models

403

404

Information retrieval ranking functions that score document relevance based on term frequency and document statistics.

405

406

```python { .api }

407

class OkapiBM25Model:

408

"""Okapi BM25 ranking function."""

409

410

def __init__(self, corpus, k1=1.2, b=0.75, epsilon=0.25): ...

411

412

def get_scores(self, query): ...

413

def get_batch_scores(self, query, doc_ids): ...

414

415

class LuceneBM25Model:

416

"""Lucene variant of BM25."""

417

418

def __init__(self, corpus, k1=1.2, b=0.75): ...

419

420

def get_scores(self, query): ...

421

def get_batch_scores(self, query, doc_ids): ...

422

423

class AtireBM25Model:

424

"""ATIRE variant of BM25."""

425

426

def __init__(self, corpus, k1=1.2, b=0.75): ...

427

428

def get_scores(self, query): ...

429

def get_batch_scores(self, query, doc_ids): ...

430

```

431

432

### Text Processing Models

433

434

Models for detecting phrases and handling n-gram construction from text corpora.

435

436

```python { .api }

437

class Phrases:

438

"""Automatic phrase detection model."""

439

440

def __init__(

441

self,

442

sentences=None,

443

min_count=5,

444

threshold=10.0,

445

max_vocab_size=40000000,

446

delimiter=b'_',

447

progress_per=10000,

448

scoring='default',

449

common_terms=frozenset()

450

): ...

451

452

def add_vocab(self, sentences): ...

453

def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): ...

454

def __getitem__(self, sentence): ...

455

```

456

457

### Model Evaluation

458

459

Tools for evaluating topic model quality and coherence.

460

461

```python { .api }

462

class CoherenceModel:

463

"""Topic coherence evaluation model."""

464

465

def __init__(

466

self,

467

model=None,

468

topics=None,

469

texts=None,

470

corpus=None,

471

dictionary=None,

472

window_size=None,

473

keyed_vectors=None,

474

coherence='c_v',

475

topn=20,

476

processes=-1

477

): ...

478

479

def get_coherence(self): ...

480

def get_coherence_per_topic(self, with_std=False, with_confidence=False): ...

481

```

482

483

### Translation and Cross-Language Models

484

485

Models for cross-language document translation and alignment.

486

487

```python { .api }

488

class TranslationMatrix:

489

"""Translation matrix for cross-language document alignment."""

490

491

def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): ...

492

493

def translate(self, source_words, topn=5): ...

494

def apply(self, docs): ...

495

496

class BackMappingTranslationMatrix:

497

"""Back-mapping translation matrix."""

498

499

def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): ...

500

501

def translate(self, source_words, topn=5): ...

502

```

503

504

## Usage Examples

505

506

### Training a Word2Vec Model

507

508

```python

509

from gensim.models import Word2Vec

510

from gensim.test.utils import common_texts

511

512

# Train Word2Vec on sample data

513

model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)

514

515

# Find similar words

516

similar_words = model.wv.most_similar('computer', topn=5)

517

print(similar_words)

518

519

# Get word vector

520

vector = model.wv['computer']

521

print(f"Vector shape: {vector.shape}")

522

```

523

524

### Training an LDA Topic Model

525

526

```python

527

from gensim import corpora

528

from gensim.models import LdaModel

529

from gensim.test.utils import common_texts

530

531

# Create dictionary and corpus

532

dictionary = corpora.Dictionary(common_texts)

533

corpus = [dictionary.doc2bow(text) for text in common_texts]

534

535

# Train LDA model

536

lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10)

537

538

# Print topics

539

topics = lda.print_topics(num_words=4)

540

for topic in topics:

541

print(topic)

542

```

543

544

### Document Topic Inference

545

546

```python

547

# Get topic distribution for new document

548

new_doc = ['computer', 'time', 'graph']

549

new_doc_bow = dictionary.doc2bow(new_doc)

550

doc_topics = lda.get_document_topics(new_doc_bow)

551

print(doc_topics)

552

```