Tessl Tile for pypi/gensim@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

corpus-management.md data-downloading.md index.md mathematical-utilities.md nlp-models.md similarity-computations.md text-preprocessing.md

corpus-management.mddocs/

0
# Corpus Management
1

2
Comprehensive corpus I/O system supporting streaming document collections in multiple formats. Gensim's corpus infrastructure enables memory-efficient processing of datasets larger than available RAM through lazy evaluation and format-agnostic interfaces.
3

4
## Capabilities
5

6
### Dictionary Management
7

8
Core vocabulary management with word-to-integer ID mappings, corpus statistics, and vocabulary filtering operations.
9

10
```python { .api }
11
class Dictionary:
12
    """Mapping between words and their integer IDs."""
13
    
14
    def __init__(self, documents=None, prune_at=2000000): ...
15
    
16
    def add_documents(self, documents, prune_at=2000000): ...
17
    def doc2bow(self, document, allow_update=False, return_missing=False): ...
18
    def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): ...
19
    def filter_n_most_frequent(self, remove_n): ...
20
    def filter_tokens(self, bad_ids=None, good_ids=None): ...
21
    def compactify(self, sort_by_word=True): ...
22
    def save_as_text(self, fname, sort_by_word=True): ...
23
    def merge_with(self, other): ...
24
    def patch_with_special_tokens(self, special_tokens): ...
25
    def most_common(self, n=None): ...
26
    
27
    @classmethod
28
    def load_from_text(cls, fname): ...
29
    @classmethod  
30
    def from_documents(cls, documents): ...
31
    @classmethod
32
    def from_corpus(cls, corpus, id2word=None): ...
33
    
34
    def __getitem__(self, tokenid): ...
35
    def __len__(self): ...
36
    def __str__(self): ...
37
    def keys(self): ...
38
    def __contains__(self, tokenid): ...
39

40
class HashDictionary:
41
    """Memory-efficient dictionary using hashing."""
42
    
43
    def __init__(self, documents=None, id_range=32000, debug=True): ...
44
    
45
    def add_documents(self, documents): ...
46
    def doc2bow(self, document, allow_update=False, return_missing=False): ...
47
    def filter_tokens(self, bad_ids=None, good_ids=None): ...
48
    def save_as_text(self, fname, sort_by_word=True): ...
49
    
50
    def __getitem__(self, tokenid): ...
51
    def __len__(self): ...
52
    def keys(self): ...
53
```
54

55
### Corpus Formats
56

57
Multiple corpus I/O formats for different data exchange standards and compatibility with external tools.
58

59
```python { .api }
60
class MmCorpus:
61
    """Matrix Market format corpus."""
62
    
63
    def __init__(self, fname): ...
64
    
65
    @staticmethod
66
    def save_corpus(fname, corpus, id2word=None, labels=None, comments=None, metadata=False): ...
67
    @staticmethod
68
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): ...
69
    
70
    def __iter__(self): ...
71
    def __len__(self): ...
72
    def docbyoffset(self, offset): ...
73

74
class BleiCorpus:
75
    """David Blei's LDA-C format corpus."""
76
    
77
    def __init__(self, fname): ...
78
    
79
    @staticmethod
80
    def save_corpus(fname, corpus, id2word=None, metadata=False): ...
81
    
82
    def __iter__(self): ...
83
    def __len__(self): ...
84

85
class SvmLightCorpus:
86
    """SVMlight format corpus."""
87
    
88
    def __init__(self, fname, store_labels=True): ...
89
    
90
    @staticmethod
91
    def save_corpus(fname, corpus, id2word=None, labels=None, metadata=False): ...
92
    
93
    def __iter__(self): ...
94
    def __len__(self): ...
95

96
class LowCorpus:
97
    """GibbsLDA++ format corpus."""
98
    
99
    def __init__(self, fname): ...
100
    
101
    @staticmethod
102
    def save_corpus(fname, corpus, id2word=None, metadata=False): ...
103
    
104
    def __iter__(self): ...
105

106
class UciCorpus:
107
    """UCI Bag-of-Words format corpus."""
108
    
109
    def __init__(self, fname): ...
110
    
111
    @staticmethod
112
    def save_corpus(fname, corpus, id2word=None, metadata=False): ...
113
    
114
    def __iter__(self): ...
115
    def __len__(self): ...
116

117
class MalletCorpus:
118
    """Mallet format corpus."""
119
    
120
    def __init__(self, fname): ...
121
    
122
    @staticmethod
123
    def save_corpus(fname, corpus, id2word=None, metadata=False): ...
124
    
125
    def __iter__(self): ...
126

127
class OpinosisCorpus:
128
    """Opinosis dataset corpus format."""
129
    
130
    def __init__(self, fname): ...
131
    
132
    def __iter__(self): ...
133
    def __len__(self): ...
134
```
135

136
### Text Corpus Processing
137

138
Specialized corpus classes for processing text documents with built-in preprocessing and tokenization.
139

140
```python { .api }
141
class TextCorpus:
142
    """Generic text corpus with preprocessing."""
143
    
144
    def __init__(
145
        self,
146
        input=None,
147
        dictionary=None,
148
        metadata=False,
149
        character_filters=None,
150
        tokenizer=None,
151
        token_filters=None
152
    ): ...
153
    
154
    def preprocess_text(self, text): ...
155
    def sample_texts(self, n, seed=None, length_range=(10, 500)): ...
156
    def __iter__(self): ...
157
    def __len__(self): ...
158
    def getstream(self): ...
159

160
class TextDirectoryCorpus(TextCorpus):
161
    """Corpus from directory of text files."""
162
    
163
    def __init__(
164
        self,
165
        input,
166
        dictionary=None,
167
        metadata=False,
168
        min_depth=0,
169
        max_depth=None,
170
        pattern=None,
171
        exclude_pattern=None,
172
        lines_are_documents=False,
173
        **kwargs
174
    ): ...
175
    
176
    def iter_filepaths(self): ...
177
```
178

179
### Specialized Corpus Types
180

181
Domain-specific corpus processors for particular data sources like Wikipedia.
182

183
```python { .api }
184
class WikiCorpus:
185
    """Wikipedia dump corpus processor."""
186
    
187
    def __init__(
188
        self,
189
        fname,
190
        processes=None,
191
        lemmatize=True,
192
        dictionary=None,
193
        filter_namespaces=('0',),
194
        tokenizer_func=tokenize,
195
        article_min_tokens=50,
196
        token_min_len=2,
197
        token_max_len=15,
198
        lower=True
199
    ): ...
200
    
201
    def get_texts(self): ...
202
    def extract_pages(self, out, compress=True): ...
203
    
204
    def __iter__(self): ...
205
    def __len__(self): ...
206

207
class IndexedCorpus:
208
    """Base class for indexed corpora with random access."""
209
    
210
    def __init__(self, fname, index_fname=None): ...
211
    
212
    def __getitem__(self, docno): ...
213
    def __iter__(self): ...
214
    def __len__(self): ...
215
    def save(self, fname_or_handle, separately=None, sep_limit=10485760, ignore=frozenset(), pickle_protocol=2): ...
216
    def load(self, fname, mmap=None): ...
217
```
218

219
## Usage Examples
220

221
### Creating and Using Dictionaries
222

223
```python
224
from gensim import corpora
225
from gensim.test.utils import common_texts
226

227
# Create dictionary from documents
228
dictionary = corpora.Dictionary(common_texts)
229
print(f"Dictionary size: {len(dictionary)}")
230

231
# Convert documents to bag-of-words
232
corpus = [dictionary.doc2bow(text) for text in common_texts]
233
print(f"Corpus: {corpus[0]}")  # Show first document
234

235
# Filter extremes
236
dictionary.filter_extremes(no_below=2, no_above=0.8)
237

238
# Save and load dictionary
239
dictionary.save('/tmp/dictionary.dict')
240
loaded_dict = corpora.Dictionary.load('/tmp/dictionary.dict')
241
```
242

243
### Working with Different Corpus Formats
244

245
```python
246
from gensim.corpora import MmCorpus, SvmLightCorpus
247

248
# Save corpus in Matrix Market format
249
MmCorpus.save_corpus('/tmp/corpus.mm', corpus, id2word=dictionary)
250

251
# Load corpus
252
mm_corpus = MmCorpus('/tmp/corpus.mm')
253
print(f"Corpus length: {len(mm_corpus)}")
254

255
# Convert to SVMlight format
256
SvmLightCorpus.save_corpus('/tmp/corpus.svmlight', corpus, id2word=dictionary)
257
svm_corpus = SvmLightCorpus('/tmp/corpus.svmlight')
258

259
# Iterate over documents
260
for doc in mm_corpus:
261
    print(doc)
262
    break  # Just show first document
263
```
264

265
### Processing Text Directories
266

267
```python
268
from gensim.corpora import TextDirectoryCorpus
269

270
# Create corpus from text files in directory
271
text_corpus = TextDirectoryCorpus('/path/to/text/files', min_depth=1)
272

273
# Create dictionary from text corpus
274
dictionary = text_corpus.dictionary
275

276
# Convert to bag-of-words
277
bow_corpus = [dictionary.doc2bow(doc) for doc in text_corpus.get_texts()]
278
```
279

280
### Working with Wikipedia Dumps
281

282
```python
283
from gensim.corpora import WikiCorpus
284

285
# Process Wikipedia dump
286
wiki_corpus = WikiCorpus('/path/to/wikipedia/dump.xml.bz2', 
287
                        lemmatize=True, 
288
                        processes=4)
289

290
# Extract articles as text
291
wiki_corpus.extract_pages('/tmp/wiki_articles', compress=True)
292

293
# Create dictionary from wiki corpus
294
dictionary = wiki_corpus.dictionary
295

296
# Convert to bag-of-words
297
bow_corpus = [dictionary.doc2bow(article) for article in wiki_corpus.get_texts()]
298
```
299

300
### Dictionary Filtering and Manipulation
301

302
```python
303
# Filter extremes: remove words that appear in less than 5 documents
304
# or more than 50% of documents
305
dictionary.filter_extremes(no_below=5, no_above=0.5)
306

307
# Keep only top 10000 most frequent words
308
dictionary.filter_n_most_frequent(10000)
309

310
# Merge dictionaries
311
other_dict = corpora.Dictionary(other_documents)
312
dictionary.merge_with(other_dict)
313

314
# Get word frequencies
315
word_freq = dictionary.cfs
316
most_common = dictionary.most_common(10)
317
print(f"Most common words: {most_common}")
318

319
# Check if word exists
320
if 'computer' in dictionary.token2id:
321
    word_id = dictionary.token2id['computer']
322
    print(f"'computer' has ID: {word_id}")
323
```
324

325
### Corpus Statistics and Analysis
326

327
```python
328
# Get corpus statistics
329
num_docs = len(corpus)
330
num_tokens = sum(sum(freq for _, freq in doc) for doc in corpus)
331
print(f"Corpus: {num_docs} documents, {num_tokens} tokens")
332

333
# Get document lengths
334
doc_lengths = [sum(freq for _, freq in doc) for doc in corpus]
335
avg_length = sum(doc_lengths) / len(doc_lengths)
336
print(f"Average document length: {avg_length:.2f} tokens")
337

338
# Find sparse documents
339
sparse_docs = [i for i, doc in enumerate(corpus) if len(doc) < 10]
340
print(f"Sparse documents (< 10 unique tokens): {len(sparse_docs)}")
341
```

Version

Tile

Files

corpus-management.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

corpus-management.mddocs/