0
# Corpus Management
1
2
Comprehensive corpus I/O system supporting streaming document collections in multiple formats. Gensim's corpus infrastructure enables memory-efficient processing of datasets larger than available RAM through lazy evaluation and format-agnostic interfaces.
3
4
## Capabilities
5
6
### Dictionary Management
7
8
Core vocabulary management with word-to-integer ID mappings, corpus statistics, and vocabulary filtering operations.
9
10
```python { .api }
11
class Dictionary:
12
"""Mapping between words and their integer IDs."""
13
14
def __init__(self, documents=None, prune_at=2000000): ...
15
16
def add_documents(self, documents, prune_at=2000000): ...
17
def doc2bow(self, document, allow_update=False, return_missing=False): ...
18
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): ...
19
def filter_n_most_frequent(self, remove_n): ...
20
def filter_tokens(self, bad_ids=None, good_ids=None): ...
21
def compactify(self, sort_by_word=True): ...
22
def save_as_text(self, fname, sort_by_word=True): ...
23
def merge_with(self, other): ...
24
def patch_with_special_tokens(self, special_tokens): ...
25
def most_common(self, n=None): ...
26
27
@classmethod
28
def load_from_text(cls, fname): ...
29
@classmethod
30
def from_documents(cls, documents): ...
31
@classmethod
32
def from_corpus(cls, corpus, id2word=None): ...
33
34
def __getitem__(self, tokenid): ...
35
def __len__(self): ...
36
def __str__(self): ...
37
def keys(self): ...
38
def __contains__(self, tokenid): ...
39
40
class HashDictionary:
41
"""Memory-efficient dictionary using hashing."""
42
43
def __init__(self, documents=None, id_range=32000, debug=True): ...
44
45
def add_documents(self, documents): ...
46
def doc2bow(self, document, allow_update=False, return_missing=False): ...
47
def filter_tokens(self, bad_ids=None, good_ids=None): ...
48
def save_as_text(self, fname, sort_by_word=True): ...
49
50
def __getitem__(self, tokenid): ...
51
def __len__(self): ...
52
def keys(self): ...
53
```
54
55
### Corpus Formats
56
57
Multiple corpus I/O formats for different data exchange standards and compatibility with external tools.
58
59
```python { .api }
60
class MmCorpus:
61
"""Matrix Market format corpus."""
62
63
def __init__(self, fname): ...
64
65
@staticmethod
66
def save_corpus(fname, corpus, id2word=None, labels=None, comments=None, metadata=False): ...
67
@staticmethod
68
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): ...
69
70
def __iter__(self): ...
71
def __len__(self): ...
72
def docbyoffset(self, offset): ...
73
74
class BleiCorpus:
75
"""David Blei's LDA-C format corpus."""
76
77
def __init__(self, fname): ...
78
79
@staticmethod
80
def save_corpus(fname, corpus, id2word=None, metadata=False): ...
81
82
def __iter__(self): ...
83
def __len__(self): ...
84
85
class SvmLightCorpus:
86
"""SVMlight format corpus."""
87
88
def __init__(self, fname, store_labels=True): ...
89
90
@staticmethod
91
def save_corpus(fname, corpus, id2word=None, labels=None, metadata=False): ...
92
93
def __iter__(self): ...
94
def __len__(self): ...
95
96
class LowCorpus:
97
"""GibbsLDA++ format corpus."""
98
99
def __init__(self, fname): ...
100
101
@staticmethod
102
def save_corpus(fname, corpus, id2word=None, metadata=False): ...
103
104
def __iter__(self): ...
105
106
class UciCorpus:
107
"""UCI Bag-of-Words format corpus."""
108
109
def __init__(self, fname): ...
110
111
@staticmethod
112
def save_corpus(fname, corpus, id2word=None, metadata=False): ...
113
114
def __iter__(self): ...
115
def __len__(self): ...
116
117
class MalletCorpus:
118
"""Mallet format corpus."""
119
120
def __init__(self, fname): ...
121
122
@staticmethod
123
def save_corpus(fname, corpus, id2word=None, metadata=False): ...
124
125
def __iter__(self): ...
126
127
class OpinosisCorpus:
128
"""Opinosis dataset corpus format."""
129
130
def __init__(self, fname): ...
131
132
def __iter__(self): ...
133
def __len__(self): ...
134
```
135
136
### Text Corpus Processing
137
138
Specialized corpus classes for processing text documents with built-in preprocessing and tokenization.
139
140
```python { .api }
141
class TextCorpus:
142
"""Generic text corpus with preprocessing."""
143
144
def __init__(
145
self,
146
input=None,
147
dictionary=None,
148
metadata=False,
149
character_filters=None,
150
tokenizer=None,
151
token_filters=None
152
): ...
153
154
def preprocess_text(self, text): ...
155
def sample_texts(self, n, seed=None, length_range=(10, 500)): ...
156
def __iter__(self): ...
157
def __len__(self): ...
158
def getstream(self): ...
159
160
class TextDirectoryCorpus(TextCorpus):
161
"""Corpus from directory of text files."""
162
163
def __init__(
164
self,
165
input,
166
dictionary=None,
167
metadata=False,
168
min_depth=0,
169
max_depth=None,
170
pattern=None,
171
exclude_pattern=None,
172
lines_are_documents=False,
173
**kwargs
174
): ...
175
176
def iter_filepaths(self): ...
177
```
178
179
### Specialized Corpus Types
180
181
Domain-specific corpus processors for particular data sources like Wikipedia.
182
183
```python { .api }
184
class WikiCorpus:
185
"""Wikipedia dump corpus processor."""
186
187
def __init__(
188
self,
189
fname,
190
processes=None,
191
lemmatize=True,
192
dictionary=None,
193
filter_namespaces=('0',),
194
tokenizer_func=tokenize,
195
article_min_tokens=50,
196
token_min_len=2,
197
token_max_len=15,
198
lower=True
199
): ...
200
201
def get_texts(self): ...
202
def extract_pages(self, out, compress=True): ...
203
204
def __iter__(self): ...
205
def __len__(self): ...
206
207
class IndexedCorpus:
208
"""Base class for indexed corpora with random access."""
209
210
def __init__(self, fname, index_fname=None): ...
211
212
def __getitem__(self, docno): ...
213
def __iter__(self): ...
214
def __len__(self): ...
215
def save(self, fname_or_handle, separately=None, sep_limit=10485760, ignore=frozenset(), pickle_protocol=2): ...
216
def load(self, fname, mmap=None): ...
217
```
218
219
## Usage Examples
220
221
### Creating and Using Dictionaries
222
223
```python
224
from gensim import corpora
225
from gensim.test.utils import common_texts
226
227
# Create dictionary from documents
228
dictionary = corpora.Dictionary(common_texts)
229
print(f"Dictionary size: {len(dictionary)}")
230
231
# Convert documents to bag-of-words
232
corpus = [dictionary.doc2bow(text) for text in common_texts]
233
print(f"Corpus: {corpus[0]}") # Show first document
234
235
# Filter extremes
236
dictionary.filter_extremes(no_below=2, no_above=0.8)
237
238
# Save and load dictionary
239
dictionary.save('/tmp/dictionary.dict')
240
loaded_dict = corpora.Dictionary.load('/tmp/dictionary.dict')
241
```
242
243
### Working with Different Corpus Formats
244
245
```python
246
from gensim.corpora import MmCorpus, SvmLightCorpus
247
248
# Save corpus in Matrix Market format
249
MmCorpus.save_corpus('/tmp/corpus.mm', corpus, id2word=dictionary)
250
251
# Load corpus
252
mm_corpus = MmCorpus('/tmp/corpus.mm')
253
print(f"Corpus length: {len(mm_corpus)}")
254
255
# Convert to SVMlight format
256
SvmLightCorpus.save_corpus('/tmp/corpus.svmlight', corpus, id2word=dictionary)
257
svm_corpus = SvmLightCorpus('/tmp/corpus.svmlight')
258
259
# Iterate over documents
260
for doc in mm_corpus:
261
print(doc)
262
break # Just show first document
263
```
264
265
### Processing Text Directories
266
267
```python
268
from gensim.corpora import TextDirectoryCorpus
269
270
# Create corpus from text files in directory
271
text_corpus = TextDirectoryCorpus('/path/to/text/files', min_depth=1)
272
273
# Create dictionary from text corpus
274
dictionary = text_corpus.dictionary
275
276
# Convert to bag-of-words
277
bow_corpus = [dictionary.doc2bow(doc) for doc in text_corpus.get_texts()]
278
```
279
280
### Working with Wikipedia Dumps
281
282
```python
283
from gensim.corpora import WikiCorpus
284
285
# Process Wikipedia dump
286
wiki_corpus = WikiCorpus('/path/to/wikipedia/dump.xml.bz2',
287
lemmatize=True,
288
processes=4)
289
290
# Extract articles as text
291
wiki_corpus.extract_pages('/tmp/wiki_articles', compress=True)
292
293
# Create dictionary from wiki corpus
294
dictionary = wiki_corpus.dictionary
295
296
# Convert to bag-of-words
297
bow_corpus = [dictionary.doc2bow(article) for article in wiki_corpus.get_texts()]
298
```
299
300
### Dictionary Filtering and Manipulation
301
302
```python
303
# Filter extremes: remove words that appear in less than 5 documents
304
# or more than 50% of documents
305
dictionary.filter_extremes(no_below=5, no_above=0.5)
306
307
# Keep only top 10000 most frequent words
308
dictionary.filter_n_most_frequent(10000)
309
310
# Merge dictionaries
311
other_dict = corpora.Dictionary(other_documents)
312
dictionary.merge_with(other_dict)
313
314
# Get word frequencies
315
word_freq = dictionary.cfs
316
most_common = dictionary.most_common(10)
317
print(f"Most common words: {most_common}")
318
319
# Check if word exists
320
if 'computer' in dictionary.token2id:
321
word_id = dictionary.token2id['computer']
322
print(f"'computer' has ID: {word_id}")
323
```
324
325
### Corpus Statistics and Analysis
326
327
```python
328
# Get corpus statistics
329
num_docs = len(corpus)
330
num_tokens = sum(sum(freq for _, freq in doc) for doc in corpus)
331
print(f"Corpus: {num_docs} documents, {num_tokens} tokens")
332
333
# Get document lengths
334
doc_lengths = [sum(freq for _, freq in doc) for doc in corpus]
335
avg_length = sum(doc_lengths) / len(doc_lengths)
336
print(f"Average document length: {avg_length:.2f} tokens")
337
338
# Find sparse documents
339
sparse_docs = [i for i, doc in enumerate(corpus) if len(doc) < 10]
340
print(f"Sparse documents (< 10 unique tokens): {len(sparse_docs)}")
341
```