or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

corpus-management.mddata-downloading.mdindex.mdmathematical-utilities.mdnlp-models.mdsimilarity-computations.mdtext-preprocessing.md

corpus-management.mddocs/

0

# Corpus Management

1

2

Comprehensive corpus I/O system supporting streaming document collections in multiple formats. Gensim's corpus infrastructure enables memory-efficient processing of datasets larger than available RAM through lazy evaluation and format-agnostic interfaces.

3

4

## Capabilities

5

6

### Dictionary Management

7

8

Core vocabulary management with word-to-integer ID mappings, corpus statistics, and vocabulary filtering operations.

9

10

```python { .api }

11

class Dictionary:

12

"""Mapping between words and their integer IDs."""

13

14

def __init__(self, documents=None, prune_at=2000000): ...

15

16

def add_documents(self, documents, prune_at=2000000): ...

17

def doc2bow(self, document, allow_update=False, return_missing=False): ...

18

def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): ...

19

def filter_n_most_frequent(self, remove_n): ...

20

def filter_tokens(self, bad_ids=None, good_ids=None): ...

21

def compactify(self, sort_by_word=True): ...

22

def save_as_text(self, fname, sort_by_word=True): ...

23

def merge_with(self, other): ...

24

def patch_with_special_tokens(self, special_tokens): ...

25

def most_common(self, n=None): ...

26

27

@classmethod

28

def load_from_text(cls, fname): ...

29

@classmethod

30

def from_documents(cls, documents): ...

31

@classmethod

32

def from_corpus(cls, corpus, id2word=None): ...

33

34

def __getitem__(self, tokenid): ...

35

def __len__(self): ...

36

def __str__(self): ...

37

def keys(self): ...

38

def __contains__(self, tokenid): ...

39

40

class HashDictionary:

41

"""Memory-efficient dictionary using hashing."""

42

43

def __init__(self, documents=None, id_range=32000, debug=True): ...

44

45

def add_documents(self, documents): ...

46

def doc2bow(self, document, allow_update=False, return_missing=False): ...

47

def filter_tokens(self, bad_ids=None, good_ids=None): ...

48

def save_as_text(self, fname, sort_by_word=True): ...

49

50

def __getitem__(self, tokenid): ...

51

def __len__(self): ...

52

def keys(self): ...

53

```

54

55

### Corpus Formats

56

57

Multiple corpus I/O formats for different data exchange standards and compatibility with external tools.

58

59

```python { .api }

60

class MmCorpus:

61

"""Matrix Market format corpus."""

62

63

def __init__(self, fname): ...

64

65

@staticmethod

66

def save_corpus(fname, corpus, id2word=None, labels=None, comments=None, metadata=False): ...

67

@staticmethod

68

def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): ...

69

70

def __iter__(self): ...

71

def __len__(self): ...

72

def docbyoffset(self, offset): ...

73

74

class BleiCorpus:

75

"""David Blei's LDA-C format corpus."""

76

77

def __init__(self, fname): ...

78

79

@staticmethod

80

def save_corpus(fname, corpus, id2word=None, metadata=False): ...

81

82

def __iter__(self): ...

83

def __len__(self): ...

84

85

class SvmLightCorpus:

86

"""SVMlight format corpus."""

87

88

def __init__(self, fname, store_labels=True): ...

89

90

@staticmethod

91

def save_corpus(fname, corpus, id2word=None, labels=None, metadata=False): ...

92

93

def __iter__(self): ...

94

def __len__(self): ...

95

96

class LowCorpus:

97

"""GibbsLDA++ format corpus."""

98

99

def __init__(self, fname): ...

100

101

@staticmethod

102

def save_corpus(fname, corpus, id2word=None, metadata=False): ...

103

104

def __iter__(self): ...

105

106

class UciCorpus:

107

"""UCI Bag-of-Words format corpus."""

108

109

def __init__(self, fname): ...

110

111

@staticmethod

112

def save_corpus(fname, corpus, id2word=None, metadata=False): ...

113

114

def __iter__(self): ...

115

def __len__(self): ...

116

117

class MalletCorpus:

118

"""Mallet format corpus."""

119

120

def __init__(self, fname): ...

121

122

@staticmethod

123

def save_corpus(fname, corpus, id2word=None, metadata=False): ...

124

125

def __iter__(self): ...

126

127

class OpinosisCorpus:

128

"""Opinosis dataset corpus format."""

129

130

def __init__(self, fname): ...

131

132

def __iter__(self): ...

133

def __len__(self): ...

134

```

135

136

### Text Corpus Processing

137

138

Specialized corpus classes for processing text documents with built-in preprocessing and tokenization.

139

140

```python { .api }

141

class TextCorpus:

142

"""Generic text corpus with preprocessing."""

143

144

def __init__(

145

self,

146

input=None,

147

dictionary=None,

148

metadata=False,

149

character_filters=None,

150

tokenizer=None,

151

token_filters=None

152

): ...

153

154

def preprocess_text(self, text): ...

155

def sample_texts(self, n, seed=None, length_range=(10, 500)): ...

156

def __iter__(self): ...

157

def __len__(self): ...

158

def getstream(self): ...

159

160

class TextDirectoryCorpus(TextCorpus):

161

"""Corpus from directory of text files."""

162

163

def __init__(

164

self,

165

input,

166

dictionary=None,

167

metadata=False,

168

min_depth=0,

169

max_depth=None,

170

pattern=None,

171

exclude_pattern=None,

172

lines_are_documents=False,

173

**kwargs

174

): ...

175

176

def iter_filepaths(self): ...

177

```

178

179

### Specialized Corpus Types

180

181

Domain-specific corpus processors for particular data sources like Wikipedia.

182

183

```python { .api }

184

class WikiCorpus:

185

"""Wikipedia dump corpus processor."""

186

187

def __init__(

188

self,

189

fname,

190

processes=None,

191

lemmatize=True,

192

dictionary=None,

193

filter_namespaces=('0',),

194

tokenizer_func=tokenize,

195

article_min_tokens=50,

196

token_min_len=2,

197

token_max_len=15,

198

lower=True

199

): ...

200

201

def get_texts(self): ...

202

def extract_pages(self, out, compress=True): ...

203

204

def __iter__(self): ...

205

def __len__(self): ...

206

207

class IndexedCorpus:

208

"""Base class for indexed corpora with random access."""

209

210

def __init__(self, fname, index_fname=None): ...

211

212

def __getitem__(self, docno): ...

213

def __iter__(self): ...

214

def __len__(self): ...

215

def save(self, fname_or_handle, separately=None, sep_limit=10485760, ignore=frozenset(), pickle_protocol=2): ...

216

def load(self, fname, mmap=None): ...

217

```

218

219

## Usage Examples

220

221

### Creating and Using Dictionaries

222

223

```python

224

from gensim import corpora

225

from gensim.test.utils import common_texts

226

227

# Create dictionary from documents

228

dictionary = corpora.Dictionary(common_texts)

229

print(f"Dictionary size: {len(dictionary)}")

230

231

# Convert documents to bag-of-words

232

corpus = [dictionary.doc2bow(text) for text in common_texts]

233

print(f"Corpus: {corpus[0]}") # Show first document

234

235

# Filter extremes

236

dictionary.filter_extremes(no_below=2, no_above=0.8)

237

238

# Save and load dictionary

239

dictionary.save('/tmp/dictionary.dict')

240

loaded_dict = corpora.Dictionary.load('/tmp/dictionary.dict')

241

```

242

243

### Working with Different Corpus Formats

244

245

```python

246

from gensim.corpora import MmCorpus, SvmLightCorpus

247

248

# Save corpus in Matrix Market format

249

MmCorpus.save_corpus('/tmp/corpus.mm', corpus, id2word=dictionary)

250

251

# Load corpus

252

mm_corpus = MmCorpus('/tmp/corpus.mm')

253

print(f"Corpus length: {len(mm_corpus)}")

254

255

# Convert to SVMlight format

256

SvmLightCorpus.save_corpus('/tmp/corpus.svmlight', corpus, id2word=dictionary)

257

svm_corpus = SvmLightCorpus('/tmp/corpus.svmlight')

258

259

# Iterate over documents

260

for doc in mm_corpus:

261

print(doc)

262

break # Just show first document

263

```

264

265

### Processing Text Directories

266

267

```python

268

from gensim.corpora import TextDirectoryCorpus

269

270

# Create corpus from text files in directory

271

text_corpus = TextDirectoryCorpus('/path/to/text/files', min_depth=1)

272

273

# Create dictionary from text corpus

274

dictionary = text_corpus.dictionary

275

276

# Convert to bag-of-words

277

bow_corpus = [dictionary.doc2bow(doc) for doc in text_corpus.get_texts()]

278

```

279

280

### Working with Wikipedia Dumps

281

282

```python

283

from gensim.corpora import WikiCorpus

284

285

# Process Wikipedia dump

286

wiki_corpus = WikiCorpus('/path/to/wikipedia/dump.xml.bz2',

287

lemmatize=True,

288

processes=4)

289

290

# Extract articles as text

291

wiki_corpus.extract_pages('/tmp/wiki_articles', compress=True)

292

293

# Create dictionary from wiki corpus

294

dictionary = wiki_corpus.dictionary

295

296

# Convert to bag-of-words

297

bow_corpus = [dictionary.doc2bow(article) for article in wiki_corpus.get_texts()]

298

```

299

300

### Dictionary Filtering and Manipulation

301

302

```python

303

# Filter extremes: remove words that appear in less than 5 documents

304

# or more than 50% of documents

305

dictionary.filter_extremes(no_below=5, no_above=0.5)

306

307

# Keep only top 10000 most frequent words

308

dictionary.filter_n_most_frequent(10000)

309

310

# Merge dictionaries

311

other_dict = corpora.Dictionary(other_documents)

312

dictionary.merge_with(other_dict)

313

314

# Get word frequencies

315

word_freq = dictionary.cfs

316

most_common = dictionary.most_common(10)

317

print(f"Most common words: {most_common}")

318

319

# Check if word exists

320

if 'computer' in dictionary.token2id:

321

word_id = dictionary.token2id['computer']

322

print(f"'computer' has ID: {word_id}")

323

```

324

325

### Corpus Statistics and Analysis

326

327

```python

328

# Get corpus statistics

329

num_docs = len(corpus)

330

num_tokens = sum(sum(freq for _, freq in doc) for doc in corpus)

331

print(f"Corpus: {num_docs} documents, {num_tokens} tokens")

332

333

# Get document lengths

334

doc_lengths = [sum(freq for _, freq in doc) for doc in corpus]

335

avg_length = sum(doc_lengths) / len(doc_lengths)

336

print(f"Average document length: {avg_length:.2f} tokens")

337

338

# Find sparse documents

339

sparse_docs = [i for i, doc in enumerate(corpus) if len(doc) < 10]

340

print(f"Sparse documents (< 10 unique tokens): {len(sparse_docs)}")

341

```