or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

corpus-management.mddata-downloading.mdindex.mdmathematical-utilities.mdnlp-models.mdsimilarity-computations.mdtext-preprocessing.md

data-downloading.mddocs/

0

# Data Downloading

1

2

Convenient API for downloading pre-trained models and datasets including Word2Vec, GloVe, FastText models, and text corpora. The downloader handles caching, version management, and integrity verification automatically.

3

4

## Capabilities

5

6

### Core Download Functions

7

8

Primary functions for downloading and loading models and datasets from the gensim-data repository.

9

10

```python { .api }

11

def load(name: str, return_path: bool = False):

12

"""

13

Download and load a model or dataset.

14

15

Parameters:

16

- name: Name of the model or dataset to load

17

- return_path: If True, return file path instead of loaded object

18

19

Returns:

20

Loaded model/dataset object or file path

21

22

Raises:

23

Exception: If model/dataset not found or download fails

24

"""

25

26

def info(name: str = None, show_only_latest: bool = True, name_only: bool = False):

27

"""

28

Get information about available models and datasets.

29

30

Parameters:

31

- name: Specific model/dataset name (optional)

32

- show_only_latest: If True, hide outdated versions (only when name is None)

33

- name_only: If True, return only names of available models and corpora

34

35

Returns:

36

Dictionary with model/dataset information

37

If name is None, returns info about all available items

38

If name is provided, returns detailed info about that specific item

39

If name_only is True, returns only the names

40

"""

41

```

42

43

### Configuration Constants

44

45

Configuration values for the download system.

46

47

```python { .api }

48

BASE_DIR: str

49

"""Default download directory (~/gensim-data by default).

50

Can be overridden with GENSIM_DATA_DIR environment variable."""

51

52

DATA_LIST_URL: str

53

"""URL for the list of available models and datasets."""

54

55

DOWNLOAD_BASE_URL: str

56

"""Base URL for downloading models and datasets."""

57

```

58

59

## Usage Examples

60

61

### Loading Pre-trained Word Vectors

62

63

```python

64

import gensim.downloader as api

65

66

# Load pre-trained GloVe vectors

67

glove_vectors = api.load("glove-twitter-25")

68

print(f"Loaded GloVe vectors: {len(glove_vectors)} words")

69

70

# Find similar words

71

similar_words = glove_vectors.most_similar("python", topn=5)

72

print(f"Words similar to 'python': {similar_words}")

73

74

# Get word vector

75

if "computer" in glove_vectors:

76

vector = glove_vectors["computer"]

77

print(f"'computer' vector shape: {vector.shape}")

78

79

# Calculate word similarity

80

if "computer" in glove_vectors and "technology" in glove_vectors:

81

similarity = glove_vectors.similarity("computer", "technology")

82

print(f"Similarity between 'computer' and 'technology': {similarity}")

83

```

84

85

### Loading Text Datasets

86

87

```python

88

# Load text8 dataset (Wikipedia dump)

89

text8_corpus = api.load("text8")

90

print(f"Loaded text8 dataset")

91

92

# text8 is an iterable of word lists

93

first_sentence = next(iter(text8_corpus))

94

print(f"First sentence length: {len(first_sentence)} words")

95

print(f"First 10 words: {first_sentence[:10]}")

96

97

# Use dataset for training models

98

from gensim.models import Word2Vec

99

100

# Train Word2Vec on the dataset

101

model = Word2Vec(text8_corpus, vector_size=100, window=5, min_count=5, workers=4)

102

print(f"Trained Word2Vec model with {len(model.wv)} words")

103

```

104

105

### Getting Information About Available Data

106

107

```python

108

# Get information about all available models and datasets

109

all_info = api.info()

110

print(f"Available items: {len(all_info)}")

111

112

# Show categories

113

for category in all_info:

114

items = all_info[category]

115

print(f"{category}: {len(items)} items")

116

117

# Show first few items in each category

118

for item_name in list(items.keys())[:3]:

119

item_info = items[item_name]

120

print(f" - {item_name}: {item_info.get('description', 'No description')}")

121

122

# Get detailed information about a specific model

123

word2vec_info = api.info("word2vec-google-news-300")

124

print(f"\nWord2Vec Google News model info:")

125

print(f"Description: {word2vec_info.get('description')}")

126

print(f"Size: {word2vec_info.get('file_size')} bytes")

127

print(f"Vocabulary size: {word2vec_info.get('num_records')} words")

128

```

129

130

### Working with Different Model Types

131

132

```python

133

# Load different types of models

134

models_to_try = [

135

"glove-wiki-gigaword-50", # GloVe vectors

136

"fasttext-wiki-news-subwords-300", # FastText vectors

137

"word2vec-google-news-300" # Word2Vec vectors (large, may take time)

138

]

139

140

for model_name in models_to_try:

141

try:

142

# Get info first to check size

143

model_info = api.info(model_name)

144

file_size_mb = model_info.get('file_size', 0) / (1024 * 1024)

145

146

print(f"\n{model_name}:")

147

print(f" Size: {file_size_mb:.1f} MB")

148

print(f" Description: {model_info.get('description', 'No description')}")

149

150

# Only load smaller models for demonstration

151

if file_size_mb < 100: # Only load models smaller than 100MB

152

vectors = api.load(model_name)

153

print(f" Loaded: {len(vectors)} word vectors")

154

155

# Test with a common word

156

if "computer" in vectors:

157

similar = vectors.most_similar("computer", topn=3)

158

print(f" Similar to 'computer': {[word for word, score in similar]}")

159

else:

160

print(f" Skipping (too large for demo)")

161

162

except Exception as e:

163

print(f" Error loading {model_name}: {e}")

164

```

165

166

### Loading Corpora for Model Training

167

168

```python

169

# Available text corpora

170

corpora_to_try = [

171

"text8", # Wikipedia text

172

"fake-news", # Fake news dataset

173

"lee_background_corpus" # Lee background corpus

174

]

175

176

for corpus_name in corpora_to_try:

177

try:

178

print(f"\nLoading corpus: {corpus_name}")

179

corpus = api.load(corpus_name)

180

181

# Get first few documents to understand structure

182

docs = []

183

for i, doc in enumerate(corpus):

184

docs.append(doc)

185

if i >= 2: # Just get first 3 documents

186

break

187

188

print(f" Number of documents (sample): {len(docs)}")

189

if docs:

190

print(f" First document type: {type(docs[0])}")

191

if isinstance(docs[0], list):

192

print(f" First document length: {len(docs[0])} tokens")

193

print(f" First few tokens: {docs[0][:10]}")

194

195

except Exception as e:

196

print(f" Error loading {corpus_name}: {e}")

197

```

198

199

### Managing Download Cache

200

201

```python

202

import os

203

204

# Check current download directory

205

print(f"Download directory: {api.BASE_DIR}")

206

207

# Check if directory exists and what's in it

208

if os.path.exists(api.BASE_DIR):

209

items = os.listdir(api.BASE_DIR)

210

print(f"Cached items: {len(items)}")

211

for item in items[:5]: # Show first 5

212

item_path = os.path.join(api.BASE_DIR, item)

213

if os.path.isdir(item_path):

214

print(f" {item}/ (directory)")

215

else:

216

size = os.path.getsize(item_path) / (1024 * 1024)

217

print(f" {item} ({size:.1f} MB)")

218

else:

219

print("Download directory doesn't exist yet")

220

```

221

222

### Using Return Path Option

223

224

```python

225

# Get file path instead of loading the model

226

model_path = api.load("glove-twitter-25", return_path=True)

227

print(f"Model file path: {model_path}")

228

229

# You can then load it manually if needed

230

from gensim.models import KeyedVectors

231

vectors = KeyedVectors.load_word2vec_format(model_path)

232

print(f"Manually loaded vectors: {len(vectors)} words")

233

```

234

235

### Error Handling and Validation

236

237

```python

238

def safe_load_model(model_name, max_size_mb=50):

239

"""Safely load a model with size checking."""

240

try:

241

# Get model info first

242

info = api.info(model_name)

243

if not info:

244

print(f"Model '{model_name}' not found")

245

return None

246

247

size_mb = info.get('file_size', 0) / (1024 * 1024)

248

if size_mb > max_size_mb:

249

print(f"Model '{model_name}' is {size_mb:.1f} MB (exceeds {max_size_mb} MB limit)")

250

return None

251

252

print(f"Loading '{model_name}' ({size_mb:.1f} MB)...")

253

model = api.load(model_name)

254

print(f"Successfully loaded '{model_name}'")

255

return model

256

257

except Exception as e:

258

print(f"Error loading '{model_name}': {e}")

259

return None

260

261

# Test safe loading

262

model = safe_load_model("glove-twitter-25")

263

if model:

264

print(f"Model has {len(model)} word vectors")

265

```

266

267

### Finding Models by Category

268

269

```python

270

def find_models_by_category(category_name):

271

"""Find all models in a specific category."""

272

all_info = api.info()

273

274

if category_name in all_info:

275

category_models = all_info[category_name]

276

print(f"\nModels in '{category_name}' category:")

277

278

for model_name, model_info in category_models.items():

279

size_mb = model_info.get('file_size', 0) / (1024 * 1024)

280

description = model_info.get('description', 'No description')

281

print(f" {model_name}")

282

print(f" Size: {size_mb:.1f} MB")

283

print(f" Description: {description}")

284

print()

285

else:

286

print(f"Category '{category_name}' not found")

287

print(f"Available categories: {list(all_info.keys())}")

288

289

# Find word embedding models

290

find_models_by_category("models")

291

292

# Find text corpora

293

find_models_by_category("corpora")

294

```

295

296

### Integration with Model Training

297

298

```python

299

# Download dataset and train a model

300

print("Loading training data...")

301

corpus = api.load("text8")

302

303

print("Training Word2Vec model...")

304

from gensim.models import Word2Vec

305

306

model = Word2Vec(

307

sentences=corpus,

308

vector_size=100,

309

window=5,

310

min_count=5,

311

workers=4,

312

epochs=5

313

)

314

315

print(f"Trained model with {len(model.wv)} words")

316

317

# Compare with pre-trained vectors

318

print("\nLoading pre-trained vectors for comparison...")

319

pretrained = api.load("glove-twitter-25")

320

321

# Test both models

322

test_word = "computer"

323

if test_word in model.wv and test_word in pretrained:

324

custom_similar = model.wv.most_similar(test_word, topn=3)

325

pretrained_similar = pretrained.most_similar(test_word, topn=3)

326

327

print(f"\nSimilar to '{test_word}':")

328

print(f"Custom model: {[word for word, score in custom_similar]}")

329

print(f"Pre-trained: {[word for word, score in pretrained_similar]}")

330

```