0
# Data Downloading
1
2
Convenient API for downloading pre-trained models and datasets including Word2Vec, GloVe, FastText models, and text corpora. The downloader handles caching, version management, and integrity verification automatically.
3
4
## Capabilities
5
6
### Core Download Functions
7
8
Primary functions for downloading and loading models and datasets from the gensim-data repository.
9
10
```python { .api }
11
def load(name: str, return_path: bool = False):
12
"""
13
Download and load a model or dataset.
14
15
Parameters:
16
- name: Name of the model or dataset to load
17
- return_path: If True, return file path instead of loaded object
18
19
Returns:
20
Loaded model/dataset object or file path
21
22
Raises:
23
Exception: If model/dataset not found or download fails
24
"""
25
26
def info(name: str = None, show_only_latest: bool = True, name_only: bool = False):
27
"""
28
Get information about available models and datasets.
29
30
Parameters:
31
- name: Specific model/dataset name (optional)
32
- show_only_latest: If True, hide outdated versions (only when name is None)
33
- name_only: If True, return only names of available models and corpora
34
35
Returns:
36
Dictionary with model/dataset information
37
If name is None, returns info about all available items
38
If name is provided, returns detailed info about that specific item
39
If name_only is True, returns only the names
40
"""
41
```
42
43
### Configuration Constants
44
45
Configuration values for the download system.
46
47
```python { .api }
48
BASE_DIR: str
49
"""Default download directory (~/gensim-data by default).
50
Can be overridden with GENSIM_DATA_DIR environment variable."""
51
52
DATA_LIST_URL: str
53
"""URL for the list of available models and datasets."""
54
55
DOWNLOAD_BASE_URL: str
56
"""Base URL for downloading models and datasets."""
57
```
58
59
## Usage Examples
60
61
### Loading Pre-trained Word Vectors
62
63
```python
64
import gensim.downloader as api
65
66
# Load pre-trained GloVe vectors
67
glove_vectors = api.load("glove-twitter-25")
68
print(f"Loaded GloVe vectors: {len(glove_vectors)} words")
69
70
# Find similar words
71
similar_words = glove_vectors.most_similar("python", topn=5)
72
print(f"Words similar to 'python': {similar_words}")
73
74
# Get word vector
75
if "computer" in glove_vectors:
76
vector = glove_vectors["computer"]
77
print(f"'computer' vector shape: {vector.shape}")
78
79
# Calculate word similarity
80
if "computer" in glove_vectors and "technology" in glove_vectors:
81
similarity = glove_vectors.similarity("computer", "technology")
82
print(f"Similarity between 'computer' and 'technology': {similarity}")
83
```
84
85
### Loading Text Datasets
86
87
```python
88
# Load text8 dataset (Wikipedia dump)
89
text8_corpus = api.load("text8")
90
print(f"Loaded text8 dataset")
91
92
# text8 is an iterable of word lists
93
first_sentence = next(iter(text8_corpus))
94
print(f"First sentence length: {len(first_sentence)} words")
95
print(f"First 10 words: {first_sentence[:10]}")
96
97
# Use dataset for training models
98
from gensim.models import Word2Vec
99
100
# Train Word2Vec on the dataset
101
model = Word2Vec(text8_corpus, vector_size=100, window=5, min_count=5, workers=4)
102
print(f"Trained Word2Vec model with {len(model.wv)} words")
103
```
104
105
### Getting Information About Available Data
106
107
```python
108
# Get information about all available models and datasets
109
all_info = api.info()
110
print(f"Available items: {len(all_info)}")
111
112
# Show categories
113
for category in all_info:
114
items = all_info[category]
115
print(f"{category}: {len(items)} items")
116
117
# Show first few items in each category
118
for item_name in list(items.keys())[:3]:
119
item_info = items[item_name]
120
print(f" - {item_name}: {item_info.get('description', 'No description')}")
121
122
# Get detailed information about a specific model
123
word2vec_info = api.info("word2vec-google-news-300")
124
print(f"\nWord2Vec Google News model info:")
125
print(f"Description: {word2vec_info.get('description')}")
126
print(f"Size: {word2vec_info.get('file_size')} bytes")
127
print(f"Vocabulary size: {word2vec_info.get('num_records')} words")
128
```
129
130
### Working with Different Model Types
131
132
```python
133
# Load different types of models
134
models_to_try = [
135
"glove-wiki-gigaword-50", # GloVe vectors
136
"fasttext-wiki-news-subwords-300", # FastText vectors
137
"word2vec-google-news-300" # Word2Vec vectors (large, may take time)
138
]
139
140
for model_name in models_to_try:
141
try:
142
# Get info first to check size
143
model_info = api.info(model_name)
144
file_size_mb = model_info.get('file_size', 0) / (1024 * 1024)
145
146
print(f"\n{model_name}:")
147
print(f" Size: {file_size_mb:.1f} MB")
148
print(f" Description: {model_info.get('description', 'No description')}")
149
150
# Only load smaller models for demonstration
151
if file_size_mb < 100: # Only load models smaller than 100MB
152
vectors = api.load(model_name)
153
print(f" Loaded: {len(vectors)} word vectors")
154
155
# Test with a common word
156
if "computer" in vectors:
157
similar = vectors.most_similar("computer", topn=3)
158
print(f" Similar to 'computer': {[word for word, score in similar]}")
159
else:
160
print(f" Skipping (too large for demo)")
161
162
except Exception as e:
163
print(f" Error loading {model_name}: {e}")
164
```
165
166
### Loading Corpora for Model Training
167
168
```python
169
# Available text corpora
170
corpora_to_try = [
171
"text8", # Wikipedia text
172
"fake-news", # Fake news dataset
173
"lee_background_corpus" # Lee background corpus
174
]
175
176
for corpus_name in corpora_to_try:
177
try:
178
print(f"\nLoading corpus: {corpus_name}")
179
corpus = api.load(corpus_name)
180
181
# Get first few documents to understand structure
182
docs = []
183
for i, doc in enumerate(corpus):
184
docs.append(doc)
185
if i >= 2: # Just get first 3 documents
186
break
187
188
print(f" Number of documents (sample): {len(docs)}")
189
if docs:
190
print(f" First document type: {type(docs[0])}")
191
if isinstance(docs[0], list):
192
print(f" First document length: {len(docs[0])} tokens")
193
print(f" First few tokens: {docs[0][:10]}")
194
195
except Exception as e:
196
print(f" Error loading {corpus_name}: {e}")
197
```
198
199
### Managing Download Cache
200
201
```python
202
import os
203
204
# Check current download directory
205
print(f"Download directory: {api.BASE_DIR}")
206
207
# Check if directory exists and what's in it
208
if os.path.exists(api.BASE_DIR):
209
items = os.listdir(api.BASE_DIR)
210
print(f"Cached items: {len(items)}")
211
for item in items[:5]: # Show first 5
212
item_path = os.path.join(api.BASE_DIR, item)
213
if os.path.isdir(item_path):
214
print(f" {item}/ (directory)")
215
else:
216
size = os.path.getsize(item_path) / (1024 * 1024)
217
print(f" {item} ({size:.1f} MB)")
218
else:
219
print("Download directory doesn't exist yet")
220
```
221
222
### Using Return Path Option
223
224
```python
225
# Get file path instead of loading the model
226
model_path = api.load("glove-twitter-25", return_path=True)
227
print(f"Model file path: {model_path}")
228
229
# You can then load it manually if needed
230
from gensim.models import KeyedVectors
231
vectors = KeyedVectors.load_word2vec_format(model_path)
232
print(f"Manually loaded vectors: {len(vectors)} words")
233
```
234
235
### Error Handling and Validation
236
237
```python
238
def safe_load_model(model_name, max_size_mb=50):
239
"""Safely load a model with size checking."""
240
try:
241
# Get model info first
242
info = api.info(model_name)
243
if not info:
244
print(f"Model '{model_name}' not found")
245
return None
246
247
size_mb = info.get('file_size', 0) / (1024 * 1024)
248
if size_mb > max_size_mb:
249
print(f"Model '{model_name}' is {size_mb:.1f} MB (exceeds {max_size_mb} MB limit)")
250
return None
251
252
print(f"Loading '{model_name}' ({size_mb:.1f} MB)...")
253
model = api.load(model_name)
254
print(f"Successfully loaded '{model_name}'")
255
return model
256
257
except Exception as e:
258
print(f"Error loading '{model_name}': {e}")
259
return None
260
261
# Test safe loading
262
model = safe_load_model("glove-twitter-25")
263
if model:
264
print(f"Model has {len(model)} word vectors")
265
```
266
267
### Finding Models by Category
268
269
```python
270
def find_models_by_category(category_name):
271
"""Find all models in a specific category."""
272
all_info = api.info()
273
274
if category_name in all_info:
275
category_models = all_info[category_name]
276
print(f"\nModels in '{category_name}' category:")
277
278
for model_name, model_info in category_models.items():
279
size_mb = model_info.get('file_size', 0) / (1024 * 1024)
280
description = model_info.get('description', 'No description')
281
print(f" {model_name}")
282
print(f" Size: {size_mb:.1f} MB")
283
print(f" Description: {description}")
284
print()
285
else:
286
print(f"Category '{category_name}' not found")
287
print(f"Available categories: {list(all_info.keys())}")
288
289
# Find word embedding models
290
find_models_by_category("models")
291
292
# Find text corpora
293
find_models_by_category("corpora")
294
```
295
296
### Integration with Model Training
297
298
```python
299
# Download dataset and train a model
300
print("Loading training data...")
301
corpus = api.load("text8")
302
303
print("Training Word2Vec model...")
304
from gensim.models import Word2Vec
305
306
model = Word2Vec(
307
sentences=corpus,
308
vector_size=100,
309
window=5,
310
min_count=5,
311
workers=4,
312
epochs=5
313
)
314
315
print(f"Trained model with {len(model.wv)} words")
316
317
# Compare with pre-trained vectors
318
print("\nLoading pre-trained vectors for comparison...")
319
pretrained = api.load("glove-twitter-25")
320
321
# Test both models
322
test_word = "computer"
323
if test_word in model.wv and test_word in pretrained:
324
custom_similar = model.wv.most_similar(test_word, topn=3)
325
pretrained_similar = pretrained.most_similar(test_word, topn=3)
326
327
print(f"\nSimilar to '{test_word}':")
328
print(f"Custom model: {[word for word, score in custom_similar]}")
329
print(f"Pre-trained: {[word for word, score in pretrained_similar]}")
330
```