Tessl Tile for pypi/gensim@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

corpus-management.md data-downloading.md index.md mathematical-utilities.md nlp-models.md similarity-computations.md text-preprocessing.md

data-downloading.mddocs/

0
# Data Downloading
1

2
Convenient API for downloading pre-trained models and datasets including Word2Vec, GloVe, FastText models, and text corpora. The downloader handles caching, version management, and integrity verification automatically.
3

4
## Capabilities
5

6
### Core Download Functions
7

8
Primary functions for downloading and loading models and datasets from the gensim-data repository.
9

10
```python { .api }
11
def load(name: str, return_path: bool = False):
12
    """
13
    Download and load a model or dataset.
14
    
15
    Parameters:
16
    - name: Name of the model or dataset to load
17
    - return_path: If True, return file path instead of loaded object
18
    
19
    Returns:
20
    Loaded model/dataset object or file path
21
    
22
    Raises:
23
    Exception: If model/dataset not found or download fails
24
    """
25

26
def info(name: str = None, show_only_latest: bool = True, name_only: bool = False):
27
    """
28
    Get information about available models and datasets.
29
    
30
    Parameters:
31
    - name: Specific model/dataset name (optional)
32
    - show_only_latest: If True, hide outdated versions (only when name is None)
33
    - name_only: If True, return only names of available models and corpora
34
    
35
    Returns:
36
    Dictionary with model/dataset information
37
    If name is None, returns info about all available items
38
    If name is provided, returns detailed info about that specific item
39
    If name_only is True, returns only the names
40
    """
41
```
42

43
### Configuration Constants
44

45
Configuration values for the download system.
46

47
```python { .api }
48
BASE_DIR: str
49
    """Default download directory (~/gensim-data by default).
50
    Can be overridden with GENSIM_DATA_DIR environment variable."""
51

52
DATA_LIST_URL: str
53
    """URL for the list of available models and datasets."""
54

55
DOWNLOAD_BASE_URL: str
56
    """Base URL for downloading models and datasets."""
57
```
58

59
## Usage Examples
60

61
### Loading Pre-trained Word Vectors
62

63
```python
64
import gensim.downloader as api
65

66
# Load pre-trained GloVe vectors
67
glove_vectors = api.load("glove-twitter-25")
68
print(f"Loaded GloVe vectors: {len(glove_vectors)} words")
69

70
# Find similar words
71
similar_words = glove_vectors.most_similar("python", topn=5)
72
print(f"Words similar to 'python': {similar_words}")
73

74
# Get word vector
75
if "computer" in glove_vectors:
76
    vector = glove_vectors["computer"]
77
    print(f"'computer' vector shape: {vector.shape}")
78

79
# Calculate word similarity
80
if "computer" in glove_vectors and "technology" in glove_vectors:
81
    similarity = glove_vectors.similarity("computer", "technology")
82
    print(f"Similarity between 'computer' and 'technology': {similarity}")
83
```
84

85
### Loading Text Datasets
86

87
```python
88
# Load text8 dataset (Wikipedia dump)
89
text8_corpus = api.load("text8")
90
print(f"Loaded text8 dataset")
91

92
# text8 is an iterable of word lists
93
first_sentence = next(iter(text8_corpus))
94
print(f"First sentence length: {len(first_sentence)} words")
95
print(f"First 10 words: {first_sentence[:10]}")
96

97
# Use dataset for training models
98
from gensim.models import Word2Vec
99

100
# Train Word2Vec on the dataset
101
model = Word2Vec(text8_corpus, vector_size=100, window=5, min_count=5, workers=4)
102
print(f"Trained Word2Vec model with {len(model.wv)} words")
103
```
104

105
### Getting Information About Available Data
106

107
```python
108
# Get information about all available models and datasets
109
all_info = api.info()
110
print(f"Available items: {len(all_info)}")
111

112
# Show categories
113
for category in all_info:
114
    items = all_info[category]
115
    print(f"{category}: {len(items)} items")
116
    
117
    # Show first few items in each category
118
    for item_name in list(items.keys())[:3]:
119
        item_info = items[item_name]
120
        print(f"  - {item_name}: {item_info.get('description', 'No description')}")
121

122
# Get detailed information about a specific model
123
word2vec_info = api.info("word2vec-google-news-300")
124
print(f"\nWord2Vec Google News model info:")
125
print(f"Description: {word2vec_info.get('description')}")
126
print(f"Size: {word2vec_info.get('file_size')} bytes")
127
print(f"Vocabulary size: {word2vec_info.get('num_records')} words")
128
```
129

130
### Working with Different Model Types
131

132
```python
133
# Load different types of models
134
models_to_try = [
135
    "glove-wiki-gigaword-50",    # GloVe vectors
136
    "fasttext-wiki-news-subwords-300",  # FastText vectors
137
    "word2vec-google-news-300"   # Word2Vec vectors (large, may take time)
138
]
139

140
for model_name in models_to_try:
141
    try:
142
        # Get info first to check size
143
        model_info = api.info(model_name)
144
        file_size_mb = model_info.get('file_size', 0) / (1024 * 1024)
145
        
146
        print(f"\n{model_name}:")
147
        print(f"  Size: {file_size_mb:.1f} MB")
148
        print(f"  Description: {model_info.get('description', 'No description')}")
149
        
150
        # Only load smaller models for demonstration
151
        if file_size_mb < 100:  # Only load models smaller than 100MB
152
            vectors = api.load(model_name)
153
            print(f"  Loaded: {len(vectors)} word vectors")
154
            
155
            # Test with a common word
156
            if "computer" in vectors:
157
                similar = vectors.most_similar("computer", topn=3)
158
                print(f"  Similar to 'computer': {[word for word, score in similar]}")
159
        else:
160
            print(f"  Skipping (too large for demo)")
161
            
162
    except Exception as e:
163
        print(f"  Error loading {model_name}: {e}")
164
```
165

166
### Loading Corpora for Model Training
167

168
```python
169
# Available text corpora
170
corpora_to_try = [
171
    "text8",           # Wikipedia text
172
    "fake-news",       # Fake news dataset
173
    "lee_background_corpus"  # Lee background corpus
174
]
175

176
for corpus_name in corpora_to_try:
177
    try:
178
        print(f"\nLoading corpus: {corpus_name}")
179
        corpus = api.load(corpus_name)
180
        
181
        # Get first few documents to understand structure
182
        docs = []
183
        for i, doc in enumerate(corpus):
184
            docs.append(doc)
185
            if i >= 2:  # Just get first 3 documents
186
                break
187
        
188
        print(f"  Number of documents (sample): {len(docs)}")
189
        if docs:
190
            print(f"  First document type: {type(docs[0])}")
191
            if isinstance(docs[0], list):
192
                print(f"  First document length: {len(docs[0])} tokens")
193
                print(f"  First few tokens: {docs[0][:10]}")
194
        
195
    except Exception as e:
196
        print(f"  Error loading {corpus_name}: {e}")
197
```
198

199
### Managing Download Cache
200

201
```python
202
import os
203

204
# Check current download directory
205
print(f"Download directory: {api.BASE_DIR}")
206

207
# Check if directory exists and what's in it
208
if os.path.exists(api.BASE_DIR):
209
    items = os.listdir(api.BASE_DIR)
210
    print(f"Cached items: {len(items)}")
211
    for item in items[:5]:  # Show first 5
212
        item_path = os.path.join(api.BASE_DIR, item)
213
        if os.path.isdir(item_path):
214
            print(f"  {item}/ (directory)")
215
        else:
216
            size = os.path.getsize(item_path) / (1024 * 1024)
217
            print(f"  {item} ({size:.1f} MB)")
218
else:
219
    print("Download directory doesn't exist yet")
220
```
221

222
### Using Return Path Option
223

224
```python
225
# Get file path instead of loading the model
226
model_path = api.load("glove-twitter-25", return_path=True)
227
print(f"Model file path: {model_path}")
228

229
# You can then load it manually if needed
230
from gensim.models import KeyedVectors
231
vectors = KeyedVectors.load_word2vec_format(model_path)
232
print(f"Manually loaded vectors: {len(vectors)} words")
233
```
234

235
### Error Handling and Validation
236

237
```python
238
def safe_load_model(model_name, max_size_mb=50):
239
    """Safely load a model with size checking."""
240
    try:
241
        # Get model info first
242
        info = api.info(model_name)
243
        if not info:
244
            print(f"Model '{model_name}' not found")
245
            return None
246
        
247
        size_mb = info.get('file_size', 0) / (1024 * 1024)
248
        if size_mb > max_size_mb:
249
            print(f"Model '{model_name}' is {size_mb:.1f} MB (exceeds {max_size_mb} MB limit)")
250
            return None
251
        
252
        print(f"Loading '{model_name}' ({size_mb:.1f} MB)...")
253
        model = api.load(model_name)
254
        print(f"Successfully loaded '{model_name}'")
255
        return model
256
        
257
    except Exception as e:
258
        print(f"Error loading '{model_name}': {e}")
259
        return None
260

261
# Test safe loading
262
model = safe_load_model("glove-twitter-25")
263
if model:
264
    print(f"Model has {len(model)} word vectors")
265
```
266

267
### Finding Models by Category
268

269
```python
270
def find_models_by_category(category_name):
271
    """Find all models in a specific category."""
272
    all_info = api.info()
273
    
274
    if category_name in all_info:
275
        category_models = all_info[category_name]
276
        print(f"\nModels in '{category_name}' category:")
277
        
278
        for model_name, model_info in category_models.items():
279
            size_mb = model_info.get('file_size', 0) / (1024 * 1024)
280
            description = model_info.get('description', 'No description')
281
            print(f"  {model_name}")
282
            print(f"    Size: {size_mb:.1f} MB")
283
            print(f"    Description: {description}")
284
            print()
285
    else:
286
        print(f"Category '{category_name}' not found")
287
        print(f"Available categories: {list(all_info.keys())}")
288

289
# Find word embedding models
290
find_models_by_category("models")
291

292
# Find text corpora
293
find_models_by_category("corpora")
294
```
295

296
### Integration with Model Training
297

298
```python
299
# Download dataset and train a model
300
print("Loading training data...")
301
corpus = api.load("text8")
302

303
print("Training Word2Vec model...")
304
from gensim.models import Word2Vec
305

306
model = Word2Vec(
307
    sentences=corpus,
308
    vector_size=100,
309
    window=5,
310
    min_count=5,
311
    workers=4,
312
    epochs=5
313
)
314

315
print(f"Trained model with {len(model.wv)} words")
316

317
# Compare with pre-trained vectors
318
print("\nLoading pre-trained vectors for comparison...")
319
pretrained = api.load("glove-twitter-25")
320

321
# Test both models
322
test_word = "computer"
323
if test_word in model.wv and test_word in pretrained:
324
    custom_similar = model.wv.most_similar(test_word, topn=3)
325
    pretrained_similar = pretrained.most_similar(test_word, topn=3)
326
    
327
    print(f"\nSimilar to '{test_word}':")
328
    print(f"Custom model: {[word for word, score in custom_similar]}")
329
    print(f"Pre-trained: {[word for word, score in pretrained_similar]}")
330
```

Version

Tile

Files

data-downloading.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-downloading.mddocs/