Python library for topic modelling, document indexing and similarity retrieval with large corpora
78
Convenient API for downloading pre-trained models and datasets including Word2Vec, GloVe, FastText models, and text corpora. The downloader handles caching, version management, and integrity verification automatically.
Primary functions for downloading and loading models and datasets from the gensim-data repository.
def load(name: str, return_path: bool = False):
"""
Download and load a model or dataset.
Parameters:
- name: Name of the model or dataset to load
- return_path: If True, return file path instead of loaded object
Returns:
Loaded model/dataset object or file path
Raises:
Exception: If model/dataset not found or download fails
"""
def info(name: str = None, show_only_latest: bool = True, name_only: bool = False):
"""
Get information about available models and datasets.
Parameters:
- name: Specific model/dataset name (optional)
- show_only_latest: If True, hide outdated versions (only when name is None)
- name_only: If True, return only names of available models and corpora
Returns:
Dictionary with model/dataset information
If name is None, returns info about all available items
If name is provided, returns detailed info about that specific item
If name_only is True, returns only the names
"""Configuration values for the download system.
BASE_DIR: str
"""Default download directory (~/gensim-data by default).
Can be overridden with GENSIM_DATA_DIR environment variable."""
DATA_LIST_URL: str
"""URL for the list of available models and datasets."""
DOWNLOAD_BASE_URL: str
"""Base URL for downloading models and datasets."""import gensim.downloader as api
# Load pre-trained GloVe vectors
glove_vectors = api.load("glove-twitter-25")
print(f"Loaded GloVe vectors: {len(glove_vectors)} words")
# Find similar words
similar_words = glove_vectors.most_similar("python", topn=5)
print(f"Words similar to 'python': {similar_words}")
# Get word vector
if "computer" in glove_vectors:
vector = glove_vectors["computer"]
print(f"'computer' vector shape: {vector.shape}")
# Calculate word similarity
if "computer" in glove_vectors and "technology" in glove_vectors:
similarity = glove_vectors.similarity("computer", "technology")
print(f"Similarity between 'computer' and 'technology': {similarity}")# Load text8 dataset (Wikipedia dump)
text8_corpus = api.load("text8")
print(f"Loaded text8 dataset")
# text8 is an iterable of word lists
first_sentence = next(iter(text8_corpus))
print(f"First sentence length: {len(first_sentence)} words")
print(f"First 10 words: {first_sentence[:10]}")
# Use dataset for training models
from gensim.models import Word2Vec
# Train Word2Vec on the dataset
model = Word2Vec(text8_corpus, vector_size=100, window=5, min_count=5, workers=4)
print(f"Trained Word2Vec model with {len(model.wv)} words")# Get information about all available models and datasets
all_info = api.info()
print(f"Available items: {len(all_info)}")
# Show categories
for category in all_info:
items = all_info[category]
print(f"{category}: {len(items)} items")
# Show first few items in each category
for item_name in list(items.keys())[:3]:
item_info = items[item_name]
print(f" - {item_name}: {item_info.get('description', 'No description')}")
# Get detailed information about a specific model
word2vec_info = api.info("word2vec-google-news-300")
print(f"\nWord2Vec Google News model info:")
print(f"Description: {word2vec_info.get('description')}")
print(f"Size: {word2vec_info.get('file_size')} bytes")
print(f"Vocabulary size: {word2vec_info.get('num_records')} words")# Load different types of models
models_to_try = [
"glove-wiki-gigaword-50", # GloVe vectors
"fasttext-wiki-news-subwords-300", # FastText vectors
"word2vec-google-news-300" # Word2Vec vectors (large, may take time)
]
for model_name in models_to_try:
try:
# Get info first to check size
model_info = api.info(model_name)
file_size_mb = model_info.get('file_size', 0) / (1024 * 1024)
print(f"\n{model_name}:")
print(f" Size: {file_size_mb:.1f} MB")
print(f" Description: {model_info.get('description', 'No description')}")
# Only load smaller models for demonstration
if file_size_mb < 100: # Only load models smaller than 100MB
vectors = api.load(model_name)
print(f" Loaded: {len(vectors)} word vectors")
# Test with a common word
if "computer" in vectors:
similar = vectors.most_similar("computer", topn=3)
print(f" Similar to 'computer': {[word for word, score in similar]}")
else:
print(f" Skipping (too large for demo)")
except Exception as e:
print(f" Error loading {model_name}: {e}")# Available text corpora
corpora_to_try = [
"text8", # Wikipedia text
"fake-news", # Fake news dataset
"lee_background_corpus" # Lee background corpus
]
for corpus_name in corpora_to_try:
try:
print(f"\nLoading corpus: {corpus_name}")
corpus = api.load(corpus_name)
# Get first few documents to understand structure
docs = []
for i, doc in enumerate(corpus):
docs.append(doc)
if i >= 2: # Just get first 3 documents
break
print(f" Number of documents (sample): {len(docs)}")
if docs:
print(f" First document type: {type(docs[0])}")
if isinstance(docs[0], list):
print(f" First document length: {len(docs[0])} tokens")
print(f" First few tokens: {docs[0][:10]}")
except Exception as e:
print(f" Error loading {corpus_name}: {e}")import os
# Check current download directory
print(f"Download directory: {api.BASE_DIR}")
# Check if directory exists and what's in it
if os.path.exists(api.BASE_DIR):
items = os.listdir(api.BASE_DIR)
print(f"Cached items: {len(items)}")
for item in items[:5]: # Show first 5
item_path = os.path.join(api.BASE_DIR, item)
if os.path.isdir(item_path):
print(f" {item}/ (directory)")
else:
size = os.path.getsize(item_path) / (1024 * 1024)
print(f" {item} ({size:.1f} MB)")
else:
print("Download directory doesn't exist yet")# Get file path instead of loading the model
model_path = api.load("glove-twitter-25", return_path=True)
print(f"Model file path: {model_path}")
# You can then load it manually if needed
from gensim.models import KeyedVectors
vectors = KeyedVectors.load_word2vec_format(model_path)
print(f"Manually loaded vectors: {len(vectors)} words")def safe_load_model(model_name, max_size_mb=50):
"""Safely load a model with size checking."""
try:
# Get model info first
info = api.info(model_name)
if not info:
print(f"Model '{model_name}' not found")
return None
size_mb = info.get('file_size', 0) / (1024 * 1024)
if size_mb > max_size_mb:
print(f"Model '{model_name}' is {size_mb:.1f} MB (exceeds {max_size_mb} MB limit)")
return None
print(f"Loading '{model_name}' ({size_mb:.1f} MB)...")
model = api.load(model_name)
print(f"Successfully loaded '{model_name}'")
return model
except Exception as e:
print(f"Error loading '{model_name}': {e}")
return None
# Test safe loading
model = safe_load_model("glove-twitter-25")
if model:
print(f"Model has {len(model)} word vectors")def find_models_by_category(category_name):
"""Find all models in a specific category."""
all_info = api.info()
if category_name in all_info:
category_models = all_info[category_name]
print(f"\nModels in '{category_name}' category:")
for model_name, model_info in category_models.items():
size_mb = model_info.get('file_size', 0) / (1024 * 1024)
description = model_info.get('description', 'No description')
print(f" {model_name}")
print(f" Size: {size_mb:.1f} MB")
print(f" Description: {description}")
print()
else:
print(f"Category '{category_name}' not found")
print(f"Available categories: {list(all_info.keys())}")
# Find word embedding models
find_models_by_category("models")
# Find text corpora
find_models_by_category("corpora")# Download dataset and train a model
print("Loading training data...")
corpus = api.load("text8")
print("Training Word2Vec model...")
from gensim.models import Word2Vec
model = Word2Vec(
sentences=corpus,
vector_size=100,
window=5,
min_count=5,
workers=4,
epochs=5
)
print(f"Trained model with {len(model.wv)} words")
# Compare with pre-trained vectors
print("\nLoading pre-trained vectors for comparison...")
pretrained = api.load("glove-twitter-25")
# Test both models
test_word = "computer"
if test_word in model.wv and test_word in pretrained:
custom_similar = model.wv.most_similar(test_word, topn=3)
pretrained_similar = pretrained.most_similar(test_word, topn=3)
print(f"\nSimilar to '{test_word}':")
print(f"Custom model: {[word for word, score in custom_similar]}")
print(f"Pre-trained: {[word for word, score in pretrained_similar]}")Install with Tessl CLI
npx tessl i tessl/pypi-gensimdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9