CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-gensim

Python library for topic modelling, document indexing and similarity retrieval with large corpora

78

1.02x
Overview
Eval results
Files

data-downloading.mddocs/

Data Downloading

Convenient API for downloading pre-trained models and datasets including Word2Vec, GloVe, FastText models, and text corpora. The downloader handles caching, version management, and integrity verification automatically.

Capabilities

Core Download Functions

Primary functions for downloading and loading models and datasets from the gensim-data repository.

def load(name: str, return_path: bool = False):
    """
    Download and load a model or dataset.
    
    Parameters:
    - name: Name of the model or dataset to load
    - return_path: If True, return file path instead of loaded object
    
    Returns:
    Loaded model/dataset object or file path
    
    Raises:
    Exception: If model/dataset not found or download fails
    """

def info(name: str = None, show_only_latest: bool = True, name_only: bool = False):
    """
    Get information about available models and datasets.
    
    Parameters:
    - name: Specific model/dataset name (optional)
    - show_only_latest: If True, hide outdated versions (only when name is None)
    - name_only: If True, return only names of available models and corpora
    
    Returns:
    Dictionary with model/dataset information
    If name is None, returns info about all available items
    If name is provided, returns detailed info about that specific item
    If name_only is True, returns only the names
    """

Configuration Constants

Configuration values for the download system.

BASE_DIR: str
    """Default download directory (~/gensim-data by default).
    Can be overridden with GENSIM_DATA_DIR environment variable."""

DATA_LIST_URL: str
    """URL for the list of available models and datasets."""

DOWNLOAD_BASE_URL: str
    """Base URL for downloading models and datasets."""

Usage Examples

Loading Pre-trained Word Vectors

import gensim.downloader as api

# Load pre-trained GloVe vectors
glove_vectors = api.load("glove-twitter-25")
print(f"Loaded GloVe vectors: {len(glove_vectors)} words")

# Find similar words
similar_words = glove_vectors.most_similar("python", topn=5)
print(f"Words similar to 'python': {similar_words}")

# Get word vector
if "computer" in glove_vectors:
    vector = glove_vectors["computer"]
    print(f"'computer' vector shape: {vector.shape}")

# Calculate word similarity
if "computer" in glove_vectors and "technology" in glove_vectors:
    similarity = glove_vectors.similarity("computer", "technology")
    print(f"Similarity between 'computer' and 'technology': {similarity}")

Loading Text Datasets

# Load text8 dataset (Wikipedia dump)
text8_corpus = api.load("text8")
print(f"Loaded text8 dataset")

# text8 is an iterable of word lists
first_sentence = next(iter(text8_corpus))
print(f"First sentence length: {len(first_sentence)} words")
print(f"First 10 words: {first_sentence[:10]}")

# Use dataset for training models
from gensim.models import Word2Vec

# Train Word2Vec on the dataset
model = Word2Vec(text8_corpus, vector_size=100, window=5, min_count=5, workers=4)
print(f"Trained Word2Vec model with {len(model.wv)} words")

Getting Information About Available Data

# Get information about all available models and datasets
all_info = api.info()
print(f"Available items: {len(all_info)}")

# Show categories
for category in all_info:
    items = all_info[category]
    print(f"{category}: {len(items)} items")
    
    # Show first few items in each category
    for item_name in list(items.keys())[:3]:
        item_info = items[item_name]
        print(f"  - {item_name}: {item_info.get('description', 'No description')}")

# Get detailed information about a specific model
word2vec_info = api.info("word2vec-google-news-300")
print(f"\nWord2Vec Google News model info:")
print(f"Description: {word2vec_info.get('description')}")
print(f"Size: {word2vec_info.get('file_size')} bytes")
print(f"Vocabulary size: {word2vec_info.get('num_records')} words")

Working with Different Model Types

# Load different types of models
models_to_try = [
    "glove-wiki-gigaword-50",    # GloVe vectors
    "fasttext-wiki-news-subwords-300",  # FastText vectors
    "word2vec-google-news-300"   # Word2Vec vectors (large, may take time)
]

for model_name in models_to_try:
    try:
        # Get info first to check size
        model_info = api.info(model_name)
        file_size_mb = model_info.get('file_size', 0) / (1024 * 1024)
        
        print(f"\n{model_name}:")
        print(f"  Size: {file_size_mb:.1f} MB")
        print(f"  Description: {model_info.get('description', 'No description')}")
        
        # Only load smaller models for demonstration
        if file_size_mb < 100:  # Only load models smaller than 100MB
            vectors = api.load(model_name)
            print(f"  Loaded: {len(vectors)} word vectors")
            
            # Test with a common word
            if "computer" in vectors:
                similar = vectors.most_similar("computer", topn=3)
                print(f"  Similar to 'computer': {[word for word, score in similar]}")
        else:
            print(f"  Skipping (too large for demo)")
            
    except Exception as e:
        print(f"  Error loading {model_name}: {e}")

Loading Corpora for Model Training

# Available text corpora
corpora_to_try = [
    "text8",           # Wikipedia text
    "fake-news",       # Fake news dataset
    "lee_background_corpus"  # Lee background corpus
]

for corpus_name in corpora_to_try:
    try:
        print(f"\nLoading corpus: {corpus_name}")
        corpus = api.load(corpus_name)
        
        # Get first few documents to understand structure
        docs = []
        for i, doc in enumerate(corpus):
            docs.append(doc)
            if i >= 2:  # Just get first 3 documents
                break
        
        print(f"  Number of documents (sample): {len(docs)}")
        if docs:
            print(f"  First document type: {type(docs[0])}")
            if isinstance(docs[0], list):
                print(f"  First document length: {len(docs[0])} tokens")
                print(f"  First few tokens: {docs[0][:10]}")
        
    except Exception as e:
        print(f"  Error loading {corpus_name}: {e}")

Managing Download Cache

import os

# Check current download directory
print(f"Download directory: {api.BASE_DIR}")

# Check if directory exists and what's in it
if os.path.exists(api.BASE_DIR):
    items = os.listdir(api.BASE_DIR)
    print(f"Cached items: {len(items)}")
    for item in items[:5]:  # Show first 5
        item_path = os.path.join(api.BASE_DIR, item)
        if os.path.isdir(item_path):
            print(f"  {item}/ (directory)")
        else:
            size = os.path.getsize(item_path) / (1024 * 1024)
            print(f"  {item} ({size:.1f} MB)")
else:
    print("Download directory doesn't exist yet")

Using Return Path Option

# Get file path instead of loading the model
model_path = api.load("glove-twitter-25", return_path=True)
print(f"Model file path: {model_path}")

# You can then load it manually if needed
from gensim.models import KeyedVectors
vectors = KeyedVectors.load_word2vec_format(model_path)
print(f"Manually loaded vectors: {len(vectors)} words")

Error Handling and Validation

def safe_load_model(model_name, max_size_mb=50):
    """Safely load a model with size checking."""
    try:
        # Get model info first
        info = api.info(model_name)
        if not info:
            print(f"Model '{model_name}' not found")
            return None
        
        size_mb = info.get('file_size', 0) / (1024 * 1024)
        if size_mb > max_size_mb:
            print(f"Model '{model_name}' is {size_mb:.1f} MB (exceeds {max_size_mb} MB limit)")
            return None
        
        print(f"Loading '{model_name}' ({size_mb:.1f} MB)...")
        model = api.load(model_name)
        print(f"Successfully loaded '{model_name}'")
        return model
        
    except Exception as e:
        print(f"Error loading '{model_name}': {e}")
        return None

# Test safe loading
model = safe_load_model("glove-twitter-25")
if model:
    print(f"Model has {len(model)} word vectors")

Finding Models by Category

def find_models_by_category(category_name):
    """Find all models in a specific category."""
    all_info = api.info()
    
    if category_name in all_info:
        category_models = all_info[category_name]
        print(f"\nModels in '{category_name}' category:")
        
        for model_name, model_info in category_models.items():
            size_mb = model_info.get('file_size', 0) / (1024 * 1024)
            description = model_info.get('description', 'No description')
            print(f"  {model_name}")
            print(f"    Size: {size_mb:.1f} MB")
            print(f"    Description: {description}")
            print()
    else:
        print(f"Category '{category_name}' not found")
        print(f"Available categories: {list(all_info.keys())}")

# Find word embedding models
find_models_by_category("models")

# Find text corpora
find_models_by_category("corpora")

Integration with Model Training

# Download dataset and train a model
print("Loading training data...")
corpus = api.load("text8")

print("Training Word2Vec model...")
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=corpus,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    epochs=5
)

print(f"Trained model with {len(model.wv)} words")

# Compare with pre-trained vectors
print("\nLoading pre-trained vectors for comparison...")
pretrained = api.load("glove-twitter-25")

# Test both models
test_word = "computer"
if test_word in model.wv and test_word in pretrained:
    custom_similar = model.wv.most_similar(test_word, topn=3)
    pretrained_similar = pretrained.most_similar(test_word, topn=3)
    
    print(f"\nSimilar to '{test_word}':")
    print(f"Custom model: {[word for word, score in custom_similar]}")
    print(f"Pre-trained: {[word for word, score in pretrained_similar]}")

Install with Tessl CLI

npx tessl i tessl/pypi-gensim

docs

corpus-management.md

data-downloading.md

index.md

mathematical-utilities.md

nlp-models.md

similarity-computations.md

text-preprocessing.md

tile.json