Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM
—
File handling utilities for downloading, caching, and managing pre-trained model files. These utilities handle automatic download of model weights, configurations, and tokenizer files from remote repositories with local caching support to avoid repeated downloads.
Main function for downloading and caching files from URLs or returning local file paths.
def cached_path(url_or_filename, cache_dir=None):
"""
Download and cache a file from a URL or return the path if it's a local file.
Parameters:
- url_or_filename (str): URL to download from or local file path
- cache_dir (str, optional): Directory to cache downloaded files
Defaults to PYTORCH_TRANSFORMERS_CACHE
Returns:
str: Path to the cached or local file
Raises:
EnvironmentError: If the file cannot be found or downloaded
"""Usage Examples:
from pytorch_transformers import cached_path
# Download and cache a model file
model_url = "https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin"
local_path = cached_path(model_url)
print(f"Model cached at: {local_path}")
# Use custom cache directory
custom_cache = "./my_cache"
config_url = "https://huggingface.co/bert-base-uncased/resolve/main/config.json"
config_path = cached_path(config_url, cache_dir=custom_cache)
# Return local file path unchanged
local_file = "./my_model.bin"
path = cached_path(local_file) # Returns "./my_model.bin"Pre-defined cache directory paths used by the library for storing downloaded files.
PYTORCH_TRANSFORMERS_CACHE: str
# Default cache directory for pytorch-transformers
# Typically resolves to: ~/.cache/torch/pytorch_transformers/
PYTORCH_PRETRAINED_BERT_CACHE: str
# Legacy cache directory for backward compatibility with pytorch-pretrained-bert
# Typically resolves to: ~/.pytorch_pretrained_bert/Usage Examples:
from pytorch_transformers import PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE
import os
# Check default cache locations
print(f"Default cache: {PYTORCH_TRANSFORMERS_CACHE}")
print(f"Legacy cache: {PYTORCH_PRETRAINED_BERT_CACHE}")
# List cached files
if os.path.exists(PYTORCH_TRANSFORMERS_CACHE):
cached_files = os.listdir(PYTORCH_TRANSFORMERS_CACHE)
print(f"Cached files: {len(cached_files)}")
for file in cached_files[:5]: # Show first 5 files
print(f" {file}")
# Clear cache (be careful!)
import shutil
# shutil.rmtree(PYTORCH_TRANSFORMERS_CACHE) # Uncomment to clear cacheStandard filenames used by the library for model components.
WEIGHTS_NAME: str = "pytorch_model.bin"
# Default filename for PyTorch model weights
CONFIG_NAME: str = "config.json"
# Default filename for model configuration files
TF_WEIGHTS_NAME: str = "model.ckpt"
# Default filename for TensorFlow model weightsUsage Examples:
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME
import os
# Check if model files exist in a directory
model_dir = "./my_model"
weights_path = os.path.join(model_dir, WEIGHTS_NAME)
config_path = os.path.join(model_dir, CONFIG_NAME)
if os.path.exists(weights_path):
print(f"Model weights found: {weights_path}")
if os.path.exists(config_path):
print(f"Model config found: {config_path}")
# Save model with standard names
model = BertModel.from_pretrained("bert-base-uncased")
model.save_pretrained(model_dir) # Creates pytorch_model.bin and config.jsonWhen you load a pre-trained model for the first time, the library automatically:
from pytorch_transformers import BertModel
# First time: Downloads and caches files
model = BertModel.from_pretrained("bert-base-uncased")
# Subsequent times: Uses cached files
model = BertModel.from_pretrained("bert-base-uncased") # Much faster!The cache directory contains subdirectories for different file types:
~/.cache/torch/pytorch_transformers/
├── 0123abc...def/ # Hash-based subdirectory
│ ├── pytorch_model.bin # Model weights
│ ├── config.json # Model configuration
│ └── tokenizer.json # Tokenizer files
├── 4567ghi...jkl/
│ └── vocab.txt # Vocabulary files
└── ...Control caching behavior through environment variables:
# Set custom cache directory
export PYTORCH_TRANSFORMERS_CACHE="/path/to/my/cache"
# Disable caching (download to temp directory each time)
export PYTORCH_TRANSFORMERS_CACHE="/tmp"
# Use offline mode (only use cached files)
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1import os
import shutil
from pytorch_transformers import PYTORCH_TRANSFORMERS_CACHE
def get_cache_size():
"""Get total size of cache directory in MB."""
if not os.path.exists(PYTORCH_TRANSFORMERS_CACHE):
return 0
total_size = 0
for dirpath, dirnames, filenames in os.walk(PYTORCH_TRANSFORMERS_CACHE):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
return total_size / (1024 * 1024) # Convert to MB
def clear_cache():
"""Clear all cached files."""
if os.path.exists(PYTORCH_TRANSFORMERS_CACHE):
shutil.rmtree(PYTORCH_TRANSFORMERS_CACHE)
print(f"Cache cleared: {PYTORCH_TRANSFORMERS_CACHE}")
def list_cached_models():
"""List all cached model directories."""
if not os.path.exists(PYTORCH_TRANSFORMERS_CACHE):
return []
cached_dirs = []
for item in os.listdir(PYTORCH_TRANSFORMERS_CACHE):
item_path = os.path.join(PYTORCH_TRANSFORMERS_CACHE, item)
if os.path.isdir(item_path):
# Check if it contains model files
has_weights = os.path.exists(os.path.join(item_path, "pytorch_model.bin"))
has_config = os.path.exists(os.path.join(item_path, "config.json"))
if has_weights or has_config:
cached_dirs.append(item)
return cached_dirs
# Usage
print(f"Cache size: {get_cache_size():.1f} MB")
print(f"Cached models: {len(list_cached_models())}")The caching utilities support HTTP proxies for downloading files in restricted network environments:
import os
# Set proxy environment variables
os.environ['HTTP_PROXY'] = 'http://proxy.company.com:8080'
os.environ['HTTPS_PROXY'] = 'https://proxy.company.com:8080'
# Download will now use proxy
from pytorch_transformers import BertModel
model = BertModel.from_pretrained("bert-base-uncased")import os
# Set download timeout (in seconds)
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300' # 5 minutes
# For very slow connections
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '1800' # 30 minutesWhen working in environments without internet access:
import os
# Enable offline mode - only use cached files
os.environ['HF_DATASETS_OFFLINE'] = '1'
os.environ['TRANSFORMERS_OFFLINE'] = '1'
try:
# This will only work if files are already cached
model = BertModel.from_pretrained("bert-base-uncased")
except OSError as e:
print(f"Model not in cache: {e}")The file utilities provide informative error messages for common issues:
from pytorch_transformers import cached_path
try:
# Invalid URL
path = cached_path("https://invalid-url.com/model.bin")
except EnvironmentError as e:
print(f"Download failed: {e}")
try:
# Local file doesn't exist
path = cached_path("./nonexistent_model.bin")
except EnvironmentError as e:
print(f"File not found: {e}")
try:
# Network issues
path = cached_path("https://valid-url.com/model.bin")
except EnvironmentError as e:
print(f"Network error: {e}")The file utilities are automatically used by all from_pretrained() methods:
# These all use cached_path internally
from pytorch_transformers import (
AutoModel, AutoTokenizer, AutoConfig,
BertModel, BertTokenizer, BertConfig
)
# Download and cache if needed
model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained("bert-base-uncased")
# Custom cache directory for specific models
model = BertModel.from_pretrained(
"bert-base-uncased",
cache_dir="./my_bert_cache"
)Install with Tessl CLI
npx tessl i tessl/pypi-pytorch-transformers