Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM
—
Core abstract base classes that define common interfaces shared by all models, tokenizers, and configurations in the pytorch-transformers library. These classes provide essential functionality for loading, saving, and managing pre-trained components.
Abstract base class for all transformer models, providing common functionality for model loading, saving, parameter management, and inference.
class PreTrainedModel:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
"""
Instantiate a pre-trained PyTorch model from a pre-trained model configuration.
Parameters:
- pretrained_model_name_or_path (str): Model name or local path
- config (PretrainedConfig, optional): Model configuration
- cache_dir (str, optional): Directory to cache downloaded files
- from_tf (bool, optional): Load from TensorFlow checkpoint
- force_download (bool, optional): Force re-download even if cached
- resume_download (bool, optional): Resume incomplete downloads
- proxies (dict, optional): HTTP proxy configuration
- output_loading_info (bool, optional): Return loading info dict
- use_auth_token (str/bool, optional): Authentication token for private models
- revision (str, optional): Git branch/tag/commit to use
- kwargs: Additional arguments passed to model constructor
Returns:
PreTrainedModel: Instance of the model class
"""
def save_pretrained(self, save_directory):
"""
Save model weights and configuration to a directory.
Parameters:
- save_directory (str): Directory to save model files
"""
def resize_token_embeddings(self, new_num_tokens=None):
"""
Resize token embeddings matrix of the model.
Parameters:
- new_num_tokens (int, optional): New vocabulary size
Returns:
torch.nn.Embedding: New embeddings matrix
"""
def prune_heads(self, heads_to_prune):
"""
Prune attention heads in the model.
Parameters:
- heads_to_prune (dict): Dictionary mapping layer to heads to prune
"""
def get_input_embeddings(self):
"""
Get the model's input embeddings.
Returns:
torch.nn.Module: Input embeddings layer
"""
def set_input_embeddings(self, value):
"""
Set the model's input embeddings.
Parameters:
- value (torch.nn.Module): New input embeddings layer
"""
def get_output_embeddings(self):
"""
Get the model's output embeddings.
Returns:
torch.nn.Module: Output embeddings layer
"""
def set_output_embeddings(self, new_embeddings):
"""
Set the model's output embeddings.
Parameters:
- new_embeddings (torch.nn.Module): New output embeddings layer
"""Usage Examples:
from pytorch_transformers import BertModel
import torch
# Load pre-trained model
model = BertModel.from_pretrained("bert-base-uncased")
# Save model
model.save_pretrained("./my-bert-model")
# Resize embeddings for new vocabulary
model.resize_token_embeddings(30000)
# Prune attention heads
heads_to_prune = {0: [0, 1], 1: [0]} # Prune heads 0,1 in layer 0 and head 0 in layer 1
model.prune_heads(heads_to_prune)
# Access embeddings
input_embeddings = model.get_input_embeddings()
print(f"Embedding dimensions: {input_embeddings.weight.shape}")
# Model inference
inputs = torch.randint(0, 1000, (1, 10)) # Random token IDs
outputs = model(inputs)Abstract base class for all tokenizers, providing common tokenization interface and special token handling.
class PreTrainedTokenizer:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""
Instantiate a pre-trained tokenizer from a vocabulary file.
Parameters:
- pretrained_model_name_or_path (str): Model name or local path
- cache_dir (str, optional): Directory to cache downloaded files
- force_download (bool, optional): Force re-download even if cached
- resume_download (bool, optional): Resume incomplete downloads
- proxies (dict, optional): HTTP proxy configuration
- use_auth_token (str/bool, optional): Authentication token for private models
- revision (str, optional): Git branch/tag/commit to use
- kwargs: Additional arguments passed to tokenizer constructor
Returns:
PreTrainedTokenizer: Instance of the tokenizer class
"""
def save_pretrained(self, save_directory):
"""
Save tokenizer vocabulary and configuration to a directory.
Parameters:
- save_directory (str): Directory to save tokenizer files
"""
def tokenize(self, text, **kwargs):
"""
Tokenize a string into a list of tokens.
Parameters:
- text (str): Input text to tokenize
- kwargs: Additional tokenization arguments
Returns:
List[str]: List of tokens
"""
def encode(self, text, text_pair=None, add_special_tokens=True, max_length=None, **kwargs):
"""
Encode text into token IDs.
Parameters:
- text (str): Primary input text
- text_pair (str, optional): Secondary input text for sentence pairs
- add_special_tokens (bool): Whether to add special tokens
- max_length (int, optional): Maximum sequence length
- kwargs: Additional encoding arguments
Returns:
List[int]: List of token IDs
"""
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
"""
Decode token IDs back to text.
Parameters:
- token_ids (List[int]): Token IDs to decode
- skip_special_tokens (bool): Whether to remove special tokens
- clean_up_tokenization_spaces (bool): Whether to clean up spaces
Returns:
str: Decoded text
"""
def convert_tokens_to_ids(self, tokens):
"""
Convert tokens to token IDs.
Parameters:
- tokens (List[str] or str): Token(s) to convert
Returns:
List[int] or int: Token ID(s)
"""
def convert_ids_to_tokens(self, ids):
"""
Convert token IDs to tokens.
Parameters:
- ids (List[int] or int): Token ID(s) to convert
Returns:
List[str] or str: Token(s)
"""
def __call__(self, text, text_pair=None, **kwargs):
"""
Main tokenization method with tensor output support.
Parameters:
- text (str or List[str]): Input text(s)
- text_pair (str or List[str], optional): Pair text(s)
- return_tensors (str, optional): Type of tensors to return ('pt', 'tf', 'np')
- padding (bool/str, optional): Padding strategy
- truncation (bool/str, optional): Truncation strategy
- max_length (int, optional): Maximum sequence length
- kwargs: Additional arguments
Returns:
Dict: Dictionary containing input_ids, attention_mask, etc.
"""Special Token Properties:
# Special tokens available on all tokenizers
bos_token: str # Beginning of sequence token
eos_token: str # End of sequence token
unk_token: str # Unknown token
sep_token: str # Separator token
pad_token: str # Padding token
cls_token: str # Classification token
mask_token: str # Mask token for masked language modeling
# Special token IDs
bos_token_id: int
eos_token_id: int
unk_token_id: int
sep_token_id: int
pad_token_id: int
cls_token_id: int
mask_token_id: int
# Vocabulary size
vocab_size: intUsage Examples:
from pytorch_transformers import BertTokenizer
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Basic tokenization
text = "Hello, how are you?"
tokens = tokenizer.tokenize(text)
print(f"Tokens: {tokens}")
# Encoding to IDs
token_ids = tokenizer.encode(text)
print(f"Token IDs: {token_ids}")
# Decoding back to text
decoded = tokenizer.decode(token_ids)
print(f"Decoded: {decoded}")
# Full preprocessing with tensors
inputs = tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
print(f"Input shape: {inputs['input_ids'].shape}")
# Access special tokens
print(f"CLS token: {tokenizer.cls_token}")
print(f"SEP token: {tokenizer.sep_token}")
print(f"PAD token ID: {tokenizer.pad_token_id}")
# Save tokenizer
tokenizer.save_pretrained("./my-tokenizer")Base configuration class for all model configurations, containing model hyperparameters and architecture specifications.
class PretrainedConfig:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""
Instantiate a PretrainedConfig from a pre-trained model configuration.
Parameters:
- pretrained_model_name_or_path (str): Model name or local path
- cache_dir (str, optional): Directory to cache downloaded files
- force_download (bool, optional): Force re-download even if cached
- resume_download (bool, optional): Resume incomplete downloads
- proxies (dict, optional): HTTP proxy configuration
- use_auth_token (str/bool, optional): Authentication token for private models
- revision (str, optional): Git branch/tag/commit to use
- kwargs: Additional configuration parameters
Returns:
PretrainedConfig: Instance of the configuration class
"""
def save_pretrained(self, save_directory):
"""
Save configuration to a directory.
Parameters:
- save_directory (str): Directory to save configuration file
"""
def to_dict(self):
"""
Serialize configuration to a Python dictionary.
Returns:
Dict: Configuration as dictionary
"""
def to_json_string(self):
"""
Serialize configuration to a JSON string.
Returns:
str: Configuration as JSON string
"""
@classmethod
def from_dict(cls, config_dict, **kwargs):
"""
Construct configuration from a dictionary.
Parameters:
- config_dict (Dict): Configuration dictionary
- kwargs: Additional parameters
Returns:
PretrainedConfig: Configuration instance
"""
@classmethod
def from_json_file(cls, json_file):
"""
Construct configuration from a JSON file.
Parameters:
- json_file (str): Path to JSON configuration file
Returns:
PretrainedConfig: Configuration instance
"""Usage Examples:
from pytorch_transformers import BertConfig
# Load configuration
config = BertConfig.from_pretrained("bert-base-uncased")
# Access configuration parameters
print(f"Hidden size: {config.hidden_size}")
print(f"Number of layers: {config.num_hidden_layers}")
print(f"Number of attention heads: {config.num_attention_heads}")
# Modify configuration
config.num_labels = 3 # For classification with 3 classes
# Save configuration
config.save_pretrained("./my-config")
# Convert to dictionary/JSON
config_dict = config.to_dict()
config_json = config.to_json_string()
# Create from dictionary
custom_config = BertConfig.from_dict({
"hidden_size": 512,
"num_hidden_layers": 6,
"num_attention_heads": 8
})Core utility classes and functions for model parameter management and weight manipulation.
A 1D convolution layer implementation as used in GPT models, where weights are transposed compared to standard linear layers.
class Conv1D(nn.Module):
def __init__(self, nf, nx):
"""
Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
Basically works like a Linear layer but the weights are transposed.
Parameters:
- nf (int): Size of output features
- nx (int): Size of input features
"""
def forward(self, x):
"""
Forward pass through the Conv1D layer.
Parameters:
- x (torch.Tensor): Input tensor
Returns:
torch.Tensor: Output tensor
"""Utility functions for pruning model layers to remove attention heads or reduce model size.
def prune_layer(layer, index, dim=None):
"""
Prune a Conv1D or nn.Linear layer to keep only entries in index.
Return the pruned layer as a new layer with requires_grad=True.
Used to remove heads.
Parameters:
- layer (nn.Module): Layer to prune (Conv1D or nn.Linear)
- index (torch.LongTensor): Indices of entries to keep
- dim (int, optional): Dimension along which to prune (default: 0 for Linear, 1 for Conv1D)
Returns:
nn.Module: New pruned layer
"""Usage Examples:
from pytorch_transformers import Conv1D, prune_layer
import torch
import torch.nn as nn
# Create a Conv1D layer
conv1d = Conv1D(768, 512) # 768 output features, 512 input features
input_tensor = torch.randn(32, 128, 512) # batch_size, seq_len, input_features
output = conv1d(input_tensor)
print(output.shape) # torch.Size([32, 128, 768])
# Prune a linear layer to keep only certain features
linear = nn.Linear(768, 12) # Original layer
indices_to_keep = torch.LongTensor([0, 2, 4, 6, 8, 10]) # Keep every other feature
pruned_linear = prune_layer(linear, indices_to_keep, dim=1)
print(f"Original: {linear.weight.shape}, Pruned: {pruned_linear.weight.shape}")File naming constants used throughout the library for consistent model serialization.
# Model weight files
WEIGHTS_NAME: str = "pytorch_model.bin"
CONFIG_NAME: str = "config.json"
TF_WEIGHTS_NAME: str = "model.ckpt"Usage Examples:
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME
import os
# Check for model files in a directory
model_dir = "./my-model"
weights_path = os.path.join(model_dir, WEIGHTS_NAME)
config_path = os.path.join(model_dir, CONFIG_NAME)
if os.path.exists(weights_path):
print(f"PyTorch weights found: {weights_path}")
if os.path.exists(config_path):
print(f"Config found: {config_path}")
# When loading TensorFlow weights
tf_weights_path = os.path.join(model_dir, TF_WEIGHTS_NAME + ".index")
if os.path.exists(tf_weights_path):
print(f"TensorFlow weights found: {tf_weights_path}")Install with Tessl CLI
npx tessl i tessl/pypi-pytorch-transformers