PyTorch implementations of transformer-based language models including BERT, OpenAI GPT, GPT-2, and Transformer-XL with pre-trained models, tokenizers, and utilities for NLP tasks
—
OpenAI GPT, GPT-2, and Transformer-XL model families with their configurations and specialized components for autoregressive language modeling, text generation, and extended context processing.
Original OpenAI GPT models with configuration and task-specific variants for language modeling and classification.
class OpenAIGPTConfig:
def __init__(
self,
vocab_size_or_config_json_file=40478,
n_positions=512,
n_ctx=512,
n_embd=768,
n_layer=12,
n_head=12,
afn="gelu",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02
):
"""
Initialize OpenAI GPT configuration.
Args:
vocab_size_or_config_json_file (int or str): Vocabulary size or config path
n_positions (int): Maximum position embeddings
n_ctx (int): Context size
n_embd (int): Embedding dimension
n_layer (int): Number of transformer layers
n_head (int): Number of attention heads
afn (str): Activation function
resid_pdrop (float): Residual dropout probability
embd_pdrop (float): Embedding dropout probability
attn_pdrop (float): Attention dropout probability
layer_norm_epsilon (float): Layer normalization epsilon
initializer_range (float): Weight initialization range
"""
@classmethod
def from_dict(cls, json_object):
"""Create configuration from dictionary."""
@classmethod
def from_json_file(cls, json_file):
"""Create configuration from JSON file."""
def to_dict(self):
"""Convert to dictionary."""
def to_json_string(self):
"""Convert to JSON string."""class OpenAIGPTModel:
def __init__(self, config):
"""
Initialize OpenAI GPT base model.
Args:
config (OpenAIGPTConfig): Model configuration
"""
def forward(self, input_ids, position_ids=None, token_type_ids=None):
"""
Forward pass through GPT model.
Args:
input_ids (torch.Tensor): Token IDs of shape [batch_size, seq_len]
position_ids (torch.Tensor, optional): Position IDs
token_type_ids (torch.Tensor, optional): Token type IDs
Returns:
torch.Tensor: Hidden states of shape [batch_size, seq_len, hidden_size]
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained OpenAI GPT model."""class OpenAIGPTLMHeadModel:
def __init__(self, config):
"""
Initialize OpenAI GPT with language modeling head.
Args:
config (OpenAIGPTConfig): Model configuration
"""
def forward(
self,
input_ids,
position_ids=None,
token_type_ids=None,
lm_labels=None
):
"""
Forward pass with language modeling head.
Args:
input_ids (torch.Tensor): Token IDs
position_ids (torch.Tensor, optional): Position IDs
token_type_ids (torch.Tensor, optional): Token type IDs
lm_labels (torch.Tensor, optional): Language modeling labels
Returns:
torch.Tensor: Language modeling logits or loss if labels provided
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""class OpenAIGPTDoubleHeadsModel:
def __init__(self, config):
"""
Initialize OpenAI GPT with both language modeling and classification heads.
Args:
config (OpenAIGPTConfig): Model configuration
"""
def forward(
self,
input_ids,
position_ids=None,
token_type_ids=None,
lm_labels=None,
multiple_choice_labels=None
):
"""
Forward pass with both heads.
Args:
input_ids (torch.Tensor): Token IDs
position_ids (torch.Tensor, optional): Position IDs
token_type_ids (torch.Tensor, optional): Token type IDs
lm_labels (torch.Tensor, optional): Language modeling labels
multiple_choice_labels (torch.Tensor, optional): Classification labels
Returns:
tuple: (lm_logits, classification_logits) or losses if labels provided
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""GPT-2 model family with improved architecture and byte-level BPE tokenization.
class GPT2Config:
def __init__(
self,
vocab_size_or_config_json_file=50257,
n_positions=1024,
n_ctx=1024,
n_embd=768,
n_layer=12,
n_head=12,
n_inner=None,
afn="gelu_new",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02
):
"""
Initialize GPT-2 configuration.
Args:
vocab_size_or_config_json_file (int or str): Vocabulary size or config path
n_positions (int): Maximum position embeddings
n_ctx (int): Context size
n_embd (int): Embedding dimension
n_layer (int): Number of layers
n_head (int): Number of attention heads
n_inner (int, optional): Inner dimension (defaults to 4 * n_embd)
afn (str): Activation function
resid_pdrop (float): Residual dropout
embd_pdrop (float): Embedding dropout
attn_pdrop (float): Attention dropout
layer_norm_epsilon (float): Layer norm epsilon
initializer_range (float): Initialization range
"""
@classmethod
def from_dict(cls, json_object):
"""Create from dictionary."""
@classmethod
def from_json_file(cls, json_file):
"""Create from JSON file."""
def to_dict(self):
"""Convert to dictionary."""
def to_json_string(self):
"""Convert to JSON string."""class GPT2Model:
def __init__(self, config):
"""
Initialize GPT-2 base model.
Args:
config (GPT2Config): Model configuration
"""
def forward(self, input_ids, position_ids=None, token_type_ids=None):
"""
Forward pass through GPT-2.
Args:
input_ids (torch.Tensor): Token IDs
position_ids (torch.Tensor, optional): Position IDs
token_type_ids (torch.Tensor, optional): Token type IDs
Returns:
torch.Tensor: Hidden states
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained GPT-2 model."""class GPT2LMHeadModel:
def __init__(self, config):
"""
Initialize GPT-2 with language modeling head.
Args:
config (GPT2Config): Model configuration
"""
def forward(
self,
input_ids,
position_ids=None,
token_type_ids=None,
lm_labels=None
):
"""
Forward pass with LM head.
Returns:
torch.Tensor: Language modeling logits or loss
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""class GPT2DoubleHeadsModel:
def __init__(self, config):
"""
Initialize GPT-2 with language modeling and classification heads.
Args:
config (GPT2Config): Model configuration
"""
def forward(
self,
input_ids,
position_ids=None,
token_type_ids=None,
lm_labels=None,
multiple_choice_labels=None
):
"""
Forward pass with both heads.
Returns:
tuple: (lm_logits, classification_logits) or losses
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""Transformer-XL models with extended context capability through recurrence mechanism and adaptive attention.
class TransfoXLConfig:
def __init__(
self,
vocab_size_or_config_json_file=267735,
cutoffs=[20000, 40000, 200000],
d_model=1024,
d_embed=1024,
n_head=16,
d_head=64,
d_inner=4096,
div_val=4,
pre_lnorm=False,
n_layer=18,
tgt_len=128,
ext_len=0,
mem_len=1600,
clamp_len=1000,
same_length=True,
attn_type=0,
sample_softmax=-1,
adaptive=True,
tie_weight=True,
dropout=0.1,
dropatt=0.0,
untie_r=True,
embd_init='normal',
init='normal',
init_range=0.01,
proj_init_std=0.01,
init_std=0.02
):
"""
Initialize Transformer-XL configuration.
Args:
vocab_size_or_config_json_file (int or str): Vocabulary size or config path
cutoffs (list): Adaptive softmax cutoffs
d_model (int): Model dimension
d_embed (int): Embedding dimension
n_head (int): Number of attention heads
d_head (int): Dimension per attention head
d_inner (int): Inner feed-forward dimension
div_val (int): Dimension reduction factor
pre_lnorm (bool): Whether to use pre-layer normalization
n_layer (int): Number of layers
tgt_len (int): Target sequence length
ext_len (int): Extended sequence length
mem_len (int): Memory length
clamp_len (int): Clamp length for positional encoding
same_length (bool): Whether to use same length
attn_type (int): Attention type
sample_softmax (int): Sample softmax parameter
adaptive (bool): Whether to use adaptive softmax
tie_weight (bool): Whether to tie weights
dropout (float): Dropout probability
dropatt (float): Attention dropout
untie_r (bool): Whether to untie relative position bias
embd_init (str): Embedding initialization
init (str): General initialization
init_range (float): Initialization range
proj_init_std (float): Projection initialization std
init_std (float): Initialization std
"""
@classmethod
def from_dict(cls, json_object):
"""Create from dictionary."""
@classmethod
def from_json_file(cls, json_file):
"""Create from JSON file."""
def to_dict(self):
"""Convert to dictionary."""
def to_json_string(self):
"""Convert to JSON string."""class TransfoXLModel:
def __init__(self, config):
"""
Initialize Transformer-XL base model.
Args:
config (TransfoXLConfig): Model configuration
"""
def forward(self, input_ids, mems=None):
"""
Forward pass with memory mechanism.
Args:
input_ids (torch.Tensor): Token IDs
mems (list, optional): Memory states from previous segments
Returns:
tuple: (hidden_states, new_mems) where:
- hidden_states (torch.Tensor): Output hidden states
- new_mems (list): Updated memory states
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained Transformer-XL model."""class TransfoXLLMHeadModel:
def __init__(self, config):
"""
Initialize Transformer-XL with language modeling head.
Args:
config (TransfoXLConfig): Model configuration
"""
def forward(self, input_ids, labels=None, mems=None):
"""
Forward pass with LM head and memory.
Args:
input_ids (torch.Tensor): Token IDs
labels (torch.Tensor, optional): Language modeling labels
mems (list, optional): Memory states
Returns:
tuple: (prediction_scores, new_mems) or loss if labels provided
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""Functions to convert TensorFlow checkpoints to PyTorch format for each model family.
def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
"""
Load TensorFlow OpenAI GPT checkpoint into PyTorch model.
Args:
model: PyTorch OpenAI GPT model
openai_checkpoint_folder_path (str): Path to TF checkpoint folder
Returns:
PyTorch model with loaded weights
"""
def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
"""
Load TensorFlow GPT-2 checkpoint into PyTorch model.
Args:
model: PyTorch GPT-2 model
gpt2_checkpoint_path (str): Path to TF checkpoint
Returns:
PyTorch model with loaded weights
"""
def load_tf_weights_in_transfo_xl(model, config, tf_path):
"""
Load TensorFlow Transformer-XL checkpoint into PyTorch model.
Args:
model: PyTorch Transformer-XL model
config (TransfoXLConfig): Model configuration
tf_path (str): Path to TF checkpoint
Returns:
PyTorch model with loaded weights
"""from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
import torch
# Load model and tokenizer
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
# Prepare input
text = "The artificial intelligence will"
input_ids = torch.tensor([tokenizer.encode(text)])
# Generate text
model.eval()
with torch.no_grad():
outputs = model(input_ids)
predictions = outputs[0]
# Get next token probabilities
next_token_logits = predictions[0, -1, :]
next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), 1)
# Decode next token
next_word = tokenizer.decode([next_token.item()])
print(f"Next word: {next_word}")from pytorch_pretrained_bert import GPT2Config, GPT2LMHeadModel
# Create custom configuration
config = GPT2Config(
vocab_size=50257,
n_positions=1024,
n_embd=768,
n_layer=12,
n_head=12
)
# Initialize model with custom config
model = GPT2LMHeadModel(config)
# Or load pre-trained
model = GPT2LMHeadModel.from_pretrained('gpt2')from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLTokenizer
import torch
# Load model and tokenizer
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
# Process sequences with memory
sequence1 = "The weather today is beautiful and"
sequence2 = "sunny with clear blue skies."
# Encode sequences
input_ids_1 = torch.tensor([tokenizer.encode(sequence1)])
input_ids_2 = torch.tensor([tokenizer.encode(sequence2)])
# Forward pass with memory
model.eval()
with torch.no_grad():
# Process first sequence
outputs_1 = model(input_ids_1)
mems = outputs_1[1] # Extract memory states
# Process second sequence with memory from first
outputs_2 = model(input_ids_2, mems=mems)
logits = outputs_2[0]from pytorch_pretrained_bert import GPT2DoubleHeadsModel, GPT2Tokenizer
import torch
# Load double heads model
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Prepare input for both language modeling and classification
text = "This movie is great!"
input_ids = torch.tensor([tokenizer.encode(text)])
# Forward pass
model.eval()
with torch.no_grad():
outputs = model(input_ids)
lm_logits = outputs[0] # Language modeling logits
cls_logits = outputs[1] # Classification logits
print(f"LM logits shape: {lm_logits.shape}")
print(f"Classification logits shape: {cls_logits.shape}")Install with Tessl CLI
npx tessl i tessl/pypi-pytorch-pretrained-bert