Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM
—
GPT-2 (Generative Pre-trained Transformer 2) models for text generation and language modeling tasks. GPT-2 uses autoregressive (left-to-right) attention to generate coherent text by predicting the next token in a sequence.
Configuration class for GPT-2 models containing all hyperparameters and architecture specifications.
class GPT2Config(PretrainedConfig):
def __init__(
self,
vocab_size=50257,
n_positions=1024,
n_ctx=1024,
n_embd=768,
n_layer=12,
n_head=12,
n_inner=None,
activation_function="gelu_new",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
**kwargs
):
"""
Configuration for GPT-2 models.
Parameters:
- vocab_size (int): Vocabulary size
- n_positions (int): Maximum sequence length for positional embeddings
- n_ctx (int): Context size (same as n_positions)
- n_embd (int): Embedding dimensionality
- n_layer (int): Number of transformer blocks
- n_head (int): Number of attention heads per layer
- n_inner (int): Inner dimensionality in feed-forward (4 * n_embd if None)
- activation_function (str): Activation function ("gelu_new", "relu", "swish")
- resid_pdrop (float): Residual connection dropout probability
- embd_pdrop (float): Embedding dropout probability
- attn_pdrop (float): Attention dropout probability
- layer_norm_epsilon (float): Layer normalization epsilon
- initializer_range (float): Weight initialization range
"""Base GPT-2 model for generating contextualized representations and text generation.
class GPT2Model(PreTrainedModel):
def __init__(self, config):
"""
Initialize GPT-2 base model.
Parameters:
- config (GPT2Config): Model configuration
"""
def forward(
self,
input_ids=None,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None
):
"""
Forward pass through GPT-2 model.
Parameters:
- input_ids (torch.Tensor): Token IDs of shape (batch_size, sequence_length)
- past (Tuple[torch.Tensor]): Pre-computed hidden states for efficient generation
- attention_mask (torch.Tensor): Attention mask to avoid padding tokens
- token_type_ids (torch.Tensor): Segment token indices
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Mask to nullify selected heads
- inputs_embeds (torch.Tensor): Pre-computed embeddings
Returns:
BaseModelOutputWithPast: Object with last_hidden_state and past_key_values
"""Usage Example:
from pytorch_transformers import GPT2Model, GPT2Tokenizer
import torch
# Load model and tokenizer
model = GPT2Model.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Prepare input
text = "The future of artificial intelligence is"
inputs = tokenizer(text, return_tensors="pt")
# Get model outputs
with torch.no_grad():
outputs = model(**inputs)
# Access representations
last_hidden_state = outputs.last_hidden_state # Shape: (1, seq_len, 768)
past_key_values = outputs.past_key_values # For efficient generation
print(f"Hidden state shape: {last_hidden_state.shape}")
print(f"Number of past layers: {len(past_key_values) if past_key_values else 0}")GPT-2 model with a language modeling head for text generation and language modeling tasks.
class GPT2LMHeadModel(PreTrainedModel):
def __init__(self, config):
"""
Initialize GPT-2 for language modeling.
Parameters:
- config (GPT2Config): Model configuration
"""
def forward(
self,
input_ids=None,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None
):
"""
Forward pass for language modeling.
Parameters:
- input_ids (torch.Tensor): Token IDs
- past (Tuple[torch.Tensor]): Pre-computed hidden states
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment token indices
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- labels (torch.Tensor): Language modeling labels (shifted input_ids)
Returns:
CausalLMOutputWithPast: Object with loss, logits, and past_key_values
"""
def generate(
self,
input_ids=None,
max_length=20,
do_sample=False,
temperature=1.0,
top_k=0,
top_p=1.0,
repetition_penalty=1.0,
pad_token_id=None,
eos_token_id=None,
**kwargs
):
"""
Generate text using the language model.
Parameters:
- input_ids (torch.Tensor): Input token IDs as prompt
- max_length (int): Maximum length of generated sequence
- do_sample (bool): Whether to use sampling or greedy decoding
- temperature (float): Sampling temperature (higher = more random)
- top_k (int): Top-k sampling (0 = disabled)
- top_p (float): Nucleus sampling threshold (1.0 = disabled)
- repetition_penalty (float): Penalty for repeated tokens
- pad_token_id (int): Padding token ID
- eos_token_id (int): End-of-sequence token ID
Returns:
torch.Tensor: Generated token IDs
"""Usage Example:
from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Set pad token
tokenizer.pad_token = tokenizer.eos_token
# Generate text
prompt = "The future of artificial intelligence"
inputs = tokenizer.encode(prompt, return_tensors="pt")
# Generate with different strategies
with torch.no_grad():
# Greedy generation
greedy_output = model.generate(
inputs,
max_length=50,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
# Sampling with temperature
sample_output = model.generate(
inputs,
max_length=50,
do_sample=True,
temperature=0.8,
top_k=50,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
# Decode generated text
greedy_text = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
sample_text = tokenizer.decode(sample_output[0], skip_special_tokens=True)
print(f"Greedy: {greedy_text}")
print(f"Sampled: {sample_text}")GPT-2 model with both language modeling and classification heads for multi-task learning.
class GPT2DoubleHeadsModel(PreTrainedModel):
def __init__(self, config):
"""
Initialize GPT-2 with double heads.
Parameters:
- config (GPT2Config): Model configuration
"""
def forward(
self,
input_ids=None,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
mc_token_ids=None,
lm_labels=None,
mc_labels=None
):
"""
Forward pass for double heads model.
Parameters:
- input_ids (torch.Tensor): Token IDs
- past (Tuple[torch.Tensor]): Pre-computed hidden states
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment token indices
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- mc_token_ids (torch.Tensor): Token IDs for classification head
- lm_labels (torch.Tensor): Language modeling labels
- mc_labels (torch.Tensor): Multiple choice labels
Returns:
GPT2DoubleHeadsModelOutput: Object with lm_loss, mc_loss, lm_logits, mc_logits, past_key_values
"""Byte-pair encoding (BPE) tokenizer for GPT-2 models.
class GPT2Tokenizer(PreTrainedTokenizer):
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
**kwargs
):
"""
Initialize GPT-2 tokenizer.
Parameters:
- vocab_file (str): Path to vocabulary file
- merges_file (str): Path to BPE merges file
- errors (str): Error handling for encoding ("replace", "ignore", "strict")
- unk_token (str): Unknown token
- bos_token (str): Beginning of sequence token
- eos_token (str): End of sequence token
- add_prefix_space (bool): Whether to add space before tokenizing
"""
def encode(
self,
text,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy="longest_first",
**kwargs
):
"""
Encode text to token IDs using BPE.
Parameters:
- text (str): Input text to encode
- add_special_tokens (bool): Whether to add special tokens
- max_length (int): Maximum sequence length
- stride (int): Stride for sliding window
- truncation_strategy (str): How to truncate long sequences
Returns:
List[int]: List of token IDs
"""Usage Example:
from pytorch_transformers import GPT2Tokenizer
# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# GPT-2 uses the same token for BOS, EOS, UNK, and PAD
print(f"Special token: {tokenizer.eos_token}") # <|endoftext|>
# Tokenize text
text = "Hello, how are you today?"
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
# Decode back
decoded = tokenizer.decode(token_ids)
print(f"Decoded: {decoded}")
# Handle multiple sequences
texts = ["First sentence.", "Second sentence."]
encoded = tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)
print(f"Batch shape: {encoded['input_ids'].shape}")def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
"""
Load TensorFlow GPT-2 checkpoint weights into a PyTorch GPT-2 model.
Parameters:
- model (GPT2Model): PyTorch GPT-2 model
- gpt2_checkpoint_path (str): Path to TensorFlow checkpoint directory
Returns:
GPT2Model: Model with loaded weights
"""GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str]
# Maps model names to download URLs for configurations
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP: Dict[str, str]
# Maps model names to download URLs for pre-trained weightsAvailable Pre-trained Models:
gpt2: 12-layer, 768-hidden, 12-heads, 117M parameters (small)gpt2-medium: 24-layer, 1024-hidden, 16-heads, 345M parametersgpt2-large: 36-layer, 1280-hidden, 20-heads, 762M parametersgpt2-xl: 48-layer, 1600-hidden, 25-heads, 1558M parametersGPT-2 models support various text generation strategies:
Greedy Decoding: Always selects the most likely next token
output = model.generate(input_ids, do_sample=False)Sampling: Randomly samples from the probability distribution
output = model.generate(input_ids, do_sample=True, temperature=0.8)Top-k Sampling: Samples from the k most likely tokens
output = model.generate(input_ids, do_sample=True, top_k=50)Nucleus (Top-p) Sampling: Samples from tokens whose cumulative probability exceeds p
output = model.generate(input_ids, do_sample=True, top_p=0.9)Combined Strategies: Use multiple techniques together
output = model.generate(
input_ids,
do_sample=True,
temperature=0.8,
top_k=50,
top_p=0.9,
repetition_penalty=1.1
)Install with Tessl CLI
npx tessl i tessl/pypi-pytorch-transformers