PyTorch implementations of transformer-based language models including BERT, OpenAI GPT, GPT-2, and Transformer-XL with pre-trained models, tokenizers, and utilities for NLP tasks
—
Specialized optimizers with learning rate scheduling designed for transformer training, including BERT-specific and OpenAI-specific Adam variants with warmup schedules, weight decay corrections, and gradient clipping.
Adam optimizer with BERT-specific weight decay handling, learning rate scheduling, and gradient clipping designed for transformer fine-tuning.
class BertAdam:
def __init__(
self,
params,
lr,
warmup=-1,
t_total=-1,
schedule='warmup_linear',
b1=0.9,
b2=0.999,
e=1e-6,
weight_decay=0.01,
max_grad_norm=1.0
):
"""
Initialize BERT Adam optimizer.
Args:
params: Model parameters to optimize
lr (float): Learning rate (required)
warmup (float): Warmup proportion of total training steps (-1 for no warmup)
t_total (int): Total training steps (-1 for no scheduling)
schedule (str): Learning rate schedule type
b1 (float): Adam beta1 parameter
b2 (float): Adam beta2 parameter
e (float): Adam epsilon parameter
weight_decay (float): Weight decay coefficient
max_grad_norm (float): Maximum gradient norm for clipping
"""
def step(self, closure=None):
"""
Perform single optimization step.
Args:
closure (callable, optional): A closure that reevaluates model and returns loss
Returns:
Optional loss value if closure is provided
"""
def zero_grad(self):
"""Clear gradients of all optimized parameters."""
def state_dict(self):
"""
Return optimizer state as dictionary.
Returns:
dict: Optimizer state dictionary
"""
def load_state_dict(self, state_dict):
"""
Load optimizer state from dictionary.
Args:
state_dict (dict): Optimizer state dictionary
"""OpenAI's Adam optimizer variant with improved weight decay handling and learning rate scheduling.
class OpenAIAdam:
def __init__(
self,
params,
lr,
schedule='warmup_linear',
warmup=-1,
t_total=-1,
b1=0.9,
b2=0.999,
e=1e-8,
weight_decay=0,
vector_l2=False,
max_grad_norm=-1,
**kwargs
):
"""
Initialize OpenAI Adam optimizer.
Args:
params: Model parameters to optimize
lr (float): Learning rate (required)
schedule (str): Learning rate schedule type
warmup (float): Warmup proportion (-1 for no warmup)
t_total (int): Total training steps (-1 for no scheduling)
b1 (float): Adam beta1 parameter
b2 (float): Adam beta2 parameter
e (float): Adam epsilon parameter
weight_decay (float): Weight decay coefficient
vector_l2 (bool): Whether to apply L2 regularization to vectors only
max_grad_norm (float): Maximum gradient norm (-1 for no clipping)
"""
def step(self, closure=None):
"""Perform single optimization step."""
def zero_grad(self):
"""Clear gradients of all optimized parameters."""
def state_dict(self):
"""Return optimizer state as dictionary."""
def load_state_dict(self, state_dict):
"""Load optimizer state from dictionary."""from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam
import torch
# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# Prepare optimizer parameters with weight decay
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{
'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay': 0.01
},
{
'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay': 0.0
}
]
# Setup BERT Adam optimizer
num_train_steps = 1000
optimizer = BertAdam(
optimizer_grouped_parameters,
lr=2e-5,
warmup=0.1,
t_total=num_train_steps
)
# Training loop
model.train()
for step, batch in enumerate(train_dataloader):
# Forward pass
loss = model(batch['input_ids'], labels=batch['labels'])[0]
# Backward pass
loss.backward()
# Optimization step
optimizer.step()
optimizer.zero_grad()
print(f"Step {step}, Loss: {loss.item()}")from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIAdam
# Load model
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
# Setup OpenAI Adam optimizer
optimizer = OpenAIAdam(
model.parameters(),
lr=6.25e-5,
warmup=0.002,
t_total=num_train_steps,
weight_decay=0.01,
max_grad_norm=1.0
)
# Training with gradient clipping
for batch in train_dataloader:
loss = model(batch['input_ids'], lm_labels=batch['labels'])[0]
loss.backward()
# Gradient clipping is handled automatically by OpenAIAdam
optimizer.step()
optimizer.zero_grad()from pytorch_pretrained_bert import BertAdam
# Setup with custom parameters
optimizer = BertAdam(
model.parameters(),
lr=1e-4, # Learning rate
warmup=0.1, # 10% warmup
t_total=5000, # Total training steps
schedule='warmup_cosine', # Cosine decay after warmup
b1=0.9, # Adam beta1
b2=0.999, # Adam beta2
e=1e-6, # Adam epsilon
weight_decay=0.01, # Weight decay
max_grad_norm=1.0 # Gradient clipping
)
# Save and load optimizer state
optimizer_state = optimizer.state_dict()
# Later restore
optimizer.load_state_dict(optimizer_state)from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam, OpenAIAdam
import torch.optim as optim
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Different optimizers for comparison
optimizers = {
'bert_adam': BertAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),
'openai_adam': OpenAIAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),
'standard_adam': optim.Adam(model.parameters(), lr=2e-5)
}
# Training comparison
for name, optimizer in optimizers.items():
print(f"Training with {name}")
model_copy = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
for step, batch in enumerate(train_dataloader):
loss = model_copy(batch['input_ids'], labels=batch['labels'])[0]
loss.backward()
optimizer.step()
optimizer.zero_grad()
if step % 100 == 0:
print(f" Step {step}, Loss: {loss.item()}")Install with Tessl CLI
npx tessl i tessl/pypi-pytorch-pretrained-bert