tessl/pypi-pytorch-pretrained-bert

PyTorch implementations of transformer-based language models including BERT, OpenAI GPT, GPT-2, and Transformer-XL with pre-trained models, tokenizers, and utilities for NLP tasks

—

Pending

Overview

Eval results

Files

Optimizers

Name: tessl/pypi-pytorch-pretrained-bert
Author: tessl

Specialized optimizers with learning rate scheduling designed for transformer training, including BERT-specific and OpenAI-specific Adam variants with warmup schedules, weight decay corrections, and gradient clipping.

Capabilities

BERT Adam Optimizer

Adam optimizer with BERT-specific weight decay handling, learning rate scheduling, and gradient clipping designed for transformer fine-tuning.

class BertAdam:
    def __init__(
        self,
        params,
        lr,
        warmup=-1,
        t_total=-1,
        schedule='warmup_linear',
        b1=0.9,
        b2=0.999,
        e=1e-6,
        weight_decay=0.01,
        max_grad_norm=1.0
    ):
        """
        Initialize BERT Adam optimizer.
        
        Args:
            params: Model parameters to optimize
            lr (float): Learning rate (required)
            warmup (float): Warmup proportion of total training steps (-1 for no warmup)
            t_total (int): Total training steps (-1 for no scheduling)
            schedule (str): Learning rate schedule type
            b1 (float): Adam beta1 parameter
            b2 (float): Adam beta2 parameter
            e (float): Adam epsilon parameter
            weight_decay (float): Weight decay coefficient
            max_grad_norm (float): Maximum gradient norm for clipping
        """
    
    def step(self, closure=None):
        """
        Perform single optimization step.
        
        Args:
            closure (callable, optional): A closure that reevaluates model and returns loss
            
        Returns:
            Optional loss value if closure is provided
        """
    
    def zero_grad(self):
        """Clear gradients of all optimized parameters."""
    
    def state_dict(self):
        """
        Return optimizer state as dictionary.
        
        Returns:
            dict: Optimizer state dictionary
        """
    
    def load_state_dict(self, state_dict):
        """
        Load optimizer state from dictionary.
        
        Args:
            state_dict (dict): Optimizer state dictionary
        """

OpenAI Adam Optimizer

OpenAI's Adam optimizer variant with improved weight decay handling and learning rate scheduling.

class OpenAIAdam:
    def __init__(
        self,
        params,
        lr,
        schedule='warmup_linear',
        warmup=-1,
        t_total=-1,
        b1=0.9,
        b2=0.999,
        e=1e-8,
        weight_decay=0,
        vector_l2=False,
        max_grad_norm=-1,
        **kwargs
    ):
        """
        Initialize OpenAI Adam optimizer.
        
        Args:
            params: Model parameters to optimize
            lr (float): Learning rate (required)
            schedule (str): Learning rate schedule type
            warmup (float): Warmup proportion (-1 for no warmup)
            t_total (int): Total training steps (-1 for no scheduling)
            b1 (float): Adam beta1 parameter
            b2 (float): Adam beta2 parameter
            e (float): Adam epsilon parameter
            weight_decay (float): Weight decay coefficient
            vector_l2 (bool): Whether to apply L2 regularization to vectors only
            max_grad_norm (float): Maximum gradient norm (-1 for no clipping)
        """
    
    def step(self, closure=None):
        """Perform single optimization step."""
    
    def zero_grad(self):
        """Clear gradients of all optimized parameters."""
    
    def state_dict(self):
        """Return optimizer state as dictionary."""
    
    def load_state_dict(self, state_dict):
        """Load optimizer state from dictionary."""

Usage Examples

Basic BERT Fine-tuning Setup

from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam
import torch

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Prepare optimizer parameters with weight decay
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
        'weight_decay': 0.01
    },
    {
        'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
        'weight_decay': 0.0
    }
]

# Setup BERT Adam optimizer
num_train_steps = 1000
optimizer = BertAdam(
    optimizer_grouped_parameters,
    lr=2e-5,
    warmup=0.1,
    t_total=num_train_steps
)

# Training loop
model.train()
for step, batch in enumerate(train_dataloader):
    # Forward pass
    loss = model(batch['input_ids'], labels=batch['labels'])[0]
    
    # Backward pass
    loss.backward()
    
    # Optimization step
    optimizer.step()
    optimizer.zero_grad()
    
    print(f"Step {step}, Loss: {loss.item()}")

OpenAI GPT Fine-tuning

from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIAdam

# Load model
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

# Setup OpenAI Adam optimizer
optimizer = OpenAIAdam(
    model.parameters(),
    lr=6.25e-5,
    warmup=0.002,
    t_total=num_train_steps,
    weight_decay=0.01,
    max_grad_norm=1.0
)

# Training with gradient clipping
for batch in train_dataloader:
    loss = model(batch['input_ids'], lm_labels=batch['labels'])[0]
    loss.backward()
    
    # Gradient clipping is handled automatically by OpenAIAdam
    optimizer.step()
    optimizer.zero_grad()

Advanced Optimizer Configuration

from pytorch_pretrained_bert import BertAdam

# Setup with custom parameters
optimizer = BertAdam(
    model.parameters(),
    lr=1e-4,                    # Learning rate
    warmup=0.1,                 # 10% warmup
    t_total=5000,              # Total training steps
    schedule='warmup_cosine',   # Cosine decay after warmup
    b1=0.9,                    # Adam beta1
    b2=0.999,                  # Adam beta2
    e=1e-6,                    # Adam epsilon
    weight_decay=0.01,         # Weight decay
    max_grad_norm=1.0          # Gradient clipping
)

# Save and load optimizer state
optimizer_state = optimizer.state_dict()

# Later restore
optimizer.load_state_dict(optimizer_state)

Comparing Optimizer Effects

from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam, OpenAIAdam
import torch.optim as optim

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Different optimizers for comparison
optimizers = {
    'bert_adam': BertAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),
    'openai_adam': OpenAIAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),
    'standard_adam': optim.Adam(model.parameters(), lr=2e-5)
}

# Training comparison
for name, optimizer in optimizers.items():
    print(f"Training with {name}")
    model_copy = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    
    for step, batch in enumerate(train_dataloader):
        loss = model_copy(batch['input_ids'], labels=batch['labels'])[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if step % 100 == 0:
            print(f"  Step {step}, Loss: {loss.item()}")

Install with Tessl CLI

npx tessl i tessl/pypi-pytorch-pretrained-bert

docs