Deep learning framework providing tensor computation with GPU acceleration and dynamic neural networks with automatic differentiation
Optimizers, learning rate schedulers, and training utilities for model optimization and parameter updates. The torch.optim module provides optimization algorithms and learning rate scheduling strategies.
Optimization algorithms for updating model parameters during training.
class Optimizer:
"""Base class for all optimizers."""
def __init__(self, params, defaults): ...
def state_dict(self):
"""Return optimizer state dictionary."""
def load_state_dict(self, state_dict):
"""Load optimizer state."""
def zero_grad(self, set_to_none: bool = False):
"""Set gradients to zero."""
def step(self, closure=None):
"""Perform optimization step."""
def add_param_group(self, param_group):
"""Add parameter group."""Stochastic Gradient Descent and variants.
class SGD(Optimizer):
"""Stochastic Gradient Descent optimizer."""
def __init__(self, params, lr, momentum=0, dampening=0, weight_decay=0, nesterov=False):
"""
Parameters:
- params: Iterable of parameters or parameter groups
- lr: Learning rate
- momentum: Momentum factor (default: 0)
- dampening: Dampening for momentum (default: 0)
- weight_decay: Weight decay (L2 penalty) (default: 0)
- nesterov: Enable Nesterov momentum (default: False)
"""
def step(self, closure=None): ...
class ASGD(Optimizer):
"""Averaged Stochastic Gradient Descent."""
def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1e-2)
- lambd: Decay term (default: 1e-4)
- alpha: Power for eta update (default: 0.75)
- t0: Point at which to start averaging (default: 1e6)
- weight_decay: Weight decay (default: 0)
"""
def step(self, closure=None): ...Adam and its variants for adaptive learning rates.
class Adam(Optimizer):
"""Adam optimizer."""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1e-3)
- betas: Coefficients for momentum and squared gradient averaging (default: (0.9, 0.999))
- eps: Term for numerical stability (default: 1e-8)
- weight_decay: Weight decay (default: 0)
- amsgrad: Use AMSGrad variant (default: False)
"""
def step(self, closure=None): ...
class AdamW(Optimizer):
"""AdamW optimizer with decoupled weight decay."""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1e-3)
- betas: Coefficients for momentum and squared gradient averaging
- eps: Term for numerical stability
- weight_decay: Weight decay coefficient (default: 1e-2)
- amsgrad: Use AMSGrad variant
"""
def step(self, closure=None): ...
class Adamax(Optimizer):
"""Adamax optimizer (Adam based on infinity norm)."""
def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 2e-3)
- betas: Coefficients for momentum and squared gradient averaging
- eps: Term for numerical stability
- weight_decay: Weight decay
"""
def step(self, closure=None): ...
class NAdam(Optimizer):
"""NAdam optimizer (Adam with Nesterov momentum)."""
def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, momentum_decay=4e-3):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 2e-3)
- betas: Coefficients for momentum and squared gradient averaging
- eps: Term for numerical stability
- weight_decay: Weight decay
- momentum_decay: Momentum decay
"""
def step(self, closure=None): ...
class RAdam(Optimizer):
"""RAdam optimizer (Rectified Adam)."""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1e-3)
- betas: Coefficients for momentum and squared gradient averaging
- eps: Term for numerical stability
- weight_decay: Weight decay
"""
def step(self, closure=None): ...Optimizers that adapt learning rates based on gradient history.
class Adagrad(Optimizer):
"""Adagrad optimizer."""
def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1e-2)
- lr_decay: Learning rate decay (default: 0)
- weight_decay: Weight decay (default: 0)
- initial_accumulator_value: Initial value for accumulator
- eps: Term for numerical stability
"""
def step(self, closure=None): ...
class Adadelta(Optimizer):
"""Adadelta optimizer."""
def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
"""
Parameters:
- params: Iterable of parameters
- lr: Coefficient that scales delta (default: 1.0)
- rho: Coefficient for squared gradient averaging (default: 0.9)
- eps: Term for numerical stability (default: 1e-6)
- weight_decay: Weight decay (default: 0)
"""
def step(self, closure=None): ...
class RMSprop(Optimizer):
"""RMSprop optimizer."""
def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1e-2)
- alpha: Smoothing constant (default: 0.99)
- eps: Term for numerical stability (default: 1e-8)
- weight_decay: Weight decay (default: 0)
- momentum: Momentum factor (default: 0)
- centered: Compute centered RMSprop (default: False)
"""
def step(self, closure=None): ...
class Rprop(Optimizer):
"""Rprop optimizer."""
def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1e-2)
- etas: Pair of (etaminus, etaplus) for multiplicative increase/decrease
- step_sizes: Pair of minimal and maximal allowed step sizes
"""
def step(self, closure=None): ...Specialized optimization algorithms.
class LBFGS(Optimizer):
"""Limited-memory BFGS optimizer."""
def __init__(self, params, lr=1, max_iter=20, max_eval=None, tolerance_grad=1e-7,
tolerance_change=1e-9, history_size=100, line_search_fn=None):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1)
- max_iter: Maximum number of iterations per optimization step
- max_eval: Maximum number of function evaluations per step
- tolerance_grad: Termination tolerance on first order optimality
- tolerance_change: Termination tolerance on function/parameter changes
- history_size: Update history size
- line_search_fn: Line search function ('strong_wolfe' or None)
"""
def step(self, closure): ...
class SparseAdam(Optimizer):
"""Adam optimizer for sparse tensors."""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (default: 1e-3)
- betas: Coefficients for momentum and squared gradient averaging
- eps: Term for numerical stability
"""
def step(self, closure=None): ...
class Adafactor(Optimizer):
"""Adafactor optimizer for memory-efficient training."""
def __init__(self, params, lr=None, eps2=1e-30, cliping_threshold=1.0, decay_rate=-0.8,
beta1=None, weight_decay=0.0, scale_parameter=True, relative_step=True):
"""
Parameters:
- params: Iterable of parameters
- lr: Learning rate (None for automatic scaling)
- eps2: Regularization constant for second moment
- cliping_threshold: Threshold of root mean square of final gradient update
- decay_rate: Coefficient for moving average of squared gradient
- beta1: Coefficient for moving average of gradient
- weight_decay: Weight decay
- scale_parameter: Scale learning rate by root mean square of parameter
- relative_step: Set learning rate relative to current step
"""
def step(self, closure=None): ...Learning rate scheduling strategies for training optimization.
class LRScheduler:
"""Base class for learning rate schedulers."""
def __init__(self, optimizer, last_epoch=-1, verbose=False): ...
def state_dict(self):
"""Return scheduler state dictionary."""
def load_state_dict(self, state_dict):
"""Load scheduler state."""
def get_last_lr(self):
"""Return last computed learning rates."""
def step(self, epoch=None):
"""Update learning rates."""
class StepLR(LRScheduler):
"""Decay learning rate by gamma every step_size epochs."""
def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1, verbose=False):
"""
Parameters:
- optimizer: Wrapped optimizer
- step_size: Period of learning rate decay
- gamma: Multiplicative factor of learning rate decay (default: 0.1)
- last_epoch: Index of last epoch (default: -1)
- verbose: Print message on every update (default: False)
"""
class MultiStepLR(LRScheduler):
"""Decay learning rate by gamma at specified milestones."""
def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1, verbose=False):
"""
Parameters:
- optimizer: Wrapped optimizer
- milestones: List of epoch indices for decay
- gamma: Multiplicative factor of learning rate decay
- last_epoch: Index of last epoch
- verbose: Print message on every update
"""
class ExponentialLR(LRScheduler):
"""Decay learning rate by gamma every epoch."""
def __init__(self, optimizer, gamma, last_epoch=-1, verbose=False):
"""
Parameters:
- optimizer: Wrapped optimizer
- gamma: Multiplicative factor of learning rate decay
- last_epoch: Index of last epoch
- verbose: Print message on every update
"""
class CosineAnnealingLR(LRScheduler):
"""Cosine annealing learning rate schedule."""
def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1, verbose=False):
"""
Parameters:
- optimizer: Wrapped optimizer
- T_max: Maximum number of iterations
- eta_min: Minimum learning rate (default: 0)
- last_epoch: Index of last epoch
- verbose: Print message on every update
"""
class CosineAnnealingWarmRestarts(LRScheduler):
"""Cosine annealing with warm restarts."""
def __init__(self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1, verbose=False):
"""
Parameters:
- optimizer: Wrapped optimizer
- T_0: Number of iterations for first restart
- T_mult: Factor to increase T_i after restart (default: 1)
- eta_min: Minimum learning rate (default: 0)
- last_epoch: Index of last epoch
- verbose: Print message on every update
"""
class ReduceLROnPlateau:
"""Reduce learning rate when metric stops improving."""
def __init__(self, optimizer, mode='min', factor=0.1, patience=10, verbose=False,
threshold=1e-4, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-8):
"""
Parameters:
- optimizer: Wrapped optimizer
- mode: 'min' or 'max' for metric improvement direction
- factor: Factor to reduce learning rate (default: 0.1)
- patience: Number of epochs with no improvement to wait
- verbose: Print message when reducing lr
- threshold: Threshold for measuring new optimum
- threshold_mode: 'rel' or 'abs' for threshold comparison
- cooldown: Number of epochs to wait before resuming normal operation
- min_lr: Lower bound on learning rate
- eps: Minimal decay applied to lr
"""
def step(self, metrics, epoch=None): ...
class CyclicLR(LRScheduler):
"""Cyclical learning rate policy."""
def __init__(self, optimizer, base_lr, max_lr, step_size_up=2000, step_size_down=None,
mode='triangular', gamma=1.0, scale_fn=None, scale_mode='cycle', cycle_momentum=True,
base_momentum=0.8, max_momentum=0.9, last_epoch=-1, verbose=False):
"""
Parameters:
- optimizer: Wrapped optimizer
- base_lr: Lower learning rate boundary
- max_lr: Upper learning rate boundary
- step_size_up: Number of training iterations in increasing half
- step_size_down: Number of training iterations in decreasing half
- mode: 'triangular', 'triangular2', or 'exp_range'
- gamma: Constant in 'exp_range' scaling function
- scale_fn: Custom scaling policy function
- scale_mode: 'cycle' or 'iterations'
- cycle_momentum: Cycle momentum inversely to learning rate
- base_momentum: Lower momentum boundary
- max_momentum: Upper momentum boundary
- last_epoch: Index of last epoch
- verbose: Print message on every update
"""
class OneCycleLR(LRScheduler):
"""One cycle learning rate policy."""
def __init__(self, optimizer, max_lr, total_steps=None, epochs=None, steps_per_epoch=None,
pct_start=0.3, anneal_strategy='cos', cycle_momentum=True, base_momentum=0.85,
max_momentum=0.95, div_factor=25.0, final_div_factor=1e4, three_phase=False, last_epoch=-1, verbose=False):
"""
Parameters:
- optimizer: Wrapped optimizer
- max_lr: Upper learning rate boundary
- total_steps: Total number of steps in cycle
- epochs: Number of epochs (alternative to total_steps)
- steps_per_epoch: Steps per epoch (with epochs)
- pct_start: Percentage of cycle spent increasing learning rate
- anneal_strategy: 'cos' or 'linear' annealing strategy
- cycle_momentum: Cycle momentum inversely to learning rate
- base_momentum: Lower momentum boundary
- max_momentum: Upper momentum boundary
- div_factor: Determines initial learning rate (max_lr/div_factor)
- final_div_factor: Determines minimum learning rate (max_lr/(div_factor*final_div_factor))
- three_phase: Use three phase schedule
- last_epoch: Index of last epoch
- verbose: Print message on every update
"""Utilities for gradient manipulation and processing.
def clip_grad_norm_(parameters, max_norm, norm_type=2.0, error_if_nonfinite=False):
"""
Clip gradient norm of parameters.
Parameters:
- parameters: Iterable of parameters or single tensor
- max_norm: Maximum norm of gradients
- norm_type: Type of norm (default: 2.0)
- error_if_nonfinite: Raise error if total norm is NaN or inf
Returns:
Total norm of the parameters
"""
def clip_grad_value_(parameters, clip_value):
"""
Clip gradient values to specified range.
Parameters:
- parameters: Iterable of parameters or single tensor
- clip_value: Maximum absolute value for gradients
"""Utilities for stochastic weight averaging to improve generalization.
class AveragedModel(nn.Module):
"""Averaged model for stochastic weight averaging."""
def __init__(self, model, device=None, avg_fn=None, use_buffers=False):
"""
Parameters:
- model: Model to average
- device: Device to store averaged parameters
- avg_fn: Function to compute running average
- use_buffers: Whether to average buffers
"""
def update_parameters(self, model): ...
class SWALR(LRScheduler):
"""Learning rate scheduler for stochastic weight averaging."""
def __init__(self, optimizer, swa_lr, anneal_epochs=10, anneal_strategy='cos', last_epoch=-1):
"""
Parameters:
- optimizer: Wrapped optimizer
- swa_lr: SWA learning rate
- anneal_epochs: Number of epochs for annealing (default: 10)
- anneal_strategy: 'cos' or 'linear' annealing strategy
- last_epoch: Index of last epoch
"""import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
# Setup model, loss, and optimizer
model = nn.Sequential(
nn.Linear(784, 128),
nn.ReLU(),
nn.Linear(128, 10)
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
def train_epoch(model, dataloader, criterion, optimizer):
model.train()
total_loss = 0
for batch_idx, (data, targets) in enumerate(dataloader):
# Zero gradients
optimizer.zero_grad()
# Forward pass
outputs = model(data)
loss = criterion(outputs, targets)
# Backward pass
loss.backward()
# Gradient clipping (optional)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# Update parameters
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
# Example usage
# train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
# loss = train_epoch(model, train_loader, criterion, optimizer)
# print(f"Training loss: {loss:.4f}")import torch
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
# Setup optimizer and scheduler
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
# Alternative: Reduce on plateau
# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
# Training loop with scheduler
for epoch in range(100):
train_loss = train_epoch(model, train_loader, criterion, optimizer)
val_loss = validate(model, val_loader, criterion)
# Step scheduler
scheduler.step() # For StepLR
# scheduler.step(val_loss) # For ReduceLROnPlateau
current_lr = optimizer.param_groups[0]['lr']
print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, LR: {current_lr:.6f}")import torch
import torch.optim as optim
# Different learning rates for different parts of the model
model = nn.Sequential(
nn.Linear(784, 128),
nn.ReLU(),
nn.Linear(128, 10)
)
# Create parameter groups
params = [
{'params': model[0].parameters(), 'lr': 0.001}, # First layer
{'params': model[2].parameters(), 'lr': 0.01} # Last layer
]
optimizer = optim.Adam(params, weight_decay=1e-4)
# Training with different learning rates
for epoch in range(100):
for batch_idx, (data, targets) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(data)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()import torch
import torch.optim as optim
from torch.optim.swa_utils import AveragedModel, SWALR
# Setup model and optimizer
model = nn.Sequential(nn.Linear(10, 5), nn.ReLU(), nn.Linear(5, 1))
optimizer = optim.SGD(model.parameters(), lr=0.1)
# Create averaged model and SWA scheduler
swa_model = AveragedModel(model)
swa_scheduler = SWALR(optimizer, swa_lr=0.05)
# Training with SWA
swa_start_epoch = 80
for epoch in range(100):
train_loss = train_epoch(model, train_loader, criterion, optimizer)
if epoch >= swa_start_epoch:
swa_model.update_parameters(model)
swa_scheduler.step()
else:
# Regular scheduler before SWA
regular_scheduler.step()
print(f"Epoch {epoch}: Loss: {train_loss:.4f}")
# Update SWA batch normalization statistics
torch.optim.swa_utils.update_bn(train_loader, swa_model)
# Use SWA model for inference
swa_model.eval()import torch
import torch.optim as optim
from torch.optim.lr_scheduler import OneCycleLR
# Setup optimizer
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
# One cycle scheduler
steps_per_epoch = len(train_loader)
scheduler = OneCycleLR(
optimizer,
max_lr=0.1,
epochs=100,
steps_per_epoch=steps_per_epoch,
pct_start=0.3,
div_factor=25,
final_div_factor=1e4
)
# Training loop
for epoch in range(100):
for batch_idx, (data, targets) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(data)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
# Step after each batch
scheduler.step()
print(f"Epoch {epoch}: LR: {optimizer.param_groups[0]['lr']:.6f}")Install with Tessl CLI
npx tessl i tessl/pypi-torch