Engine of OpenMMLab projects for training deep learning models based on PyTorch with large-scale training frameworks, configuration management, and monitoring capabilities
—
Comprehensive optimization framework with support for multiple optimizers, learning rate schedulers, momentum schedulers, automatic mixed precision, and gradient accumulation strategies. The system provides flexible optimization configurations for various training scenarios.
Wrapper classes that provide unified interface for different optimization strategies.
class BaseOptimWrapper:
def __init__(self, optimizer, accumulative_counts: int = 1, clip_grad: dict = None):
"""
Base optimizer wrapper.
Parameters:
- optimizer: PyTorch optimizer instance
- accumulative_counts: Number of steps for gradient accumulation
- clip_grad: Gradient clipping configuration
"""
def update_params(self, loss):
"""
Update model parameters.
Parameters:
- loss: Loss tensor for backward pass
"""
def zero_grad(self):
"""Zero gradients."""
def step(self):
"""Optimizer step."""
def backward(self, loss):
"""
Backward pass.
Parameters:
- loss: Loss tensor
"""
def get_lr(self) -> dict:
"""
Get current learning rates.
Returns:
Dictionary of learning rates
"""
def get_momentum(self) -> dict:
"""
Get current momentum values.
Returns:
Dictionary of momentum values
"""
@property
def param_groups(self) -> list:
"""Parameter groups."""
class OptimWrapper(BaseOptimWrapper):
def __init__(self, optimizer, accumulative_counts: int = 1, clip_grad: dict = None):
"""
Standard optimizer wrapper.
Parameters:
- optimizer: PyTorch optimizer
- accumulative_counts: Gradient accumulation steps
- clip_grad: Gradient clipping config
"""
class AmpOptimWrapper(BaseOptimWrapper):
def __init__(self, loss_scale: str = 'dynamic', **kwargs):
"""
Automatic mixed precision optimizer wrapper.
Parameters:
- loss_scale: Loss scaling strategy ('dynamic' or float value)
- **kwargs: Base wrapper arguments
"""
def backward(self, loss):
"""Scaled backward pass for AMP."""
class ApexOptimWrapper(BaseOptimWrapper):
def __init__(self, **kwargs):
"""
Apex optimizer wrapper for FP16 training.
Parameters:
- **kwargs: Base wrapper arguments
"""
class OptimWrapperDict:
def __init__(self, **kwargs):
"""
Dictionary of optimizer wrappers for multi-optimizer training.
Parameters:
- **kwargs: Named optimizer wrapper configurations
"""
def update_params(self, loss_dict: dict):
"""
Update parameters for multiple optimizers.
Parameters:
- loss_dict: Dictionary of losses for each optimizer
"""
def zero_grad(self):
"""Zero gradients for all optimizers."""
def step(self):
"""Step for all optimizers."""Comprehensive collection of learning rate scheduling strategies.
class ConstantLR:
def __init__(self, factor: float = 1.0, **kwargs):
"""
Constant learning rate scheduler.
Parameters:
- factor: Multiplicative factor for learning rate
- **kwargs: Base scheduler arguments
"""
class CosineAnnealingLR:
def __init__(self, T_max: int, eta_min: float = 0, **kwargs):
"""
Cosine annealing learning rate scheduler.
Parameters:
- T_max: Maximum number of iterations
- eta_min: Minimum learning rate
- **kwargs: Base scheduler arguments
"""
class ExponentialLR:
def __init__(self, gamma: float, **kwargs):
"""
Exponential learning rate scheduler.
Parameters:
- gamma: Multiplicative factor of learning rate decay
- **kwargs: Base scheduler arguments
"""
class LinearLR:
def __init__(self, start_factor: float = 1.0, end_factor: float = 0.0, total_iters: int = 5, **kwargs):
"""
Linear learning rate scheduler.
Parameters:
- start_factor: Starting multiplicative factor
- end_factor: Ending multiplicative factor
- total_iters: Number of iterations for linear decay
- **kwargs: Base scheduler arguments
"""
class MultiStepLR:
def __init__(self, milestones: list, gamma: float = 0.1, **kwargs):
"""
Multi-step learning rate scheduler.
Parameters:
- milestones: List of epoch indices for LR decay
- gamma: Multiplicative factor of learning rate decay
- **kwargs: Base scheduler arguments
"""
class StepLR:
def __init__(self, step_size: int, gamma: float = 0.1, **kwargs):
"""
Step learning rate scheduler.
Parameters:
- step_size: Period of learning rate decay
- gamma: Multiplicative factor of learning rate decay
- **kwargs: Base scheduler arguments
"""
class OneCycleLR:
def __init__(self, max_lr: float, total_steps: int = None, epochs: int = None, steps_per_epoch: int = None, pct_start: float = 0.3, anneal_strategy: str = 'cos', cycle_momentum: bool = True, base_momentum: float = 0.85, max_momentum: float = 0.95, div_factor: float = 25.0, final_div_factor: float = 10000.0, **kwargs):
"""
One cycle learning rate scheduler.
Parameters:
- max_lr: Upper learning rate boundaries
- total_steps: Total number of steps
- epochs: Number of epochs
- steps_per_epoch: Steps per epoch
- pct_start: Percentage of cycle spent increasing learning rate
- anneal_strategy: Annealing strategy ('cos' or 'linear')
- cycle_momentum: Whether to cycle momentum
- base_momentum: Lower momentum boundary
- max_momentum: Upper momentum boundary
- div_factor: Initial learning rate divisor
- final_div_factor: Final learning rate divisor
- **kwargs: Base scheduler arguments
"""
class PolyLR:
def __init__(self, power: float = 1.0, min_lr: float = 0.0, **kwargs):
"""
Polynomial learning rate scheduler.
Parameters:
- power: Polynomial power
- min_lr: Minimum learning rate
- **kwargs: Base scheduler arguments
"""
class ReduceOnPlateauLR:
def __init__(self, mode: str = 'min', factor: float = 0.1, patience: int = 10, threshold: float = 1e-4, threshold_mode: str = 'rel', cooldown: int = 0, min_lr: float = 0, eps: float = 1e-8, **kwargs):
"""
Reduce on plateau learning rate scheduler.
Parameters:
- mode: 'min' or 'max' for monitoring metric
- factor: Factor to reduce learning rate
- patience: Number of epochs with no improvement
- threshold: Threshold for measuring improvement
- threshold_mode: 'rel' or 'abs' threshold mode
- cooldown: Number of epochs to wait before resuming
- min_lr: Minimum learning rate
- eps: Minimum decay applied to learning rate
- **kwargs: Base scheduler arguments
"""Schedulers for momentum parameter in optimizers.
class ConstantMomentum:
def __init__(self, factor: float = 1.0, **kwargs):
"""
Constant momentum scheduler.
Parameters:
- factor: Multiplicative factor for momentum
- **kwargs: Base scheduler arguments
"""
class CosineAnnealingMomentum:
def __init__(self, T_max: int, eta_min: float = 0, **kwargs):
"""
Cosine annealing momentum scheduler.
Parameters:
- T_max: Maximum number of iterations
- eta_min: Minimum momentum
- **kwargs: Base scheduler arguments
"""
class ExponentialMomentum:
def __init__(self, gamma: float, **kwargs):
"""
Exponential momentum scheduler.
Parameters:
- gamma: Multiplicative factor of momentum decay
- **kwargs: Base scheduler arguments
"""
class LinearMomentum:
def __init__(self, start_factor: float = 1.0, end_factor: float = 0.0, total_iters: int = 5, **kwargs):
"""
Linear momentum scheduler.
Parameters:
- start_factor: Starting multiplicative factor
- end_factor: Ending multiplicative factor
- total_iters: Number of iterations
- **kwargs: Base scheduler arguments
"""
class MultiStepMomentum:
def __init__(self, milestones: list, gamma: float = 0.1, **kwargs):
"""
Multi-step momentum scheduler.
Parameters:
- milestones: List of epoch indices
- gamma: Multiplicative factor
- **kwargs: Base scheduler arguments
"""
class StepMomentum:
def __init__(self, step_size: int, gamma: float = 0.1, **kwargs):
"""
Step momentum scheduler.
Parameters:
- step_size: Period of momentum decay
- gamma: Multiplicative factor
- **kwargs: Base scheduler arguments
"""Generic parameter scheduling framework for any optimizer parameter.
class _ParamScheduler:
def __init__(self, optimizer, param_name: str, **kwargs):
"""
Base parameter scheduler.
Parameters:
- optimizer: Optimizer instance
- param_name: Parameter name to schedule
- **kwargs: Scheduler arguments
"""
def step(self):
"""Execute scheduler step."""
def get_value(self) -> list:
"""
Get current parameter values.
Returns:
List of current parameter values
"""
class ConstantParamScheduler(_ParamScheduler):
def __init__(self, optimizer, param_name: str, factor: float = 1.0, **kwargs):
"""
Constant parameter scheduler.
Parameters:
- optimizer: Optimizer instance
- param_name: Parameter name
- factor: Multiplicative factor
- **kwargs: Base scheduler arguments
"""
class CosineAnnealingParamScheduler(_ParamScheduler):
def __init__(self, optimizer, param_name: str, T_max: int, eta_min: float = 0, **kwargs):
"""
Cosine annealing parameter scheduler.
Parameters:
- optimizer: Optimizer instance
- param_name: Parameter name
- T_max: Maximum iterations
- eta_min: Minimum parameter value
- **kwargs: Base scheduler arguments
"""Builder class for creating optimizer wrappers from configuration.
class DefaultOptimWrapperConstructor:
def __init__(self, optim_wrapper_cfg: dict, paramwise_cfg: dict = None):
"""
Default optimizer wrapper constructor.
Parameters:
- optim_wrapper_cfg: Optimizer wrapper configuration
- paramwise_cfg: Parameter-wise configuration
"""
def __call__(self, model) -> BaseOptimWrapper:
"""
Build optimizer wrapper for model.
Parameters:
- model: PyTorch model
Returns:
Optimizer wrapper instance
"""
def build_optim_wrapper(model, cfg: dict) -> BaseOptimWrapper:
"""
Build optimizer wrapper from configuration.
Parameters:
- model: PyTorch model
- cfg: Optimizer wrapper configuration
Returns:
Built optimizer wrapper
"""Implementation of ZeRO (Zero Redundancy Optimizer) for memory-efficient training.
class ZeroRedundancyOptimizer:
def __init__(self, params, optimizer_class, process_group=None, parameters_as_bucket_view: bool = False, overlap_with_ddp: bool = False, **defaults):
"""
Zero Redundancy Optimizer wrapper.
Parameters:
- params: Model parameters
- optimizer_class: Base optimizer class
- process_group: Process group for distributed training
- parameters_as_bucket_view: Whether to use bucket view
- overlap_with_ddp: Whether to overlap with DDP
- **defaults: Default optimizer arguments
"""
def step(self, closure=None):
"""
Optimizer step with gradient synchronization.
Parameters:
- closure: Optional closure function
"""
def zero_grad(self):
"""Zero gradients across all processes."""
def consolidate_state_dict(self, to: int = 0):
"""
Consolidate optimizer state dictionary.
Parameters:
- to: Target rank for consolidation
"""import torch
from mmengine.optim import OptimWrapper
# Create model and optimizer
model = torch.nn.Linear(10, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# Wrap optimizer
optim_wrapper = OptimWrapper(
optimizer=optimizer,
accumulative_counts=4, # Gradient accumulation
clip_grad=dict(max_norm=1.0) # Gradient clipping
)
# Training step
loss = model(data)
optim_wrapper.update_params(loss)from mmengine.optim import AmpOptimWrapper
# Create AMP optimizer wrapper
optim_wrapper = AmpOptimWrapper(
optimizer=optimizer,
loss_scale='dynamic',
accumulative_counts=2
)
# Training with AMP
loss = model(data)
optim_wrapper.update_params(loss) # Automatic scalingfrom mmengine.optim import CosineAnnealingLR, MultiStepLR
# Cosine annealing scheduler
scheduler_cfg = dict(
type='CosineAnnealingLR',
T_max=100,
eta_min=1e-6,
by_epoch=True,
begin=0,
end=100
)
# Multi-step scheduler
scheduler_cfg = dict(
type='MultiStepLR',
milestones=[30, 60, 90],
gamma=0.1,
by_epoch=True
)
# Use in runner configuration
runner = Runner(
model=model,
optim_wrapper=dict(
optimizer=dict(type='SGD', lr=0.1, momentum=0.9),
clip_grad=dict(max_norm=1.0)
),
param_scheduler=scheduler_cfg
)from mmengine.optim import OptimWrapperDict
# Multiple optimizer configuration
optim_wrapper_dict = OptimWrapperDict(
generator=dict(
optimizer=dict(type='Adam', lr=0.0002, betas=(0.5, 0.999))
),
discriminator=dict(
optimizer=dict(type='Adam', lr=0.0002, betas=(0.5, 0.999))
)
)
# Training step with multiple losses
losses = {
'generator': gen_loss,
'discriminator': disc_loss
}
optim_wrapper_dict.update_params(losses)from mmengine.optim import DefaultOptimWrapperConstructor
# Parameter-wise learning rate configuration
paramwise_cfg = dict(
bias_lr_mult=2.0, # 2x learning rate for bias
bias_decay_mult=0.0, # No weight decay for bias
norm_decay_mult=0.0, # No weight decay for normalization
custom_keys={
'.backbone': dict(lr_mult=0.1), # 0.1x LR for backbone
'.head': dict(lr_mult=1.0) # 1x LR for head
}
)
# Create optimizer constructor
constructor = DefaultOptimWrapperConstructor(
optim_wrapper_cfg=dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.01, momentum=0.9)
),
paramwise_cfg=paramwise_cfg
)
# Build optimizer wrapper
optim_wrapper = constructor(model)from mmengine.optim import OneCycleLR
# One cycle learning rate policy
scheduler_cfg = dict(
type='OneCycleLR',
max_lr=0.1,
total_steps=1000,
pct_start=0.3,
anneal_strategy='cos',
cycle_momentum=True,
base_momentum=0.85,
max_momentum=0.95
)
# Polynomial learning rate decay
poly_scheduler_cfg = dict(
type='PolyLR',
power=0.9,
min_lr=1e-6,
by_epoch=False,
begin=0,
end=1000
)Install with Tessl CLI
npx tessl i tessl/pypi-mmengine