Engine of OpenMMLab projects for training deep learning models based on PyTorch with large-scale training frameworks, configuration management, and monitoring capabilities
—
Complete training orchestration system with flexible runners supporting epoch-based and iteration-based training, validation, and testing loops with built-in checkpointing and logging. The system provides a unified interface for managing the entire training pipeline.
Central coordinator managing the entire training process with flexible configuration and automatic component initialization.
class Runner:
def __init__(self, model, work_dir: str = None, train_dataloader = None, val_dataloader = None, test_dataloader = None, train_cfg: dict = None, val_cfg: dict = None, test_cfg: dict = None, auto_scale_lr: dict = None, optim_wrapper = None, param_scheduler = None, val_evaluator = None, test_evaluator = None, default_hooks: dict = None, custom_hooks: list = None, data_preprocessor = None, load_from: str = None, resume: bool = False, launcher: str = 'none', env_cfg: dict = None, log_processor = None, visualizer = None, default_scope: str = 'mmengine', randomness: dict = None, experiment_name: str = None, cfg: dict = None):
"""
Initialize Runner with comprehensive training configuration.
Parameters:
- model: Model to train (torch.nn.Module or config dict)
- work_dir: Working directory for saving outputs
- train_dataloader: Training data loader
- val_dataloader: Validation data loader
- test_dataloader: Test data loader
- train_cfg: Training loop configuration
- val_cfg: Validation loop configuration
- test_cfg: Test loop configuration
- auto_scale_lr: Automatic learning rate scaling configuration
- optim_wrapper: Optimizer wrapper configuration
- param_scheduler: Parameter scheduler configuration
- val_evaluator: Validation evaluator configuration
- test_evaluator: Test evaluator configuration
- default_hooks: Default hooks configuration
- custom_hooks: Custom hooks list
- data_preprocessor: Data preprocessor configuration
- load_from: Checkpoint path to load
- resume: Whether to resume training
- launcher: Distributed launcher type
- env_cfg: Environment configuration
- log_processor: Log processor configuration
- visualizer: Visualizer configuration
- default_scope: Default registry scope
- randomness: Randomness configuration
- experiment_name: Experiment name
- cfg: Complete configuration object
"""
@classmethod
def from_cfg(cls, cfg) -> 'Runner':
"""
Create Runner from configuration.
Parameters:
- cfg: Configuration object or dict
Returns:
Initialized Runner instance
"""
def train(self):
"""Run training loop."""
def val(self):
"""Run validation loop."""
def test(self):
"""Run test loop."""
def call_hook(self, fn_name: str, **kwargs):
"""
Call hook method.
Parameters:
- fn_name: Hook method name
- **kwargs: Hook arguments
"""
def register_hook(self, hook, priority: str = 'NORMAL'):
"""
Register hook.
Parameters:
- hook: Hook instance or config
- priority: Hook priority
"""
def load_or_resume(self):
"""Load checkpoint or resume training."""
def save_checkpoint(self, out_dir: str, filename: str = None, file_client_args: dict = None, save_optimizer: bool = True, save_param_scheduler: bool = True, meta: dict = None, by_epoch: bool = True):
"""
Save checkpoint.
Parameters:
- out_dir: Output directory
- filename: Checkpoint filename
- file_client_args: File client arguments
- save_optimizer: Whether to save optimizer state
- save_param_scheduler: Whether to save scheduler state
- meta: Additional metadata
- by_epoch: Whether checkpoint is by epoch
"""
@property
def epoch(self) -> int:
"""Current epoch number."""
@property
def iter(self) -> int:
"""Current iteration number."""
@property
def max_epochs(self) -> int:
"""Maximum number of epochs."""
@property
def max_iters(self) -> int:
"""Maximum number of iterations."""Extended runner with additional flexibility for custom training workflows.
class FlexibleRunner(Runner):
def __init__(self, **kwargs):
"""
Initialize FlexibleRunner with extended configuration options.
Parameters:
- **kwargs: Same as Runner plus additional flexibility options
"""
def run_loop(self, loop: 'BaseLoop'):
"""
Run custom training loop.
Parameters:
- loop: Loop instance to execute
"""Abstract base class for all training loops providing common interface and functionality.
class BaseLoop:
def __init__(self, runner, dataloader):
"""
Initialize base loop.
Parameters:
- runner: Runner instance
- dataloader: Data loader for the loop
"""
def run(self):
"""Execute the loop."""
@property
def iter(self) -> int:
"""Current iteration number."""
@property
def max_iters(self) -> int:
"""Maximum iterations for this loop."""Specialized training loops for different training strategies.
class EpochBasedTrainLoop(BaseLoop):
def __init__(self, runner, dataloader, max_epochs: int, val_begin: int = 1, val_interval: int = 1, dynamic_intervals: list = None):
"""
Epoch-based training loop.
Parameters:
- runner: Runner instance
- dataloader: Training data loader
- max_epochs: Maximum number of epochs
- val_begin: Epoch to begin validation
- val_interval: Validation interval in epochs
- dynamic_intervals: Dynamic validation intervals
"""
def run_epoch(self):
"""Run one training epoch."""
def run_iter(self, idx: int, data_batch):
"""
Run one training iteration.
Parameters:
- idx: Iteration index
- data_batch: Input data batch
"""
class IterBasedTrainLoop(BaseLoop):
def __init__(self, runner, dataloader, max_iters: int, val_begin: int = 1, val_interval: int = 1, dynamic_intervals: list = None):
"""
Iteration-based training loop.
Parameters:
- runner: Runner instance
- dataloader: Training data loader
- max_iters: Maximum number of iterations
- val_begin: Iteration to begin validation
- val_interval: Validation interval in iterations
- dynamic_intervals: Dynamic validation intervals
"""
def run_iter(self, data_batch):
"""
Run one training iteration.
Parameters:
- data_batch: Input data batch
"""Loops for model evaluation during training or standalone testing.
class ValLoop(BaseLoop):
def __init__(self, runner, dataloader, evaluator, fp16: bool = False):
"""
Validation loop.
Parameters:
- runner: Runner instance
- dataloader: Validation data loader
- evaluator: Evaluator for validation metrics
- fp16: Whether to use FP16 precision
"""
def run(self) -> dict:
"""
Run validation loop.
Returns:
Dictionary of validation metrics
"""
class TestLoop(BaseLoop):
def __init__(self, runner, dataloader, evaluator, fp16: bool = False):
"""
Test loop.
Parameters:
- runner: Runner instance
- dataloader: Test data loader
- evaluator: Evaluator for test metrics
- fp16: Whether to use FP16 precision
"""
def run(self) -> dict:
"""
Run test loop.
Returns:
Dictionary of test metrics
"""Comprehensive checkpoint loading and saving functionality.
def load_checkpoint(filename: str, map_location: str = None, logger = None, revise_keys: list = None) -> dict:
"""
Load checkpoint from file.
Parameters:
- filename: Checkpoint file path
- map_location: Device to load checkpoint
- logger: Logger instance
- revise_keys: Keys to revise during loading
Returns:
Checkpoint dictionary
"""
def save_checkpoint(model, filename: str, optimizer = None, lr_scheduler = None, meta: dict = None, file_client_args: dict = None):
"""
Save checkpoint to file.
Parameters:
- model: Model to save
- filename: Output filename
- optimizer: Optimizer state to save
- lr_scheduler: Learning rate scheduler to save
- meta: Additional metadata
- file_client_args: File client arguments
"""
def weights_to_cpu(state_dict: dict) -> dict:
"""
Move weights to CPU.
Parameters:
- state_dict: Model state dictionary
Returns:
CPU state dictionary
"""
def get_state_dict(module, destination: dict = None, prefix: str = '', keep_vars: bool = False) -> dict:
"""
Get model state dictionary.
Parameters:
- module: PyTorch module
- destination: Destination dictionary
- prefix: Key prefix
- keep_vars: Whether to keep variables
Returns:
State dictionary
"""
def find_latest_checkpoint(path: str, suffix: str = 'pth') -> str:
"""
Find latest checkpoint in directory.
Parameters:
- path: Directory path
- suffix: Checkpoint file suffix
Returns:
Latest checkpoint path
"""Utilities for loading pre-trained models and model information.
def get_torchvision_models() -> list:
"""
Get list of available torchvision models.
Returns:
List of model names
"""
def get_external_models() -> list:
"""
Get list of available external models.
Returns:
List of external model names
"""
def get_mmcls_models() -> list:
"""
Get list of available MMClassification models.
Returns:
List of MMCls model names
"""
def get_deprecated_model_names() -> list:
"""
Get list of deprecated model names.
Returns:
List of deprecated model names
"""
class CheckpointLoader:
@staticmethod
def load_checkpoint(filename: str, map_location: str = None) -> dict:
"""
Load checkpoint with advanced options.
Parameters:
- filename: Checkpoint file path
- map_location: Device mapping
Returns:
Loaded checkpoint
"""Additional utilities for training management.
def set_random_seed(seed: int, deterministic: bool = False, diff_rank_seed: bool = False):
"""
Set random seed for reproducibility.
Parameters:
- seed: Random seed value
- deterministic: Whether to use deterministic algorithms
- diff_rank_seed: Whether to use different seeds for different ranks
"""
def turn_on_activation_checkpointing(model, **kwargs):
"""
Enable activation checkpointing for memory efficiency.
Parameters:
- model: Model to apply checkpointing
- **kwargs: Checkpointing configuration
"""
def autocast(*args, **kwargs):
"""
Automatic mixed precision context manager.
Parameters:
- *args: Positional arguments
- **kwargs: Keyword arguments
Returns:
Autocast context manager
"""from mmengine import Runner, Config
# Load configuration
cfg = Config.fromfile('config.py')
# Create runner
runner = Runner.from_cfg(cfg)
# Start training
runner.train()from mmengine import Runner, EpochBasedTrainLoop
# Create runner with custom configuration
runner = Runner(
model=model,
work_dir='./work_dir',
train_dataloader=train_loader,
val_dataloader=val_loader,
train_cfg=dict(type='EpochBasedTrainLoop', max_epochs=100),
optim_wrapper=dict(optimizer=dict(type='SGD', lr=0.1)),
)
# Run training
runner.train()from mmengine.runner import load_checkpoint, save_checkpoint
# Load checkpoint
checkpoint = load_checkpoint('model.pth', map_location='cpu')
# Save checkpoint with metadata
save_checkpoint(
model,
'checkpoint.pth',
optimizer=optimizer,
meta={'epoch': 10, 'best_acc': 0.95}
)
# Find latest checkpoint
latest_ckpt = find_latest_checkpoint('./checkpoints')from mmengine import Runner
from mmengine.hooks import Hook
class CustomHook(Hook):
def before_train_epoch(self, runner):
print(f"Starting epoch {runner.epoch}")
runner = Runner.from_cfg(cfg)
runner.register_hook(CustomHook(), priority='LOW')
runner.train()Install with Tessl CLI
npx tessl i tessl/pypi-mmengine