CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-mmengine

Engine of OpenMMLab projects for training deep learning models based on PyTorch with large-scale training frameworks, configuration management, and monitoring capabilities

Pending
Overview
Eval results
Files

training.mddocs/

Training and Loops

Complete training orchestration system with flexible runners supporting epoch-based and iteration-based training, validation, and testing loops with built-in checkpointing and logging. The system provides a unified interface for managing the entire training pipeline.

Capabilities

Runner Class

Central coordinator managing the entire training process with flexible configuration and automatic component initialization.

class Runner:
    def __init__(self, model, work_dir: str = None, train_dataloader = None, val_dataloader = None, test_dataloader = None, train_cfg: dict = None, val_cfg: dict = None, test_cfg: dict = None, auto_scale_lr: dict = None, optim_wrapper = None, param_scheduler = None, val_evaluator = None, test_evaluator = None, default_hooks: dict = None, custom_hooks: list = None, data_preprocessor = None, load_from: str = None, resume: bool = False, launcher: str = 'none', env_cfg: dict = None, log_processor = None, visualizer = None, default_scope: str = 'mmengine', randomness: dict = None, experiment_name: str = None, cfg: dict = None):
        """
        Initialize Runner with comprehensive training configuration.
        
        Parameters:
        - model: Model to train (torch.nn.Module or config dict)
        - work_dir: Working directory for saving outputs
        - train_dataloader: Training data loader
        - val_dataloader: Validation data loader
        - test_dataloader: Test data loader
        - train_cfg: Training loop configuration
        - val_cfg: Validation loop configuration
        - test_cfg: Test loop configuration
        - auto_scale_lr: Automatic learning rate scaling configuration
        - optim_wrapper: Optimizer wrapper configuration
        - param_scheduler: Parameter scheduler configuration
        - val_evaluator: Validation evaluator configuration
        - test_evaluator: Test evaluator configuration
        - default_hooks: Default hooks configuration
        - custom_hooks: Custom hooks list
        - data_preprocessor: Data preprocessor configuration
        - load_from: Checkpoint path to load
        - resume: Whether to resume training
        - launcher: Distributed launcher type
        - env_cfg: Environment configuration
        - log_processor: Log processor configuration
        - visualizer: Visualizer configuration
        - default_scope: Default registry scope
        - randomness: Randomness configuration
        - experiment_name: Experiment name
        - cfg: Complete configuration object
        """

    @classmethod
    def from_cfg(cls, cfg) -> 'Runner':
        """
        Create Runner from configuration.
        
        Parameters:
        - cfg: Configuration object or dict
        
        Returns:
        Initialized Runner instance
        """

    def train(self):
        """Run training loop."""

    def val(self):
        """Run validation loop."""

    def test(self):
        """Run test loop."""

    def call_hook(self, fn_name: str, **kwargs):
        """
        Call hook method.
        
        Parameters:
        - fn_name: Hook method name
        - **kwargs: Hook arguments
        """

    def register_hook(self, hook, priority: str = 'NORMAL'):
        """
        Register hook.
        
        Parameters:
        - hook: Hook instance or config
        - priority: Hook priority
        """

    def load_or_resume(self):
        """Load checkpoint or resume training."""

    def save_checkpoint(self, out_dir: str, filename: str = None, file_client_args: dict = None, save_optimizer: bool = True, save_param_scheduler: bool = True, meta: dict = None, by_epoch: bool = True):
        """
        Save checkpoint.
        
        Parameters:
        - out_dir: Output directory
        - filename: Checkpoint filename
        - file_client_args: File client arguments
        - save_optimizer: Whether to save optimizer state
        - save_param_scheduler: Whether to save scheduler state
        - meta: Additional metadata
        - by_epoch: Whether checkpoint is by epoch
        """

    @property
    def epoch(self) -> int:
        """Current epoch number."""

    @property
    def iter(self) -> int:
        """Current iteration number."""

    @property
    def max_epochs(self) -> int:
        """Maximum number of epochs."""

    @property
    def max_iters(self) -> int:
        """Maximum number of iterations."""

Flexible Runner

Extended runner with additional flexibility for custom training workflows.

class FlexibleRunner(Runner):
    def __init__(self, **kwargs):
        """
        Initialize FlexibleRunner with extended configuration options.
        
        Parameters:
        - **kwargs: Same as Runner plus additional flexibility options
        """

    def run_loop(self, loop: 'BaseLoop'):
        """
        Run custom training loop.
        
        Parameters:
        - loop: Loop instance to execute
        """

Base Loop Class

Abstract base class for all training loops providing common interface and functionality.

class BaseLoop:
    def __init__(self, runner, dataloader):
        """
        Initialize base loop.
        
        Parameters:
        - runner: Runner instance
        - dataloader: Data loader for the loop
        """

    def run(self):
        """Execute the loop."""

    @property
    def iter(self) -> int:
        """Current iteration number."""

    @property
    def max_iters(self) -> int:
        """Maximum iterations for this loop."""

Training Loops

Specialized training loops for different training strategies.

class EpochBasedTrainLoop(BaseLoop):
    def __init__(self, runner, dataloader, max_epochs: int, val_begin: int = 1, val_interval: int = 1, dynamic_intervals: list = None):
        """
        Epoch-based training loop.
        
        Parameters:
        - runner: Runner instance
        - dataloader: Training data loader
        - max_epochs: Maximum number of epochs
        - val_begin: Epoch to begin validation
        - val_interval: Validation interval in epochs
        - dynamic_intervals: Dynamic validation intervals
        """

    def run_epoch(self):
        """Run one training epoch."""

    def run_iter(self, idx: int, data_batch):
        """
        Run one training iteration.
        
        Parameters:
        - idx: Iteration index
        - data_batch: Input data batch
        """

class IterBasedTrainLoop(BaseLoop):
    def __init__(self, runner, dataloader, max_iters: int, val_begin: int = 1, val_interval: int = 1, dynamic_intervals: list = None):
        """
        Iteration-based training loop.
        
        Parameters:
        - runner: Runner instance
        - dataloader: Training data loader
        - max_iters: Maximum number of iterations
        - val_begin: Iteration to begin validation
        - val_interval: Validation interval in iterations
        - dynamic_intervals: Dynamic validation intervals
        """

    def run_iter(self, data_batch):
        """
        Run one training iteration.
        
        Parameters:
        - data_batch: Input data batch
        """

Validation and Test Loops

Loops for model evaluation during training or standalone testing.

class ValLoop(BaseLoop):
    def __init__(self, runner, dataloader, evaluator, fp16: bool = False):
        """
        Validation loop.
        
        Parameters:
        - runner: Runner instance
        - dataloader: Validation data loader
        - evaluator: Evaluator for validation metrics
        - fp16: Whether to use FP16 precision
        """

    def run(self) -> dict:
        """
        Run validation loop.
        
        Returns:
        Dictionary of validation metrics
        """

class TestLoop(BaseLoop):
    def __init__(self, runner, dataloader, evaluator, fp16: bool = False):
        """
        Test loop.
        
        Parameters:
        - runner: Runner instance
        - dataloader: Test data loader
        - evaluator: Evaluator for test metrics
        - fp16: Whether to use FP16 precision
        """

    def run(self) -> dict:
        """
        Run test loop.
        
        Returns:
        Dictionary of test metrics
        """

Checkpoint Management

Comprehensive checkpoint loading and saving functionality.

def load_checkpoint(filename: str, map_location: str = None, logger = None, revise_keys: list = None) -> dict:
    """
    Load checkpoint from file.
    
    Parameters:
    - filename: Checkpoint file path
    - map_location: Device to load checkpoint
    - logger: Logger instance
    - revise_keys: Keys to revise during loading
    
    Returns:
    Checkpoint dictionary
    """

def save_checkpoint(model, filename: str, optimizer = None, lr_scheduler = None, meta: dict = None, file_client_args: dict = None):
    """
    Save checkpoint to file.
    
    Parameters:
    - model: Model to save
    - filename: Output filename
    - optimizer: Optimizer state to save
    - lr_scheduler: Learning rate scheduler to save
    - meta: Additional metadata
    - file_client_args: File client arguments
    """

def weights_to_cpu(state_dict: dict) -> dict:
    """
    Move weights to CPU.
    
    Parameters:
    - state_dict: Model state dictionary
    
    Returns:
    CPU state dictionary
    """

def get_state_dict(module, destination: dict = None, prefix: str = '', keep_vars: bool = False) -> dict:
    """
    Get model state dictionary.
    
    Parameters:
    - module: PyTorch module
    - destination: Destination dictionary
    - prefix: Key prefix
    - keep_vars: Whether to keep variables
    
    Returns:
    State dictionary
    """

def find_latest_checkpoint(path: str, suffix: str = 'pth') -> str:
    """
    Find latest checkpoint in directory.
    
    Parameters:
    - path: Directory path
    - suffix: Checkpoint file suffix
    
    Returns:
    Latest checkpoint path
    """

Model Loading Utilities

Utilities for loading pre-trained models and model information.

def get_torchvision_models() -> list:
    """
    Get list of available torchvision models.
    
    Returns:
    List of model names
    """

def get_external_models() -> list:
    """
    Get list of available external models.
    
    Returns:
    List of external model names
    """

def get_mmcls_models() -> list:
    """
    Get list of available MMClassification models.
    
    Returns:
    List of MMCls model names
    """

def get_deprecated_model_names() -> list:
    """
    Get list of deprecated model names.
    
    Returns:
    List of deprecated model names
    """

class CheckpointLoader:
    @staticmethod
    def load_checkpoint(filename: str, map_location: str = None) -> dict:
        """
        Load checkpoint with advanced options.
        
        Parameters:
        - filename: Checkpoint file path
        - map_location: Device mapping
        
        Returns:
        Loaded checkpoint
        """

Training Utilities

Additional utilities for training management.

def set_random_seed(seed: int, deterministic: bool = False, diff_rank_seed: bool = False):
    """
    Set random seed for reproducibility.
    
    Parameters:
    - seed: Random seed value
    - deterministic: Whether to use deterministic algorithms
    - diff_rank_seed: Whether to use different seeds for different ranks
    """

def turn_on_activation_checkpointing(model, **kwargs):
    """
    Enable activation checkpointing for memory efficiency.
    
    Parameters:
    - model: Model to apply checkpointing
    - **kwargs: Checkpointing configuration
    """

def autocast(*args, **kwargs):
    """
    Automatic mixed precision context manager.
    
    Parameters:
    - *args: Positional arguments
    - **kwargs: Keyword arguments
    
    Returns:
    Autocast context manager
    """

Usage Examples

Basic Training Setup

from mmengine import Runner, Config

# Load configuration
cfg = Config.fromfile('config.py')

# Create runner
runner = Runner.from_cfg(cfg)

# Start training
runner.train()

Custom Training Loop

from mmengine import Runner, EpochBasedTrainLoop

# Create runner with custom configuration
runner = Runner(
    model=model,
    work_dir='./work_dir',
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    train_cfg=dict(type='EpochBasedTrainLoop', max_epochs=100),
    optim_wrapper=dict(optimizer=dict(type='SGD', lr=0.1)),
)

# Run training
runner.train()

Checkpoint Operations

from mmengine.runner import load_checkpoint, save_checkpoint

# Load checkpoint
checkpoint = load_checkpoint('model.pth', map_location='cpu')

# Save checkpoint with metadata
save_checkpoint(
    model, 
    'checkpoint.pth',
    optimizer=optimizer,
    meta={'epoch': 10, 'best_acc': 0.95}
)

# Find latest checkpoint
latest_ckpt = find_latest_checkpoint('./checkpoints')

Custom Hook Registration

from mmengine import Runner
from mmengine.hooks import Hook

class CustomHook(Hook):
    def before_train_epoch(self, runner):
        print(f"Starting epoch {runner.epoch}")

runner = Runner.from_cfg(cfg)
runner.register_hook(CustomHook(), priority='LOW')
runner.train()

Install with Tessl CLI

npx tessl i tessl/pypi-mmengine

docs

configuration.md

dataset.md

distributed.md

fileio.md

index.md

logging.md

models.md

optimization.md

registry.md

training.md

visualization.md

tile.json