CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-modelscope

ModelScope brings the notion of Model-as-a-Service to life with unified interfaces for state-of-the-art machine learning models.

Pending
Overview
Eval results
Files

training.mddocs/

Training Framework

ModelScope's training framework provides comprehensive tools for training and fine-tuning models across different domains. The framework supports epoch-based training with hooks, metrics, evaluation, and checkpoint management.

Capabilities

Epoch-Based Trainer

Main trainer class for epoch-based training workflows.

class EpochBasedTrainer:
    """
    Main epoch-based trainer for ModelScope models.
    """
    
    def __init__(
        self,
        model: Optional[Union[TorchModel, nn.Module, str]] = None,
        cfg_file: Optional[str] = None,
        cfg_modify_fn: Optional[Callable] = None,
        arg_parse_fn: Optional[Callable] = None,
        data_collator: Optional[Union[Callable, Dict[str, Callable]]] = None,
        train_dataset: Optional[Union[MsDataset, Dataset]] = None,
        eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
        preprocessor: Optional[Union[Preprocessor, Dict[str, Preprocessor]]] = None,
        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler] = (None, None),
        model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
        seed: int = 42,
        callbacks: Optional[List[Hook]] = None,
        samplers: Optional[Union[Sampler, Dict[str, Sampler]]] = None,
        efficient_tuners: Union[Dict[str, TunerConfig], TunerConfig] = None,
        **kwargs
    ):
        """
        Initialize trainer with model and training configuration.
        
        Parameters:
        - model: Model instance to train (TorchModel, nn.Module, or model identifier string)
        - cfg_file: Path to configuration file
        - cfg_modify_fn: Function to modify configuration dynamically
        - arg_parse_fn: Custom argument parsing function
        - data_collator: Data collation function(s) for batching
        - train_dataset: Training dataset (MsDataset or Dataset)
        - eval_dataset: Evaluation dataset (MsDataset or Dataset)
        - preprocessor: Data preprocessor(s) for input processing
        - optimizers: Tuple of (optimizer, lr_scheduler) instances
        - model_revision: Model revision/version (default: DEFAULT_MODEL_REVISION)
        - seed: Random seed for reproducibility (default: 42)
        - callbacks: List of training hooks/callbacks
        - samplers: Data sampler(s) for training and evaluation
        - efficient_tuners: Parameter-efficient tuning configurations
        - **kwargs: Additional trainer-specific parameters
        """
    
    def train(self):
        """
        Start the training process.
        """
    
    def evaluate(self, eval_dataset = None):
        """
        Evaluate model on evaluation dataset.
        
        Parameters:
        - eval_dataset: Dataset for evaluation (optional)
        
        Returns:
        Evaluation metrics dictionary
        """
    
    def save_checkpoint(self, checkpoint_dir: str):
        """
        Save training checkpoint.
        
        Parameters:
        - checkpoint_dir: Directory to save checkpoint
        """
    
    def load_checkpoint(self, checkpoint_path: str):
        """
        Load training checkpoint.
        
        Parameters:
        - checkpoint_path: Path to checkpoint file
        """
    
    def resume_training(self, checkpoint_path: str):
        """
        Resume training from checkpoint.
        
        Parameters:
        - checkpoint_path: Path to checkpoint file
        """

Training Arguments

Configuration class for training parameters and hyperparameters.

@dataclass(init=False)
class TrainingArgs(DatasetArgs, TrainArgs, ModelArgs):
    """
    Configuration container for training parameters.
    Inherits from DatasetArgs, TrainArgs, and ModelArgs dataclasses.
    """
    
    use_model_config: bool = field(
        default=False,
        metadata={
            'help': 'Use the configuration of the model'
        }
    )
    
    def __init__(self, **kwargs):
        """
        Initialize training arguments with flexible keyword arguments.
        
        Parameters:
        - **kwargs: Training configuration parameters including:
          - output_dir: Directory for saving model and checkpoints
          - max_epochs: Maximum number of training epochs
          - learning_rate: Learning rate for optimizer
          - train_batch_size: Batch size for training
          - eval_batch_size: Batch size for evaluation
          - eval_strategy: Evaluation strategy ('no', 'steps', 'epoch')
          - save_strategy: Checkpoint saving strategy ('no', 'steps', 'epoch')
          - logging_steps: Steps between logging outputs
          - save_steps: Steps between saving checkpoints
          - eval_steps: Steps between evaluations
          - use_model_config: Whether to use model configuration
          
        Note: This class uses dataclass fields and supports all parameters
        from DatasetArgs, TrainArgs, and ModelArgs parent classes.
        """
        self.manual_args = list(kwargs.keys())
        for f in fields(self):
            if f.name in kwargs:
                setattr(self, f.name, kwargs[f.name])
        self._unknown_args = {}

Hook System

Training hooks for customizing the training process at different stages.

class Hook:
    """
    Base class for training hooks.
    """
    
    def before_run(self, trainer):
        """
        Called before training starts.
        
        Parameters:
        - trainer: Trainer instance
        """
    
    def after_run(self, trainer):
        """
        Called after training completes.
        
        Parameters:
        - trainer: Trainer instance
        """
    
    def before_epoch(self, trainer):
        """
        Called before each epoch.
        
        Parameters:
        - trainer: Trainer instance
        """
    
    def after_epoch(self, trainer):
        """
        Called after each epoch.
        
        Parameters:
        - trainer: Trainer instance
        """
    
    def before_iter(self, trainer):
        """
        Called before each iteration.
        
        Parameters:
        - trainer: Trainer instance
        """
    
    def after_iter(self, trainer):
        """
        Called after each iteration.
        
        Parameters:
        - trainer: Trainer instance
        """

class Priority:
    """
    Priority levels for hook execution order.
    """
    HIGHEST = 0
    HIGH = 10
    NORMAL = 50
    LOW = 70
    LOWEST = 100

Dataset Builder

Utility functions for creating datasets from various sources.

def build_dataset_from_file(
    data_files: str,
    split: str = None,
    cache_dir: str = None,
    **kwargs
):
    """
    Build dataset from file paths.
    
    Parameters:
    - data_files: Path to data file(s)
    - split: Dataset split name
    - cache_dir: Directory for caching processed data
    - **kwargs: Additional dataset parameters
    
    Returns:
    Dataset instance
    """

def build_trainer(cfg: dict, default_args: dict = None):
    """
    Build trainer from configuration.
    
    Parameters:
    - cfg: Trainer configuration dictionary
    - default_args: Default arguments to merge
    
    Returns:
    Trainer instance
    """

Specialized Trainers

Domain-specific trainer implementations for specialized tasks.

class NlpEpochBasedTrainer(EpochBasedTrainer):
    """
    NLP-specific trainer with text processing optimizations.
    """
    pass

class VecoTrainer(EpochBasedTrainer):
    """
    Specialized trainer for Veco models.
    """
    pass

Usage Examples

Basic Training Setup

from modelscope import Model, EpochBasedTrainer, TrainingArgs
from modelscope import build_dataset_from_file

# Load pre-trained model
model = Model.from_pretrained('damo/nlp_structbert_base_chinese')

# Build dataset
train_dataset = build_dataset_from_file('train.json')
eval_dataset = build_dataset_from_file('eval.json')

# Configure training arguments
training_args = TrainingArgs(
    output_dir='./output',
    max_epochs=10,
    learning_rate=2e-5,
    train_batch_size=16,
    eval_batch_size=32,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=100
)

# Create trainer
trainer = EpochBasedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Start training
trainer.train()

Custom Training with Hooks

from modelscope import EpochBasedTrainer, Hook, Priority

class CustomLoggingHook(Hook):
    def __init__(self, log_interval=100):
        self.log_interval = log_interval
        self.step = 0
    
    def after_iter(self, trainer):
        self.step += 1
        if self.step % self.log_interval == 0:
            print(f"Step {self.step}: Loss = {trainer.loss}")
    
    def after_epoch(self, trainer):
        print(f"Epoch {trainer.epoch} completed")

class ModelCheckpointHook(Hook):
    def __init__(self, save_interval=5):
        self.save_interval = save_interval
    
    def after_epoch(self, trainer):
        if trainer.epoch % self.save_interval == 0:
            trainer.save_checkpoint(f'./checkpoints/epoch_{trainer.epoch}')

# Create trainer with custom hooks
trainer = EpochBasedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Register hooks
trainer.register_hook(CustomLoggingHook(log_interval=50), Priority.HIGH)
trainer.register_hook(ModelCheckpointHook(save_interval=2), Priority.NORMAL)

# Start training
trainer.train()

Fine-tuning with Evaluation

from modelscope import Model, EpochBasedTrainer, TrainingArgs

# Load model for fine-tuning
model = Model.from_pretrained('damo/nlp_bert_base_chinese')

# Prepare datasets
train_data = build_dataset_from_file('fine_tune_train.json')
eval_data = build_dataset_from_file('fine_tune_eval.json')

# Configure fine-tuning arguments
fine_tune_args = TrainingArgs(
    output_dir='./fine_tuned_model',
    max_epochs=5,
    learning_rate=1e-5,  # Lower learning rate for fine-tuning
    train_batch_size=8,
    eval_batch_size=16,
    eval_strategy='steps',
    eval_steps=200,
    save_strategy='steps',
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',
    greater_is_better=True
)

# Create trainer
trainer = EpochBasedTrainer(
    model=model,
    args=fine_tune_args,
    train_dataset=train_data,
    eval_dataset=eval_data
)

# Train and evaluate
trainer.train()
final_metrics = trainer.evaluate()
print(f"Final evaluation metrics: {final_metrics}")

Resume Training from Checkpoint

from modelscope import EpochBasedTrainer, TrainingArgs

# Configure training arguments
training_args = TrainingArgs(
    output_dir='./continued_training',
    max_epochs=20,
    resume_from_checkpoint='./checkpoints/epoch_10'
)

# Create trainer
trainer = EpochBasedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Resume training from checkpoint
trainer.resume_training('./checkpoints/epoch_10/checkpoint.pth')

Custom Trainer Implementation

from modelscope import EpochBasedTrainer

class CustomTrainer(EpochBasedTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Custom initialization
    
    def compute_loss(self, model, inputs):
        """
        Custom loss computation.
        
        Parameters:
        - model: Model instance
        - inputs: Batch inputs
        
        Returns:
        Loss tensor
        """
        outputs = model(inputs)
        # Custom loss calculation
        loss = custom_loss_function(outputs, inputs['labels'])
        return loss
    
    def evaluate(self, eval_dataset=None):
        """
        Custom evaluation logic.
        """
        # Custom evaluation implementation
        metrics = super().evaluate(eval_dataset)
        
        # Add custom metrics
        custom_metric = self.compute_custom_metric()
        metrics['custom_metric'] = custom_metric
        
        return metrics

# Use custom trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

Multi-GPU Training

from modelscope import EpochBasedTrainer, TrainingArgs
import torch

# Check for multiple GPUs
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    
    # Configure for multi-GPU training
    training_args = TrainingArgs(
        output_dir='./multi_gpu_output',
        max_epochs=10,
        train_batch_size=32,  # Total batch size across all GPUs
        eval_batch_size=64,
        dataloader_num_workers=4,
        fp16=True,  # Mixed precision training
        gradient_accumulation_steps=2
    )
    
    # Create trainer (will automatically use multiple GPUs)
    trainer = EpochBasedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    
    trainer.train()

Learning Rate Scheduling

from modelscope import EpochBasedTrainer, TrainingArgs, Hook

class LearningRateSchedulerHook(Hook):
    def __init__(self, scheduler):
        self.scheduler = scheduler
    
    def after_epoch(self, trainer):
        self.scheduler.step()
        current_lr = self.scheduler.get_last_lr()[0]
        print(f"Learning rate updated to: {current_lr}")

# Setup training with learning rate scheduling
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

trainer = EpochBasedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Create optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=3, gamma=0.5)

# Register scheduler hook
trainer.register_hook(LearningRateSchedulerHook(scheduler))

trainer.train()

Install with Tessl CLI

npx tessl i tessl/pypi-modelscope

docs

datasets.md

export.md

hub.md

index.md

metrics.md

models.md

pipelines.md

preprocessors.md

training.md

utilities.md

tile.json