ModelScope brings the notion of Model-as-a-Service to life with unified interfaces for state-of-the-art machine learning models.
—
ModelScope's training framework provides comprehensive tools for training and fine-tuning models across different domains. The framework supports epoch-based training with hooks, metrics, evaluation, and checkpoint management.
Main trainer class for epoch-based training workflows.
class EpochBasedTrainer:
"""
Main epoch-based trainer for ModelScope models.
"""
def __init__(
self,
model: Optional[Union[TorchModel, nn.Module, str]] = None,
cfg_file: Optional[str] = None,
cfg_modify_fn: Optional[Callable] = None,
arg_parse_fn: Optional[Callable] = None,
data_collator: Optional[Union[Callable, Dict[str, Callable]]] = None,
train_dataset: Optional[Union[MsDataset, Dataset]] = None,
eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
preprocessor: Optional[Union[Preprocessor, Dict[str, Preprocessor]]] = None,
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler] = (None, None),
model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
seed: int = 42,
callbacks: Optional[List[Hook]] = None,
samplers: Optional[Union[Sampler, Dict[str, Sampler]]] = None,
efficient_tuners: Union[Dict[str, TunerConfig], TunerConfig] = None,
**kwargs
):
"""
Initialize trainer with model and training configuration.
Parameters:
- model: Model instance to train (TorchModel, nn.Module, or model identifier string)
- cfg_file: Path to configuration file
- cfg_modify_fn: Function to modify configuration dynamically
- arg_parse_fn: Custom argument parsing function
- data_collator: Data collation function(s) for batching
- train_dataset: Training dataset (MsDataset or Dataset)
- eval_dataset: Evaluation dataset (MsDataset or Dataset)
- preprocessor: Data preprocessor(s) for input processing
- optimizers: Tuple of (optimizer, lr_scheduler) instances
- model_revision: Model revision/version (default: DEFAULT_MODEL_REVISION)
- seed: Random seed for reproducibility (default: 42)
- callbacks: List of training hooks/callbacks
- samplers: Data sampler(s) for training and evaluation
- efficient_tuners: Parameter-efficient tuning configurations
- **kwargs: Additional trainer-specific parameters
"""
def train(self):
"""
Start the training process.
"""
def evaluate(self, eval_dataset = None):
"""
Evaluate model on evaluation dataset.
Parameters:
- eval_dataset: Dataset for evaluation (optional)
Returns:
Evaluation metrics dictionary
"""
def save_checkpoint(self, checkpoint_dir: str):
"""
Save training checkpoint.
Parameters:
- checkpoint_dir: Directory to save checkpoint
"""
def load_checkpoint(self, checkpoint_path: str):
"""
Load training checkpoint.
Parameters:
- checkpoint_path: Path to checkpoint file
"""
def resume_training(self, checkpoint_path: str):
"""
Resume training from checkpoint.
Parameters:
- checkpoint_path: Path to checkpoint file
"""Configuration class for training parameters and hyperparameters.
@dataclass(init=False)
class TrainingArgs(DatasetArgs, TrainArgs, ModelArgs):
"""
Configuration container for training parameters.
Inherits from DatasetArgs, TrainArgs, and ModelArgs dataclasses.
"""
use_model_config: bool = field(
default=False,
metadata={
'help': 'Use the configuration of the model'
}
)
def __init__(self, **kwargs):
"""
Initialize training arguments with flexible keyword arguments.
Parameters:
- **kwargs: Training configuration parameters including:
- output_dir: Directory for saving model and checkpoints
- max_epochs: Maximum number of training epochs
- learning_rate: Learning rate for optimizer
- train_batch_size: Batch size for training
- eval_batch_size: Batch size for evaluation
- eval_strategy: Evaluation strategy ('no', 'steps', 'epoch')
- save_strategy: Checkpoint saving strategy ('no', 'steps', 'epoch')
- logging_steps: Steps between logging outputs
- save_steps: Steps between saving checkpoints
- eval_steps: Steps between evaluations
- use_model_config: Whether to use model configuration
Note: This class uses dataclass fields and supports all parameters
from DatasetArgs, TrainArgs, and ModelArgs parent classes.
"""
self.manual_args = list(kwargs.keys())
for f in fields(self):
if f.name in kwargs:
setattr(self, f.name, kwargs[f.name])
self._unknown_args = {}Training hooks for customizing the training process at different stages.
class Hook:
"""
Base class for training hooks.
"""
def before_run(self, trainer):
"""
Called before training starts.
Parameters:
- trainer: Trainer instance
"""
def after_run(self, trainer):
"""
Called after training completes.
Parameters:
- trainer: Trainer instance
"""
def before_epoch(self, trainer):
"""
Called before each epoch.
Parameters:
- trainer: Trainer instance
"""
def after_epoch(self, trainer):
"""
Called after each epoch.
Parameters:
- trainer: Trainer instance
"""
def before_iter(self, trainer):
"""
Called before each iteration.
Parameters:
- trainer: Trainer instance
"""
def after_iter(self, trainer):
"""
Called after each iteration.
Parameters:
- trainer: Trainer instance
"""
class Priority:
"""
Priority levels for hook execution order.
"""
HIGHEST = 0
HIGH = 10
NORMAL = 50
LOW = 70
LOWEST = 100Utility functions for creating datasets from various sources.
def build_dataset_from_file(
data_files: str,
split: str = None,
cache_dir: str = None,
**kwargs
):
"""
Build dataset from file paths.
Parameters:
- data_files: Path to data file(s)
- split: Dataset split name
- cache_dir: Directory for caching processed data
- **kwargs: Additional dataset parameters
Returns:
Dataset instance
"""
def build_trainer(cfg: dict, default_args: dict = None):
"""
Build trainer from configuration.
Parameters:
- cfg: Trainer configuration dictionary
- default_args: Default arguments to merge
Returns:
Trainer instance
"""Domain-specific trainer implementations for specialized tasks.
class NlpEpochBasedTrainer(EpochBasedTrainer):
"""
NLP-specific trainer with text processing optimizations.
"""
pass
class VecoTrainer(EpochBasedTrainer):
"""
Specialized trainer for Veco models.
"""
passfrom modelscope import Model, EpochBasedTrainer, TrainingArgs
from modelscope import build_dataset_from_file
# Load pre-trained model
model = Model.from_pretrained('damo/nlp_structbert_base_chinese')
# Build dataset
train_dataset = build_dataset_from_file('train.json')
eval_dataset = build_dataset_from_file('eval.json')
# Configure training arguments
training_args = TrainingArgs(
output_dir='./output',
max_epochs=10,
learning_rate=2e-5,
train_batch_size=16,
eval_batch_size=32,
eval_strategy='epoch',
save_strategy='epoch',
logging_steps=100
)
# Create trainer
trainer = EpochBasedTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
# Start training
trainer.train()from modelscope import EpochBasedTrainer, Hook, Priority
class CustomLoggingHook(Hook):
def __init__(self, log_interval=100):
self.log_interval = log_interval
self.step = 0
def after_iter(self, trainer):
self.step += 1
if self.step % self.log_interval == 0:
print(f"Step {self.step}: Loss = {trainer.loss}")
def after_epoch(self, trainer):
print(f"Epoch {trainer.epoch} completed")
class ModelCheckpointHook(Hook):
def __init__(self, save_interval=5):
self.save_interval = save_interval
def after_epoch(self, trainer):
if trainer.epoch % self.save_interval == 0:
trainer.save_checkpoint(f'./checkpoints/epoch_{trainer.epoch}')
# Create trainer with custom hooks
trainer = EpochBasedTrainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
# Register hooks
trainer.register_hook(CustomLoggingHook(log_interval=50), Priority.HIGH)
trainer.register_hook(ModelCheckpointHook(save_interval=2), Priority.NORMAL)
# Start training
trainer.train()from modelscope import Model, EpochBasedTrainer, TrainingArgs
# Load model for fine-tuning
model = Model.from_pretrained('damo/nlp_bert_base_chinese')
# Prepare datasets
train_data = build_dataset_from_file('fine_tune_train.json')
eval_data = build_dataset_from_file('fine_tune_eval.json')
# Configure fine-tuning arguments
fine_tune_args = TrainingArgs(
output_dir='./fine_tuned_model',
max_epochs=5,
learning_rate=1e-5, # Lower learning rate for fine-tuning
train_batch_size=8,
eval_batch_size=16,
eval_strategy='steps',
eval_steps=200,
save_strategy='steps',
save_steps=500,
load_best_model_at_end=True,
metric_for_best_model='eval_accuracy',
greater_is_better=True
)
# Create trainer
trainer = EpochBasedTrainer(
model=model,
args=fine_tune_args,
train_dataset=train_data,
eval_dataset=eval_data
)
# Train and evaluate
trainer.train()
final_metrics = trainer.evaluate()
print(f"Final evaluation metrics: {final_metrics}")from modelscope import EpochBasedTrainer, TrainingArgs
# Configure training arguments
training_args = TrainingArgs(
output_dir='./continued_training',
max_epochs=20,
resume_from_checkpoint='./checkpoints/epoch_10'
)
# Create trainer
trainer = EpochBasedTrainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
# Resume training from checkpoint
trainer.resume_training('./checkpoints/epoch_10/checkpoint.pth')from modelscope import EpochBasedTrainer
class CustomTrainer(EpochBasedTrainer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Custom initialization
def compute_loss(self, model, inputs):
"""
Custom loss computation.
Parameters:
- model: Model instance
- inputs: Batch inputs
Returns:
Loss tensor
"""
outputs = model(inputs)
# Custom loss calculation
loss = custom_loss_function(outputs, inputs['labels'])
return loss
def evaluate(self, eval_dataset=None):
"""
Custom evaluation logic.
"""
# Custom evaluation implementation
metrics = super().evaluate(eval_dataset)
# Add custom metrics
custom_metric = self.compute_custom_metric()
metrics['custom_metric'] = custom_metric
return metrics
# Use custom trainer
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)from modelscope import EpochBasedTrainer, TrainingArgs
import torch
# Check for multiple GPUs
if torch.cuda.device_count() > 1:
print(f"Using {torch.cuda.device_count()} GPUs")
# Configure for multi-GPU training
training_args = TrainingArgs(
output_dir='./multi_gpu_output',
max_epochs=10,
train_batch_size=32, # Total batch size across all GPUs
eval_batch_size=64,
dataloader_num_workers=4,
fp16=True, # Mixed precision training
gradient_accumulation_steps=2
)
# Create trainer (will automatically use multiple GPUs)
trainer = EpochBasedTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
trainer.train()from modelscope import EpochBasedTrainer, TrainingArgs, Hook
class LearningRateSchedulerHook(Hook):
def __init__(self, scheduler):
self.scheduler = scheduler
def after_epoch(self, trainer):
self.scheduler.step()
current_lr = self.scheduler.get_last_lr()[0]
print(f"Learning rate updated to: {current_lr}")
# Setup training with learning rate scheduling
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
trainer = EpochBasedTrainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
# Create optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=3, gamma=0.5)
# Register scheduler hook
trainer.register_hook(LearningRateSchedulerHook(scheduler))
trainer.train()Install with Tessl CLI
npx tessl i tessl/pypi-modelscope