HuggingFace Accelerate is a PyTorch library that simplifies distributed and mixed-precision training by abstracting away the boilerplate code needed for multi-GPU, TPU, and mixed-precision setups.
npx @tessl/cli install tessl/pypi-accelerate@1.10.0HuggingFace Accelerate is a PyTorch library that simplifies distributed and mixed-precision training by abstracting away the boilerplate code needed for multi-GPU, TPU, and mixed-precision setups. It provides a thin wrapper around PyTorch that allows users to easily run their existing training scripts on any hardware configuration (single/multi-GPU, TPU, CPU) with minimal code changes - typically just 5 lines of code.
pip install acceleratefrom accelerate import AcceleratorFor specific functionality:
from accelerate import (
Accelerator,
PartialState,
ParallelismConfig,
cpu_offload,
cpu_offload_with_hook,
disk_offload,
dispatch_model,
init_empty_weights,
init_on_device,
load_checkpoint_and_dispatch,
skip_first_batches,
prepare_pippy,
debug_launcher,
notebook_launcher,
find_executable_batch_size,
infer_auto_device_map,
load_checkpoint_in_model,
synchronize_rng_states
)from accelerate import Accelerator
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
# Initialize accelerator
accelerator = Accelerator(mixed_precision="fp16")
# Define model, optimizer, dataloader
model = nn.Linear(10, 1)
optimizer = AdamW(model.parameters(), lr=1e-4)
dataloader = DataLoader(dataset, batch_size=16)
# Prepare for distributed training (this is the key step)
model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
# Training loop
for batch in dataloader:
optimizer.zero_grad()
with accelerator.accumulate(model):
outputs = model(batch['input'])
loss = torch.nn.functional.mse_loss(outputs, batch['target'])
accelerator.backward(loss)
optimizer.step()
# Save model
accelerator.save_model(model, "my_model")Accelerate follows a modular design with these key components:
This design allows Accelerate to work as a universal training wrapper that adapts to any hardware configuration while keeping the user's training code largely unchanged.
The main Accelerator class and essential training functionality including mixed precision, gradient accumulation, and basic distributed operations.
class Accelerator:
def __init__(
self,
device_placement: bool = True,
split_batches: bool = False,
mixed_precision: str | None = None,
gradient_accumulation_steps: int = 1,
cpu: bool = False,
dataloader_config: DataLoaderConfiguration | None = None,
deepspeed_plugin: DeepSpeedPlugin | dict[str, DeepSpeedPlugin] | None = None,
fsdp_plugin: FullyShardedDataParallelPlugin | None = None,
megatron_lm_plugin: MegatronLMPlugin | None = None,
rng_types: list[str] | None = None,
log_with: str | list[str] | None = None,
project_dir: str | None = None,
project_config: ProjectConfiguration | None = None,
gradient_accumulation_plugin: GradientAccumulationPlugin | None = None,
step_scheduler_with_optimizer: bool = True,
kwargs_handlers: list[KwargsHandler] | None = None,
dynamo_backend: str | None = None,
dynamo_plugin: TorchDynamoPlugin | None = None,
parallelism_config: ParallelismConfig | None = None,
**kwargs
): ...
def prepare(self, *args): ...
def backward(self, loss, **kwargs): ...
def gather(self, tensor): ...
def save_model(self, model, save_directory: str, **kwargs): ...Device management utilities for handling large models through CPU/disk offloading, device mapping, and efficient initialization strategies.
def cpu_offload(
model: torch.nn.Module,
execution_device: torch.device | str | int | None = None,
offload_buffers: bool = False,
state_dict: dict[str, torch.Tensor] | None = None,
preload_module_classes: list[str] | None = None
): ...
def cpu_offload_with_hook(
model: torch.nn.Module,
execution_device: torch.device | str | int | None = None,
prev_module_hook: UserCpuOffloadHook | None = None
): ...
def disk_offload(
model: torch.nn.Module,
offload_dir: str | os.PathLike,
execution_device: torch.device | str | int | None = None,
offload_buffers: bool = False
): ...
def dispatch_model(
model: torch.nn.Module,
device_map: dict[str, torch.device | str | int] | None = None,
main_device: torch.device | str | int | None = None,
state_dict: dict[str, torch.Tensor] | None = None,
strict: bool = False,
preload_module_classes: list[str] | None = None
): ...
def init_empty_weights(include_buffers: bool = None): ...
def init_on_device(device: torch.device | str | int, include_buffers: bool = None): ...
def load_checkpoint_and_dispatch(
model: torch.nn.Module,
checkpoint: str | os.PathLike,
device_map: dict[str, torch.device | str | int] | None = None,
max_memory: dict[int | str, int | str] | None = None,
no_split_module_classes: list[str] | None = None,
strict: bool = False,
dtype: torch.dtype | None = None
): ...Low-level distributed communication primitives for gathering, broadcasting, and synchronizing data across processes.
def broadcast(tensor: torch.Tensor, from_process: int = 0): ...
def gather(tensor: torch.Tensor): ...
def reduce(tensor: torch.Tensor, reduction: str = "mean"): ...
def wait_for_everyone(): ...
def synchronize_rng_states(rng_types: list[str] | None = None): ...Configuration classes and plugins for customizing distributed training behavior, including DeepSpeed, FSDP, and mixed precision settings.
class DeepSpeedPlugin:
def __init__(
self,
hf_ds_config: dict | str | None = None,
gradient_accumulation_steps: int | None = None,
gradient_clipping: float | None = None,
zero_stage: int | None = None,
**kwargs
): ...
class FullyShardedDataParallelPlugin:
def __init__(
self,
sharding_strategy: int | None = None,
backward_prefetch: int | None = None,
mixed_precision_policy: MixedPrecision | None = None,
**kwargs
): ...Memory management, checkpointing, model utilities, and various helper functions for training workflows.
def find_executable_batch_size(function, starting_batch_size: int = 128): ...
def infer_auto_device_map(
model: torch.nn.Module,
max_memory: dict[int | str, int | str] | None = None,
no_split_module_classes: list[str] | None = None
): ...
def load_checkpoint_in_model(
model: torch.nn.Module,
checkpoint: str | os.PathLike,
device_map: dict[str, torch.device | str | int] | None = None
): ...Command-line tools for configuration, launching distributed training, memory estimation, and environment management.
accelerate config # Interactive configuration setup
accelerate launch # Launch distributed training
accelerate env # Display environment information
accelerate estimate-memory # Estimate memory requirements
accelerate test # Test distributed setupDataLoader utilities for skipping batches and handling distributed data loading patterns.
def skip_first_batches(dataloader: torch.utils.data.DataLoader, num_batches: int): ...Tools for launching distributed training from different environments including notebooks and debugging scenarios.
def notebook_launcher(
function,
args: tuple = (),
num_processes: int = None,
mixed_precision: str = "no",
use_port: str = "29500"
): ...
def debug_launcher(
function,
args: tuple = (),
num_processes: int = 2
): ...Pipeline parallelism utilities for large model inference.
def prepare_pippy(
model: torch.nn.Module,
split_points: str | list[str] | None = None,
no_split_module_classes: list[str] | None = None
): ...class PartialState:
"""Singleton class containing distributed training state."""
device: torch.device
distributed_type: DistributedType
local_process_index: int
process_index: int
num_processes: int
is_main_process: bool
is_local_main_process: bool
def wait_for_everyone(self): ...
def split_between_processes(self, inputs, apply_padding: bool = False): ...
class DataLoaderConfiguration:
"""Configuration for DataLoader behavior in distributed training."""
split_batches: bool = False
dispatch_batches: bool | None = None
even_batches: bool = True
use_seedable_sampler: bool = False
class ProjectConfiguration:
"""Configuration for project output and logging."""
project_dir: str = "."
logging_dir: str | None = None
automatic_checkpoint_naming: bool = False
total_limit: int | None = None
iteration_checkpoints: bool = False
save_every_n_steps: int | None = None
from enum import Enum
class DistributedType(Enum):
"""Types of distributed training backends."""
NO = "NO"
MULTI_CPU = "MULTI_CPU"
MULTI_GPU = "MULTI_GPU"
MULTI_MLU = "MULTI_MLU"
MULTI_NPU = "MULTI_NPU"
MULTI_XPU = "MULTI_XPU"
DEEPSPEED = "DEEPSPEED"
FSDP = "FSDP"
class PrecisionType(Enum):
"""Mixed precision training types."""
NO = "no"
FP16 = "fp16"
BF16 = "bf16"
FP8 = "fp8"
class LoggerType(Enum):
"""Experiment tracking logger types."""
TENSORBOARD = "tensorboard"
WANDB = "wandb"
COMET_ML = "comet_ml"
MLFLOW = "mlflow"
AIM = "aim"
CLEARML = "clearml"
class GradientAccumulationPlugin:
"""Plugin for gradient accumulation configuration."""
num_steps: int
adjust_scheduler: bool = True
sync_with_dataloader: bool = True
class MegatronLMPlugin:
"""Plugin for Megatron-LM configuration."""
tp_degree: int = 1
pp_degree: int = 1
num_micro_batches: int = 1
sequence_parallelism: bool = False
recompute_activations: bool = False
use_distributed_optimizer: bool = False
class UserCpuOffloadHook:
"""Hook for managing CPU offloading behavior."""
def offload(self): ...
def remove(self): ...
class TorchDynamoPlugin:
"""Plugin for PyTorch Dynamo configuration."""
backend: str = "inductor"
mode: str | None = None
fullgraph: bool = False
dynamic: bool | None = None
options: dict | None = None
class KwargsHandler:
"""Base class for handling additional configuration arguments."""
pass