HuggingFace Accelerate is a PyTorch library that simplifies distributed and mixed-precision training by abstracting away the boilerplate code needed for multi-GPU, TPU, and mixed-precision setups.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Configuration classes and plugins for customizing distributed training behavior, including DeepSpeed integration, FSDP configuration, mixed precision settings, and other advanced training optimizations.
Base configuration objects for controlling distributed training behavior.
class DataLoaderConfiguration:
"""
Configuration for DataLoader behavior in distributed training.
Controls how data is distributed and processed across multiple processes.
"""
def __init__(
self,
split_batches: bool = False,
dispatch_batches: bool | None = None,
even_batches: bool = True,
use_seedable_sampler: bool = False,
use_configured_sampler: bool = False,
non_blocking: bool = False,
gradient_accumulation_kwargs: dict | None = None
):
"""
Initialize DataLoader configuration.
Parameters:
- split_batches: Whether to split batches across processes
- dispatch_batches: Whether to dispatch batches to processes
- even_batches: Ensure all processes get same number of batches
- use_seedable_sampler: Use seedable sampler for reproducibility
- use_configured_sampler: Use custom sampler configuration
- non_blocking: Use non-blocking data transfer
- gradient_accumulation_kwargs: Additional gradient accumulation settings
"""
class ProjectConfiguration:
"""
Configuration for project output directories and logging behavior.
"""
def __init__(
self,
project_dir: str = ".",
logging_dir: str | None = None,
automatic_checkpoint_naming: bool = False,
total_limit: int | None = None,
iteration_checkpoints: bool = False,
save_on_each_node: bool = False
):
"""
Initialize project configuration.
Parameters:
- project_dir: Root directory for project outputs
- logging_dir: Directory for log files (relative to project_dir)
- automatic_checkpoint_naming: Auto-generate checkpoint names
- total_limit: Maximum number of checkpoints to keep
- iteration_checkpoints: Save checkpoints by iteration number
- save_on_each_node: Save checkpoints on every node
"""
class GradientAccumulationPlugin:
"""
Plugin for configuring gradient accumulation behavior.
"""
def __init__(
self,
num_steps: int | None = None,
adjust_scheduler: bool = True,
sync_with_dataloader: bool = True
):
"""
Initialize gradient accumulation plugin.
Parameters:
- num_steps: Number of steps to accumulate gradients
- adjust_scheduler: Adjust scheduler for accumulation steps
- sync_with_dataloader: Sync accumulation with dataloader length
"""Configuration for DeepSpeed distributed training integration.
class DeepSpeedPlugin:
"""
Plugin for DeepSpeed distributed training configuration.
Provides integration with Microsoft DeepSpeed for memory-efficient
training with ZeRO optimizer states, gradient partitioning, and
parameter offloading.
"""
def __init__(
self,
hf_ds_config: dict | str | None = None,
gradient_accumulation_steps: int | None = None,
gradient_clipping: float | None = None,
zero_stage: int | None = None,
is_train_batch_min: bool = True,
auto_wrap_policy: bool | None = None,
offload_optimizer_device: str | None = None,
offload_param_device: str | None = None,
offload_optimizer_nvme_path: str | None = None,
offload_param_nvme_path: str | None = None,
zero3_init_flag: bool | None = None,
zero3_save_16bit_model: bool | None = None,
**kwargs
):
"""
Initialize DeepSpeed plugin configuration.
Parameters:
- hf_ds_config: DeepSpeed configuration dict or path to config file
- gradient_accumulation_steps: Number of gradient accumulation steps
- gradient_clipping: Gradient clipping threshold
- zero_stage: ZeRO optimization stage (0, 1, 2, or 3)
- is_train_batch_min: Whether train_batch_size is minimum per device
- auto_wrap_policy: Automatic model wrapping policy
- offload_optimizer_device: Device for optimizer state offloading
- offload_param_device: Device for parameter offloading
- offload_optimizer_nvme_path: NVMe path for optimizer offloading
- offload_param_nvme_path: NVMe path for parameter offloading
- zero3_init_flag: Enable ZeRO-3 initialization optimizations
- zero3_save_16bit_model: Save model in 16-bit precision with ZeRO-3
"""Configuration for PyTorch Fully Sharded Data Parallel training.
class FullyShardedDataParallelPlugin:
"""
Plugin for PyTorch FSDP (Fully Sharded Data Parallel) configuration.
Enables memory-efficient training by sharding model parameters,
gradients, and optimizer states across multiple GPUs.
"""
def __init__(
self,
sharding_strategy: int | None = None,
backward_prefetch: int | None = None,
mixed_precision_policy: MixedPrecision | None = None,
auto_wrap_policy: ModuleWrapPolicy | None = None,
cpu_offload: CPUOffload | None = None,
ignored_modules: list[torch.nn.Module] | None = None,
state_dict_type: str | None = None,
state_dict_config: dict | None = None,
optim_state_dict_config: dict | None = None,
limit_all_gathers: bool = True,
use_orig_params: bool = True,
param_init_fn: callable | None = None,
sync_module_states: bool = True,
forward_prefetch: bool = False,
activation_checkpointing: bool = False
):
"""
Initialize FSDP plugin configuration.
Parameters:
- sharding_strategy: Parameter sharding strategy
- backward_prefetch: Backward pass prefetching strategy
- mixed_precision_policy: Mixed precision configuration
- auto_wrap_policy: Automatic module wrapping policy
- cpu_offload: CPU offloading configuration
- ignored_modules: Modules to exclude from FSDP wrapping
- state_dict_type: Type of state dict to use
- state_dict_config: State dict configuration
- optim_state_dict_config: Optimizer state dict configuration
- limit_all_gathers: Limit simultaneous all-gather operations
- use_orig_params: Use original parameter references
- param_init_fn: Custom parameter initialization function
- sync_module_states: Synchronize module states across ranks
- forward_prefetch: Enable forward pass prefetching
- activation_checkpointing: Enable activation checkpointing
"""Configuration classes for different mixed precision training modes.
class AutocastKwargs:
"""
Configuration for PyTorch autocast mixed precision.
"""
def __init__(
self,
enabled: bool = True,
cache_enabled: bool | None = None
):
"""
Initialize autocast configuration.
Parameters:
- enabled: Whether to enable autocast
- cache_enabled: Whether to enable autocast caching
"""
class GradScalerKwargs:
"""
Configuration for gradient scaling in mixed precision training.
"""
def __init__(
self,
init_scale: float = 65536.0,
growth_factor: float = 2.0,
backoff_factor: float = 0.5,
growth_interval: int = 2000,
enabled: bool = True
):
"""
Initialize gradient scaler configuration.
Parameters:
- init_scale: Initial scaling factor
- growth_factor: Factor to multiply scale by when no overflow
- backoff_factor: Factor to multiply scale by when overflow detected
- growth_interval: Number of steps between scale increases
- enabled: Whether gradient scaling is enabled
"""
class FP8RecipeKwargs:
"""
Configuration for FP8 (8-bit floating point) training.
"""
def __init__(
self,
backend: str = "TE",
use_autocast: bool = True,
fp8_format: str = "HYBRID",
amax_history_len: int = 1024,
amax_compute_algo: str = "most_recent"
):
"""
Initialize FP8 training configuration.
Parameters:
- backend: FP8 backend to use ("TE" for Transformer Engine)
- use_autocast: Whether to use autocast with FP8
- fp8_format: FP8 format specification
- amax_history_len: Length of amax history for scaling
- amax_compute_algo: Algorithm for computing amax values
"""Configuration for PyTorch compilation and optimization features.
class TorchDynamoPlugin:
"""
Plugin for PyTorch Dynamo compilation configuration.
Enables torch.compile optimizations for faster training and inference.
"""
def __init__(
self,
backend: str = "inductor",
mode: str | None = None,
fullgraph: bool = False,
dynamic: bool | None = None,
options: dict | None = None,
disable: bool = False
):
"""
Initialize Torch Dynamo plugin.
Parameters:
- backend: Compilation backend ("inductor", "aot_eager", etc.)
- mode: Compilation mode ("default", "reduce-overhead", "max-autotune")
- fullgraph: Whether to require full graph compilation
- dynamic: Enable dynamic shape compilation
- options: Additional backend-specific options
- disable: Whether to disable compilation
"""
class TorchTensorParallelPlugin:
"""
Plugin for PyTorch tensor parallelism configuration.
"""
def __init__(
self,
tensor_parallel_degree: int = 1,
parallelize_plan: dict | None = None
):
"""
Initialize tensor parallel plugin.
Parameters:
- tensor_parallel_degree: Degree of tensor parallelism
- parallelize_plan: Custom parallelization plan
"""Configuration classes for model quantization techniques.
class BnbQuantizationConfig:
"""
Configuration for Bitsandbytes quantization.
Enables 4-bit and 8-bit quantization for memory-efficient training.
"""
def __init__(
self,
load_in_8bit: bool = False,
load_in_4bit: bool = False,
llm_int8_threshold: float = 6.0,
llm_int8_skip_modules: list[str] | None = None,
llm_int8_enable_fp32_cpu_offload: bool = False,
llm_int8_has_fp16_weight: bool = False,
bnb_4bit_compute_dtype: torch.dtype | None = None,
bnb_4bit_quant_type: str = "fp4",
bnb_4bit_use_double_quant: bool = False,
bnb_4bit_quant_storage: torch.dtype | None = None
):
"""
Initialize Bitsandbytes quantization configuration.
Parameters:
- load_in_8bit: Enable 8-bit quantization
- load_in_4bit: Enable 4-bit quantization
- llm_int8_threshold: Threshold for int8 quantization
- llm_int8_skip_modules: Modules to skip during quantization
- llm_int8_enable_fp32_cpu_offload: Enable FP32 CPU offloading
- llm_int8_has_fp16_weight: Whether model has FP16 weights
- bnb_4bit_compute_dtype: Compute dtype for 4-bit operations
- bnb_4bit_quant_type: 4-bit quantization type ("fp4" or "nf4")
- bnb_4bit_use_double_quant: Enable double quantization
- bnb_4bit_quant_storage: Storage dtype for quantized weights
"""Configuration for distributed process group initialization.
class InitProcessGroupKwargs:
"""
Configuration for distributed process group initialization.
"""
def __init__(
self,
init_method: str | None = None,
timeout: int = 1800,
backend: str | None = None
):
"""
Initialize process group configuration.
Parameters:
- init_method: Method for process group initialization
- timeout: Timeout for initialization (seconds)
- backend: Distributed backend to use
"""
class DistributedDataParallelKwargs:
"""
Configuration for PyTorch DistributedDataParallel wrapper.
"""
def __init__(
self,
dim: int = 0,
broadcast_buffers: bool = True,
bucket_cap_mb: int = 25,
find_unused_parameters: bool = False,
check_reduction: bool = False,
gradient_as_bucket_view: bool = False,
static_graph: bool = False,
comm_hook: callable | None = None,
comm_state_option: str | None = None
):
"""
Initialize DDP configuration.
Parameters:
- dim: Dimension for gradient reduction
- broadcast_buffers: Whether to broadcast buffers
- bucket_cap_mb: Bucket size for gradient communication (MB)
- find_unused_parameters: Find unused parameters during backward
- check_reduction: Check gradient reduction correctness
- gradient_as_bucket_view: Use gradient as bucket view
- static_graph: Whether computation graph is static
- comm_hook: Custom communication hook
- comm_state_option: Communication state configuration
"""from accelerate import (
Accelerator,
DataLoaderConfiguration,
ProjectConfiguration,
GradientAccumulationPlugin
)
# Configure data loading behavior
dataloader_config = DataLoaderConfiguration(
split_batches=True,
even_batches=True,
use_seedable_sampler=True
)
# Configure project outputs
project_config = ProjectConfiguration(
project_dir="./experiments",
logging_dir="logs",
automatic_checkpoint_naming=True,
total_limit=5
)
# Configure gradient accumulation
grad_accumulation = GradientAccumulationPlugin(
num_steps=4,
adjust_scheduler=True
)
# Initialize accelerator with configurations
accelerator = Accelerator(
mixed_precision="fp16",
dataloader_config=dataloader_config,
project_config=project_config,
gradient_accumulation_plugin=grad_accumulation
)from accelerate import Accelerator, DeepSpeedPlugin
# Define DeepSpeed configuration
deepspeed_config = {
"train_batch_size": 16,
"gradient_accumulation_steps": 4,
"optimizer": {
"type": "Adam",
"params": {"lr": 1e-4}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {"device": "cpu"},
"overlap_comm": True,
"contiguous_gradients": True
},
"fp16": {"enabled": True}
}
# Create DeepSpeed plugin
deepspeed_plugin = DeepSpeedPlugin(
hf_ds_config=deepspeed_config,
zero_stage=2,
gradient_accumulation_steps=4,
gradient_clipping=1.0
)
# Initialize accelerator with DeepSpeed
accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)from accelerate import Accelerator, FullyShardedDataParallelPlugin
from torch.distributed.fsdp import ShardingStrategy, BackwardPrefetch
# Configure FSDP plugin
fsdp_plugin = FullyShardedDataParallelPlugin(
sharding_strategy=ShardingStrategy.FULL_SHARD,
backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
cpu_offload=None, # Keep on GPU
mixed_precision_policy=None, # Use default
auto_wrap_policy=None, # Auto-detect
limit_all_gathers=True,
use_orig_params=True,
sync_module_states=True
)
# Initialize with FSDP
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)from accelerate import (
Accelerator,
AutocastKwargs,
GradScalerKwargs,
FP8RecipeKwargs
)
# Configure autocast behavior
autocast_kwargs = AutocastKwargs(
enabled=True,
cache_enabled=False
)
# Configure gradient scaling
scaler_kwargs = GradScalerKwargs(
init_scale=2**16,
growth_factor=2.0,
backoff_factor=0.5,
growth_interval=2000
)
# Configure FP8 training (if supported)
fp8_kwargs = FP8RecipeKwargs(
backend="TE",
use_autocast=True,
fp8_format="HYBRID"
)
# Initialize with advanced mixed precision
accelerator = Accelerator(
mixed_precision="fp16",
kwargs_handlers=[autocast_kwargs, scaler_kwargs]
)from accelerate import Accelerator, BnbQuantizationConfig
import torch
# Configure 4-bit quantization
bnb_config = BnbQuantizationConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True
)
# Note: Quantization is typically applied during model loading
# rather than through Accelerator initialization
model = AutoModelForCausalLM.from_pretrained(
"model_name",
quantization_config=bnb_config,
device_map="auto"
)Install with Tessl CLI
npx tessl i tessl/pypi-accelerate