HuggingFace Accelerate is a PyTorch library that simplifies distributed and mixed-precision training by abstracting away the boilerplate code needed for multi-GPU, TPU, and mixed-precision setups.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Device management utilities for handling large models that exceed single device memory through CPU/disk offloading, automatic device mapping, and efficient initialization strategies. These utilities enable training and inference with models that would otherwise be impossible to run.
Functions for offloading model parameters to CPU memory when not in use, automatically moving them to GPU during forward/backward passes.
def cpu_offload(
model: torch.nn.Module,
execution_device: torch.device | None = None,
offload_buffers: bool = False,
state_dict: dict[str, torch.Tensor] | None = None,
preload_module_classes: list[str] | None = None
):
"""
Offload model to CPU with automatic device management hooks.
Model parameters are moved to CPU and automatically transferred to
execution device during forward pass, then moved back to CPU.
Parameters:
- model: Model to offload to CPU
- execution_device: Device to use during computation (default: auto-detect)
- offload_buffers: Whether to also offload buffer tensors
- state_dict: Optional state dict to use for model parameters
- preload_module_classes: Module classes to preload on execution device
Returns:
Model with CPU offloading hooks attached
"""
def cpu_offload_with_hook(
model: torch.nn.Module,
execution_device: torch.device | str | int | None = None,
prev_module_hook: UserCpuOffloadHook | None = None
):
"""
Advanced CPU offloading with custom hook chaining.
Provides more control over offloading behavior and allows chaining
multiple offloading hooks for complex model architectures.
Parameters:
- model: Model to offload
- execution_device: Computation device
- prev_module_hook: Previous hook in the chain
Returns:
Tuple of (model, hook) for hook chaining
"""Functions for offloading model parameters to disk storage for extremely large models that exceed total system memory.
def disk_offload(
model: torch.nn.Module,
offload_dir: str | os.PathLike,
execution_device: torch.device | str | int | None = None,
offload_buffers: bool = False
):
"""
Offload model parameters to disk storage.
Parameters are saved to disk and loaded on-demand during computation.
Slower than CPU offloading but enables handling arbitrarily large models.
Parameters:
- model: Model to offload to disk
- offload_dir: Directory to store offloaded parameters
- execution_device: Device for computation
- offload_buffers: Whether to offload buffer tensors
Returns:
Model with disk offloading hooks
"""Functions for automatically distributing model layers across multiple devices based on memory constraints and performance considerations.
def dispatch_model(
model: torch.nn.Module,
device_map: dict[str, torch.device | str | int] | None = None,
main_device: torch.device | str | int | None = None,
state_dict: dict[str, torch.Tensor] | None = None,
strict: bool = False,
preload_module_classes: list[str] | None = None
):
"""
Dispatch model layers across multiple devices.
Automatically places model components on specified devices and sets up
hooks for moving tensors between devices during forward pass.
Parameters:
- model: Model to dispatch across devices
- device_map: Mapping of layer names to devices
- main_device: Primary device for model execution
- state_dict: Optional state dict to load
- strict: Whether to strictly enforce device mapping
- preload_module_classes: Module classes to preload
Returns:
Model with device dispatch hooks configured
"""
def infer_auto_device_map(
model: torch.nn.Module,
max_memory: dict[int | str, int | str] | None = None,
no_split_module_classes: list[str] | None = None,
dtype: torch.dtype | str | None = None,
special_dtypes: dict[str, torch.dtype | str] | None = None,
verbose: bool = False
):
"""
Automatically infer optimal device mapping for model.
Analyzes model architecture and memory constraints to determine
the best placement of layers across available devices.
Parameters:
- model: Model to analyze
- max_memory: Maximum memory per device (dict of device_id: memory)
- no_split_module_classes: Module types that shouldn't be split
- dtype: Data type for memory calculation
- special_dtypes: Special data types for specific parameters
- verbose: Whether to print mapping details
Returns:
Dict mapping layer names to optimal devices
"""Functions for memory-efficient model initialization, particularly useful for large models.
def init_empty_weights(include_buffers: bool = None):
"""
Context manager for initializing models with empty tensors.
Creates model structure without allocating memory for parameters,
enabling initialization of models larger than available memory.
Parameters:
- include_buffers: Whether to initialize buffers as empty too
Returns:
Context manager for empty weight initialization
"""
def init_on_device(
device: torch.device | str,
include_buffers: bool = False
):
"""
Context manager to initialize model directly on specified device.
Avoids creating tensors on CPU first, reducing memory usage and
improving initialization speed for large models.
Parameters:
- device: Target device for initialization
- include_buffers: Whether to initialize buffers on device
Returns:
Context manager for device-specific initialization
"""Functions for loading and managing model checkpoints with device mapping support.
def load_checkpoint_and_dispatch(
model: torch.nn.Module,
checkpoint: str | os.PathLike,
device_map: dict[str, torch.device | str | int] | None = None,
max_memory: dict[int | str, int | str] | None = None,
no_split_module_classes: list[str] | None = None,
dtype: torch.dtype | str | None = None,
offload_folder: str | os.PathLike | None = None,
offload_state_dict: bool = False,
strict: bool = False
):
"""
Load checkpoint and dispatch model across devices.
Combines checkpoint loading with automatic device mapping and
offloading for large models that exceed device memory.
Parameters:
- model: Model to load checkpoint into
- checkpoint: Path to checkpoint file
- device_map: Manual device mapping (optional)
- max_memory: Memory constraints per device
- no_split_module_classes: Modules that shouldn't be split
- dtype: Data type for parameters
- offload_folder: Directory for offloaded parameters
- offload_state_dict: Whether to offload full state dict
- strict: Strict checkpoint loading
Returns:
Model with loaded checkpoint and device mapping applied
"""
def load_checkpoint_in_model(
model: torch.nn.Module,
checkpoint: str | os.PathLike,
device_map: dict[str, torch.device | str | int] | None = None,
offload_folder: str | os.PathLike | None = None,
dtype: torch.dtype | str | None = None,
offload_state_dict: bool = False,
offload_buffers: bool = False,
keep_in_fp32_modules: list[str] | None = None,
strict: bool = False
):
"""
Load checkpoint into model with advanced offloading options.
Provides fine-grained control over checkpoint loading with support
for mixed precision, selective offloading, and memory optimization.
Parameters:
- model: Target model
- checkpoint: Checkpoint path
- device_map: Device placement mapping
- offload_folder: Offloading directory
- dtype: Target data type
- offload_state_dict: Offload entire state dict
- offload_buffers: Offload buffer tensors
- keep_in_fp32_modules: Modules to keep in FP32
- strict: Strict loading mode
Returns:
List of missing and unexpected keys from checkpoint
"""from accelerate import cpu_offload
import torch
import torch.nn as nn
# Create large model
model = nn.Sequential(
nn.Linear(10000, 10000),
nn.ReLU(),
nn.Linear(10000, 1000)
)
# Enable CPU offloading - model parameters move to CPU when not in use
model = cpu_offload(model, execution_device="cuda:0")
# Model automatically moves parameters to GPU during forward pass
with torch.no_grad():
output = model(torch.randn(32, 10000, device="cuda:0"))from accelerate import infer_auto_device_map, dispatch_model
import torch
# Define memory constraints (in bytes or human-readable format)
max_memory = {
0: "10GB", # GPU 0 has 10GB available
1: "10GB", # GPU 1 has 10GB available
"cpu": "50GB" # 50GB CPU memory available
}
# Automatically determine optimal device placement
device_map = infer_auto_device_map(
model,
max_memory=max_memory,
no_split_module_classes=["LlamaDecoderLayer", "GPT2Block"]
)
# Apply the device mapping
model = dispatch_model(model, device_map=device_map)from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Initialize model without allocating memory
with init_empty_weights():
model = MyLargeModel(config)
# Load checkpoint with automatic device mapping
model = load_checkpoint_and_dispatch(
model,
checkpoint="path/to/checkpoint.bin",
device_map="auto",
max_memory={"0": "15GB", "1": "15GB", "cpu": "50GB"},
offload_folder="./offload_weights"
)from accelerate import disk_offload
import tempfile
# Create temporary directory for offloaded weights
with tempfile.TemporaryDirectory() as temp_dir:
# Offload model to disk - enables models larger than total RAM
model = disk_offload(
model,
offload_dir=temp_dir,
execution_device="cuda:0"
)
# Model parameters are loaded from disk on-demand
output = model(input_tensor)from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Initialize model structure without memory allocation
with init_empty_weights(include_buffers=True):
model = AutoModel.from_pretrained(
"microsoft/DialoGPT-large",
torch_dtype=torch.float16
)
# Load and dispatch with automatic device mapping
model = load_checkpoint_and_dispatch(
model,
"path/to/sharded/checkpoint",
device_map="auto",
max_memory={"0": "12GB", "cpu": "30GB"},
dtype=torch.float16,
offload_folder="./offload"
)Install with Tessl CLI
npx tessl i tessl/pypi-accelerate