tessl/pypi-accelerate

HuggingFace Accelerate is a PyTorch library that simplifies distributed and mixed-precision training by abstracting away the boilerplate code needed for multi-GPU, TPU, and mixed-precision setups.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Utilities

Name: tessl/pypi-accelerate
Author: tessl

Memory management, checkpointing, model utilities, and various helper functions for training workflows. These utilities provide essential functionality for efficient training, model management, and system optimization.

Capabilities

Memory Management

Functions for optimizing memory usage during training and inference.

def find_executable_batch_size(
    function: callable,
    starting_batch_size: int = 128
):
    """
    Automatically find the largest executable batch size for a function.
    
    Performs binary search to find the maximum batch size that doesn't
    cause out-of-memory errors, useful for maximizing hardware utilization.
    
    Parameters:
    - function: Function to test with different batch sizes
    - starting_batch_size: Initial batch size to try
    
    Returns:
    Largest batch size that executes successfully
    """

def release_memory(*objects):
    """
    Release memory from specified objects and trigger garbage collection.
    
    Parameters:
    - *objects: Objects to delete and release memory from
    """

Model Utilities

Functions for model introspection, manipulation, and memory analysis.

def infer_auto_device_map(
    model: torch.nn.Module,
    max_memory: dict[int | str, int | str] | None = None,
    no_split_module_classes: list[str] | None = None,
    dtype: torch.dtype | str | None = None,
    special_dtypes: dict[str, torch.dtype | str] | None = None,
    verbose: bool = False
):
    """
    Automatically infer optimal device mapping for a model.
    
    Analyzes model size and available memory to determine the best
    placement of layers across devices.
    
    Parameters:
    - model: Model to analyze
    - max_memory: Maximum memory per device
    - no_split_module_classes: Module classes that shouldn't be split
    - dtype: Data type for memory calculations
    - special_dtypes: Special dtypes for specific parameters
    - verbose: Print detailed mapping information
    
    Returns:
    Dictionary mapping layer names to devices
    """

def get_balanced_memory(
    model: torch.nn.Module,
    max_memory: dict[int | str, int | str] | None = None,
    no_split_module_classes: list[str] | None = None,
    dtype: torch.dtype | None = None,
    low_zero_memory: bool = False
):
    """
    Calculate balanced memory distribution for model across devices.
    
    Parameters:
    - model: Model to analyze
    - max_memory: Memory constraints per device
    - no_split_module_classes: Modules to keep together
    - dtype: Data type for calculations
    - low_zero_memory: Use minimal memory for device 0
    
    Returns:
    Balanced memory allocation across devices
    """

def compute_module_sizes(
    model: torch.nn.Module,
    dtype: torch.dtype | None = None
):
    """
    Compute memory size of each module in the model.
    
    Parameters:
    - model: Model to analyze
    - dtype: Data type for size calculations
    
    Returns:
    Dictionary mapping module names to memory sizes in bytes
    """

def get_max_memory(max_memory: dict[int | str, int | str] | None = None):
    """
    Get maximum available memory per device.
    
    Parameters:
    - max_memory: User-specified memory limits
    
    Returns:
    Dictionary of available memory per device
    """

def has_offloaded_params(model: torch.nn.Module):
    """
    Check if model has any offloaded parameters.
    
    Parameters:
    - model: Model to check
    
    Returns:
    Boolean indicating presence of offloaded parameters
    """

Checkpointing and State Management

Functions for saving and loading training state and model checkpoints.

def save_accelerator_state(
    output_dir: str | os.PathLike,
    safe_serialization: bool = True
):
    """
    Save complete Accelerator training state.
    
    Saves model, optimizer, scheduler, and RNG states for complete
    training resumption.
    
    Parameters:
    - output_dir: Directory to save state files
    - safe_serialization: Use safetensors format when possible
    """

def load_accelerator_state(input_dir: str | os.PathLike):
    """
    Load complete Accelerator training state.
    
    Parameters:
    - input_dir: Directory containing saved state files
    """

def save_custom_state(
    obj,
    path: str | os.PathLike,
    process_index: int = 0,
    scaler: callable | None = None
):
    """
    Save custom object state with process coordination.
    
    Parameters:
    - obj: Object to save
    - path: Path to save object
    - process_index: Process responsible for saving
    - scaler: Optional scaling function
    """

def load_custom_state(
    path: str | os.PathLike,
    process_index: int = 0,
    scaler: callable | None = None
):
    """
    Load custom object state.
    
    Parameters:
    - path: Path to load object from
    - process_index: Process responsible for loading
    - scaler: Optional scaling function
    
    Returns:
    Loaded object
    """

def load_checkpoint_in_model(
    model: torch.nn.Module,
    checkpoint: str | os.PathLike,
    device_map: dict[str, torch.device | str | int] | None = None,
    offload_folder: str | os.PathLike | None = None,
    dtype: torch.dtype | None = None,
    offload_state_dict: bool = False,
    offload_buffers: bool = False,
    keep_in_fp32_modules: list[str] | None = None,
    strict: bool = False
):
    """
    Load checkpoint into model with advanced options.
    
    Parameters:
    - model: Model to load checkpoint into
    - checkpoint: Path to checkpoint file
    - device_map: Device placement mapping
    - offload_folder: Directory for offloaded weights
    - dtype: Target data type
    - offload_state_dict: Offload entire state dict
    - offload_buffers: Offload buffer tensors
    - keep_in_fp32_modules: Modules to keep in FP32
    - strict: Strict checkpoint loading
    
    Returns:
    Tuple of (missing_keys, unexpected_keys)
    """

Random State Management

Functions for managing random number generation across distributed processes.

def set_seed(seed: int, device_specific: bool = False):
    """
    Set random seed across all processes and libraries.
    
    Sets seeds for PyTorch, NumPy, Python random, and other libraries
    to ensure reproducible results across distributed training.
    
    Parameters:
    - seed: Random seed value
    - device_specific: Use device-specific seeding for different results per device
    """

def synchronize_rng_states(
    rng_types: list[str] | None = None,
    generator: torch.Generator | None = None
):
    """
    Synchronize random number generator states across processes.
    
    Parameters:
    - rng_types: Types of RNG to sync ("torch", "cuda", "xla")
    - generator: Specific generator to synchronize
    """

def synchronize_rng_state(
    rng_type: str | None = None,
    generator: torch.Generator | None = None
):
    """
    Synchronize single RNG state across processes.
    
    Parameters:
    - rng_type: Type of RNG to synchronize
    - generator: Specific generator to use
    """

Model Parameter Management

Functions for managing model parameters, tied weights, and device placement.

def find_tied_parameters(model: torch.nn.Module):
    """
    Find tied (shared) parameters in model.
    
    Parameters:
    - model: Model to analyze
    
    Returns:
    List of parameter groups that share the same tensor
    """

def check_tied_parameters_on_same_device(model: torch.nn.Module):
    """
    Verify that tied parameters are on the same device.
    
    Parameters:
    - model: Model to check
    
    Returns:
    Boolean indicating if all tied parameters are properly placed
    """

def retie_parameters(
    model: torch.nn.Module,
    tied_params: list[list[str]]
):
    """
    Re-establish parameter tying after model loading.
    
    Parameters:
    - model: Model with parameters to retie
    - tied_params: List of parameter groups to tie together
    """

def set_module_tensor_to_device(
    module: torch.nn.Module,
    tensor_name: str,
    device: torch.device | str | int,
    value: torch.Tensor | None = None,
    dtype: torch.dtype | None = None
):
    """
    Set specific tensor in module to device with optional value/dtype.
    
    Parameters:
    - module: Module containing the tensor
    - tensor_name: Name of tensor to modify
    - device: Target device
    - value: Optional new tensor value
    - dtype: Optional target dtype
    """

def align_module_device(
    module: torch.nn.Module,
    execution_device: torch.device | str | int
):
    """
    Align module device with execution device.
    
    Parameters:
    - module: Module to align
    - execution_device: Target execution device
    """

File I/O and Serialization

General-purpose functions for saving and loading objects with device awareness.

def save(
    obj,
    path: str | os.PathLike,
    save_on_each_node: bool = False,
    safe_serialization: bool = False
):
    """
    Save object with distributed training awareness.
    
    Parameters:
    - obj: Object to save
    - path: Save path
    - save_on_each_node: Save on each node instead of just main process
    - safe_serialization: Use safetensors format when possible
    """

def load(
    path: str | os.PathLike,
    map_location: str | torch.device | None = None,
    **kwargs
):
    """
    Load object with device mapping support.
    
    Parameters:
    - path: Path to load from
    - map_location: Device mapping for tensors
    - **kwargs: Additional arguments for loading
    
    Returns:
    Loaded object
    """

def clean_state_dict_for_safetensors(state_dict: dict):
    """
    Clean state dict for safetensors serialization.
    
    Removes incompatible elements and prepares dict for safetensors format.
    
    Parameters:
    - state_dict: State dictionary to clean
    
    Returns:
    Cleaned state dictionary
    """

Environment and Import Detection

Functions for detecting available libraries and hardware capabilities.

def is_cuda_available():
    """Check if CUDA is available."""

def is_mps_available():
    """Check if Apple MPS is available."""

def is_xpu_available():
    """Check if Intel XPU is available."""

def is_hpu_available():
    """Check if Habana HPU is available."""

def is_npu_available():
    """Check if NPU is available."""

def is_deepspeed_available():
    """Check if DeepSpeed is available."""

def is_transformers_available():
    """Check if Transformers library is available."""

def is_datasets_available():
    """Check if Datasets library is available."""

def is_wandb_available():
    """Check if Weights & Biases is available."""

def is_tensorboard_available():
    """Check if TensorBoard is available."""

def is_comet_ml_available():
    """Check if Comet ML is available."""

def is_mlflow_available():
    """Check if MLflow is available."""

def is_bnb_available():
    """Check if Bitsandbytes is available."""

def is_4bit_bnb_available():
    """Check if 4-bit Bitsandbytes quantization is available."""

def is_8bit_bnb_available():
    """Check if 8-bit Bitsandbytes quantization is available."""

def is_torch_xla_available():
    """Check if Torch XLA is available."""

def is_rich_available():
    """Check if Rich formatting library is available."""

System Utilities

General system and process management utilities.

def wait_for_everyone():
    """
    Global synchronization barrier across all processes.
    """

def extract_model_from_parallel(
    model: torch.nn.Module,
    keep_fp32_wrapper: bool = True
):
    """
    Extract original model from parallel training wrappers.
    
    Parameters:
    - model: Wrapped model
    - keep_fp32_wrapper: Whether to keep mixed precision wrapper
    
    Returns:
    Unwrapped model
    """

def merge_dicts(dict1: dict, dict2: dict):
    """
    Merge two dictionaries recursively.
    
    Parameters:
    - dict1: First dictionary
    - dict2: Second dictionary
    
    Returns:
    Merged dictionary
    """

def get_pretty_name(obj):
    """
    Get human-readable name for object.
    
    Parameters:
    - obj: Object to get name for
    
    Returns:
    Pretty string representation
    """

def write_basic_config(
    mixed_precision: str = "no",
    save_location: str = "default"
):
    """
    Write basic Accelerate configuration file.
    
    Parameters:
    - mixed_precision: Mixed precision mode
    - save_location: Where to save config ("default" or custom path)
    """

def convert_bytes(size_bytes: int):
    """
    Convert bytes to human-readable format.
    
    Parameters:
    - size_bytes: Size in bytes
    
    Returns:
    Human-readable size string (e.g., "1.5 GB")
    """

Usage Examples

Automatic Batch Size Finding

from accelerate import find_executable_batch_size
import torch

def training_function(batch_size):
    # Your training code here
    model = MyModel()
    optimizer = torch.optim.Adam(model.parameters())
    
    # Simulate training step
    for _ in range(10):
        batch = torch.randn(batch_size, 784)
        loss = model(batch).sum()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Find optimal batch size automatically
optimal_batch_size = find_executable_batch_size(training_function)
print(f"Optimal batch size: {optimal_batch_size}")

Model Memory Analysis

from accelerate import (
    compute_module_sizes,
    get_balanced_memory,
    infer_auto_device_map
)

# Analyze model memory usage
module_sizes = compute_module_sizes(model, dtype=torch.float16)
print("Memory usage per module:")
for name, size in module_sizes.items():
    print(f"{name}: {size / 1024**3:.2f} GB")

# Get balanced memory allocation
max_memory = {"0": "10GB", "1": "10GB", "cpu": "30GB"}
balanced_memory = get_balanced_memory(
    model,
    max_memory=max_memory,
    no_split_module_classes=["LlamaDecoderLayer"]
)

# Infer optimal device mapping
device_map = infer_auto_device_map(
    model,
    max_memory=balanced_memory,
    no_split_module_classes=["LlamaDecoderLayer"],
    verbose=True
)

Advanced Checkpointing

from accelerate import (
    save_accelerator_state,
    load_accelerator_state,
    save_custom_state,
    load_custom_state
)

# Save complete training state
accelerator = Accelerator()
model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)

# After some training...
save_accelerator_state("./checkpoint-1000", safe_serialization=True)

# Save custom objects
training_metadata = {
    "epoch": 5,
    "best_loss": 0.1234,
    "learning_rates": [0.001, 0.0005, 0.0001]
}
save_custom_state(training_metadata, "./checkpoint-1000/metadata.pkl")

# Later, load everything back
load_accelerator_state("./checkpoint-1000")
metadata = load_custom_state("./checkpoint-1000/metadata.pkl")

Random State Management

from accelerate import set_seed, synchronize_rng_states

# Set reproducible seed across all processes
set_seed(42, device_specific=False)

# Synchronize RNG states for consistency
synchronize_rng_states(["torch", "cuda", "numpy"])

# Training with consistent randomness
for epoch in range(num_epochs):
    # All processes will generate the same random augmentations
    for batch in dataloader:
        augmented_batch = apply_random_augmentation(batch)
        # ... training code

Parameter Management

from accelerate import (
    find_tied_parameters,
    check_tied_parameters_on_same_device,
    retie_parameters
)

# Find tied parameters in model
tied_params = find_tied_parameters(model)
print("Tied parameter groups:", tied_params)

# Check if tied parameters are properly placed
if not check_tied_parameters_on_same_device(model):
    print("Warning: Tied parameters are not on the same device!")

# Re-tie parameters after loading from checkpoint
retie_parameters(model, tied_params)

System Integration

from accelerate import (
    is_cuda_available,
    is_deepspeed_available,
    write_basic_config,
    convert_bytes
)

# Check system capabilities
print(f"CUDA available: {is_cuda_available()}")
print(f"DeepSpeed available: {is_deepspeed_available()}")

# Create basic configuration
if is_cuda_available():
    write_basic_config(mixed_precision="fp16")
else:
    write_basic_config(mixed_precision="no")

# Memory usage reporting
model_size = sum(p.numel() * p.element_size() for p in model.parameters())
print(f"Model size: {convert_bytes(model_size)}")

Install with Tessl CLI