tessl/pypi-emcee

The Python ensemble sampling toolkit for affine-invariant MCMC

—

Pending

Overview

Eval results

Files

Storage Backends

Name: tessl/pypi-emcee
Author: tessl

emcee provides flexible storage backends for persisting MCMC chains and sampling results. Backends enable efficient storage, retrieval, and analysis of sampling data, supporting both in-memory and file-based storage with features like compression and resumable sampling.

Capabilities

Backend Base Class

The foundation for all storage backends, providing common interface and in-memory storage.

class Backend:
    def __init__(self, dtype=None):
        """
        Initialize backend.
        
        Args:
            dtype: Data type for stored arrays (default: np.float64)
        """
    
    def reset(self, nwalkers: int, ndim: int):
        """
        Clear backend state and prepare for new sampling.
        
        Args:
            nwalkers: Number of walkers in ensemble
            ndim: Number of dimensions in parameter space
        """
    
    def has_blobs(self):
        """
        Check if backend stores blob data.
        
        Returns:
            bool: True if blobs are stored
        """
    
    def get_chain(self, flat: bool = False, thin: int = 1, discard: int = 0):
        """
        Retrieve stored MCMC chain.
        
        Args:
            flat: Flatten chain across ensemble dimension
            thin: Take every thin steps
            discard: Discard first discard steps as burn-in
            
        Returns:
            ndarray: Chain data [steps, nwalkers, ndim] or [steps*nwalkers, ndim] if flat
        """
    
    def get_log_prob(self, flat: bool = False, thin: int = 1, discard: int = 0):
        """
        Retrieve log probability values.
        
        Returns:
            ndarray: Log probabilities [steps, nwalkers] or [steps*nwalkers] if flat
        """
    
    def get_blobs(self, flat: bool = False, thin: int = 1, discard: int = 0):
        """
        Retrieve blob data if available.
        
        Returns:
            ndarray or None: Blob data if stored
        """
    
    def save_step(self, state, accepted):
        """
        Store a sampling step.
        
        Args:
            state: Current ensemble state
            accepted: Boolean array of accepted proposals
        """

HDF5 Backend

File-based backend using HDF5 format for persistent storage with compression and metadata support.

class HDFBackend(Backend):
    def __init__(self, filename: str, name: str = "mcmc", read_only: bool = False):
        """
        Initialize HDF5 backend.
        
        Args:
            filename: Path to HDF5 file
            name: Group name within HDF5 file
            read_only: Open file in read-only mode
        """
    
    @property
    def filename(self):
        """Get the HDF5 filename."""
    
    @property
    def name(self):
        """Get the group name."""
    
    @property
    def iteration(self):
        """Get current iteration count."""
    
    @property
    def shape(self):
        """Get chain shape (nwalkers, ndim)."""
    
    def get_autocorr_time(self, **kwargs):
        """
        Compute autocorrelation time from stored chain.
        
        Returns:
            ndarray: Autocorrelation times for each parameter
        """

class TempHDFBackend:
    def __init__(self, **kwargs):
        """
        Temporary HDF5 backend that creates a temporary file.
        
        Args:
            **kwargs: Arguments passed to HDFBackend
        """

Backend Utilities

Functions for working with multiple backends and testing.

def get_test_backends():
    """
    Get list of available backends for testing.
    
    Returns:
        list: Available backend classes
    """

Usage Examples

In-Memory Backend (Default)

import emcee
import numpy as np

def log_prob(theta):
    return -0.5 * np.sum(theta**2)

# Default backend is in-memory
sampler = emcee.EnsembleSampler(32, 2, log_prob)

# Or explicitly specify
backend = emcee.backends.Backend()
sampler = emcee.EnsembleSampler(32, 2, log_prob, backend=backend)

# Run sampling
pos = np.random.randn(32, 2)
sampler.run_mcmc(pos, 1000)

# Access results
chain = sampler.get_chain()
log_prob_vals = sampler.get_log_prob()

HDF5 Backend for Persistent Storage

from emcee.backends import HDFBackend

# Create HDF5 backend
filename = "mcmc_results.h5"
backend = HDFBackend(filename)

sampler = emcee.EnsembleSampler(32, 2, log_prob, backend=backend)

# Run sampling - results saved to file
sampler.run_mcmc(pos, 1000)

# Results are automatically saved
print(f"Chain shape: {backend.shape}")
print(f"Iterations completed: {backend.iteration}")

Resuming from HDF5 Backend

# Resume sampling from existing file
backend = HDFBackend(filename, read_only=False)

# Check existing progress
print(f"Previous iterations: {backend.iteration}")
previous_chain = backend.get_chain()

# Resume from last state
if backend.iteration > 0:
    last_state = backend.get_last_sample()
    sampler = emcee.EnsembleSampler(32, 2, log_prob, backend=backend)
    
    # Continue sampling
    sampler.run_mcmc(last_state, 500)  # Additional 500 steps

Multiple Sampling Runs in Same File

# Use different group names for multiple runs
backend1 = HDFBackend("results.h5", name="run1")
backend2 = HDFBackend("results.h5", name="run2")

# First run
sampler1 = emcee.EnsembleSampler(32, 2, log_prob, backend=backend1)
sampler1.run_mcmc(pos, 1000)

# Second run with different parameters
sampler2 = emcee.EnsembleSampler(32, 2, log_prob, backend=backend2)
sampler2.run_mcmc(pos, 1000)

# Access results from specific runs
chain1 = backend1.get_chain()
chain2 = backend2.get_chain()

Temporary HDF5 Backend

from emcee.backends import TempHDFBackend

# Creates temporary file that's automatically cleaned up
with TempHDFBackend() as backend:
    sampler = emcee.EnsembleSampler(32, 2, log_prob, backend=backend)
    sampler.run_mcmc(pos, 1000)
    
    # Use results while in context
    chain = backend.get_chain()
    # File is automatically deleted when context exits

Backend with Blob Data

def log_prob_with_blobs(theta):
    log_p = -0.5 * np.sum(theta**2)
    # Return additional metadata as blobs
    blobs = {"energy": np.sum(theta**2), "step_size": np.linalg.norm(theta)}
    return log_p, blobs

# Backend automatically handles blobs
backend = HDFBackend("results_with_blobs.h5")
sampler = emcee.EnsembleSampler(32, 2, log_prob_with_blobs, backend=backend)

sampler.run_mcmc(pos, 1000)

# Access blob data
blobs = backend.get_blobs()
print(f"Blob keys: {blobs.dtype.names}")

Analyzing Stored Results

# Load existing results for analysis
backend = HDFBackend("results.h5", read_only=True)

# Get chain with burn-in removal
chain = backend.get_chain(discard=200, flat=True)
log_prob_vals = backend.get_log_prob(discard=200, flat=True)

# Compute autocorrelation time
tau = backend.get_autocorr_time()
print(f"Autocorrelation time: {tau}")

# Thin chain based on autocorrelation
thin_factor = int(2 * np.max(tau))
thinned_chain = backend.get_chain(discard=200, thin=thin_factor, flat=True)

Custom Backend Configuration

# Backend with specific data type
backend = emcee.backends.Backend(dtype=np.float32)

# HDF5 with compression (requires h5py)
import h5py
backend = HDFBackend("compressed.h5")
# HDF5 compression is automatically applied when available

Backend Inspection

# Check backend properties
backend = HDFBackend("results.h5")

print(f"Backend type: {type(backend).__name__}")
print(f"Has blobs: {backend.has_blobs()}")
print(f"Chain shape: {backend.shape}")
print(f"Iterations: {backend.iteration}")

# Access raw HDF5 file (advanced usage)
with h5py.File(backend.filename, 'r') as f:
    print(f"HDF5 groups: {list(f.keys())}")
    print(f"Chain dataset shape: {f[backend.name]['chain'].shape}")

Install with Tessl CLI