CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pymc3

Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano

68

0.94x
Overview
Eval results
Files

data-handling.mddocs/

Data Handling

PyMC3 provides specialized data structures and utilities for handling observed data, minibatch processing, and generator-based data sources. These tools enable efficient memory usage and streaming data processing in Bayesian models.

Capabilities

Data Container

The primary data container for observed variables in PyMC3 models.

class Data:
    """
    Data container for observed variables with mutable and shared data support.
    
    Creates a shared variable that can be updated during sampling or between
    model fits, enabling out-of-sample prediction and data augmentation.
    
    Parameters:
    - name: str, name for the data variable
    - value: array-like, initial data values
    - dims: tuple, named dimensions for the data
    - export_index_as_coords: bool, export index as coordinates
    """

Minibatch Processing

Efficient minibatch processing for large datasets and stochastic variational inference.

class Minibatch:
    """
    Multidimensional minibatch container for stochastic inference.
    
    Enables efficient processing of large datasets by sampling random
    minibatches during each iteration, essential for scalable variational
    inference and large dataset handling.
    
    Parameters:
    - data: array-like, full dataset to sample from
    - batch_size: int or list, size of minibatches or list of sizes with random seeds
    - dtype: str, data type for minibatch arrays
    - broadcastable: tuple, broadcasting pattern (defaults to (False,) * ndim)
    - name: str, name for minibatch variable (default "Minibatch")
    - random_seed: int, random seed for minibatch sampling
    - update_shared_f: callable, function to update underlying shared variable
    """

def align_minibatches(*minibatches):
    """
    Align multiple minibatch variables to sample consistent indices.
    
    Ensures that multiple minibatch variables sample the same data points
    in each iteration, maintaining consistency across related datasets.
    
    Parameters:
    - minibatches: Minibatch variables to align
    
    Returns:
    - tuple: aligned minibatch variables
    """

Generator Adaptation

Support for generator-based data sources and streaming data processing.

class GeneratorAdapter:
    """
    Adapter for generator-based data sources in PyMC3 models.
    
    Converts Python generators into PyMC3-compatible tensor variables,
    enabling streaming data processing and infinite data sources with
    automatic type inference from the first generated item.
    
    Parameters:
    - generator: Python generator yielding data arrays
    
    Methods:
    - make_variable(gop, name): create tensor variable from generator
    - set_gen(gen): update underlying generator
    - set_default(value): set default value for variable
    """

Data Utilities

Helper functions for data loading and management.

def get_data(filename):
    """
    Load package data files for examples and testing.
    
    Retrieves data files from the PyMC3 package or downloads from
    remote repository if not available locally.
    
    Parameters:
    - filename: str, name of data file to load
    
    Returns:
    - BytesIO: file-like object containing data
    """

Usage Examples

Basic Data Container

import pymc3 as pm
import numpy as np

# Create mutable data container
data = np.random.randn(100)
shared_data = pm.Data('shared_data', data)

with pm.Model() as model:
    mu = pm.Normal('mu', 0, 1)
    sigma = pm.HalfNormal('sigma', 1)
    
    # Use shared data in likelihood
    obs = pm.Normal('obs', mu, sigma, observed=shared_data)
    
    trace = pm.sample(1000)

# Update data for out-of-sample prediction
new_data = np.random.randn(50)
pm.set_data({'shared_data': new_data})

# Generate predictions with new data
with model:
    post_pred = pm.sample_posterior_predictive(trace)

Minibatch Processing for Large Datasets

import pymc3 as pm
import numpy as np

# Large dataset
N = 10000
X = np.random.randn(N, 5)
y = np.random.randn(N)

# Create minibatches
batch_size = 100
X_batch = pm.Minibatch(X, batch_size=batch_size)
y_batch = pm.Minibatch(y, batch_size=batch_size)

# Align minibatches to sample consistent indices
X_batch, y_batch = pm.align_minibatches(X_batch, y_batch)

with pm.Model() as model:
    # Model parameters
    beta = pm.Normal('beta', 0, 1, shape=5)
    sigma = pm.HalfNormal('sigma', 1)
    
    # Minibatch likelihood
    mu = pm.math.dot(X_batch, beta)
    obs = pm.Normal('obs', mu, sigma, observed=y_batch, total_size=N)
    
    # Use ADVI for large dataset
    approx = pm.fit(n=10000, method='advi')
    trace = approx.sample(1000)

Generator-Based Data Processing

import pymc3 as pm
import numpy as np

def data_generator():
    """Generator yielding streaming data batches."""
    while True:
        yield np.random.randn(10, 3)

# Create generator adapter
gen = data_generator()
adapter = pm.GeneratorAdapter(gen)

with pm.Model() as model:
    # Create variable from generator
    data_var = adapter.make_variable(name='streaming_data')
    
    # Model using streaming data
    mu = pm.Normal('mu', 0, 1, shape=3)
    sigma = pm.HalfNormal('sigma', 1)
    obs = pm.Normal('obs', mu, sigma, observed=data_var)
    
    # Note: Special handling needed for generator-based sampling

Data Loading Utilities

import pymc3 as pm
import pandas as pd

# Load package data
data_file = pm.get_data('coal.csv')
coal_data = pd.read_csv(data_file)

# Use in model
with pm.Model() as model:
    # Process loaded data
    years = coal_data['year'].values
    disasters = pm.Data('disasters', coal_data['disasters'].values)
    
    # Build time series model
    lambda1 = pm.Exponential('lambda1', 1)
    lambda2 = pm.Exponential('lambda2', 1)
    tau = pm.DiscreteUniform('tau', 0, len(years))
    
    rate = pm.math.switch(years < tau, lambda1, lambda2)
    obs = pm.Poisson('obs', rate, observed=disasters)
    
    trace = pm.sample(2000)

Dynamic Data Updates

import pymc3 as pm
import numpy as np

# Initial training data
X_train = np.random.randn(100, 3)
y_train = np.random.randn(100)

# Create shared variables
X_shared = pm.Data('X_shared', X_train)
y_shared = pm.Data('y_shared', y_train)

with pm.Model() as model:
    # Model parameters
    beta = pm.Normal('beta', 0, 1, shape=3)
    sigma = pm.HalfNormal('sigma', 1)
    
    # Model definition
    mu = pm.math.dot(X_shared, beta)
    obs = pm.Normal('obs', mu, sigma, observed=y_shared)
    
    # Fit to initial data
    trace = pm.sample(1000)

# Update with new data batches
for batch_idx in range(5):
    X_new = np.random.randn(20, 3)
    y_new = np.random.randn(20)
    
    # Update shared variables
    pm.set_data({'X_shared': X_new, 'y_shared': y_new})
    
    # Continue sampling or refit
    with model:
        trace_batch = pm.sample(200, start=trace[-1])

Install with Tessl CLI

npx tessl i tessl/pypi-pymc3

docs

data-handling.md

distributions.md

gaussian-processes.md

glm.md

index.md

math-functions.md

modeling.md

sampling.md

stats-plots.md

step-methods.md

variational.md

tile.json