Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano
68
PyMC3 provides specialized data structures and utilities for handling observed data, minibatch processing, and generator-based data sources. These tools enable efficient memory usage and streaming data processing in Bayesian models.
The primary data container for observed variables in PyMC3 models.
class Data:
"""
Data container for observed variables with mutable and shared data support.
Creates a shared variable that can be updated during sampling or between
model fits, enabling out-of-sample prediction and data augmentation.
Parameters:
- name: str, name for the data variable
- value: array-like, initial data values
- dims: tuple, named dimensions for the data
- export_index_as_coords: bool, export index as coordinates
"""Efficient minibatch processing for large datasets and stochastic variational inference.
class Minibatch:
"""
Multidimensional minibatch container for stochastic inference.
Enables efficient processing of large datasets by sampling random
minibatches during each iteration, essential for scalable variational
inference and large dataset handling.
Parameters:
- data: array-like, full dataset to sample from
- batch_size: int or list, size of minibatches or list of sizes with random seeds
- dtype: str, data type for minibatch arrays
- broadcastable: tuple, broadcasting pattern (defaults to (False,) * ndim)
- name: str, name for minibatch variable (default "Minibatch")
- random_seed: int, random seed for minibatch sampling
- update_shared_f: callable, function to update underlying shared variable
"""
def align_minibatches(*minibatches):
"""
Align multiple minibatch variables to sample consistent indices.
Ensures that multiple minibatch variables sample the same data points
in each iteration, maintaining consistency across related datasets.
Parameters:
- minibatches: Minibatch variables to align
Returns:
- tuple: aligned minibatch variables
"""Support for generator-based data sources and streaming data processing.
class GeneratorAdapter:
"""
Adapter for generator-based data sources in PyMC3 models.
Converts Python generators into PyMC3-compatible tensor variables,
enabling streaming data processing and infinite data sources with
automatic type inference from the first generated item.
Parameters:
- generator: Python generator yielding data arrays
Methods:
- make_variable(gop, name): create tensor variable from generator
- set_gen(gen): update underlying generator
- set_default(value): set default value for variable
"""Helper functions for data loading and management.
def get_data(filename):
"""
Load package data files for examples and testing.
Retrieves data files from the PyMC3 package or downloads from
remote repository if not available locally.
Parameters:
- filename: str, name of data file to load
Returns:
- BytesIO: file-like object containing data
"""import pymc3 as pm
import numpy as np
# Create mutable data container
data = np.random.randn(100)
shared_data = pm.Data('shared_data', data)
with pm.Model() as model:
mu = pm.Normal('mu', 0, 1)
sigma = pm.HalfNormal('sigma', 1)
# Use shared data in likelihood
obs = pm.Normal('obs', mu, sigma, observed=shared_data)
trace = pm.sample(1000)
# Update data for out-of-sample prediction
new_data = np.random.randn(50)
pm.set_data({'shared_data': new_data})
# Generate predictions with new data
with model:
post_pred = pm.sample_posterior_predictive(trace)import pymc3 as pm
import numpy as np
# Large dataset
N = 10000
X = np.random.randn(N, 5)
y = np.random.randn(N)
# Create minibatches
batch_size = 100
X_batch = pm.Minibatch(X, batch_size=batch_size)
y_batch = pm.Minibatch(y, batch_size=batch_size)
# Align minibatches to sample consistent indices
X_batch, y_batch = pm.align_minibatches(X_batch, y_batch)
with pm.Model() as model:
# Model parameters
beta = pm.Normal('beta', 0, 1, shape=5)
sigma = pm.HalfNormal('sigma', 1)
# Minibatch likelihood
mu = pm.math.dot(X_batch, beta)
obs = pm.Normal('obs', mu, sigma, observed=y_batch, total_size=N)
# Use ADVI for large dataset
approx = pm.fit(n=10000, method='advi')
trace = approx.sample(1000)import pymc3 as pm
import numpy as np
def data_generator():
"""Generator yielding streaming data batches."""
while True:
yield np.random.randn(10, 3)
# Create generator adapter
gen = data_generator()
adapter = pm.GeneratorAdapter(gen)
with pm.Model() as model:
# Create variable from generator
data_var = adapter.make_variable(name='streaming_data')
# Model using streaming data
mu = pm.Normal('mu', 0, 1, shape=3)
sigma = pm.HalfNormal('sigma', 1)
obs = pm.Normal('obs', mu, sigma, observed=data_var)
# Note: Special handling needed for generator-based samplingimport pymc3 as pm
import pandas as pd
# Load package data
data_file = pm.get_data('coal.csv')
coal_data = pd.read_csv(data_file)
# Use in model
with pm.Model() as model:
# Process loaded data
years = coal_data['year'].values
disasters = pm.Data('disasters', coal_data['disasters'].values)
# Build time series model
lambda1 = pm.Exponential('lambda1', 1)
lambda2 = pm.Exponential('lambda2', 1)
tau = pm.DiscreteUniform('tau', 0, len(years))
rate = pm.math.switch(years < tau, lambda1, lambda2)
obs = pm.Poisson('obs', rate, observed=disasters)
trace = pm.sample(2000)import pymc3 as pm
import numpy as np
# Initial training data
X_train = np.random.randn(100, 3)
y_train = np.random.randn(100)
# Create shared variables
X_shared = pm.Data('X_shared', X_train)
y_shared = pm.Data('y_shared', y_train)
with pm.Model() as model:
# Model parameters
beta = pm.Normal('beta', 0, 1, shape=3)
sigma = pm.HalfNormal('sigma', 1)
# Model definition
mu = pm.math.dot(X_shared, beta)
obs = pm.Normal('obs', mu, sigma, observed=y_shared)
# Fit to initial data
trace = pm.sample(1000)
# Update with new data batches
for batch_idx in range(5):
X_new = np.random.randn(20, 3)
y_new = np.random.randn(20)
# Update shared variables
pm.set_data({'X_shared': X_new, 'y_shared': y_new})
# Continue sampling or refit
with model:
trace_batch = pm.sample(200, start=trace[-1])Install with Tessl CLI
npx tessl i tessl/pypi-pymc3docs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10