Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano
68
PyMC3's variational inference module provides scalable approximate inference methods that optimize a tractable approximation to the posterior distribution. These methods trade some accuracy for computational efficiency, making Bayesian inference feasible for large datasets and complex models.
High-level interface for variational inference with automatic method selection and optimization.
def fit(n=10000, method='advi', model=None, random_seed=None,
start=None, inf_kwargs=None, **kwargs):
"""
Fit model using variational inference.
Main interface for approximate inference using various VI methods.
Automatically handles method selection, initialization, and optimization.
Parameters:
- n: int, number of optimization iterations
- method: str or Inference class, VI method ('advi', 'fullrank_advi', 'svgd', 'asvgd', 'nfvi')
- model: Model, model to fit (current context if None)
- random_seed: int, random seed for reproducibility
- start: dict, initial parameter values
- inf_kwargs: dict, arguments passed to inference method
- kwargs: additional arguments for specific methods
Returns:
- Approximation: fitted approximation object
Example:
with pm.Model() as model:
mu = pm.Normal('mu', 0, 1)
y_obs = pm.Normal('y_obs', mu, 1, observed=data)
# Fit using ADVI
approx = pm.fit(n=20000, method='advi')
# Sample from approximation
trace = approx.sample(1000)
"""
def sample_approx(approx, draws=1000, include_transformed=True):
"""
Sample from fitted approximation.
Parameters:
- approx: Approximation, fitted approximation object
- draws: int, number of samples to draw
- include_transformed: bool, include transformed variables
Returns:
- MultiTrace: samples from approximation
"""Core variational inference algorithms with different approximation strategies.
class ADVI:
"""
Automatic Differentiation Variational Inference.
Mean-field variational inference using automatic differentiation
for gradient-based optimization. Assumes independence between
parameters in the approximating distribution.
"""
def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
scale_cost_to_minibatch=False, random_seed=None,
start=None, **kwargs):
"""
Initialize ADVI inference.
Parameters:
- local_rv: dict, local random variables for minibatch inference
- model: Model, model to approximate
- cost_part_grad_scale: float, scaling for cost gradient
- scale_cost_to_minibatch: bool, scale cost to minibatch size
- random_seed: int, random seed
- start: dict, starting parameter values
"""
def fit(self, n=10000, score=None, callbacks=None,
progressbar=True, **kwargs):
"""
Fit the approximation.
Parameters:
- n: int, number of optimization iterations
- score: bool, compute ELBO score during fitting
- callbacks: list, callback functions for monitoring
- progressbar: bool, display progress bar
Returns:
- Approximation: fitted approximation
"""
class FullRankADVI:
"""
Full-rank Automatic Differentiation Variational Inference.
Variational inference with full covariance structure in the
approximating distribution, capturing correlations between parameters.
"""
def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
scale_cost_to_minibatch=False, random_seed=None,
start=None, **kwargs):
"""Initialize full-rank ADVI inference."""
class SVGD:
"""
Stein Variational Gradient Descent.
Non-parametric variational method that uses a set of particles
to approximate the posterior through deterministic updates
guided by Stein's method.
"""
def __init__(self, n_particles=100, jitter=1e-6, model=None,
start=None, random_seed=None, **kwargs):
"""
Initialize SVGD inference.
Parameters:
- n_particles: int, number of particles
- jitter: float, noise for numerical stability
- model: Model, model to approximate
- start: dict, initial particle positions
- random_seed: int, random seed
"""
class ASVGD:
"""
Amortized Stein Variational Gradient Descent.
Extension of SVGD with amortization for handling multiple
related inference problems efficiently.
"""
class NFVI:
"""
Normalizing Flow Variational Inference.
Variational inference using normalizing flows to create
flexible approximating distributions beyond simple parametric forms.
"""
def __init__(self, flow='planar', model=None, random_seed=None,
**kwargs):
"""
Initialize normalizing flow VI.
Parameters:
- flow: str or Flow, type of normalizing flow
- model: Model, model to approximate
- random_seed: int, random seed
"""
class ImplicitGradient:
"""
Implicit gradient variational inference.
Advanced method using implicit differentiation for
variational optimization with complex constraints.
"""
class Inference:
"""
Base class for all variational inference methods.
Provides common interface and functionality for different
VI algorithms including optimization, callbacks, and diagnostics.
"""
@property
def approx(self):
"""Current approximation object."""
@property
def hist(self):
"""Optimization history."""
def fit(self, n, **kwargs):
"""Abstract method for fitting approximation."""
class KLqp:
"""
Kullback-Leibler divergence minimization.
General framework for KL-divergence based variational inference
with customizable divergence measures and optimization strategies.
"""Different parametric forms for approximating posterior distributions.
class MeanField:
"""
Mean-field Gaussian approximation.
Independent normal distributions for each parameter,
assuming no correlations in the approximating distribution.
"""
def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
scale_cost_to_minibatch=False, random_seed=None,
start=None):
"""Initialize mean-field approximation."""
def sample(self, draws=1000, include_transformed=True):
"""
Sample from approximation.
Parameters:
- draws: int, number of samples
- include_transformed: bool, include transformed variables
Returns:
- MultiTrace: samples from approximation
"""
@property
def mean(self):
"""Mean of approximating distribution."""
@property
def std(self):
"""Standard deviation of approximating distribution."""
def sample_node(self, node, size=None, more_replacements=None):
"""Sample specific model node."""
class FullRank:
"""
Full-rank Gaussian approximation.
Multivariate normal distribution with full covariance matrix,
capturing correlations between parameters in approximation.
"""
def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
scale_cost_to_minibatch=False, random_seed=None,
start=None):
"""Initialize full-rank approximation."""
@property
def cov(self):
"""Covariance matrix of approximating distribution."""
@property
def L_chol(self):
"""Cholesky factor of covariance matrix."""
class Empirical:
"""
Empirical approximation using sample particles.
Non-parametric approximation represented by a set of
weighted samples, used in particle-based methods like SVGD.
"""
def __init__(self, trace=None, size=None, jitter=1e-6,
local_rv=None, model=None, **kwargs):
"""
Initialize empirical approximation.
Parameters:
- trace: MultiTrace, initial particles
- size: int, number of particles
- jitter: float, noise for numerical stability
- local_rv: dict, local random variables
- model: Model, model context
"""
@property
def histogram(self):
"""Histogram representation of samples."""
class NormalizingFlow:
"""
Normalizing flow approximation.
Flexible approximation using invertible transformations
to map simple base distribution to complex target.
"""
def __init__(self, flow='planar', local_rv=None, model=None,
**kwargs):
"""
Initialize normalizing flow approximation.
Parameters:
- flow: str or Flow, flow architecture
- local_rv: dict, local random variables
- model: Model, model context
"""
class Approximation:
"""
Base class for all approximations.
Provides common interface for sampling, evaluation, and
manipulation of approximating distributions.
"""
def sample(self, draws=1000, **kwargs):
"""Sample from approximation."""
def sample_vp(self, draws=1000, hide_transformed=True):
"""Sample variational parameters."""
def apply_replacements(self, node, more_replacements=None):
"""Apply parameter replacements to computational graph."""
@property
def logq(self):
"""Log-probability of approximating distribution."""
@property
def logq_norm(self):
"""Normalized log-probability."""
@property
def shared_params(self):
"""Shared parameter tensors."""
class Group:
"""
Variable grouping for approximation.
Groups related variables together for joint approximation,
useful for hierarchical models and structured approximations.
"""
def __init__(self, group, vfam=None, params=None):
"""
Initialize variable group.
Parameters:
- group: list, variables to group together
- vfam: str, variational family for group
- params: dict, group-specific parameters
"""Stochastic optimization algorithms for variational parameter updates.
def adam(loss_or_grads, params, learning_rate=0.001, beta1=0.9,
beta2=0.999, epsilon=1e-8):
"""
Adam optimizer for variational parameters.
Adaptive moment estimation with bias correction
for efficient optimization of variational objectives.
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, learning rate
- beta1: float, exponential decay rate for first moment
- beta2: float, exponential decay rate for second moment
- epsilon: float, numerical stability constant
Returns:
- list: parameter updates
"""
def adamax(loss_or_grads, params, learning_rate=0.002, beta1=0.9,
beta2=0.999, epsilon=1e-8):
"""
Adamax optimizer (Adam with infinity norm).
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, learning rate
- beta1: float, exponential decay rate for first moment
- beta2: float, exponential decay rate for second raw moment
- epsilon: float, numerical stability constant
Returns:
- list: parameter updates
"""
def sgd(loss_or_grads, params, learning_rate):
"""
Stochastic gradient descent optimizer.
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, learning rate
Returns:
- list: parameter updates
"""
def momentum(loss_or_grads, params, learning_rate, momentum=0.9):
"""
SGD with momentum optimizer.
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, learning rate
- momentum: float, momentum coefficient
Returns:
- list: parameter updates
"""
def nesterov_momentum(loss_or_grads, params, learning_rate, momentum=0.9):
"""
SGD with Nesterov momentum optimizer.
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, learning rate
- momentum: float, momentum coefficient
Returns:
- list: parameter updates
"""
def adagrad(loss_or_grads, params, learning_rate=1.0, epsilon=1e-6):
"""
Adagrad optimizer with adaptive learning rates.
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, initial learning rate
- epsilon: float, numerical stability constant
Returns:
- list: parameter updates
"""
def adagrad_window(loss_or_grads, params, learning_rate=1.0,
epsilon=1e-6, n_win=10):
"""
Adagrad with windowed accumulation.
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, initial learning rate
- epsilon: float, numerical stability constant
- n_win: int, window size for gradient accumulation
Returns:
- list: parameter updates
"""
def adadelta(loss_or_grads, params, learning_rate=1.0, rho=0.95,
epsilon=1e-6):
"""
Adadelta optimizer with parameter-specific learning rates.
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, learning rate scaling factor
- rho: float, decay rate for moving averages
- epsilon: float, numerical stability constant
Returns:
- list: parameter updates
"""
def rmsprop(loss_or_grads, params, learning_rate=0.001, rho=0.9,
epsilon=1e-6):
"""
RMSprop optimizer with adaptive learning rates.
Parameters:
- loss_or_grads: loss function or gradients
- params: list, parameters to optimize
- learning_rate: float, learning rate
- rho: float, decay rate for moving average
- epsilon: float, numerical stability constant
Returns:
- list: parameter updates
"""
def apply_momentum(updates, params=None, momentum=0.9):
"""
Apply momentum to parameter updates.
Parameters:
- updates: dict, parameter updates
- params: list, parameters (inferred if None)
- momentum: float, momentum coefficient
Returns:
- dict: momentum-adjusted updates
"""
def apply_nesterov_momentum(updates, params=None, momentum=0.9):
"""
Apply Nesterov momentum to parameter updates.
Parameters:
- updates: dict, parameter updates
- params: list, parameters (inferred if None)
- momentum: float, momentum coefficient
Returns:
- dict: Nesterov momentum-adjusted updates
"""
def norm_constraint(tensor, max_norm, norm_axes=None, epsilon=1e-7):
"""
Apply norm constraint to gradients.
Parameters:
- tensor: tensor to constrain
- max_norm: float, maximum allowed norm
- norm_axes: tuple, axes for norm computation
- epsilon: float, numerical stability
Returns:
- tensor: norm-constrained tensor
"""
def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7,
return_norm=False):
"""
Apply total norm constraint across multiple tensors.
Parameters:
- tensor_vars: list, tensors to constrain
- max_norm: float, maximum total norm
- epsilon: float, numerical stability
- return_norm: bool, return computed norm
Returns:
- list or tuple: constrained tensors and optionally norm
"""Stein variational inference and related particle-based methods.
class Stein:
"""
Stein method for particle-based inference.
Base class for Stein variational methods that use
reproducing kernel Hilbert spaces for particle updates.
"""
def __init__(self, approx=None, kernel='rbf', **kwargs):
"""
Initialize Stein method.
Parameters:
- approx: Approximation, empirical approximation
- kernel: str or callable, kernel function
"""
def updates(self, obj, **kwargs):
"""Compute Stein variational updates."""import pymc3 as pm
import numpy as np
import theano.tensor as tt
# Simple model with ADVI
with pm.Model() as simple_model:
mu = pm.Normal('mu', mu=0, sigma=10)
sigma = pm.HalfNormal('sigma', sigma=5)
y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=data)
# Fit using ADVI
approx = pm.fit(n=20000, method='advi')
# Sample from approximation
trace_vi = approx.sample(2000)
# Check convergence
print(f"ELBO: {approx.hist[-1]:.2f}")# Full-rank ADVI for correlated parameters
with pm.Model() as corr_model:
# Correlated parameters
theta = pm.MvNormal('theta',
mu=np.zeros(3),
cov=np.eye(3),
shape=3)
# Transform to create correlation
x = tt.dot(theta, correlation_matrix)
y_obs = pm.Normal('y_obs', mu=x.sum(), sigma=1, observed=target)
# Full-rank to capture correlations
approx = pm.fit(n=30000, method='fullrank_advi')
# Access covariance structure
cov_matrix = approx.cov.eval()
print("Posterior covariance:\n", cov_matrix)# SVGD for complex posteriors
with pm.Model() as complex_model:
# Multi-modal posterior setup
mu1 = pm.Normal('mu1', mu=-2, sigma=1)
mu2 = pm.Normal('mu2', mu=2, sigma=1)
# Create multi-modality
mixture = pm.Mixture('mixture',
w=np.array([0.3, 0.7]),
comp_dists=[pm.Normal.dist(mu=mu1, sigma=0.5),
pm.Normal.dist(mu=mu2, sigma=0.5)],
observed=mixture_data)
# SVGD with particles
approx = pm.fit(n=10000,
method='svgd',
inf_kwargs={'n_particles': 200})
# Analyze particle distribution
particles = approx.sample(5000)
print("Particle means:", particles['mu1'].mean(), particles['mu2'].mean())# Normalizing flow for flexible approximation
with pm.Model() as flow_model:
# Skewed posterior
x = pm.SkewNormal('x', mu=0, sigma=1, alpha=5)
y_obs = pm.Normal('y_obs', mu=x**2, sigma=0.5, observed=skewed_data)
# Normalizing flow approximation
approx = pm.fit(n=25000, method='nfvi',
inf_kwargs={'flow': 'planar'})
# Sample from flexible approximation
trace_flow = approx.sample(2000)# Large dataset with minibatch VI
batch_size = 500
n_data = len(large_dataset)
# Minibatch containers
x_minibatch = pm.Minibatch(X_large, batch_size=batch_size)
y_minibatch = pm.Minibatch(y_large, batch_size=batch_size)
with pm.Model() as minibatch_model:
# Model parameters
weights = pm.Normal('weights', mu=0, sigma=1, shape=n_features)
intercept = pm.Normal('intercept', mu=0, sigma=10)
# Predictions with minibatch
mu = intercept + tt.dot(x_minibatch, weights)
# Scaled likelihood
y_obs = pm.Normal('y_obs', mu=mu, sigma=1,
observed=y_minibatch,
total_size=n_data)
# Minibatch ADVI
approx = pm.fit(n=50000,
method='advi',
inf_kwargs={'scale_cost_to_minibatch': True})# Custom optimization with monitoring
def elbo_callback(approx, loss, i):
"""Custom callback for monitoring ELBO."""
if i % 1000 == 0:
print(f"Iteration {i}: ELBO = {loss:.2f}")
# Custom diagnostics
samples = approx.sample(100)
mean_est = {var: samples[var].mean()
for var in samples.varnames}
print(f"Current estimates: {mean_est}")
with pm.Model() as callback_model:
theta = pm.Beta('theta', alpha=2, beta=3)
y_obs = pm.Binomial('y_obs', n=20, p=theta, observed=successes)
# Custom ADVI with callbacks
inference = pm.ADVI()
approx = pm.fit(n=15000,
method=inference,
callbacks=[elbo_callback])# Hierarchical model with VI
n_groups = 10
with pm.Model() as hierarchical_vi:
# Hyperpriors
mu_mu = pm.Normal('mu_mu', mu=0, sigma=10)
sigma_mu = pm.HalfNormal('sigma_mu', sigma=5)
# Group-level parameters
group_mu = pm.Normal('group_mu',
mu=mu_mu,
sigma=sigma_mu,
shape=n_groups)
# Observations
y_obs = pm.Normal('y_obs',
mu=group_mu[group_idx],
sigma=1,
observed=hierarchical_data)
# Hierarchical ADVI
approx = pm.fit(n=25000, method='advi')
# Analyze group-level effects
trace_hier = approx.sample(2000)
group_effects = trace_hier['group_mu']
print("Group effect means:", group_effects.mean(axis=0))
print("Group effect stds:", group_effects.std(axis=0))# Compare models using VI
models = {}
approx_results = {}
# Model 1: Simple linear
with pm.Model() as model1:
alpha = pm.Normal('alpha', 0, 10)
beta = pm.Normal('beta', 0, 10)
sigma = pm.HalfNormal('sigma', 1)
mu = alpha + beta * x_data
y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y_data)
models['linear'] = model1
approx_results['linear'] = pm.fit(n=20000)
# Model 2: Quadratic
with pm.Model() as model2:
alpha = pm.Normal('alpha', 0, 10)
beta1 = pm.Normal('beta1', 0, 10)
beta2 = pm.Normal('beta2', 0, 10)
sigma = pm.HalfNormal('sigma', 1)
mu = alpha + beta1 * x_data + beta2 * x_data**2
y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y_data)
models['quadratic'] = model2
approx_results['quadratic'] = pm.fit(n=20000)
# Compare ELBO values
for name, approx in approx_results.items():
final_elbo = approx.hist[-1]
print(f"{name} model ELBO: {final_elbo:.2f}")# Detailed approximation analysis
with pm.Model() as analysis_model:
mu = pm.Normal('mu', mu=0, sigma=5)
tau = pm.Gamma('tau', alpha=2, beta=1)
y_obs = pm.Normal('y_obs', mu=mu, tau=tau, observed=obs_data)
# Fit approximation
approx = pm.fit(n=30000, method='fullrank_advi')
# Extract approximation parameters
mean_params = approx.mean.eval()
std_params = approx.std.eval()
cov_params = approx.cov.eval()
print("Variational means:", mean_params)
print("Variational stds:", std_params)
print("Variational covariance:\n", cov_params)
# Sample and compare to MCMC
vi_samples = approx.sample(5000)
mcmc_samples = pm.sample(2000, tune=1000, chains=2)
# Diagnostic comparison
import arviz as az
comparison = az.compare({
'VI': vi_samples,
'MCMC': mcmc_samples
})
print(comparison)Install with Tessl CLI
npx tessl i tessl/pypi-pymc3docs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10