CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pymc3

Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano

68

0.94x
Overview
Eval results
Files

variational.mddocs/

Variational Inference

PyMC3's variational inference module provides scalable approximate inference methods that optimize a tractable approximation to the posterior distribution. These methods trade some accuracy for computational efficiency, making Bayesian inference feasible for large datasets and complex models.

Capabilities

Main Inference Function

High-level interface for variational inference with automatic method selection and optimization.

def fit(n=10000, method='advi', model=None, random_seed=None, 
        start=None, inf_kwargs=None, **kwargs):
    """
    Fit model using variational inference.
    
    Main interface for approximate inference using various VI methods.
    Automatically handles method selection, initialization, and optimization.
    
    Parameters:
    - n: int, number of optimization iterations
    - method: str or Inference class, VI method ('advi', 'fullrank_advi', 'svgd', 'asvgd', 'nfvi')
    - model: Model, model to fit (current context if None)
    - random_seed: int, random seed for reproducibility
    - start: dict, initial parameter values
    - inf_kwargs: dict, arguments passed to inference method
    - kwargs: additional arguments for specific methods
    
    Returns:
    - Approximation: fitted approximation object
    
    Example:
    with pm.Model() as model:
        mu = pm.Normal('mu', 0, 1)
        y_obs = pm.Normal('y_obs', mu, 1, observed=data)
        
    # Fit using ADVI
    approx = pm.fit(n=20000, method='advi')
    
    # Sample from approximation
    trace = approx.sample(1000)
    """

def sample_approx(approx, draws=1000, include_transformed=True):
    """
    Sample from fitted approximation.
    
    Parameters:
    - approx: Approximation, fitted approximation object
    - draws: int, number of samples to draw
    - include_transformed: bool, include transformed variables
    
    Returns:
    - MultiTrace: samples from approximation
    """

Inference Classes

Core variational inference algorithms with different approximation strategies.

class ADVI:
    """
    Automatic Differentiation Variational Inference.
    
    Mean-field variational inference using automatic differentiation
    for gradient-based optimization. Assumes independence between
    parameters in the approximating distribution.
    """
    
    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, 
                 scale_cost_to_minibatch=False, random_seed=None, 
                 start=None, **kwargs):
        """
        Initialize ADVI inference.
        
        Parameters:
        - local_rv: dict, local random variables for minibatch inference
        - model: Model, model to approximate
        - cost_part_grad_scale: float, scaling for cost gradient
        - scale_cost_to_minibatch: bool, scale cost to minibatch size
        - random_seed: int, random seed
        - start: dict, starting parameter values
        """
    
    def fit(self, n=10000, score=None, callbacks=None, 
            progressbar=True, **kwargs):
        """
        Fit the approximation.
        
        Parameters:
        - n: int, number of optimization iterations
        - score: bool, compute ELBO score during fitting
        - callbacks: list, callback functions for monitoring
        - progressbar: bool, display progress bar
        
        Returns:
        - Approximation: fitted approximation
        """

class FullRankADVI:
    """
    Full-rank Automatic Differentiation Variational Inference.
    
    Variational inference with full covariance structure in the
    approximating distribution, capturing correlations between parameters.
    """
    
    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
                 scale_cost_to_minibatch=False, random_seed=None, 
                 start=None, **kwargs):
        """Initialize full-rank ADVI inference."""

class SVGD:
    """
    Stein Variational Gradient Descent.
    
    Non-parametric variational method that uses a set of particles
    to approximate the posterior through deterministic updates
    guided by Stein's method.
    """
    
    def __init__(self, n_particles=100, jitter=1e-6, model=None, 
                 start=None, random_seed=None, **kwargs):
        """
        Initialize SVGD inference.
        
        Parameters:
        - n_particles: int, number of particles
        - jitter: float, noise for numerical stability
        - model: Model, model to approximate
        - start: dict, initial particle positions
        - random_seed: int, random seed
        """

class ASVGD:
    """
    Amortized Stein Variational Gradient Descent.
    
    Extension of SVGD with amortization for handling multiple
    related inference problems efficiently.
    """

class NFVI:
    """
    Normalizing Flow Variational Inference.
    
    Variational inference using normalizing flows to create
    flexible approximating distributions beyond simple parametric forms.
    """
    
    def __init__(self, flow='planar', model=None, random_seed=None, 
                 **kwargs):
        """
        Initialize normalizing flow VI.
        
        Parameters:
        - flow: str or Flow, type of normalizing flow
        - model: Model, model to approximate  
        - random_seed: int, random seed
        """

class ImplicitGradient:
    """
    Implicit gradient variational inference.
    
    Advanced method using implicit differentiation for
    variational optimization with complex constraints.
    """

class Inference:
    """
    Base class for all variational inference methods.
    
    Provides common interface and functionality for different
    VI algorithms including optimization, callbacks, and diagnostics.
    """
    
    @property
    def approx(self):
        """Current approximation object."""
    
    @property 
    def hist(self):
        """Optimization history."""
    
    def fit(self, n, **kwargs):
        """Abstract method for fitting approximation."""

class KLqp:
    """
    Kullback-Leibler divergence minimization.
    
    General framework for KL-divergence based variational inference
    with customizable divergence measures and optimization strategies.
    """

Approximation Classes

Different parametric forms for approximating posterior distributions.

class MeanField:
    """
    Mean-field Gaussian approximation.
    
    Independent normal distributions for each parameter,
    assuming no correlations in the approximating distribution.
    """
    
    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
                 scale_cost_to_minibatch=False, random_seed=None,
                 start=None):
        """Initialize mean-field approximation."""
    
    def sample(self, draws=1000, include_transformed=True):
        """
        Sample from approximation.
        
        Parameters:
        - draws: int, number of samples
        - include_transformed: bool, include transformed variables
        
        Returns:
        - MultiTrace: samples from approximation
        """
    
    @property
    def mean(self):
        """Mean of approximating distribution."""
    
    @property
    def std(self):
        """Standard deviation of approximating distribution."""
    
    def sample_node(self, node, size=None, more_replacements=None):
        """Sample specific model node."""

class FullRank:
    """
    Full-rank Gaussian approximation.
    
    Multivariate normal distribution with full covariance matrix,
    capturing correlations between parameters in approximation.
    """
    
    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
                 scale_cost_to_minibatch=False, random_seed=None,
                 start=None):
        """Initialize full-rank approximation."""
    
    @property
    def cov(self):
        """Covariance matrix of approximating distribution."""
    
    @property
    def L_chol(self):
        """Cholesky factor of covariance matrix."""

class Empirical:
    """
    Empirical approximation using sample particles.
    
    Non-parametric approximation represented by a set of
    weighted samples, used in particle-based methods like SVGD.
    """
    
    def __init__(self, trace=None, size=None, jitter=1e-6, 
                 local_rv=None, model=None, **kwargs):
        """
        Initialize empirical approximation.
        
        Parameters:
        - trace: MultiTrace, initial particles
        - size: int, number of particles
        - jitter: float, noise for numerical stability
        - local_rv: dict, local random variables
        - model: Model, model context
        """
    
    @property
    def histogram(self):
        """Histogram representation of samples."""

class NormalizingFlow:
    """
    Normalizing flow approximation.
    
    Flexible approximation using invertible transformations
    to map simple base distribution to complex target.
    """
    
    def __init__(self, flow='planar', local_rv=None, model=None, 
                 **kwargs):
        """
        Initialize normalizing flow approximation.
        
        Parameters:
        - flow: str or Flow, flow architecture
        - local_rv: dict, local random variables
        - model: Model, model context
        """

class Approximation:
    """
    Base class for all approximations.
    
    Provides common interface for sampling, evaluation, and
    manipulation of approximating distributions.
    """
    
    def sample(self, draws=1000, **kwargs):
        """Sample from approximation."""
    
    def sample_vp(self, draws=1000, hide_transformed=True):
        """Sample variational parameters."""
        
    def apply_replacements(self, node, more_replacements=None):
        """Apply parameter replacements to computational graph."""
    
    @property
    def logq(self):
        """Log-probability of approximating distribution."""
    
    @property
    def logq_norm(self):
        """Normalized log-probability."""
    
    @property
    def shared_params(self):
        """Shared parameter tensors."""

class Group:
    """
    Variable grouping for approximation.
    
    Groups related variables together for joint approximation,
    useful for hierarchical models and structured approximations.
    """
    
    def __init__(self, group, vfam=None, params=None):
        """
        Initialize variable group.
        
        Parameters:
        - group: list, variables to group together
        - vfam: str, variational family for group
        - params: dict, group-specific parameters
        """

Optimization Methods

Stochastic optimization algorithms for variational parameter updates.

def adam(loss_or_grads, params, learning_rate=0.001, beta1=0.9, 
         beta2=0.999, epsilon=1e-8):
    """
    Adam optimizer for variational parameters.
    
    Adaptive moment estimation with bias correction
    for efficient optimization of variational objectives.
    
    Parameters:
    - loss_or_grads: loss function or gradients
    - params: list, parameters to optimize
    - learning_rate: float, learning rate
    - beta1: float, exponential decay rate for first moment
    - beta2: float, exponential decay rate for second moment  
    - epsilon: float, numerical stability constant
    
    Returns:
    - list: parameter updates
    """

def adamax(loss_or_grads, params, learning_rate=0.002, beta1=0.9, 
           beta2=0.999, epsilon=1e-8):
    """
    Adamax optimizer (Adam with infinity norm).
    
    Parameters:
    - loss_or_grads: loss function or gradients
    - params: list, parameters to optimize
    - learning_rate: float, learning rate
    - beta1: float, exponential decay rate for first moment
    - beta2: float, exponential decay rate for second raw moment
    - epsilon: float, numerical stability constant
    
    Returns:
    - list: parameter updates
    """

def sgd(loss_or_grads, params, learning_rate):
    """
    Stochastic gradient descent optimizer.
    
    Parameters:
    - loss_or_grads: loss function or gradients  
    - params: list, parameters to optimize
    - learning_rate: float, learning rate
    
    Returns:
    - list: parameter updates
    """

def momentum(loss_or_grads, params, learning_rate, momentum=0.9):
    """
    SGD with momentum optimizer.
    
    Parameters:
    - loss_or_grads: loss function or gradients
    - params: list, parameters to optimize  
    - learning_rate: float, learning rate
    - momentum: float, momentum coefficient
    
    Returns:
    - list: parameter updates
    """

def nesterov_momentum(loss_or_grads, params, learning_rate, momentum=0.9):
    """
    SGD with Nesterov momentum optimizer.
    
    Parameters:
    - loss_or_grads: loss function or gradients
    - params: list, parameters to optimize
    - learning_rate: float, learning rate
    - momentum: float, momentum coefficient
    
    Returns:
    - list: parameter updates
    """

def adagrad(loss_or_grads, params, learning_rate=1.0, epsilon=1e-6):
    """
    Adagrad optimizer with adaptive learning rates.
    
    Parameters:
    - loss_or_grads: loss function or gradients
    - params: list, parameters to optimize
    - learning_rate: float, initial learning rate
    - epsilon: float, numerical stability constant
    
    Returns:
    - list: parameter updates
    """

def adagrad_window(loss_or_grads, params, learning_rate=1.0, 
                   epsilon=1e-6, n_win=10):
    """
    Adagrad with windowed accumulation.
    
    Parameters:
    - loss_or_grads: loss function or gradients
    - params: list, parameters to optimize
    - learning_rate: float, initial learning rate
    - epsilon: float, numerical stability constant
    - n_win: int, window size for gradient accumulation
    
    Returns:
    - list: parameter updates
    """

def adadelta(loss_or_grads, params, learning_rate=1.0, rho=0.95, 
             epsilon=1e-6):
    """
    Adadelta optimizer with parameter-specific learning rates.
    
    Parameters:
    - loss_or_grads: loss function or gradients
    - params: list, parameters to optimize
    - learning_rate: float, learning rate scaling factor
    - rho: float, decay rate for moving averages
    - epsilon: float, numerical stability constant
    
    Returns:
    - list: parameter updates
    """

def rmsprop(loss_or_grads, params, learning_rate=0.001, rho=0.9, 
            epsilon=1e-6):
    """
    RMSprop optimizer with adaptive learning rates.
    
    Parameters:
    - loss_or_grads: loss function or gradients
    - params: list, parameters to optimize
    - learning_rate: float, learning rate
    - rho: float, decay rate for moving average
    - epsilon: float, numerical stability constant
    
    Returns:
    - list: parameter updates
    """

def apply_momentum(updates, params=None, momentum=0.9):
    """
    Apply momentum to parameter updates.
    
    Parameters:
    - updates: dict, parameter updates
    - params: list, parameters (inferred if None)
    - momentum: float, momentum coefficient
    
    Returns:
    - dict: momentum-adjusted updates
    """

def apply_nesterov_momentum(updates, params=None, momentum=0.9):
    """
    Apply Nesterov momentum to parameter updates.
    
    Parameters:
    - updates: dict, parameter updates  
    - params: list, parameters (inferred if None)
    - momentum: float, momentum coefficient
    
    Returns:
    - dict: Nesterov momentum-adjusted updates
    """

def norm_constraint(tensor, max_norm, norm_axes=None, epsilon=1e-7):
    """
    Apply norm constraint to gradients.
    
    Parameters:
    - tensor: tensor to constrain
    - max_norm: float, maximum allowed norm
    - norm_axes: tuple, axes for norm computation
    - epsilon: float, numerical stability
    
    Returns:
    - tensor: norm-constrained tensor
    """

def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, 
                         return_norm=False):
    """
    Apply total norm constraint across multiple tensors.
    
    Parameters:
    - tensor_vars: list, tensors to constrain
    - max_norm: float, maximum total norm
    - epsilon: float, numerical stability
    - return_norm: bool, return computed norm
    
    Returns:
    - list or tuple: constrained tensors and optionally norm
    """

Stein Methods

Stein variational inference and related particle-based methods.

class Stein:
    """
    Stein method for particle-based inference.
    
    Base class for Stein variational methods that use
    reproducing kernel Hilbert spaces for particle updates.
    """
    
    def __init__(self, approx=None, kernel='rbf', **kwargs):
        """
        Initialize Stein method.
        
        Parameters:
        - approx: Approximation, empirical approximation
        - kernel: str or callable, kernel function
        """
    
    def updates(self, obj, **kwargs):
        """Compute Stein variational updates."""

Usage Examples

Basic ADVI

import pymc3 as pm
import numpy as np
import theano.tensor as tt

# Simple model with ADVI
with pm.Model() as simple_model:
    mu = pm.Normal('mu', mu=0, sigma=10)
    sigma = pm.HalfNormal('sigma', sigma=5)
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=data)
    
    # Fit using ADVI
    approx = pm.fit(n=20000, method='advi')
    
    # Sample from approximation
    trace_vi = approx.sample(2000)
    
    # Check convergence
    print(f"ELBO: {approx.hist[-1]:.2f}")

Full-Rank ADVI

# Full-rank ADVI for correlated parameters
with pm.Model() as corr_model:
    # Correlated parameters
    theta = pm.MvNormal('theta', 
                       mu=np.zeros(3), 
                       cov=np.eye(3), 
                       shape=3)
    
    # Transform to create correlation
    x = tt.dot(theta, correlation_matrix)
    
    y_obs = pm.Normal('y_obs', mu=x.sum(), sigma=1, observed=target)
    
    # Full-rank to capture correlations
    approx = pm.fit(n=30000, method='fullrank_advi')
    
    # Access covariance structure
    cov_matrix = approx.cov.eval()
    print("Posterior covariance:\n", cov_matrix)

Stein Variational Gradient Descent

# SVGD for complex posteriors
with pm.Model() as complex_model:
    # Multi-modal posterior setup
    mu1 = pm.Normal('mu1', mu=-2, sigma=1)
    mu2 = pm.Normal('mu2', mu=2, sigma=1)
    
    # Create multi-modality
    mixture = pm.Mixture('mixture', 
                        w=np.array([0.3, 0.7]),
                        comp_dists=[pm.Normal.dist(mu=mu1, sigma=0.5),
                                   pm.Normal.dist(mu=mu2, sigma=0.5)],
                        observed=mixture_data)
    
    # SVGD with particles
    approx = pm.fit(n=10000, 
                   method='svgd', 
                   inf_kwargs={'n_particles': 200})
    
    # Analyze particle distribution
    particles = approx.sample(5000)
    print("Particle means:", particles['mu1'].mean(), particles['mu2'].mean())

Normalizing Flow VI

# Normalizing flow for flexible approximation
with pm.Model() as flow_model:
    # Skewed posterior
    x = pm.SkewNormal('x', mu=0, sigma=1, alpha=5)
    y_obs = pm.Normal('y_obs', mu=x**2, sigma=0.5, observed=skewed_data)
    
    # Normalizing flow approximation
    approx = pm.fit(n=25000, method='nfvi',
                   inf_kwargs={'flow': 'planar'})
    
    # Sample from flexible approximation
    trace_flow = approx.sample(2000)

Minibatch Variational Inference

# Large dataset with minibatch VI
batch_size = 500
n_data = len(large_dataset)

# Minibatch containers
x_minibatch = pm.Minibatch(X_large, batch_size=batch_size)
y_minibatch = pm.Minibatch(y_large, batch_size=batch_size)

with pm.Model() as minibatch_model:
    # Model parameters
    weights = pm.Normal('weights', mu=0, sigma=1, shape=n_features)
    intercept = pm.Normal('intercept', mu=0, sigma=10)
    
    # Predictions with minibatch
    mu = intercept + tt.dot(x_minibatch, weights)
    
    # Scaled likelihood
    y_obs = pm.Normal('y_obs', mu=mu, sigma=1, 
                     observed=y_minibatch,
                     total_size=n_data)
    
    # Minibatch ADVI
    approx = pm.fit(n=50000, 
                   method='advi',
                   inf_kwargs={'scale_cost_to_minibatch': True})

Custom Optimization and Callbacks

# Custom optimization with monitoring
def elbo_callback(approx, loss, i):
    """Custom callback for monitoring ELBO."""
    if i % 1000 == 0:
        print(f"Iteration {i}: ELBO = {loss:.2f}")
        
        # Custom diagnostics
        samples = approx.sample(100)
        mean_est = {var: samples[var].mean() 
                   for var in samples.varnames}
        print(f"Current estimates: {mean_est}")

with pm.Model() as callback_model:
    theta = pm.Beta('theta', alpha=2, beta=3)
    y_obs = pm.Binomial('y_obs', n=20, p=theta, observed=successes)
    
    # Custom ADVI with callbacks
    inference = pm.ADVI()
    approx = pm.fit(n=15000, 
                   method=inference,
                   callbacks=[elbo_callback])

Hierarchical VI

# Hierarchical model with VI
n_groups = 10

with pm.Model() as hierarchical_vi:
    # Hyperpriors
    mu_mu = pm.Normal('mu_mu', mu=0, sigma=10)
    sigma_mu = pm.HalfNormal('sigma_mu', sigma=5)
    
    # Group-level parameters
    group_mu = pm.Normal('group_mu', 
                        mu=mu_mu, 
                        sigma=sigma_mu, 
                        shape=n_groups)
    
    # Observations
    y_obs = pm.Normal('y_obs', 
                     mu=group_mu[group_idx], 
                     sigma=1, 
                     observed=hierarchical_data)
    
    # Hierarchical ADVI
    approx = pm.fit(n=25000, method='advi')
    
    # Analyze group-level effects
    trace_hier = approx.sample(2000)
    group_effects = trace_hier['group_mu']
    
    print("Group effect means:", group_effects.mean(axis=0))
    print("Group effect stds:", group_effects.std(axis=0))

Model Comparison with VI

# Compare models using VI
models = {}
approx_results = {}

# Model 1: Simple linear
with pm.Model() as model1:
    alpha = pm.Normal('alpha', 0, 10)
    beta = pm.Normal('beta', 0, 10)
    sigma = pm.HalfNormal('sigma', 1)
    
    mu = alpha + beta * x_data
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y_data)
    
models['linear'] = model1
approx_results['linear'] = pm.fit(n=20000)

# Model 2: Quadratic  
with pm.Model() as model2:
    alpha = pm.Normal('alpha', 0, 10)
    beta1 = pm.Normal('beta1', 0, 10)
    beta2 = pm.Normal('beta2', 0, 10)
    sigma = pm.HalfNormal('sigma', 1)
    
    mu = alpha + beta1 * x_data + beta2 * x_data**2
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y_data)
    
models['quadratic'] = model2  
approx_results['quadratic'] = pm.fit(n=20000)

# Compare ELBO values
for name, approx in approx_results.items():
    final_elbo = approx.hist[-1]
    print(f"{name} model ELBO: {final_elbo:.2f}")

Advanced Approximation Analysis

# Detailed approximation analysis
with pm.Model() as analysis_model:
    mu = pm.Normal('mu', mu=0, sigma=5)
    tau = pm.Gamma('tau', alpha=2, beta=1)
    y_obs = pm.Normal('y_obs', mu=mu, tau=tau, observed=obs_data)
    
    # Fit approximation
    approx = pm.fit(n=30000, method='fullrank_advi')
    
    # Extract approximation parameters
    mean_params = approx.mean.eval()
    std_params = approx.std.eval()
    cov_params = approx.cov.eval()
    
    print("Variational means:", mean_params)
    print("Variational stds:", std_params)
    print("Variational covariance:\n", cov_params)
    
    # Sample and compare to MCMC
    vi_samples = approx.sample(5000)
    mcmc_samples = pm.sample(2000, tune=1000, chains=2)
    
    # Diagnostic comparison
    import arviz as az
    comparison = az.compare({
        'VI': vi_samples,
        'MCMC': mcmc_samples
    })
    print(comparison)

Install with Tessl CLI

npx tessl i tessl/pypi-pymc3

docs

data-handling.md

distributions.md

gaussian-processes.md

glm.md

index.md

math-functions.md

modeling.md

sampling.md

stats-plots.md

step-methods.md

variational.md

tile.json