Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano
68
PyMC3's GLM module provides a high-level interface for generalized linear models with automatic handling of design matrices, link functions, and family-specific distributions. This streamlines the construction of regression models for various response types including continuous, binary, count, and categorical data.
High-level classes for building generalized linear models with automatic configuration.
class GLM:
"""
Generalized Linear Model with automatic family and link function handling.
Provides a high-level interface for GLM construction that automatically
handles design matrix creation, appropriate distributions, link functions,
and parameter priors based on the specified family.
"""
def __init__(self, y, X, labels=None, intercept=True, family=None,
priors=None, **kwargs):
"""
Initialize GLM.
Parameters:
- y: array, response variable
- X: array, design matrix or predictor variables
- labels: list, variable names for predictors
- intercept: bool, include intercept term
- family: GLMFamily, response distribution family
- priors: dict, prior specifications for parameters
- kwargs: additional arguments for family-specific configuration
Returns:
- GLM: configured GLM object
Example:
# Linear regression
glm = pm.GLM(y=height, X=weight, family=pm.glm.families.Normal())
# Logistic regression
glm = pm.GLM(y=success, X=predictors, family=pm.glm.families.Binomial())
# Poisson regression
glm = pm.GLM(y=counts, X=features, family=pm.glm.families.Poisson())
"""
def fit(self, draws=1000, **kwargs):
"""
Fit GLM using MCMC sampling.
Parameters:
- draws: int, number of posterior samples
- kwargs: arguments passed to pm.sample()
Returns:
- InferenceData: posterior samples and diagnostics
"""
def predict(self, X_new, trace=None, **kwargs):
"""
Generate predictions for new data.
Parameters:
- X_new: array, new predictor values
- trace: InferenceData, posterior samples for prediction
- kwargs: additional prediction arguments
Returns:
- dict: predictive samples
"""
class LinearComponent:
"""
Linear component builder for GLM construction.
Handles design matrix creation, variable naming, and prior
specification for linear predictors in GLM models.
"""
def __init__(self, x, intercept=True, labels=None, priors=None):
"""
Initialize linear component.
Parameters:
- x: array, predictor matrix
- intercept: bool, include intercept
- labels: list, predictor names
- priors: dict, prior distributions for coefficients
Returns:
- LinearComponent: linear component object
"""
def build(self, name=''):
"""
Build linear predictor variables.
Parameters:
- name: str, prefix for variable names
Returns:
- TensorVariable: linear predictor
"""Distribution families for different response types available in pm.glm.families.
class Normal:
"""
Normal family for continuous responses (linear regression).
Uses identity link function and normal distribution for
modeling continuous response variables.
"""
def __init__(self, priors=None):
"""
Initialize Normal family.
Parameters:
- priors: dict, prior specifications {'sigma': prior_dist}
"""
@property
def link(self):
"""Identity link function: η = μ"""
@property
def likelihood(self):
"""Normal likelihood distribution"""
class Binomial:
"""
Binomial family for binary and binomial responses.
Uses logit link function and binomial distribution for
modeling binary outcomes or success/failure counts.
"""
def __init__(self, n=1, priors=None):
"""
Initialize Binomial family.
Parameters:
- n: int or array, number of trials (1 for binary)
- priors: dict, prior specifications
"""
@property
def link(self):
"""Logit link function: η = log(p/(1-p))"""
class Poisson:
"""
Poisson family for count responses.
Uses log link function and Poisson distribution for
modeling count data and rate processes.
"""
def __init__(self, priors=None):
"""
Initialize Poisson family.
Parameters:
- priors: dict, prior specifications
"""
@property
def link(self):
"""Log link function: η = log(λ)"""
class NegativeBinomial:
"""
Negative binomial family for overdispersed count data.
Uses log link with additional dispersion parameter for
modeling count data with variance greater than mean.
"""
def __init__(self, priors=None):
"""
Initialize NegativeBinomial family.
Parameters:
- priors: dict, prior specifications {'alpha': prior_dist}
"""
class Gamma:
"""
Gamma family for positive continuous responses.
Uses log link function and gamma distribution for
modeling positive continuous variables like duration, cost.
"""
def __init__(self, priors=None):
"""
Initialize Gamma family.
Parameters:
- priors: dict, prior specifications {'alpha': prior_dist}
"""
class StudentT:
"""
Student's t family for robust regression.
Uses identity link with Student's t distribution for
robust modeling of continuous responses with outliers.
"""
def __init__(self, priors=None):
"""
Initialize StudentT family.
Parameters:
- priors: dict, prior specifications {'nu': prior_dist, 'sigma': prior_dist}
"""import pymc3 as pm
import numpy as np
import pandas as pd
# Generate sample data
np.random.seed(42)
n = 100
X = np.random.randn(n, 3)
true_coef = np.array([1.5, -2.0, 0.5])
true_intercept = 0.3
y = true_intercept + X @ true_coef + np.random.normal(0, 0.5, n)
# GLM linear regression
with pm.Model() as linear_glm:
# Automatic GLM construction
glm = pm.GLM(y=y, X=X,
labels=['x1', 'x2', 'x3'],
family=pm.glm.families.Normal())
# Sample
trace = pm.sample(1000, tune=1000)
# Predictions
X_new = np.random.randn(20, 3)
predictions = glm.predict(X_new, trace=trace)
# Manual prior specification
with pm.Model() as linear_glm_priors:
# Custom priors
priors = {
'Intercept': pm.Normal('Intercept', mu=0, sigma=10),
'x1': pm.Normal('x1', mu=0, sigma=5),
'x2': pm.Laplace('x2', mu=0, b=1), # Regularized prior
'x3': pm.Normal('x3', mu=0, sigma=5),
'sigma': pm.HalfNormal('sigma', sigma=2)
}
glm = pm.GLM(y=y, X=X,
labels=['x1', 'x2', 'x3'],
family=pm.glm.families.Normal(),
priors=priors)
trace = pm.sample(1000, tune=1000)# Binary classification data
n_samples = 200
X_log = np.random.randn(n_samples, 4)
true_coef_log = np.array([1.2, -0.8, 0.5, -1.1])
true_intercept_log = -0.3
# Generate binary outcomes
linear_pred = true_intercept_log + X_log @ true_coef_log
prob = 1 / (1 + np.exp(-linear_pred)) # Logistic function
y_binary = np.random.binomial(1, prob)
# Logistic regression GLM
with pm.Model() as logistic_glm:
glm = pm.GLM(y=y_binary, X=X_log,
labels=['feature_1', 'feature_2', 'feature_3', 'feature_4'],
family=pm.glm.families.Binomial())
trace = pm.sample(1000, tune=1000)
# Prediction probabilities
X_test = np.random.randn(50, 4)
pred_samples = glm.predict(X_test, trace=trace)
# Extract probabilities
prob_samples = pred_samples['y'] # Bernoulli samples
prob_mean = prob_samples.mean(axis=0)
# Multi-trial binomial
n_trials = 20
y_binomial = np.random.binomial(n_trials, prob)
with pm.Model() as binomial_glm:
glm = pm.GLM(y=y_binomial, X=X_log,
labels=['feature_1', 'feature_2', 'feature_3', 'feature_4'],
family=pm.glm.families.Binomial(n=n_trials))
trace = pm.sample(1000, tune=1000)# Count data generation
X_count = np.random.randn(150, 2)
true_coef_count = np.array([0.8, -0.6])
true_intercept_count = 1.2
# Generate count outcomes
log_rate = true_intercept_count + X_count @ true_coef_count
rate = np.exp(log_rate)
y_counts = np.random.poisson(rate)
# Poisson regression GLM
with pm.Model() as poisson_glm:
glm = pm.GLM(y=y_counts, X=X_count,
labels=['exposure', 'treatment'],
family=pm.glm.families.Poisson())
trace = pm.sample(1000, tune=1000)
# Prediction of rates
X_count_new = np.array([[0.5, 1.0], [-1.0, 0.0], [1.5, -0.5]])
count_pred = glm.predict(X_count_new, trace=trace)
# Expected counts
expected_counts = count_pred['y'].mean(axis=0)
print("Expected counts:", expected_counts)# Overdispersed count data
X_nb = np.random.randn(120, 3)
true_coef_nb = np.array([0.5, -0.3, 0.7])
log_mu = 2.0 + X_nb @ true_coef_nb
mu = np.exp(log_mu)
# Overdispersion parameter
alpha_true = 2.0
y_nb = np.random.negative_binomial(n=alpha_true,
p=alpha_true/(alpha_true + mu))
with pm.Model() as nb_glm:
# Custom priors for dispersion
priors = {
'alpha': pm.Gamma('alpha', alpha=1, beta=1) # Dispersion parameter
}
glm = pm.GLM(y=y_nb, X=X_nb,
labels=['var1', 'var2', 'var3'],
family=pm.glm.families.NegativeBinomial(),
priors=priors)
trace = pm.sample(1000, tune=1000)
# Check for overdispersion
alpha_post = trace.posterior['alpha'].values.flatten()
print(f"Overdispersion parameter (mean): {alpha_post.mean():.3f}")# Positive continuous response (e.g., waiting times, costs)
X_gamma = np.random.randn(100, 2)
true_coef_gamma = np.array([0.4, -0.2])
log_scale = 1.0 + X_gamma @ true_coef_gamma
scale = np.exp(log_scale)
# Gamma-distributed response
shape = 2.0
y_gamma = np.random.gamma(shape=shape, scale=scale/shape)
with pm.Model() as gamma_glm:
# Priors for shape parameter
priors = {
'alpha': pm.Gamma('alpha', alpha=1, beta=1) # Shape parameter
}
glm = pm.GLM(y=y_gamma, X=X_gamma,
labels=['duration_factor', 'cost_driver'],
family=pm.glm.families.Gamma(),
priors=priors)
trace = pm.sample(1000, tune=1000)# Data with outliers
X_robust = np.random.randn(80, 2)
true_coef_robust = np.array([1.0, -0.5])
y_clean = 2.0 + X_robust @ true_coef_robust
# Add outliers
outlier_idx = np.random.choice(80, size=8, replace=False)
y_robust = y_clean.copy()
y_robust[outlier_idx] += np.random.normal(0, 5, len(outlier_idx))
with pm.Model() as robust_glm:
# Student's t for robustness
priors = {
'nu': pm.Gamma('nu', alpha=2, beta=0.1), # Degrees of freedom
'sigma': pm.HalfNormal('sigma', sigma=2)
}
glm = pm.GLM(y=y_robust, X=X_robust,
labels=['x1', 'x2'],
family=pm.glm.families.StudentT(),
priors=priors)
trace = pm.sample(1000, tune=1000)
# Compare with normal GLM
glm_normal = pm.GLM(y=y_robust, X=X_robust,
family=pm.glm.families.Normal())
trace_normal = pm.sample(1000, tune=1000)# Hierarchical/mixed effects structure using GLM components
n_groups = 5
n_per_group = 30
group_labels = np.repeat(range(n_groups), n_per_group)
n_total = len(group_labels)
# Group-level predictors
X_group = np.random.randn(n_total, 2)
# Individual-level predictors
X_individual = np.random.randn(n_total, 1)
with pm.Model() as mixed_glm:
# Fixed effects for individual-level predictors
individual_component = pm.glm.LinearComponent(
x=X_individual,
labels=['individual_pred'],
priors={'individual_pred': pm.Normal('individual_pred', 0, 1)}
)
individual_pred = individual_component.build('individual')
# Random intercepts per group
group_intercept_sd = pm.HalfNormal('group_intercept_sd', sigma=1)
group_intercepts = pm.Normal('group_intercepts',
mu=0,
sigma=group_intercept_sd,
shape=n_groups)
# Random slopes per group for one predictor
group_slope_sd = pm.HalfNormal('group_slope_sd', sigma=1)
group_slopes = pm.Normal('group_slopes',
mu=0,
sigma=group_slope_sd,
shape=n_groups)
# Combined linear predictor
mu = (individual_pred +
group_intercepts[group_labels] +
group_slopes[group_labels] * X_group[:, 0])
# Observation noise
sigma = pm.HalfNormal('sigma', sigma=1)
# Generate response data
y_mixed = pm.Normal('y_mixed', mu=mu, sigma=sigma, observed=np.random.randn(n_total))
trace = pm.sample(1000, tune=1000)# Model with interaction effects
n_interact = 200
X1 = np.random.randn(n_interact)
X2 = np.random.binomial(1, 0.5, n_interact) # Binary predictor
X_interact = np.column_stack([X1, X2, X1 * X2]) # Include interaction
# True coefficients including interaction
true_coef_interact = np.array([1.0, 0.5, -0.8]) # main effects + interaction
y_interact_true = 0.2 + X_interact @ true_coef_interact
y_interact = y_interact_true + np.random.normal(0, 0.3, n_interact)
with pm.Model() as interaction_glm:
glm = pm.GLM(y=y_interact, X=X_interact,
labels=['continuous', 'binary', 'interaction'],
family=pm.glm.families.Normal())
trace = pm.sample(1000, tune=1000)
# Analyze interaction effect
interaction_coef = trace.posterior['interaction'].values.flatten()
print(f"Interaction effect: {interaction_coef.mean():.3f} ± {interaction_coef.std():.3f}")# Compare different GLM families for same data
y_comparison = y_counts # Use count data from earlier example
X_comparison = X_count
models_comparison = {}
# Poisson model
with pm.Model() as model_poisson:
glm_pois = pm.GLM(y=y_comparison, X=X_comparison,
family=pm.glm.families.Poisson())
trace_pois = pm.sample(1000, tune=1000, target_accept=0.9)
models_comparison['Poisson'] = trace_pois
# Negative binomial model
with pm.Model() as model_nb:
glm_nb = pm.GLM(y=y_comparison, X=X_comparison,
family=pm.glm.families.NegativeBinomial())
trace_nb = pm.sample(1000, tune=1000, target_accept=0.9)
models_comparison['NegBinom'] = trace_nb
# Model comparison using WAIC
import arviz as az
comparison = az.compare(models_comparison, ic='waic')
print("Model comparison:")
print(comparison)Install with Tessl CLI
npx tessl i tessl/pypi-pymc3docs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10