tessl/pypi-pymc3

Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano

0.94x

Overview

Eval results

Files

Generalized Linear Models

Name: tessl/pypi-pymc3
Rating: 0.68 (1 reviews)
Author: tessl

PyMC3's GLM module provides a high-level interface for generalized linear models with automatic handling of design matrices, link functions, and family-specific distributions. This streamlines the construction of regression models for various response types including continuous, binary, count, and categorical data.

Capabilities

GLM Classes

High-level classes for building generalized linear models with automatic configuration.

class GLM:
    """
    Generalized Linear Model with automatic family and link function handling.
    
    Provides a high-level interface for GLM construction that automatically
    handles design matrix creation, appropriate distributions, link functions,
    and parameter priors based on the specified family.
    """
    
    def __init__(self, y, X, labels=None, intercept=True, family=None, 
                 priors=None, **kwargs):
        """
        Initialize GLM.
        
        Parameters:
        - y: array, response variable
        - X: array, design matrix or predictor variables  
        - labels: list, variable names for predictors
        - intercept: bool, include intercept term
        - family: GLMFamily, response distribution family
        - priors: dict, prior specifications for parameters
        - kwargs: additional arguments for family-specific configuration
        
        Returns:
        - GLM: configured GLM object
        
        Example:
        # Linear regression
        glm = pm.GLM(y=height, X=weight, family=pm.glm.families.Normal())
        
        # Logistic regression  
        glm = pm.GLM(y=success, X=predictors, family=pm.glm.families.Binomial())
        
        # Poisson regression
        glm = pm.GLM(y=counts, X=features, family=pm.glm.families.Poisson())
        """
    
    def fit(self, draws=1000, **kwargs):
        """
        Fit GLM using MCMC sampling.
        
        Parameters:
        - draws: int, number of posterior samples
        - kwargs: arguments passed to pm.sample()
        
        Returns:
        - InferenceData: posterior samples and diagnostics
        """
    
    def predict(self, X_new, trace=None, **kwargs):
        """
        Generate predictions for new data.
        
        Parameters:
        - X_new: array, new predictor values
        - trace: InferenceData, posterior samples for prediction
        - kwargs: additional prediction arguments
        
        Returns:
        - dict: predictive samples
        """

class LinearComponent:
    """
    Linear component builder for GLM construction.
    
    Handles design matrix creation, variable naming, and prior
    specification for linear predictors in GLM models.
    """
    
    def __init__(self, x, intercept=True, labels=None, priors=None):
        """
        Initialize linear component.
        
        Parameters:
        - x: array, predictor matrix
        - intercept: bool, include intercept
        - labels: list, predictor names
        - priors: dict, prior distributions for coefficients
        
        Returns:
        - LinearComponent: linear component object
        """
    
    def build(self, name=''):
        """
        Build linear predictor variables.
        
        Parameters:
        - name: str, prefix for variable names
        
        Returns:
        - TensorVariable: linear predictor
        """

GLM Families

Distribution families for different response types available in pm.glm.families.

class Normal:
    """
    Normal family for continuous responses (linear regression).
    
    Uses identity link function and normal distribution for
    modeling continuous response variables.
    """
    
    def __init__(self, priors=None):
        """
        Initialize Normal family.
        
        Parameters:
        - priors: dict, prior specifications {'sigma': prior_dist}
        """
    
    @property
    def link(self):
        """Identity link function: η = μ"""
    
    @property
    def likelihood(self):
        """Normal likelihood distribution"""

class Binomial:
    """
    Binomial family for binary and binomial responses.
    
    Uses logit link function and binomial distribution for
    modeling binary outcomes or success/failure counts.
    """
    
    def __init__(self, n=1, priors=None):
        """
        Initialize Binomial family.
        
        Parameters:
        - n: int or array, number of trials (1 for binary)
        - priors: dict, prior specifications
        """
    
    @property  
    def link(self):
        """Logit link function: η = log(p/(1-p))"""

class Poisson:
    """
    Poisson family for count responses.
    
    Uses log link function and Poisson distribution for
    modeling count data and rate processes.
    """
    
    def __init__(self, priors=None):
        """
        Initialize Poisson family.
        
        Parameters:
        - priors: dict, prior specifications
        """
    
    @property
    def link(self):
        """Log link function: η = log(λ)"""

class NegativeBinomial:
    """
    Negative binomial family for overdispersed count data.
    
    Uses log link with additional dispersion parameter for
    modeling count data with variance greater than mean.
    """
    
    def __init__(self, priors=None):
        """
        Initialize NegativeBinomial family.
        
        Parameters:
        - priors: dict, prior specifications {'alpha': prior_dist}
        """

class Gamma:
    """
    Gamma family for positive continuous responses.
    
    Uses log link function and gamma distribution for
    modeling positive continuous variables like duration, cost.
    """
    
    def __init__(self, priors=None):
        """
        Initialize Gamma family.
        
        Parameters:
        - priors: dict, prior specifications {'alpha': prior_dist}
        """

class StudentT:
    """
    Student's t family for robust regression.
    
    Uses identity link with Student's t distribution for
    robust modeling of continuous responses with outliers.
    """
    
    def __init__(self, priors=None):
        """
        Initialize StudentT family.
        
        Parameters:
        - priors: dict, prior specifications {'nu': prior_dist, 'sigma': prior_dist}
        """

Usage Examples

Linear Regression with GLM

import pymc3 as pm
import numpy as np
import pandas as pd

# Generate sample data
np.random.seed(42)
n = 100
X = np.random.randn(n, 3)
true_coef = np.array([1.5, -2.0, 0.5])
true_intercept = 0.3
y = true_intercept + X @ true_coef + np.random.normal(0, 0.5, n)

# GLM linear regression
with pm.Model() as linear_glm:
    # Automatic GLM construction
    glm = pm.GLM(y=y, X=X, 
                labels=['x1', 'x2', 'x3'],
                family=pm.glm.families.Normal())
    
    # Sample
    trace = pm.sample(1000, tune=1000)
    
    # Predictions
    X_new = np.random.randn(20, 3)
    predictions = glm.predict(X_new, trace=trace)

# Manual prior specification
with pm.Model() as linear_glm_priors:
    # Custom priors
    priors = {
        'Intercept': pm.Normal('Intercept', mu=0, sigma=10),
        'x1': pm.Normal('x1', mu=0, sigma=5),
        'x2': pm.Laplace('x2', mu=0, b=1),  # Regularized prior
        'x3': pm.Normal('x3', mu=0, sigma=5),
        'sigma': pm.HalfNormal('sigma', sigma=2)
    }
    
    glm = pm.GLM(y=y, X=X, 
                labels=['x1', 'x2', 'x3'],
                family=pm.glm.families.Normal(),
                priors=priors)
    
    trace = pm.sample(1000, tune=1000)

Logistic Regression

# Binary classification data
n_samples = 200
X_log = np.random.randn(n_samples, 4)
true_coef_log = np.array([1.2, -0.8, 0.5, -1.1])
true_intercept_log = -0.3

# Generate binary outcomes
linear_pred = true_intercept_log + X_log @ true_coef_log
prob = 1 / (1 + np.exp(-linear_pred))  # Logistic function
y_binary = np.random.binomial(1, prob)

# Logistic regression GLM
with pm.Model() as logistic_glm:
    glm = pm.GLM(y=y_binary, X=X_log,
                labels=['feature_1', 'feature_2', 'feature_3', 'feature_4'],
                family=pm.glm.families.Binomial())
    
    trace = pm.sample(1000, tune=1000)
    
    # Prediction probabilities
    X_test = np.random.randn(50, 4)
    pred_samples = glm.predict(X_test, trace=trace)
    
    # Extract probabilities
    prob_samples = pred_samples['y']  # Bernoulli samples
    prob_mean = prob_samples.mean(axis=0)

# Multi-trial binomial
n_trials = 20
y_binomial = np.random.binomial(n_trials, prob)

with pm.Model() as binomial_glm:
    glm = pm.GLM(y=y_binomial, X=X_log,
                labels=['feature_1', 'feature_2', 'feature_3', 'feature_4'],  
                family=pm.glm.families.Binomial(n=n_trials))
    
    trace = pm.sample(1000, tune=1000)

Poisson Regression

# Count data generation
X_count = np.random.randn(150, 2)
true_coef_count = np.array([0.8, -0.6])
true_intercept_count = 1.2

# Generate count outcomes
log_rate = true_intercept_count + X_count @ true_coef_count
rate = np.exp(log_rate)
y_counts = np.random.poisson(rate)

# Poisson regression GLM
with pm.Model() as poisson_glm:
    glm = pm.GLM(y=y_counts, X=X_count,
                labels=['exposure', 'treatment'],
                family=pm.glm.families.Poisson())
    
    trace = pm.sample(1000, tune=1000)
    
    # Prediction of rates
    X_count_new = np.array([[0.5, 1.0], [-1.0, 0.0], [1.5, -0.5]])
    count_pred = glm.predict(X_count_new, trace=trace)
    
    # Expected counts
    expected_counts = count_pred['y'].mean(axis=0)
    print("Expected counts:", expected_counts)

Negative Binomial Regression for Overdispersed Counts

# Overdispersed count data  
X_nb = np.random.randn(120, 3)
true_coef_nb = np.array([0.5, -0.3, 0.7])
log_mu = 2.0 + X_nb @ true_coef_nb
mu = np.exp(log_mu)

# Overdispersion parameter
alpha_true = 2.0
y_nb = np.random.negative_binomial(n=alpha_true, 
                                  p=alpha_true/(alpha_true + mu))

with pm.Model() as nb_glm:
    # Custom priors for dispersion
    priors = {
        'alpha': pm.Gamma('alpha', alpha=1, beta=1)  # Dispersion parameter
    }
    
    glm = pm.GLM(y=y_nb, X=X_nb,
                labels=['var1', 'var2', 'var3'],
                family=pm.glm.families.NegativeBinomial(),
                priors=priors)
    
    trace = pm.sample(1000, tune=1000)
    
    # Check for overdispersion
    alpha_post = trace.posterior['alpha'].values.flatten()
    print(f"Overdispersion parameter (mean): {alpha_post.mean():.3f}")

Gamma Regression for Positive Continuous Data

# Positive continuous response (e.g., waiting times, costs)
X_gamma = np.random.randn(100, 2)
true_coef_gamma = np.array([0.4, -0.2])
log_scale = 1.0 + X_gamma @ true_coef_gamma
scale = np.exp(log_scale)

# Gamma-distributed response
shape = 2.0
y_gamma = np.random.gamma(shape=shape, scale=scale/shape)

with pm.Model() as gamma_glm:
    # Priors for shape parameter
    priors = {
        'alpha': pm.Gamma('alpha', alpha=1, beta=1)  # Shape parameter
    }
    
    glm = pm.GLM(y=y_gamma, X=X_gamma,
                labels=['duration_factor', 'cost_driver'],
                family=pm.glm.families.Gamma(),
                priors=priors)
    
    trace = pm.sample(1000, tune=1000)

Robust Regression with Student's t

# Data with outliers
X_robust = np.random.randn(80, 2)
true_coef_robust = np.array([1.0, -0.5])
y_clean = 2.0 + X_robust @ true_coef_robust

# Add outliers
outlier_idx = np.random.choice(80, size=8, replace=False)
y_robust = y_clean.copy()
y_robust[outlier_idx] += np.random.normal(0, 5, len(outlier_idx))

with pm.Model() as robust_glm:
    # Student's t for robustness
    priors = {
        'nu': pm.Gamma('nu', alpha=2, beta=0.1),  # Degrees of freedom
        'sigma': pm.HalfNormal('sigma', sigma=2)
    }
    
    glm = pm.GLM(y=y_robust, X=X_robust,
                labels=['x1', 'x2'], 
                family=pm.glm.families.StudentT(),
                priors=priors)
    
    trace = pm.sample(1000, tune=1000)
    
    # Compare with normal GLM
    glm_normal = pm.GLM(y=y_robust, X=X_robust,
                       family=pm.glm.families.Normal())
    trace_normal = pm.sample(1000, tune=1000)

Mixed Effects GLM Structure

# Hierarchical/mixed effects structure using GLM components
n_groups = 5
n_per_group = 30
group_labels = np.repeat(range(n_groups), n_per_group)
n_total = len(group_labels)

# Group-level predictors
X_group = np.random.randn(n_total, 2)

# Individual-level predictors  
X_individual = np.random.randn(n_total, 1)

with pm.Model() as mixed_glm:
    # Fixed effects for individual-level predictors
    individual_component = pm.glm.LinearComponent(
        x=X_individual,
        labels=['individual_pred'],
        priors={'individual_pred': pm.Normal('individual_pred', 0, 1)}
    )
    individual_pred = individual_component.build('individual')
    
    # Random intercepts per group  
    group_intercept_sd = pm.HalfNormal('group_intercept_sd', sigma=1)
    group_intercepts = pm.Normal('group_intercepts', 
                                mu=0, 
                                sigma=group_intercept_sd, 
                                shape=n_groups)
    
    # Random slopes per group for one predictor
    group_slope_sd = pm.HalfNormal('group_slope_sd', sigma=1)
    group_slopes = pm.Normal('group_slopes',
                            mu=0,
                            sigma=group_slope_sd, 
                            shape=n_groups)
    
    # Combined linear predictor
    mu = (individual_pred + 
          group_intercepts[group_labels] + 
          group_slopes[group_labels] * X_group[:, 0])
    
    # Observation noise
    sigma = pm.HalfNormal('sigma', sigma=1)
    
    # Generate response data
    y_mixed = pm.Normal('y_mixed', mu=mu, sigma=sigma, observed=np.random.randn(n_total))
    
    trace = pm.sample(1000, tune=1000)

GLM with Interaction Terms

# Model with interaction effects
n_interact = 200
X1 = np.random.randn(n_interact)
X2 = np.random.binomial(1, 0.5, n_interact)  # Binary predictor
X_interact = np.column_stack([X1, X2, X1 * X2])  # Include interaction

# True coefficients including interaction
true_coef_interact = np.array([1.0, 0.5, -0.8])  # main effects + interaction
y_interact_true = 0.2 + X_interact @ true_coef_interact
y_interact = y_interact_true + np.random.normal(0, 0.3, n_interact)

with pm.Model() as interaction_glm:
    glm = pm.GLM(y=y_interact, X=X_interact,
                labels=['continuous', 'binary', 'interaction'],
                family=pm.glm.families.Normal())
    
    trace = pm.sample(1000, tune=1000)
    
    # Analyze interaction effect
    interaction_coef = trace.posterior['interaction'].values.flatten()
    print(f"Interaction effect: {interaction_coef.mean():.3f} ± {interaction_coef.std():.3f}")

GLM Model Comparison

# Compare different GLM families for same data
y_comparison = y_counts  # Use count data from earlier example
X_comparison = X_count

models_comparison = {}

# Poisson model
with pm.Model() as model_poisson:
    glm_pois = pm.GLM(y=y_comparison, X=X_comparison,
                     family=pm.glm.families.Poisson())
    trace_pois = pm.sample(1000, tune=1000, target_accept=0.9)
    
models_comparison['Poisson'] = trace_pois

# Negative binomial model
with pm.Model() as model_nb:
    glm_nb = pm.GLM(y=y_comparison, X=X_comparison,
                   family=pm.glm.families.NegativeBinomial())
    trace_nb = pm.sample(1000, tune=1000, target_accept=0.9)
    
models_comparison['NegBinom'] = trace_nb

# Model comparison using WAIC
import arviz as az
comparison = az.compare(models_comparison, ic='waic')
print("Model comparison:")
print(comparison)

Install with Tessl CLI

npx tessl i tessl/pypi-pymc3