0
# Generalized Linear Models
1
2
PyMC3's GLM module provides a high-level interface for generalized linear models with automatic handling of design matrices, link functions, and family-specific distributions. This streamlines the construction of regression models for various response types including continuous, binary, count, and categorical data.
3
4
## Capabilities
5
6
### GLM Classes
7
8
High-level classes for building generalized linear models with automatic configuration.
9
10
```python { .api }
11
class GLM:
12
"""
13
Generalized Linear Model with automatic family and link function handling.
14
15
Provides a high-level interface for GLM construction that automatically
16
handles design matrix creation, appropriate distributions, link functions,
17
and parameter priors based on the specified family.
18
"""
19
20
def __init__(self, y, X, labels=None, intercept=True, family=None,
21
priors=None, **kwargs):
22
"""
23
Initialize GLM.
24
25
Parameters:
26
- y: array, response variable
27
- X: array, design matrix or predictor variables
28
- labels: list, variable names for predictors
29
- intercept: bool, include intercept term
30
- family: GLMFamily, response distribution family
31
- priors: dict, prior specifications for parameters
32
- kwargs: additional arguments for family-specific configuration
33
34
Returns:
35
- GLM: configured GLM object
36
37
Example:
38
# Linear regression
39
glm = pm.GLM(y=height, X=weight, family=pm.glm.families.Normal())
40
41
# Logistic regression
42
glm = pm.GLM(y=success, X=predictors, family=pm.glm.families.Binomial())
43
44
# Poisson regression
45
glm = pm.GLM(y=counts, X=features, family=pm.glm.families.Poisson())
46
"""
47
48
def fit(self, draws=1000, **kwargs):
49
"""
50
Fit GLM using MCMC sampling.
51
52
Parameters:
53
- draws: int, number of posterior samples
54
- kwargs: arguments passed to pm.sample()
55
56
Returns:
57
- InferenceData: posterior samples and diagnostics
58
"""
59
60
def predict(self, X_new, trace=None, **kwargs):
61
"""
62
Generate predictions for new data.
63
64
Parameters:
65
- X_new: array, new predictor values
66
- trace: InferenceData, posterior samples for prediction
67
- kwargs: additional prediction arguments
68
69
Returns:
70
- dict: predictive samples
71
"""
72
73
class LinearComponent:
74
"""
75
Linear component builder for GLM construction.
76
77
Handles design matrix creation, variable naming, and prior
78
specification for linear predictors in GLM models.
79
"""
80
81
def __init__(self, x, intercept=True, labels=None, priors=None):
82
"""
83
Initialize linear component.
84
85
Parameters:
86
- x: array, predictor matrix
87
- intercept: bool, include intercept
88
- labels: list, predictor names
89
- priors: dict, prior distributions for coefficients
90
91
Returns:
92
- LinearComponent: linear component object
93
"""
94
95
def build(self, name=''):
96
"""
97
Build linear predictor variables.
98
99
Parameters:
100
- name: str, prefix for variable names
101
102
Returns:
103
- TensorVariable: linear predictor
104
"""
105
```
106
107
### GLM Families
108
109
Distribution families for different response types available in `pm.glm.families`.
110
111
```python { .api }
112
class Normal:
113
"""
114
Normal family for continuous responses (linear regression).
115
116
Uses identity link function and normal distribution for
117
modeling continuous response variables.
118
"""
119
120
def __init__(self, priors=None):
121
"""
122
Initialize Normal family.
123
124
Parameters:
125
- priors: dict, prior specifications {'sigma': prior_dist}
126
"""
127
128
@property
129
def link(self):
130
"""Identity link function: η = μ"""
131
132
@property
133
def likelihood(self):
134
"""Normal likelihood distribution"""
135
136
class Binomial:
137
"""
138
Binomial family for binary and binomial responses.
139
140
Uses logit link function and binomial distribution for
141
modeling binary outcomes or success/failure counts.
142
"""
143
144
def __init__(self, n=1, priors=None):
145
"""
146
Initialize Binomial family.
147
148
Parameters:
149
- n: int or array, number of trials (1 for binary)
150
- priors: dict, prior specifications
151
"""
152
153
@property
154
def link(self):
155
"""Logit link function: η = log(p/(1-p))"""
156
157
class Poisson:
158
"""
159
Poisson family for count responses.
160
161
Uses log link function and Poisson distribution for
162
modeling count data and rate processes.
163
"""
164
165
def __init__(self, priors=None):
166
"""
167
Initialize Poisson family.
168
169
Parameters:
170
- priors: dict, prior specifications
171
"""
172
173
@property
174
def link(self):
175
"""Log link function: η = log(λ)"""
176
177
class NegativeBinomial:
178
"""
179
Negative binomial family for overdispersed count data.
180
181
Uses log link with additional dispersion parameter for
182
modeling count data with variance greater than mean.
183
"""
184
185
def __init__(self, priors=None):
186
"""
187
Initialize NegativeBinomial family.
188
189
Parameters:
190
- priors: dict, prior specifications {'alpha': prior_dist}
191
"""
192
193
class Gamma:
194
"""
195
Gamma family for positive continuous responses.
196
197
Uses log link function and gamma distribution for
198
modeling positive continuous variables like duration, cost.
199
"""
200
201
def __init__(self, priors=None):
202
"""
203
Initialize Gamma family.
204
205
Parameters:
206
- priors: dict, prior specifications {'alpha': prior_dist}
207
"""
208
209
class StudentT:
210
"""
211
Student's t family for robust regression.
212
213
Uses identity link with Student's t distribution for
214
robust modeling of continuous responses with outliers.
215
"""
216
217
def __init__(self, priors=None):
218
"""
219
Initialize StudentT family.
220
221
Parameters:
222
- priors: dict, prior specifications {'nu': prior_dist, 'sigma': prior_dist}
223
"""
224
```
225
226
## Usage Examples
227
228
### Linear Regression with GLM
229
230
```python
231
import pymc3 as pm
232
import numpy as np
233
import pandas as pd
234
235
# Generate sample data
236
np.random.seed(42)
237
n = 100
238
X = np.random.randn(n, 3)
239
true_coef = np.array([1.5, -2.0, 0.5])
240
true_intercept = 0.3
241
y = true_intercept + X @ true_coef + np.random.normal(0, 0.5, n)
242
243
# GLM linear regression
244
with pm.Model() as linear_glm:
245
# Automatic GLM construction
246
glm = pm.GLM(y=y, X=X,
247
labels=['x1', 'x2', 'x3'],
248
family=pm.glm.families.Normal())
249
250
# Sample
251
trace = pm.sample(1000, tune=1000)
252
253
# Predictions
254
X_new = np.random.randn(20, 3)
255
predictions = glm.predict(X_new, trace=trace)
256
257
# Manual prior specification
258
with pm.Model() as linear_glm_priors:
259
# Custom priors
260
priors = {
261
'Intercept': pm.Normal('Intercept', mu=0, sigma=10),
262
'x1': pm.Normal('x1', mu=0, sigma=5),
263
'x2': pm.Laplace('x2', mu=0, b=1), # Regularized prior
264
'x3': pm.Normal('x3', mu=0, sigma=5),
265
'sigma': pm.HalfNormal('sigma', sigma=2)
266
}
267
268
glm = pm.GLM(y=y, X=X,
269
labels=['x1', 'x2', 'x3'],
270
family=pm.glm.families.Normal(),
271
priors=priors)
272
273
trace = pm.sample(1000, tune=1000)
274
```
275
276
### Logistic Regression
277
278
```python
279
# Binary classification data
280
n_samples = 200
281
X_log = np.random.randn(n_samples, 4)
282
true_coef_log = np.array([1.2, -0.8, 0.5, -1.1])
283
true_intercept_log = -0.3
284
285
# Generate binary outcomes
286
linear_pred = true_intercept_log + X_log @ true_coef_log
287
prob = 1 / (1 + np.exp(-linear_pred)) # Logistic function
288
y_binary = np.random.binomial(1, prob)
289
290
# Logistic regression GLM
291
with pm.Model() as logistic_glm:
292
glm = pm.GLM(y=y_binary, X=X_log,
293
labels=['feature_1', 'feature_2', 'feature_3', 'feature_4'],
294
family=pm.glm.families.Binomial())
295
296
trace = pm.sample(1000, tune=1000)
297
298
# Prediction probabilities
299
X_test = np.random.randn(50, 4)
300
pred_samples = glm.predict(X_test, trace=trace)
301
302
# Extract probabilities
303
prob_samples = pred_samples['y'] # Bernoulli samples
304
prob_mean = prob_samples.mean(axis=0)
305
306
# Multi-trial binomial
307
n_trials = 20
308
y_binomial = np.random.binomial(n_trials, prob)
309
310
with pm.Model() as binomial_glm:
311
glm = pm.GLM(y=y_binomial, X=X_log,
312
labels=['feature_1', 'feature_2', 'feature_3', 'feature_4'],
313
family=pm.glm.families.Binomial(n=n_trials))
314
315
trace = pm.sample(1000, tune=1000)
316
```
317
318
### Poisson Regression
319
320
```python
321
# Count data generation
322
X_count = np.random.randn(150, 2)
323
true_coef_count = np.array([0.8, -0.6])
324
true_intercept_count = 1.2
325
326
# Generate count outcomes
327
log_rate = true_intercept_count + X_count @ true_coef_count
328
rate = np.exp(log_rate)
329
y_counts = np.random.poisson(rate)
330
331
# Poisson regression GLM
332
with pm.Model() as poisson_glm:
333
glm = pm.GLM(y=y_counts, X=X_count,
334
labels=['exposure', 'treatment'],
335
family=pm.glm.families.Poisson())
336
337
trace = pm.sample(1000, tune=1000)
338
339
# Prediction of rates
340
X_count_new = np.array([[0.5, 1.0], [-1.0, 0.0], [1.5, -0.5]])
341
count_pred = glm.predict(X_count_new, trace=trace)
342
343
# Expected counts
344
expected_counts = count_pred['y'].mean(axis=0)
345
print("Expected counts:", expected_counts)
346
```
347
348
### Negative Binomial Regression for Overdispersed Counts
349
350
```python
351
# Overdispersed count data
352
X_nb = np.random.randn(120, 3)
353
true_coef_nb = np.array([0.5, -0.3, 0.7])
354
log_mu = 2.0 + X_nb @ true_coef_nb
355
mu = np.exp(log_mu)
356
357
# Overdispersion parameter
358
alpha_true = 2.0
359
y_nb = np.random.negative_binomial(n=alpha_true,
360
p=alpha_true/(alpha_true + mu))
361
362
with pm.Model() as nb_glm:
363
# Custom priors for dispersion
364
priors = {
365
'alpha': pm.Gamma('alpha', alpha=1, beta=1) # Dispersion parameter
366
}
367
368
glm = pm.GLM(y=y_nb, X=X_nb,
369
labels=['var1', 'var2', 'var3'],
370
family=pm.glm.families.NegativeBinomial(),
371
priors=priors)
372
373
trace = pm.sample(1000, tune=1000)
374
375
# Check for overdispersion
376
alpha_post = trace.posterior['alpha'].values.flatten()
377
print(f"Overdispersion parameter (mean): {alpha_post.mean():.3f}")
378
```
379
380
### Gamma Regression for Positive Continuous Data
381
382
```python
383
# Positive continuous response (e.g., waiting times, costs)
384
X_gamma = np.random.randn(100, 2)
385
true_coef_gamma = np.array([0.4, -0.2])
386
log_scale = 1.0 + X_gamma @ true_coef_gamma
387
scale = np.exp(log_scale)
388
389
# Gamma-distributed response
390
shape = 2.0
391
y_gamma = np.random.gamma(shape=shape, scale=scale/shape)
392
393
with pm.Model() as gamma_glm:
394
# Priors for shape parameter
395
priors = {
396
'alpha': pm.Gamma('alpha', alpha=1, beta=1) # Shape parameter
397
}
398
399
glm = pm.GLM(y=y_gamma, X=X_gamma,
400
labels=['duration_factor', 'cost_driver'],
401
family=pm.glm.families.Gamma(),
402
priors=priors)
403
404
trace = pm.sample(1000, tune=1000)
405
```
406
407
### Robust Regression with Student's t
408
409
```python
410
# Data with outliers
411
X_robust = np.random.randn(80, 2)
412
true_coef_robust = np.array([1.0, -0.5])
413
y_clean = 2.0 + X_robust @ true_coef_robust
414
415
# Add outliers
416
outlier_idx = np.random.choice(80, size=8, replace=False)
417
y_robust = y_clean.copy()
418
y_robust[outlier_idx] += np.random.normal(0, 5, len(outlier_idx))
419
420
with pm.Model() as robust_glm:
421
# Student's t for robustness
422
priors = {
423
'nu': pm.Gamma('nu', alpha=2, beta=0.1), # Degrees of freedom
424
'sigma': pm.HalfNormal('sigma', sigma=2)
425
}
426
427
glm = pm.GLM(y=y_robust, X=X_robust,
428
labels=['x1', 'x2'],
429
family=pm.glm.families.StudentT(),
430
priors=priors)
431
432
trace = pm.sample(1000, tune=1000)
433
434
# Compare with normal GLM
435
glm_normal = pm.GLM(y=y_robust, X=X_robust,
436
family=pm.glm.families.Normal())
437
trace_normal = pm.sample(1000, tune=1000)
438
```
439
440
### Mixed Effects GLM Structure
441
442
```python
443
# Hierarchical/mixed effects structure using GLM components
444
n_groups = 5
445
n_per_group = 30
446
group_labels = np.repeat(range(n_groups), n_per_group)
447
n_total = len(group_labels)
448
449
# Group-level predictors
450
X_group = np.random.randn(n_total, 2)
451
452
# Individual-level predictors
453
X_individual = np.random.randn(n_total, 1)
454
455
with pm.Model() as mixed_glm:
456
# Fixed effects for individual-level predictors
457
individual_component = pm.glm.LinearComponent(
458
x=X_individual,
459
labels=['individual_pred'],
460
priors={'individual_pred': pm.Normal('individual_pred', 0, 1)}
461
)
462
individual_pred = individual_component.build('individual')
463
464
# Random intercepts per group
465
group_intercept_sd = pm.HalfNormal('group_intercept_sd', sigma=1)
466
group_intercepts = pm.Normal('group_intercepts',
467
mu=0,
468
sigma=group_intercept_sd,
469
shape=n_groups)
470
471
# Random slopes per group for one predictor
472
group_slope_sd = pm.HalfNormal('group_slope_sd', sigma=1)
473
group_slopes = pm.Normal('group_slopes',
474
mu=0,
475
sigma=group_slope_sd,
476
shape=n_groups)
477
478
# Combined linear predictor
479
mu = (individual_pred +
480
group_intercepts[group_labels] +
481
group_slopes[group_labels] * X_group[:, 0])
482
483
# Observation noise
484
sigma = pm.HalfNormal('sigma', sigma=1)
485
486
# Generate response data
487
y_mixed = pm.Normal('y_mixed', mu=mu, sigma=sigma, observed=np.random.randn(n_total))
488
489
trace = pm.sample(1000, tune=1000)
490
```
491
492
### GLM with Interaction Terms
493
494
```python
495
# Model with interaction effects
496
n_interact = 200
497
X1 = np.random.randn(n_interact)
498
X2 = np.random.binomial(1, 0.5, n_interact) # Binary predictor
499
X_interact = np.column_stack([X1, X2, X1 * X2]) # Include interaction
500
501
# True coefficients including interaction
502
true_coef_interact = np.array([1.0, 0.5, -0.8]) # main effects + interaction
503
y_interact_true = 0.2 + X_interact @ true_coef_interact
504
y_interact = y_interact_true + np.random.normal(0, 0.3, n_interact)
505
506
with pm.Model() as interaction_glm:
507
glm = pm.GLM(y=y_interact, X=X_interact,
508
labels=['continuous', 'binary', 'interaction'],
509
family=pm.glm.families.Normal())
510
511
trace = pm.sample(1000, tune=1000)
512
513
# Analyze interaction effect
514
interaction_coef = trace.posterior['interaction'].values.flatten()
515
print(f"Interaction effect: {interaction_coef.mean():.3f} ± {interaction_coef.std():.3f}")
516
```
517
518
### GLM Model Comparison
519
520
```python
521
# Compare different GLM families for same data
522
y_comparison = y_counts # Use count data from earlier example
523
X_comparison = X_count
524
525
models_comparison = {}
526
527
# Poisson model
528
with pm.Model() as model_poisson:
529
glm_pois = pm.GLM(y=y_comparison, X=X_comparison,
530
family=pm.glm.families.Poisson())
531
trace_pois = pm.sample(1000, tune=1000, target_accept=0.9)
532
533
models_comparison['Poisson'] = trace_pois
534
535
# Negative binomial model
536
with pm.Model() as model_nb:
537
glm_nb = pm.GLM(y=y_comparison, X=X_comparison,
538
family=pm.glm.families.NegativeBinomial())
539
trace_nb = pm.sample(1000, tune=1000, target_accept=0.9)
540
541
models_comparison['NegBinom'] = trace_nb
542
543
# Model comparison using WAIC
544
import arviz as az
545
comparison = az.compare(models_comparison, ic='waic')
546
print("Model comparison:")
547
print(comparison)
548
```