or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-handling.mddistributions.mdgaussian-processes.mdglm.mdindex.mdmath-functions.mdmodeling.mdsampling.mdstats-plots.mdstep-methods.mdvariational.md

glm.mddocs/

0

# Generalized Linear Models

1

2

PyMC3's GLM module provides a high-level interface for generalized linear models with automatic handling of design matrices, link functions, and family-specific distributions. This streamlines the construction of regression models for various response types including continuous, binary, count, and categorical data.

3

4

## Capabilities

5

6

### GLM Classes

7

8

High-level classes for building generalized linear models with automatic configuration.

9

10

```python { .api }

11

class GLM:

12

"""

13

Generalized Linear Model with automatic family and link function handling.

14

15

Provides a high-level interface for GLM construction that automatically

16

handles design matrix creation, appropriate distributions, link functions,

17

and parameter priors based on the specified family.

18

"""

19

20

def __init__(self, y, X, labels=None, intercept=True, family=None,

21

priors=None, **kwargs):

22

"""

23

Initialize GLM.

24

25

Parameters:

26

- y: array, response variable

27

- X: array, design matrix or predictor variables

28

- labels: list, variable names for predictors

29

- intercept: bool, include intercept term

30

- family: GLMFamily, response distribution family

31

- priors: dict, prior specifications for parameters

32

- kwargs: additional arguments for family-specific configuration

33

34

Returns:

35

- GLM: configured GLM object

36

37

Example:

38

# Linear regression

39

glm = pm.GLM(y=height, X=weight, family=pm.glm.families.Normal())

40

41

# Logistic regression

42

glm = pm.GLM(y=success, X=predictors, family=pm.glm.families.Binomial())

43

44

# Poisson regression

45

glm = pm.GLM(y=counts, X=features, family=pm.glm.families.Poisson())

46

"""

47

48

def fit(self, draws=1000, **kwargs):

49

"""

50

Fit GLM using MCMC sampling.

51

52

Parameters:

53

- draws: int, number of posterior samples

54

- kwargs: arguments passed to pm.sample()

55

56

Returns:

57

- InferenceData: posterior samples and diagnostics

58

"""

59

60

def predict(self, X_new, trace=None, **kwargs):

61

"""

62

Generate predictions for new data.

63

64

Parameters:

65

- X_new: array, new predictor values

66

- trace: InferenceData, posterior samples for prediction

67

- kwargs: additional prediction arguments

68

69

Returns:

70

- dict: predictive samples

71

"""

72

73

class LinearComponent:

74

"""

75

Linear component builder for GLM construction.

76

77

Handles design matrix creation, variable naming, and prior

78

specification for linear predictors in GLM models.

79

"""

80

81

def __init__(self, x, intercept=True, labels=None, priors=None):

82

"""

83

Initialize linear component.

84

85

Parameters:

86

- x: array, predictor matrix

87

- intercept: bool, include intercept

88

- labels: list, predictor names

89

- priors: dict, prior distributions for coefficients

90

91

Returns:

92

- LinearComponent: linear component object

93

"""

94

95

def build(self, name=''):

96

"""

97

Build linear predictor variables.

98

99

Parameters:

100

- name: str, prefix for variable names

101

102

Returns:

103

- TensorVariable: linear predictor

104

"""

105

```

106

107

### GLM Families

108

109

Distribution families for different response types available in `pm.glm.families`.

110

111

```python { .api }

112

class Normal:

113

"""

114

Normal family for continuous responses (linear regression).

115

116

Uses identity link function and normal distribution for

117

modeling continuous response variables.

118

"""

119

120

def __init__(self, priors=None):

121

"""

122

Initialize Normal family.

123

124

Parameters:

125

- priors: dict, prior specifications {'sigma': prior_dist}

126

"""

127

128

@property

129

def link(self):

130

"""Identity link function: η = μ"""

131

132

@property

133

def likelihood(self):

134

"""Normal likelihood distribution"""

135

136

class Binomial:

137

"""

138

Binomial family for binary and binomial responses.

139

140

Uses logit link function and binomial distribution for

141

modeling binary outcomes or success/failure counts.

142

"""

143

144

def __init__(self, n=1, priors=None):

145

"""

146

Initialize Binomial family.

147

148

Parameters:

149

- n: int or array, number of trials (1 for binary)

150

- priors: dict, prior specifications

151

"""

152

153

@property

154

def link(self):

155

"""Logit link function: η = log(p/(1-p))"""

156

157

class Poisson:

158

"""

159

Poisson family for count responses.

160

161

Uses log link function and Poisson distribution for

162

modeling count data and rate processes.

163

"""

164

165

def __init__(self, priors=None):

166

"""

167

Initialize Poisson family.

168

169

Parameters:

170

- priors: dict, prior specifications

171

"""

172

173

@property

174

def link(self):

175

"""Log link function: η = log(λ)"""

176

177

class NegativeBinomial:

178

"""

179

Negative binomial family for overdispersed count data.

180

181

Uses log link with additional dispersion parameter for

182

modeling count data with variance greater than mean.

183

"""

184

185

def __init__(self, priors=None):

186

"""

187

Initialize NegativeBinomial family.

188

189

Parameters:

190

- priors: dict, prior specifications {'alpha': prior_dist}

191

"""

192

193

class Gamma:

194

"""

195

Gamma family for positive continuous responses.

196

197

Uses log link function and gamma distribution for

198

modeling positive continuous variables like duration, cost.

199

"""

200

201

def __init__(self, priors=None):

202

"""

203

Initialize Gamma family.

204

205

Parameters:

206

- priors: dict, prior specifications {'alpha': prior_dist}

207

"""

208

209

class StudentT:

210

"""

211

Student's t family for robust regression.

212

213

Uses identity link with Student's t distribution for

214

robust modeling of continuous responses with outliers.

215

"""

216

217

def __init__(self, priors=None):

218

"""

219

Initialize StudentT family.

220

221

Parameters:

222

- priors: dict, prior specifications {'nu': prior_dist, 'sigma': prior_dist}

223

"""

224

```

225

226

## Usage Examples

227

228

### Linear Regression with GLM

229

230

```python

231

import pymc3 as pm

232

import numpy as np

233

import pandas as pd

234

235

# Generate sample data

236

np.random.seed(42)

237

n = 100

238

X = np.random.randn(n, 3)

239

true_coef = np.array([1.5, -2.0, 0.5])

240

true_intercept = 0.3

241

y = true_intercept + X @ true_coef + np.random.normal(0, 0.5, n)

242

243

# GLM linear regression

244

with pm.Model() as linear_glm:

245

# Automatic GLM construction

246

glm = pm.GLM(y=y, X=X,

247

labels=['x1', 'x2', 'x3'],

248

family=pm.glm.families.Normal())

249

250

# Sample

251

trace = pm.sample(1000, tune=1000)

252

253

# Predictions

254

X_new = np.random.randn(20, 3)

255

predictions = glm.predict(X_new, trace=trace)

256

257

# Manual prior specification

258

with pm.Model() as linear_glm_priors:

259

# Custom priors

260

priors = {

261

'Intercept': pm.Normal('Intercept', mu=0, sigma=10),

262

'x1': pm.Normal('x1', mu=0, sigma=5),

263

'x2': pm.Laplace('x2', mu=0, b=1), # Regularized prior

264

'x3': pm.Normal('x3', mu=0, sigma=5),

265

'sigma': pm.HalfNormal('sigma', sigma=2)

266

}

267

268

glm = pm.GLM(y=y, X=X,

269

labels=['x1', 'x2', 'x3'],

270

family=pm.glm.families.Normal(),

271

priors=priors)

272

273

trace = pm.sample(1000, tune=1000)

274

```

275

276

### Logistic Regression

277

278

```python

279

# Binary classification data

280

n_samples = 200

281

X_log = np.random.randn(n_samples, 4)

282

true_coef_log = np.array([1.2, -0.8, 0.5, -1.1])

283

true_intercept_log = -0.3

284

285

# Generate binary outcomes

286

linear_pred = true_intercept_log + X_log @ true_coef_log

287

prob = 1 / (1 + np.exp(-linear_pred)) # Logistic function

288

y_binary = np.random.binomial(1, prob)

289

290

# Logistic regression GLM

291

with pm.Model() as logistic_glm:

292

glm = pm.GLM(y=y_binary, X=X_log,

293

labels=['feature_1', 'feature_2', 'feature_3', 'feature_4'],

294

family=pm.glm.families.Binomial())

295

296

trace = pm.sample(1000, tune=1000)

297

298

# Prediction probabilities

299

X_test = np.random.randn(50, 4)

300

pred_samples = glm.predict(X_test, trace=trace)

301

302

# Extract probabilities

303

prob_samples = pred_samples['y'] # Bernoulli samples

304

prob_mean = prob_samples.mean(axis=0)

305

306

# Multi-trial binomial

307

n_trials = 20

308

y_binomial = np.random.binomial(n_trials, prob)

309

310

with pm.Model() as binomial_glm:

311

glm = pm.GLM(y=y_binomial, X=X_log,

312

labels=['feature_1', 'feature_2', 'feature_3', 'feature_4'],

313

family=pm.glm.families.Binomial(n=n_trials))

314

315

trace = pm.sample(1000, tune=1000)

316

```

317

318

### Poisson Regression

319

320

```python

321

# Count data generation

322

X_count = np.random.randn(150, 2)

323

true_coef_count = np.array([0.8, -0.6])

324

true_intercept_count = 1.2

325

326

# Generate count outcomes

327

log_rate = true_intercept_count + X_count @ true_coef_count

328

rate = np.exp(log_rate)

329

y_counts = np.random.poisson(rate)

330

331

# Poisson regression GLM

332

with pm.Model() as poisson_glm:

333

glm = pm.GLM(y=y_counts, X=X_count,

334

labels=['exposure', 'treatment'],

335

family=pm.glm.families.Poisson())

336

337

trace = pm.sample(1000, tune=1000)

338

339

# Prediction of rates

340

X_count_new = np.array([[0.5, 1.0], [-1.0, 0.0], [1.5, -0.5]])

341

count_pred = glm.predict(X_count_new, trace=trace)

342

343

# Expected counts

344

expected_counts = count_pred['y'].mean(axis=0)

345

print("Expected counts:", expected_counts)

346

```

347

348

### Negative Binomial Regression for Overdispersed Counts

349

350

```python

351

# Overdispersed count data

352

X_nb = np.random.randn(120, 3)

353

true_coef_nb = np.array([0.5, -0.3, 0.7])

354

log_mu = 2.0 + X_nb @ true_coef_nb

355

mu = np.exp(log_mu)

356

357

# Overdispersion parameter

358

alpha_true = 2.0

359

y_nb = np.random.negative_binomial(n=alpha_true,

360

p=alpha_true/(alpha_true + mu))

361

362

with pm.Model() as nb_glm:

363

# Custom priors for dispersion

364

priors = {

365

'alpha': pm.Gamma('alpha', alpha=1, beta=1) # Dispersion parameter

366

}

367

368

glm = pm.GLM(y=y_nb, X=X_nb,

369

labels=['var1', 'var2', 'var3'],

370

family=pm.glm.families.NegativeBinomial(),

371

priors=priors)

372

373

trace = pm.sample(1000, tune=1000)

374

375

# Check for overdispersion

376

alpha_post = trace.posterior['alpha'].values.flatten()

377

print(f"Overdispersion parameter (mean): {alpha_post.mean():.3f}")

378

```

379

380

### Gamma Regression for Positive Continuous Data

381

382

```python

383

# Positive continuous response (e.g., waiting times, costs)

384

X_gamma = np.random.randn(100, 2)

385

true_coef_gamma = np.array([0.4, -0.2])

386

log_scale = 1.0 + X_gamma @ true_coef_gamma

387

scale = np.exp(log_scale)

388

389

# Gamma-distributed response

390

shape = 2.0

391

y_gamma = np.random.gamma(shape=shape, scale=scale/shape)

392

393

with pm.Model() as gamma_glm:

394

# Priors for shape parameter

395

priors = {

396

'alpha': pm.Gamma('alpha', alpha=1, beta=1) # Shape parameter

397

}

398

399

glm = pm.GLM(y=y_gamma, X=X_gamma,

400

labels=['duration_factor', 'cost_driver'],

401

family=pm.glm.families.Gamma(),

402

priors=priors)

403

404

trace = pm.sample(1000, tune=1000)

405

```

406

407

### Robust Regression with Student's t

408

409

```python

410

# Data with outliers

411

X_robust = np.random.randn(80, 2)

412

true_coef_robust = np.array([1.0, -0.5])

413

y_clean = 2.0 + X_robust @ true_coef_robust

414

415

# Add outliers

416

outlier_idx = np.random.choice(80, size=8, replace=False)

417

y_robust = y_clean.copy()

418

y_robust[outlier_idx] += np.random.normal(0, 5, len(outlier_idx))

419

420

with pm.Model() as robust_glm:

421

# Student's t for robustness

422

priors = {

423

'nu': pm.Gamma('nu', alpha=2, beta=0.1), # Degrees of freedom

424

'sigma': pm.HalfNormal('sigma', sigma=2)

425

}

426

427

glm = pm.GLM(y=y_robust, X=X_robust,

428

labels=['x1', 'x2'],

429

family=pm.glm.families.StudentT(),

430

priors=priors)

431

432

trace = pm.sample(1000, tune=1000)

433

434

# Compare with normal GLM

435

glm_normal = pm.GLM(y=y_robust, X=X_robust,

436

family=pm.glm.families.Normal())

437

trace_normal = pm.sample(1000, tune=1000)

438

```

439

440

### Mixed Effects GLM Structure

441

442

```python

443

# Hierarchical/mixed effects structure using GLM components

444

n_groups = 5

445

n_per_group = 30

446

group_labels = np.repeat(range(n_groups), n_per_group)

447

n_total = len(group_labels)

448

449

# Group-level predictors

450

X_group = np.random.randn(n_total, 2)

451

452

# Individual-level predictors

453

X_individual = np.random.randn(n_total, 1)

454

455

with pm.Model() as mixed_glm:

456

# Fixed effects for individual-level predictors

457

individual_component = pm.glm.LinearComponent(

458

x=X_individual,

459

labels=['individual_pred'],

460

priors={'individual_pred': pm.Normal('individual_pred', 0, 1)}

461

)

462

individual_pred = individual_component.build('individual')

463

464

# Random intercepts per group

465

group_intercept_sd = pm.HalfNormal('group_intercept_sd', sigma=1)

466

group_intercepts = pm.Normal('group_intercepts',

467

mu=0,

468

sigma=group_intercept_sd,

469

shape=n_groups)

470

471

# Random slopes per group for one predictor

472

group_slope_sd = pm.HalfNormal('group_slope_sd', sigma=1)

473

group_slopes = pm.Normal('group_slopes',

474

mu=0,

475

sigma=group_slope_sd,

476

shape=n_groups)

477

478

# Combined linear predictor

479

mu = (individual_pred +

480

group_intercepts[group_labels] +

481

group_slopes[group_labels] * X_group[:, 0])

482

483

# Observation noise

484

sigma = pm.HalfNormal('sigma', sigma=1)

485

486

# Generate response data

487

y_mixed = pm.Normal('y_mixed', mu=mu, sigma=sigma, observed=np.random.randn(n_total))

488

489

trace = pm.sample(1000, tune=1000)

490

```

491

492

### GLM with Interaction Terms

493

494

```python

495

# Model with interaction effects

496

n_interact = 200

497

X1 = np.random.randn(n_interact)

498

X2 = np.random.binomial(1, 0.5, n_interact) # Binary predictor

499

X_interact = np.column_stack([X1, X2, X1 * X2]) # Include interaction

500

501

# True coefficients including interaction

502

true_coef_interact = np.array([1.0, 0.5, -0.8]) # main effects + interaction

503

y_interact_true = 0.2 + X_interact @ true_coef_interact

504

y_interact = y_interact_true + np.random.normal(0, 0.3, n_interact)

505

506

with pm.Model() as interaction_glm:

507

glm = pm.GLM(y=y_interact, X=X_interact,

508

labels=['continuous', 'binary', 'interaction'],

509

family=pm.glm.families.Normal())

510

511

trace = pm.sample(1000, tune=1000)

512

513

# Analyze interaction effect

514

interaction_coef = trace.posterior['interaction'].values.flatten()

515

print(f"Interaction effect: {interaction_coef.mean():.3f} ± {interaction_coef.std():.3f}")

516

```

517

518

### GLM Model Comparison

519

520

```python

521

# Compare different GLM families for same data

522

y_comparison = y_counts # Use count data from earlier example

523

X_comparison = X_count

524

525

models_comparison = {}

526

527

# Poisson model

528

with pm.Model() as model_poisson:

529

glm_pois = pm.GLM(y=y_comparison, X=X_comparison,

530

family=pm.glm.families.Poisson())

531

trace_pois = pm.sample(1000, tune=1000, target_accept=0.9)

532

533

models_comparison['Poisson'] = trace_pois

534

535

# Negative binomial model

536

with pm.Model() as model_nb:

537

glm_nb = pm.GLM(y=y_comparison, X=X_comparison,

538

family=pm.glm.families.NegativeBinomial())

539

trace_nb = pm.sample(1000, tune=1000, target_accept=0.9)

540

541

models_comparison['NegBinom'] = trace_nb

542

543

# Model comparison using WAIC

544

import arviz as az

545

comparison = az.compare(models_comparison, ic='waic')

546

print("Model comparison:")

547

print(comparison)

548

```