0
# Utility Functions
1
2
Helper functions for generating test data, creating balanced designs, and programmatically constructing formulas. These utilities support common tasks in statistical modeling and experimental design.
3
4
## Capabilities
5
6
### Balanced Factorial Design Generation
7
8
Creates simple balanced factorial designs for testing and experimentation.
9
10
```python { .api }
11
def balanced(**kwargs):
12
"""
13
Create balanced factorial designs for testing.
14
15
Given factor names and number of levels for each, generates a balanced factorial
16
design as a data dictionary. Useful for creating test data with all combinations
17
of factor levels.
18
19
Parameters:
20
- **kwargs: factor_name=num_levels pairs specifying factors and their level counts
21
- repeat (int): Number of replications of the complete design (default: 1)
22
23
Returns:
24
dict: Data dictionary with factor names as keys and level lists as values
25
"""
26
```
27
28
#### Usage Examples
29
30
```python
31
import patsy
32
33
# Simple 2x3 factorial design
34
data = patsy.balanced(treatment=2, dose=3)
35
print(data)
36
# {'treatment': ['treatment1', 'treatment1', 'treatment1',
37
# 'treatment2', 'treatment2', 'treatment2'],
38
# 'dose': ['dose1', 'dose2', 'dose3', 'dose1', 'dose2', 'dose3']}
39
40
# Multiple factors
41
data = patsy.balanced(group=2, time=3, condition=2)
42
print(f"Total combinations: {len(data['group'])}") # 2*3*2 = 12 combinations
43
44
# With replication
45
data = patsy.balanced(treatment=2, dose=2, repeat=3)
46
print(f"Total observations: {len(data['treatment'])}") # 2*2*3 = 12 observations
47
48
# Use in design matrix construction
49
design = patsy.dmatrix("C(treatment) * C(dose)", data)
50
print(f"Design matrix shape: {design.shape}")
51
52
# Complete model with balanced design
53
y_data = [i + np.random.normal(0, 0.1) for i in range(len(data['treatment']))]
54
data['y'] = y_data
55
y, X = patsy.dmatrices("y ~ C(treatment) * C(dose)", data)
56
```
57
58
### Demo Data Generation
59
60
Creates simple categorical and numerical demo data for testing formulas and models.
61
62
```python { .api }
63
def demo_data(*names, nlevels=2, min_rows=5):
64
"""
65
Create simple categorical/numerical demo data.
66
67
Variable names starting with 'a'-'m' become categorical with specified levels.
68
Names starting with 'p'-'z' become numerical (normal distribution).
69
Creates balanced design for categorical variables with at least min_rows observations.
70
71
Parameters:
72
- *names: Variable names to create
73
- nlevels (int): Number of levels for categorical variables (default: 2)
74
- min_rows (int): Minimum number of data rows to generate (default: 5)
75
76
Returns:
77
dict: Data dictionary with variable names as keys
78
79
Notes:
80
- Categorical variables: names starting with 'a' through 'm'
81
- Numerical variables: names starting with 'p' through 'z'
82
- Uses fixed random seed for reproducible numerical data
83
"""
84
```
85
86
#### Usage Examples
87
88
```python
89
import patsy
90
import numpy as np
91
92
# Mixed categorical and numerical variables
93
data = patsy.demo_data("group", "condition", "score", "time")
94
print("Variables created:")
95
for name, values in data.items():
96
print(f" {name}: {type(values[0]).__name__} - {len(values)} observations")
97
98
# Categorical variables (a-m)
99
cat_data = patsy.demo_data("factor_a", "factor_b", "group")
100
print("Categorical levels:")
101
for name, values in cat_data.items():
102
print(f" {name}: {set(values)}")
103
104
# Numerical variables (p-z)
105
num_data = patsy.demo_data("x", "y", "z", "score", "time")
106
print("Numerical data types:")
107
for name, values in num_data.items():
108
if isinstance(values, np.ndarray):
109
print(f" {name}: mean={np.mean(values):.2f}, std={np.std(values):.2f}")
110
111
# Custom parameters
112
data = patsy.demo_data("group", "x", "y", nlevels=4, min_rows=20)
113
print(f"Group levels: {set(data['group'])}")
114
print(f"Data size: {len(data['x'])} rows")
115
116
# Use with formula construction
117
y, X = patsy.dmatrices("y ~ C(group) + x", data)
118
print(f"Design matrix shape: {X.shape}")
119
120
# Reproducible data (same seed)
121
data1 = patsy.demo_data("x", "y")
122
data2 = patsy.demo_data("x", "y")
123
print("Reproducible:", np.array_equal(data1["x"], data2["x"]))
124
```
125
126
### Programmatic Factor Construction
127
128
A factor class for programmatically constructing formulas without string parsing.
129
130
```python { .api }
131
class LookupFactor:
132
"""
133
Simple factor class that looks up named entries in data.
134
135
Useful for programmatically constructing formulas and as an example
136
of the factor protocol. Provides more control than string-based formulas.
137
"""
138
def __init__(self, varname, force_categorical=False, contrast=None, levels=None):
139
"""
140
Create a lookup factor.
141
142
Parameters:
143
- varname (str): Variable name for data lookup
144
- force_categorical (bool): Treat as categorical regardless of data type
145
- contrast: Contrast coding scheme (requires force_categorical=True)
146
- levels: Explicit categorical levels (requires force_categorical=True)
147
"""
148
```
149
150
#### Usage Examples
151
152
```python
153
import patsy
154
from patsy import LookupFactor, ModelDesc, Term
155
import pandas as pd
156
157
# Sample data
158
data = pd.DataFrame({
159
'x': [1, 2, 3, 4, 5],
160
'group': ['A', 'B', 'A', 'B', 'A'],
161
'y': [2, 4, 6, 8, 10]
162
})
163
164
# Basic lookup factor
165
x_factor = LookupFactor("x")
166
group_factor = LookupFactor("group")
167
168
# Programmatically construct model description
169
# Equivalent to "y ~ x + group"
170
outcome_term = Term([LookupFactor("y")])
171
predictor_terms = [
172
Term([]), # Intercept
173
Term([LookupFactor("x")]),
174
Term([LookupFactor("group")])
175
]
176
177
model_desc = ModelDesc([outcome_term], predictor_terms)
178
179
# Build design matrices from programmatic model
180
y, X = patsy.dmatrices(model_desc, data)
181
print("Programmatic model shape:", X.shape)
182
183
# Force categorical treatment
184
categorical_factor = LookupFactor("x", force_categorical=True)
185
cat_term = Term([categorical_factor])
186
cat_model = ModelDesc([], [Term([]), cat_term])
187
design = patsy.dmatrix(cat_model, data)
188
print("Forced categorical columns:", design.design_info.column_names)
189
190
# With custom contrast
191
from patsy import Sum
192
contrast_factor = LookupFactor("group", force_categorical=True, contrast=Sum())
193
contrast_term = Term([contrast_factor])
194
contrast_model = ModelDesc([], [Term([]), contrast_term])
195
contrast_design = patsy.dmatrix(contrast_model, data)
196
print("Custom contrast columns:", contrast_design.design_info.column_names)
197
198
# With explicit levels
199
levels_factor = LookupFactor("group", force_categorical=True, levels=['B', 'A'])
200
levels_term = Term([levels_factor])
201
levels_model = ModelDesc([], [Term([]), levels_term])
202
levels_design = patsy.dmatrix(levels_model, data)
203
print("Custom levels columns:", levels_design.design_info.column_names)
204
```
205
206
## Integration with Other Patsy Features
207
208
### Balanced Designs with Complex Models
209
210
```python
211
import patsy
212
import numpy as np
213
from sklearn.linear_model import LinearRegression
214
215
# Create complex balanced design
216
data = patsy.balanced(treatment=3, dose=2, gender=2, repeat=5)
217
218
# Add outcome variable with realistic effects
219
np.random.seed(42)
220
y_values = []
221
for t, d, g in zip(data['treatment'], data['dose'], data['gender']):
222
# Simulate treatment and dose effects
223
effect = {'treatment1': 0, 'treatment2': 2, 'treatment3': 4}[t]
224
effect += {'dose1': 0, 'dose2': 1}[d]
225
effect += {'gender1': 0, 'gender2': 0.5}[g]
226
y_values.append(effect + np.random.normal(0, 0.5))
227
228
data['response'] = y_values
229
230
# Analyze with full factorial model
231
y, X = patsy.dmatrices("response ~ C(treatment) * C(dose) * C(gender)", data)
232
print(f"Full factorial design: {X.shape}")
233
234
# Fit model
235
model = LinearRegression(fit_intercept=False)
236
model.fit(X, y.ravel())
237
print(f"Model R²: {model.score(X, y.ravel()):.3f}")
238
```
239
240
### Demo Data for Testing Transformations
241
242
```python
243
import patsy
244
245
# Generate data for testing various transformations
246
data = patsy.demo_data("group", "x", "y", "z", nlevels=3, min_rows=30)
247
248
# Test spline transformations
249
spline_design = patsy.dmatrix("bs(x, df=4)", data)
250
print(f"B-spline design: {spline_design.shape}")
251
252
# Test interactions with categorical
253
interaction_design = patsy.dmatrix("C(group) * x", data)
254
print(f"Interaction design: {interaction_design.shape}")
255
256
# Test stateful transforms
257
standardized_design = patsy.dmatrix("standardize(x) + standardize(y)", data)
258
print(f"Standardized design: {standardized_design.shape}")
259
260
# Complete mixed-effects style model
261
complex_y, complex_X = patsy.dmatrices(
262
"z ~ C(group) + bs(x, df=3) + standardize(y)",
263
data
264
)
265
print(f"Complex model: {complex_X.shape}")
266
```
267
268
### Programmatic Model Construction
269
270
```python
271
import patsy
272
from patsy import LookupFactor, ModelDesc, Term, INTERCEPT
273
274
# Function to build models programmatically
275
def build_model(outcome, predictors, interactions=None):
276
"""Build ModelDesc programmatically"""
277
# Outcome term
278
outcome_term = Term([LookupFactor(outcome)])
279
280
# Predictor terms starting with intercept
281
pred_terms = [Term([INTERCEPT])]
282
283
# Add main effects
284
for pred in predictors:
285
pred_terms.append(Term([LookupFactor(pred)]))
286
287
# Add interactions if specified
288
if interactions:
289
for pred1, pred2 in interactions:
290
interaction_term = Term([LookupFactor(pred1), LookupFactor(pred2)])
291
pred_terms.append(interaction_term)
292
293
return ModelDesc([outcome_term], pred_terms)
294
295
# Use the function
296
data = patsy.demo_data("group", "condition", "x", "y", "response")
297
298
# Build model: response ~ group + condition + x + group:condition
299
model = build_model(
300
outcome="response",
301
predictors=["group", "condition", "x"],
302
interactions=[("group", "condition")]
303
)
304
305
y, X = patsy.dmatrices(model, data)
306
print(f"Programmatic model: {X.shape}")
307
print("Columns:", X.design_info.column_names)
308
```
309
310
## Advanced Utility Patterns
311
312
### Custom Data Generation
313
314
```python
315
def create_experiment_data(n_subjects, n_conditions, n_timepoints):
316
"""Create realistic experimental data structure"""
317
318
# Use balanced design for experimental structure
319
design = patsy.balanced(
320
subject=n_subjects,
321
condition=n_conditions,
322
timepoint=n_timepoints
323
)
324
325
# Add realistic measurement data
326
np.random.seed(42)
327
measurements = []
328
for subj, cond, time in zip(design['subject'], design['condition'], design['timepoint']):
329
# Simulate individual differences and condition effects
330
subject_effect = int(subj.replace('subject', '')) * 0.1
331
condition_effect = {'condition1': 0, 'condition2': 1, 'condition3': 2}[cond]
332
time_effect = int(time.replace('timepoint', '')) * 0.2
333
334
measurement = subject_effect + condition_effect + time_effect + np.random.normal(0, 0.3)
335
measurements.append(measurement)
336
337
design['measurement'] = measurements
338
return design
339
340
# Use custom data generation
341
exp_data = create_experiment_data(10, 3, 4)
342
print(f"Experimental data: {len(exp_data['measurement'])} observations")
343
344
# Analyze with mixed-effects style formula
345
y, X = patsy.dmatrices("measurement ~ C(condition) + C(timepoint)", exp_data)
346
print(f"Analysis design: {X.shape}")
347
```