Tessl Tile for pypi/patsy@1.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

builtins.md categorical.md contrasts.md high-level.md index.md matrix-building.md splines.md transforms.md utilities.md

utilities.mddocs/

0
# Utility Functions
1

2
Helper functions for generating test data, creating balanced designs, and programmatically constructing formulas. These utilities support common tasks in statistical modeling and experimental design.
3

4
## Capabilities
5

6
### Balanced Factorial Design Generation
7

8
Creates simple balanced factorial designs for testing and experimentation.
9

10
```python { .api }
11
def balanced(**kwargs):
12
    """
13
    Create balanced factorial designs for testing.
14

15
    Given factor names and number of levels for each, generates a balanced factorial
16
    design as a data dictionary. Useful for creating test data with all combinations
17
    of factor levels.
18

19
    Parameters:
20
    - **kwargs: factor_name=num_levels pairs specifying factors and their level counts
21
    - repeat (int): Number of replications of the complete design (default: 1)
22

23
    Returns:
24
    dict: Data dictionary with factor names as keys and level lists as values
25
    """
26
```
27

28
#### Usage Examples
29

30
```python
31
import patsy
32

33
# Simple 2x3 factorial design
34
data = patsy.balanced(treatment=2, dose=3)
35
print(data)
36
# {'treatment': ['treatment1', 'treatment1', 'treatment1', 
37
#                'treatment2', 'treatment2', 'treatment2'],
38
#  'dose': ['dose1', 'dose2', 'dose3', 'dose1', 'dose2', 'dose3']}
39

40
# Multiple factors
41
data = patsy.balanced(group=2, time=3, condition=2)
42
print(f"Total combinations: {len(data['group'])}")  # 2*3*2 = 12 combinations
43

44
# With replication
45
data = patsy.balanced(treatment=2, dose=2, repeat=3)
46
print(f"Total observations: {len(data['treatment'])}")  # 2*2*3 = 12 observations
47

48
# Use in design matrix construction
49
design = patsy.dmatrix("C(treatment) * C(dose)", data)
50
print(f"Design matrix shape: {design.shape}")
51

52
# Complete model with balanced design
53
y_data = [i + np.random.normal(0, 0.1) for i in range(len(data['treatment']))]
54
data['y'] = y_data
55
y, X = patsy.dmatrices("y ~ C(treatment) * C(dose)", data)
56
```
57

58
### Demo Data Generation
59

60
Creates simple categorical and numerical demo data for testing formulas and models.
61

62
```python { .api }
63
def demo_data(*names, nlevels=2, min_rows=5):
64
    """
65
    Create simple categorical/numerical demo data.
66

67
    Variable names starting with 'a'-'m' become categorical with specified levels.
68
    Names starting with 'p'-'z' become numerical (normal distribution).
69
    Creates balanced design for categorical variables with at least min_rows observations.
70

71
    Parameters:
72
    - *names: Variable names to create
73
    - nlevels (int): Number of levels for categorical variables (default: 2)
74
    - min_rows (int): Minimum number of data rows to generate (default: 5)
75

76
    Returns:
77
    dict: Data dictionary with variable names as keys
78
    
79
    Notes:
80
    - Categorical variables: names starting with 'a' through 'm'
81
    - Numerical variables: names starting with 'p' through 'z'  
82
    - Uses fixed random seed for reproducible numerical data
83
    """
84
```
85

86
#### Usage Examples
87

88
```python
89
import patsy
90
import numpy as np
91

92
# Mixed categorical and numerical variables
93
data = patsy.demo_data("group", "condition", "score", "time")
94
print("Variables created:")
95
for name, values in data.items():
96
    print(f"  {name}: {type(values[0]).__name__} - {len(values)} observations")
97

98
# Categorical variables (a-m)
99
cat_data = patsy.demo_data("factor_a", "factor_b", "group")
100
print("Categorical levels:")
101
for name, values in cat_data.items():
102
    print(f"  {name}: {set(values)}")
103

104
# Numerical variables (p-z)  
105
num_data = patsy.demo_data("x", "y", "z", "score", "time")
106
print("Numerical data types:")
107
for name, values in num_data.items():
108
    if isinstance(values, np.ndarray):
109
        print(f"  {name}: mean={np.mean(values):.2f}, std={np.std(values):.2f}")
110

111
# Custom parameters
112
data = patsy.demo_data("group", "x", "y", nlevels=4, min_rows=20)
113
print(f"Group levels: {set(data['group'])}")
114
print(f"Data size: {len(data['x'])} rows")
115

116
# Use with formula construction
117
y, X = patsy.dmatrices("y ~ C(group) + x", data)
118
print(f"Design matrix shape: {X.shape}")
119

120
# Reproducible data (same seed)
121
data1 = patsy.demo_data("x", "y")
122
data2 = patsy.demo_data("x", "y")
123
print("Reproducible:", np.array_equal(data1["x"], data2["x"]))
124
```
125

126
### Programmatic Factor Construction
127

128
A factor class for programmatically constructing formulas without string parsing.
129

130
```python { .api }
131
class LookupFactor:
132
    """
133
    Simple factor class that looks up named entries in data.
134
    
135
    Useful for programmatically constructing formulas and as an example
136
    of the factor protocol. Provides more control than string-based formulas.
137
    """
138
    def __init__(self, varname, force_categorical=False, contrast=None, levels=None):
139
        """
140
        Create a lookup factor.
141
        
142
        Parameters:
143
        - varname (str): Variable name for data lookup
144
        - force_categorical (bool): Treat as categorical regardless of data type
145
        - contrast: Contrast coding scheme (requires force_categorical=True)
146
        - levels: Explicit categorical levels (requires force_categorical=True)
147
        """
148
```
149

150
#### Usage Examples
151

152
```python
153
import patsy
154
from patsy import LookupFactor, ModelDesc, Term
155
import pandas as pd
156

157
# Sample data
158
data = pd.DataFrame({
159
    'x': [1, 2, 3, 4, 5],
160
    'group': ['A', 'B', 'A', 'B', 'A'],
161
    'y': [2, 4, 6, 8, 10]
162
})
163

164
# Basic lookup factor
165
x_factor = LookupFactor("x")
166
group_factor = LookupFactor("group")
167

168
# Programmatically construct model description
169
# Equivalent to "y ~ x + group"
170
outcome_term = Term([LookupFactor("y")])
171
predictor_terms = [
172
    Term([]),  # Intercept
173
    Term([LookupFactor("x")]),
174
    Term([LookupFactor("group")])
175
]
176

177
model_desc = ModelDesc([outcome_term], predictor_terms)
178

179
# Build design matrices from programmatic model
180
y, X = patsy.dmatrices(model_desc, data)
181
print("Programmatic model shape:", X.shape)
182

183
# Force categorical treatment
184
categorical_factor = LookupFactor("x", force_categorical=True)
185
cat_term = Term([categorical_factor])
186
cat_model = ModelDesc([], [Term([]), cat_term])
187
design = patsy.dmatrix(cat_model, data)
188
print("Forced categorical columns:", design.design_info.column_names)
189

190
# With custom contrast
191
from patsy import Sum
192
contrast_factor = LookupFactor("group", force_categorical=True, contrast=Sum())
193
contrast_term = Term([contrast_factor])
194
contrast_model = ModelDesc([], [Term([]), contrast_term])
195
contrast_design = patsy.dmatrix(contrast_model, data)
196
print("Custom contrast columns:", contrast_design.design_info.column_names)
197

198
# With explicit levels
199
levels_factor = LookupFactor("group", force_categorical=True, levels=['B', 'A'])
200
levels_term = Term([levels_factor])  
201
levels_model = ModelDesc([], [Term([]), levels_term])
202
levels_design = patsy.dmatrix(levels_model, data)
203
print("Custom levels columns:", levels_design.design_info.column_names)
204
```
205

206
## Integration with Other Patsy Features
207

208
### Balanced Designs with Complex Models
209

210
```python
211
import patsy
212
import numpy as np
213
from sklearn.linear_model import LinearRegression
214

215
# Create complex balanced design
216
data = patsy.balanced(treatment=3, dose=2, gender=2, repeat=5)
217

218
# Add outcome variable with realistic effects
219
np.random.seed(42)
220
y_values = []
221
for t, d, g in zip(data['treatment'], data['dose'], data['gender']):
222
    # Simulate treatment and dose effects
223
    effect = {'treatment1': 0, 'treatment2': 2, 'treatment3': 4}[t]
224
    effect += {'dose1': 0, 'dose2': 1}[d]  
225
    effect += {'gender1': 0, 'gender2': 0.5}[g]
226
    y_values.append(effect + np.random.normal(0, 0.5))
227

228
data['response'] = y_values
229

230
# Analyze with full factorial model
231
y, X = patsy.dmatrices("response ~ C(treatment) * C(dose) * C(gender)", data)
232
print(f"Full factorial design: {X.shape}")
233

234
# Fit model
235
model = LinearRegression(fit_intercept=False)
236
model.fit(X, y.ravel())
237
print(f"Model R²: {model.score(X, y.ravel()):.3f}")
238
```
239

240
### Demo Data for Testing Transformations
241

242
```python
243
import patsy
244

245
# Generate data for testing various transformations
246
data = patsy.demo_data("group", "x", "y", "z", nlevels=3, min_rows=30)
247

248
# Test spline transformations
249
spline_design = patsy.dmatrix("bs(x, df=4)", data)
250
print(f"B-spline design: {spline_design.shape}")
251

252
# Test interactions with categorical
253
interaction_design = patsy.dmatrix("C(group) * x", data)
254
print(f"Interaction design: {interaction_design.shape}")
255

256
# Test stateful transforms
257
standardized_design = patsy.dmatrix("standardize(x) + standardize(y)", data)
258
print(f"Standardized design: {standardized_design.shape}")
259

260
# Complete mixed-effects style model
261
complex_y, complex_X = patsy.dmatrices(
262
    "z ~ C(group) + bs(x, df=3) + standardize(y)", 
263
    data
264
)
265
print(f"Complex model: {complex_X.shape}")
266
```
267

268
### Programmatic Model Construction
269

270
```python
271
import patsy
272
from patsy import LookupFactor, ModelDesc, Term, INTERCEPT
273

274
# Function to build models programmatically
275
def build_model(outcome, predictors, interactions=None):
276
    """Build ModelDesc programmatically"""
277
    # Outcome term
278
    outcome_term = Term([LookupFactor(outcome)])
279
    
280
    # Predictor terms starting with intercept
281
    pred_terms = [Term([INTERCEPT])]
282
    
283
    # Add main effects
284
    for pred in predictors:
285
        pred_terms.append(Term([LookupFactor(pred)]))
286
    
287
    # Add interactions if specified
288
    if interactions:
289
        for pred1, pred2 in interactions:
290
            interaction_term = Term([LookupFactor(pred1), LookupFactor(pred2)])
291
            pred_terms.append(interaction_term)
292
    
293
    return ModelDesc([outcome_term], pred_terms)
294

295
# Use the function
296
data = patsy.demo_data("group", "condition", "x", "y", "response")
297

298
# Build model: response ~ group + condition + x + group:condition
299
model = build_model(
300
    outcome="response",
301
    predictors=["group", "condition", "x"],
302
    interactions=[("group", "condition")]
303
)
304

305
y, X = patsy.dmatrices(model, data)
306
print(f"Programmatic model: {X.shape}")
307
print("Columns:", X.design_info.column_names)
308
```
309

310
## Advanced Utility Patterns
311

312
### Custom Data Generation
313

314
```python
315
def create_experiment_data(n_subjects, n_conditions, n_timepoints):
316
    """Create realistic experimental data structure"""
317
    
318
    # Use balanced design for experimental structure
319
    design = patsy.balanced(
320
        subject=n_subjects,
321
        condition=n_conditions, 
322
        timepoint=n_timepoints
323
    )
324
    
325
    # Add realistic measurement data
326
    np.random.seed(42)
327
    measurements = []
328
    for subj, cond, time in zip(design['subject'], design['condition'], design['timepoint']):
329
        # Simulate individual differences and condition effects
330
        subject_effect = int(subj.replace('subject', '')) * 0.1
331
        condition_effect = {'condition1': 0, 'condition2': 1, 'condition3': 2}[cond]
332
        time_effect = int(time.replace('timepoint', '')) * 0.2
333
        
334
        measurement = subject_effect + condition_effect + time_effect + np.random.normal(0, 0.3)
335
        measurements.append(measurement)
336
    
337
    design['measurement'] = measurements
338
    return design
339

340
# Use custom data generation
341
exp_data = create_experiment_data(10, 3, 4)
342
print(f"Experimental data: {len(exp_data['measurement'])} observations")
343

344
# Analyze with mixed-effects style formula
345
y, X = patsy.dmatrices("measurement ~ C(condition) + C(timepoint)", exp_data)
346
print(f"Analysis design: {X.shape}")
347
```

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/