or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

builtins.mdcategorical.mdcontrasts.mdhigh-level.mdindex.mdmatrix-building.mdsplines.mdtransforms.mdutilities.md

utilities.mddocs/

0

# Utility Functions

1

2

Helper functions for generating test data, creating balanced designs, and programmatically constructing formulas. These utilities support common tasks in statistical modeling and experimental design.

3

4

## Capabilities

5

6

### Balanced Factorial Design Generation

7

8

Creates simple balanced factorial designs for testing and experimentation.

9

10

```python { .api }

11

def balanced(**kwargs):

12

"""

13

Create balanced factorial designs for testing.

14

15

Given factor names and number of levels for each, generates a balanced factorial

16

design as a data dictionary. Useful for creating test data with all combinations

17

of factor levels.

18

19

Parameters:

20

- **kwargs: factor_name=num_levels pairs specifying factors and their level counts

21

- repeat (int): Number of replications of the complete design (default: 1)

22

23

Returns:

24

dict: Data dictionary with factor names as keys and level lists as values

25

"""

26

```

27

28

#### Usage Examples

29

30

```python

31

import patsy

32

33

# Simple 2x3 factorial design

34

data = patsy.balanced(treatment=2, dose=3)

35

print(data)

36

# {'treatment': ['treatment1', 'treatment1', 'treatment1',

37

# 'treatment2', 'treatment2', 'treatment2'],

38

# 'dose': ['dose1', 'dose2', 'dose3', 'dose1', 'dose2', 'dose3']}

39

40

# Multiple factors

41

data = patsy.balanced(group=2, time=3, condition=2)

42

print(f"Total combinations: {len(data['group'])}") # 2*3*2 = 12 combinations

43

44

# With replication

45

data = patsy.balanced(treatment=2, dose=2, repeat=3)

46

print(f"Total observations: {len(data['treatment'])}") # 2*2*3 = 12 observations

47

48

# Use in design matrix construction

49

design = patsy.dmatrix("C(treatment) * C(dose)", data)

50

print(f"Design matrix shape: {design.shape}")

51

52

# Complete model with balanced design

53

y_data = [i + np.random.normal(0, 0.1) for i in range(len(data['treatment']))]

54

data['y'] = y_data

55

y, X = patsy.dmatrices("y ~ C(treatment) * C(dose)", data)

56

```

57

58

### Demo Data Generation

59

60

Creates simple categorical and numerical demo data for testing formulas and models.

61

62

```python { .api }

63

def demo_data(*names, nlevels=2, min_rows=5):

64

"""

65

Create simple categorical/numerical demo data.

66

67

Variable names starting with 'a'-'m' become categorical with specified levels.

68

Names starting with 'p'-'z' become numerical (normal distribution).

69

Creates balanced design for categorical variables with at least min_rows observations.

70

71

Parameters:

72

- *names: Variable names to create

73

- nlevels (int): Number of levels for categorical variables (default: 2)

74

- min_rows (int): Minimum number of data rows to generate (default: 5)

75

76

Returns:

77

dict: Data dictionary with variable names as keys

78

79

Notes:

80

- Categorical variables: names starting with 'a' through 'm'

81

- Numerical variables: names starting with 'p' through 'z'

82

- Uses fixed random seed for reproducible numerical data

83

"""

84

```

85

86

#### Usage Examples

87

88

```python

89

import patsy

90

import numpy as np

91

92

# Mixed categorical and numerical variables

93

data = patsy.demo_data("group", "condition", "score", "time")

94

print("Variables created:")

95

for name, values in data.items():

96

print(f" {name}: {type(values[0]).__name__} - {len(values)} observations")

97

98

# Categorical variables (a-m)

99

cat_data = patsy.demo_data("factor_a", "factor_b", "group")

100

print("Categorical levels:")

101

for name, values in cat_data.items():

102

print(f" {name}: {set(values)}")

103

104

# Numerical variables (p-z)

105

num_data = patsy.demo_data("x", "y", "z", "score", "time")

106

print("Numerical data types:")

107

for name, values in num_data.items():

108

if isinstance(values, np.ndarray):

109

print(f" {name}: mean={np.mean(values):.2f}, std={np.std(values):.2f}")

110

111

# Custom parameters

112

data = patsy.demo_data("group", "x", "y", nlevels=4, min_rows=20)

113

print(f"Group levels: {set(data['group'])}")

114

print(f"Data size: {len(data['x'])} rows")

115

116

# Use with formula construction

117

y, X = patsy.dmatrices("y ~ C(group) + x", data)

118

print(f"Design matrix shape: {X.shape}")

119

120

# Reproducible data (same seed)

121

data1 = patsy.demo_data("x", "y")

122

data2 = patsy.demo_data("x", "y")

123

print("Reproducible:", np.array_equal(data1["x"], data2["x"]))

124

```

125

126

### Programmatic Factor Construction

127

128

A factor class for programmatically constructing formulas without string parsing.

129

130

```python { .api }

131

class LookupFactor:

132

"""

133

Simple factor class that looks up named entries in data.

134

135

Useful for programmatically constructing formulas and as an example

136

of the factor protocol. Provides more control than string-based formulas.

137

"""

138

def __init__(self, varname, force_categorical=False, contrast=None, levels=None):

139

"""

140

Create a lookup factor.

141

142

Parameters:

143

- varname (str): Variable name for data lookup

144

- force_categorical (bool): Treat as categorical regardless of data type

145

- contrast: Contrast coding scheme (requires force_categorical=True)

146

- levels: Explicit categorical levels (requires force_categorical=True)

147

"""

148

```

149

150

#### Usage Examples

151

152

```python

153

import patsy

154

from patsy import LookupFactor, ModelDesc, Term

155

import pandas as pd

156

157

# Sample data

158

data = pd.DataFrame({

159

'x': [1, 2, 3, 4, 5],

160

'group': ['A', 'B', 'A', 'B', 'A'],

161

'y': [2, 4, 6, 8, 10]

162

})

163

164

# Basic lookup factor

165

x_factor = LookupFactor("x")

166

group_factor = LookupFactor("group")

167

168

# Programmatically construct model description

169

# Equivalent to "y ~ x + group"

170

outcome_term = Term([LookupFactor("y")])

171

predictor_terms = [

172

Term([]), # Intercept

173

Term([LookupFactor("x")]),

174

Term([LookupFactor("group")])

175

]

176

177

model_desc = ModelDesc([outcome_term], predictor_terms)

178

179

# Build design matrices from programmatic model

180

y, X = patsy.dmatrices(model_desc, data)

181

print("Programmatic model shape:", X.shape)

182

183

# Force categorical treatment

184

categorical_factor = LookupFactor("x", force_categorical=True)

185

cat_term = Term([categorical_factor])

186

cat_model = ModelDesc([], [Term([]), cat_term])

187

design = patsy.dmatrix(cat_model, data)

188

print("Forced categorical columns:", design.design_info.column_names)

189

190

# With custom contrast

191

from patsy import Sum

192

contrast_factor = LookupFactor("group", force_categorical=True, contrast=Sum())

193

contrast_term = Term([contrast_factor])

194

contrast_model = ModelDesc([], [Term([]), contrast_term])

195

contrast_design = patsy.dmatrix(contrast_model, data)

196

print("Custom contrast columns:", contrast_design.design_info.column_names)

197

198

# With explicit levels

199

levels_factor = LookupFactor("group", force_categorical=True, levels=['B', 'A'])

200

levels_term = Term([levels_factor])

201

levels_model = ModelDesc([], [Term([]), levels_term])

202

levels_design = patsy.dmatrix(levels_model, data)

203

print("Custom levels columns:", levels_design.design_info.column_names)

204

```

205

206

## Integration with Other Patsy Features

207

208

### Balanced Designs with Complex Models

209

210

```python

211

import patsy

212

import numpy as np

213

from sklearn.linear_model import LinearRegression

214

215

# Create complex balanced design

216

data = patsy.balanced(treatment=3, dose=2, gender=2, repeat=5)

217

218

# Add outcome variable with realistic effects

219

np.random.seed(42)

220

y_values = []

221

for t, d, g in zip(data['treatment'], data['dose'], data['gender']):

222

# Simulate treatment and dose effects

223

effect = {'treatment1': 0, 'treatment2': 2, 'treatment3': 4}[t]

224

effect += {'dose1': 0, 'dose2': 1}[d]

225

effect += {'gender1': 0, 'gender2': 0.5}[g]

226

y_values.append(effect + np.random.normal(0, 0.5))

227

228

data['response'] = y_values

229

230

# Analyze with full factorial model

231

y, X = patsy.dmatrices("response ~ C(treatment) * C(dose) * C(gender)", data)

232

print(f"Full factorial design: {X.shape}")

233

234

# Fit model

235

model = LinearRegression(fit_intercept=False)

236

model.fit(X, y.ravel())

237

print(f"Model R²: {model.score(X, y.ravel()):.3f}")

238

```

239

240

### Demo Data for Testing Transformations

241

242

```python

243

import patsy

244

245

# Generate data for testing various transformations

246

data = patsy.demo_data("group", "x", "y", "z", nlevels=3, min_rows=30)

247

248

# Test spline transformations

249

spline_design = patsy.dmatrix("bs(x, df=4)", data)

250

print(f"B-spline design: {spline_design.shape}")

251

252

# Test interactions with categorical

253

interaction_design = patsy.dmatrix("C(group) * x", data)

254

print(f"Interaction design: {interaction_design.shape}")

255

256

# Test stateful transforms

257

standardized_design = patsy.dmatrix("standardize(x) + standardize(y)", data)

258

print(f"Standardized design: {standardized_design.shape}")

259

260

# Complete mixed-effects style model

261

complex_y, complex_X = patsy.dmatrices(

262

"z ~ C(group) + bs(x, df=3) + standardize(y)",

263

data

264

)

265

print(f"Complex model: {complex_X.shape}")

266

```

267

268

### Programmatic Model Construction

269

270

```python

271

import patsy

272

from patsy import LookupFactor, ModelDesc, Term, INTERCEPT

273

274

# Function to build models programmatically

275

def build_model(outcome, predictors, interactions=None):

276

"""Build ModelDesc programmatically"""

277

# Outcome term

278

outcome_term = Term([LookupFactor(outcome)])

279

280

# Predictor terms starting with intercept

281

pred_terms = [Term([INTERCEPT])]

282

283

# Add main effects

284

for pred in predictors:

285

pred_terms.append(Term([LookupFactor(pred)]))

286

287

# Add interactions if specified

288

if interactions:

289

for pred1, pred2 in interactions:

290

interaction_term = Term([LookupFactor(pred1), LookupFactor(pred2)])

291

pred_terms.append(interaction_term)

292

293

return ModelDesc([outcome_term], pred_terms)

294

295

# Use the function

296

data = patsy.demo_data("group", "condition", "x", "y", "response")

297

298

# Build model: response ~ group + condition + x + group:condition

299

model = build_model(

300

outcome="response",

301

predictors=["group", "condition", "x"],

302

interactions=[("group", "condition")]

303

)

304

305

y, X = patsy.dmatrices(model, data)

306

print(f"Programmatic model: {X.shape}")

307

print("Columns:", X.design_info.column_names)

308

```

309

310

## Advanced Utility Patterns

311

312

### Custom Data Generation

313

314

```python

315

def create_experiment_data(n_subjects, n_conditions, n_timepoints):

316

"""Create realistic experimental data structure"""

317

318

# Use balanced design for experimental structure

319

design = patsy.balanced(

320

subject=n_subjects,

321

condition=n_conditions,

322

timepoint=n_timepoints

323

)

324

325

# Add realistic measurement data

326

np.random.seed(42)

327

measurements = []

328

for subj, cond, time in zip(design['subject'], design['condition'], design['timepoint']):

329

# Simulate individual differences and condition effects

330

subject_effect = int(subj.replace('subject', '')) * 0.1

331

condition_effect = {'condition1': 0, 'condition2': 1, 'condition3': 2}[cond]

332

time_effect = int(time.replace('timepoint', '')) * 0.2

333

334

measurement = subject_effect + condition_effect + time_effect + np.random.normal(0, 0.3)

335

measurements.append(measurement)

336

337

design['measurement'] = measurements

338

return design

339

340

# Use custom data generation

341

exp_data = create_experiment_data(10, 3, 4)

342

print(f"Experimental data: {len(exp_data['measurement'])} observations")

343

344

# Analyze with mixed-effects style formula

345

y, X = patsy.dmatrices("measurement ~ C(condition) + C(timepoint)", exp_data)

346

print(f"Analysis design: {X.shape}")

347

```