or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

builtins.mdcategorical.mdcontrasts.mdhigh-level.mdindex.mdmatrix-building.mdsplines.mdtransforms.mdutilities.md

matrix-building.mddocs/

0

# Design Matrix Building

1

2

Lower-level functions for constructing design matrices from parsed terms, providing more control over the matrix building process than the high-level interface. These functions form the core of Patsy's formula interpretation machinery.

3

4

## Capabilities

5

6

### Design Matrix Builders Construction

7

8

Creates design matrix builder objects from term lists, which can then be used to construct matrices from data.

9

10

```python { .api }

11

def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"):

12

"""

13

Construct DesignMatrixBuilder objects from term lists.

14

15

This is one of Patsy's fundamental functions, providing the core formula

16

interpretation machinery along with build_design_matrices().

17

18

Parameters:

19

- termlists: List of term lists, where each term list contains Term objects

20

specifying a design matrix structure

21

- data_iter_maker: Zero-argument callable returning iterator over dict-like data objects

22

- eval_env: EvalEnvironment for variable lookup and evaluation

23

- NA_action (str): Strategy for handling missing data ("drop", "raise", etc.)

24

25

Returns:

26

List of DesignMatrixBuilder objects, one for each input term list

27

"""

28

```

29

30

#### Usage Examples

31

32

```python

33

import patsy

34

from patsy import ModelDesc, Term, EvalEnvironment

35

from patsy.desc import INTERCEPT

36

import pandas as pd

37

38

# Prepare data iterator maker

39

data = pd.DataFrame({

40

'x': [1, 2, 3, 4, 5],

41

'y': [2, 4, 6, 8, 10],

42

'group': ['A', 'B', 'A', 'B', 'A']

43

})

44

45

def data_iter_maker():

46

yield data

47

48

# Create term lists manually (usually done by formula parsing)

49

# This is typically internal to Patsy, but shown for completeness

50

terms_y = [Term([INTERCEPT])] # Just intercept for outcome

51

terms_x = [Term([INTERCEPT]), Term(['x']), Term(['group'])] # Intercept + predictors

52

53

termlists = [terms_y, terms_x]

54

eval_env = EvalEnvironment.capture()

55

56

# Build design matrix builders

57

builders = patsy.design_matrix_builders(termlists, data_iter_maker, eval_env)

58

print(f"Number of builders: {len(builders)}")

59

```

60

61

### Design Matrix Construction

62

63

Constructs actual design matrices from pre-built design matrix builder objects.

64

65

```python { .api }

66

def build_design_matrices(builders, data, NA_action="drop", return_type="matrix", dtype=float):

67

"""

68

Construct design matrices from DesignMatrixBuilder objects.

69

70

This is one of Patsy's fundamental functions, working together with

71

design_matrix_builders() to form the core formula interpretation API.

72

73

Parameters:

74

- builders: List of DesignMatrixBuilder objects (from design_matrix_builders)

75

- data: Dict-like object containing data for matrix construction

76

- NA_action (str): Strategy for handling missing data

77

- return_type (str): "matrix" for numpy arrays, "dataframe" for pandas DataFrames

78

- dtype: Data type for the resulting matrices (default: float)

79

80

Returns:

81

List of design matrices (DesignMatrix objects or DataFrames)

82

"""

83

```

84

85

#### Usage Examples

86

87

```python

88

import patsy

89

import pandas as pd

90

import numpy as np

91

92

# Using the builders from the previous example

93

# (In practice, you'd usually get builders from the high-level interface)

94

95

# New data for matrix construction

96

new_data = pd.DataFrame({

97

'x': [1.5, 2.5, 3.5],

98

'y': [3, 5, 7],

99

'group': ['A', 'B', 'A']

100

})

101

102

# Build matrices using the pre-constructed builders

103

matrices = patsy.build_design_matrices(builders, new_data)

104

print(f"Number of matrices: {len(matrices)}")

105

print(f"Outcome matrix shape: {matrices[0].shape}")

106

print(f"Predictor matrix shape: {matrices[1].shape}")

107

108

# With different return type

109

matrices_df = patsy.build_design_matrices(builders, new_data, return_type="dataframe")

110

print("DataFrame columns:", matrices_df[1].columns.tolist())

111

112

# With different data type

113

matrices_int = patsy.build_design_matrices(builders, new_data, dtype=np.int32)

114

```

115

116

## Integration with High-Level Interface

117

118

The matrix building functions work behind the scenes in high-level functions:

119

120

```python

121

import patsy

122

123

# High-level interface (what users typically use)

124

y, X = patsy.dmatrices("y ~ x + C(group)", data)

125

126

# Is roughly equivalent to this lower-level process:

127

# 1. Parse formula to create ModelDesc

128

# 2. Extract term lists from ModelDesc

129

# 3. Create data iterator

130

# 4. Call design_matrix_builders()

131

# 5. Call build_design_matrices()

132

```

133

134

## Advanced Usage Patterns

135

136

### Incremental Processing with Builders

137

138

```python

139

import patsy

140

141

# Create builders for incremental processing

142

def large_data_iter():

143

# Simulate large dataset in chunks

144

for i in range(0, 10000, 1000):

145

chunk_data = {

146

'x': list(range(i, i+1000)),

147

'y': [j*2 for j in range(i, i+1000)]

148

}

149

yield chunk_data

150

151

# Parse formula to get model description

152

model_desc = patsy.ModelDesc.from_formula("y ~ x")

153

eval_env = patsy.EvalEnvironment.capture()

154

155

# Extract term lists

156

lhs_termlist = [model_desc.lhs_termlist] if model_desc.lhs_termlist else []

157

rhs_termlist = [model_desc.rhs_termlist]

158

termlists = lhs_termlist + rhs_termlist

159

160

# Create builders

161

builders = patsy.design_matrix_builders(termlists, large_data_iter, eval_env)

162

163

# Use builders on new data

164

new_data = {'x': [5000, 5001, 5002], 'y': [10000, 10002, 10004]}

165

matrices = patsy.build_design_matrices(builders, new_data)

166

```

167

168

### Custom Missing Data Handling

169

170

```python

171

import patsy

172

from patsy.missing import NAAction

173

174

# Custom NA action

175

class CustomNAAction(NAAction):

176

def handle_NA(self, values, is_NA, origins):

177

# Custom logic for handling missing values

178

# This is a simplified example

179

return values[~is_NA], origins[~is_NA] if origins is not None else None

180

181

custom_na_action = CustomNAAction()

182

183

# Use with matrix builders

184

builders = patsy.design_matrix_builders(

185

termlists,

186

data_iter_maker,

187

eval_env,

188

NA_action=custom_na_action

189

)

190

191

matrices = patsy.build_design_matrices(

192

builders,

193

data,

194

NA_action=custom_na_action

195

)

196

```

197

198

### Reusing Builders for Different Data

199

200

```python

201

import patsy

202

import pandas as pd

203

204

# Original training data

205

train_data = pd.DataFrame({

206

'x': [1, 2, 3, 4, 5],

207

'y': [2, 4, 6, 8, 10],

208

'group': ['A', 'B', 'A', 'B', 'A']

209

})

210

211

# Create builders from training data

212

def train_iter():

213

yield train_data

214

215

model_desc = patsy.ModelDesc.from_formula("y ~ x + C(group)")

216

eval_env = patsy.EvalEnvironment.capture()

217

termlists = [[model_desc.lhs_termlist], [model_desc.rhs_termlist]]

218

219

builders = patsy.design_matrix_builders(termlists, train_iter, eval_env)

220

221

# Test data (different size, potentially different factor levels)

222

test_data = pd.DataFrame({

223

'x': [1.5, 2.5, 3.5, 4.5],

224

'y': [3, 5, 7, 9],

225

'group': ['A', 'B', 'A', 'C'] # Note: 'C' is a new level

226

})

227

228

# Build matrices for test data using same builders

229

try:

230

test_matrices = patsy.build_design_matrices(builders, test_data)

231

print("Test matrices built successfully")

232

except Exception as e:

233

print(f"Error with new factor level: {e}")

234

# Handle new factor levels appropriately

235

```

236

237

## Builder Objects and Metadata

238

239

### DesignMatrixBuilder Properties

240

241

```python

242

# Builders contain metadata about the design matrix structure

243

builder = builders[1] # Predictor matrix builder

244

245

# Access design information

246

print("Column names:", builder.design_info.column_names)

247

print("Terms:", builder.design_info.terms)

248

print("Factor infos:", [fi.factor.code for fi in builder.design_info.factor_infos])

249

250

# Check for stateful transforms

251

for factor_info in builder.design_info.factor_infos:

252

if hasattr(factor_info.factor, 'memorize_chunk'):

253

print(f"Stateful factor: {factor_info.factor}")

254

```

255

256

### Matrix Metadata

257

258

```python

259

# Built matrices contain rich metadata

260

matrix = matrices[1] # Predictor matrix

261

262

if hasattr(matrix, 'design_info'):

263

print("Matrix column names:", matrix.design_info.column_names)

264

print("Matrix shape:", matrix.shape)

265

print("Terms per column:", matrix.design_info.column_name_indexes)

266

```

267

268

## Error Handling and Debugging

269

270

### Common Issues

271

272

```python

273

# Factor level mismatches

274

try:

275

matrices = patsy.build_design_matrices(builders, data_with_new_levels)

276

except patsy.PatsyError as e:

277

print(f"Factor level error: {e}")

278

# Handle appropriately (drop new levels, add to design, etc.)

279

280

# Missing data issues

281

try:

282

matrices = patsy.build_design_matrices(builders, data_with_nas, NA_action="raise")

283

except patsy.PatsyError as e:

284

print(f"Missing data error: {e}")

285

# Switch to different NA_action or preprocess data

286

```

287

288

### Debugging Matrix Construction

289

290

```python

291

# Inspect intermediate results

292

print("Builder design infos:")

293

for i, builder in enumerate(builders):

294

print(f"Builder {i}: {builder.design_info.column_names}")

295

296

# Check data types and shapes

297

for i, matrix in enumerate(matrices):

298

print(f"Matrix {i}: shape={matrix.shape}, dtype={matrix.dtype}")

299

if hasattr(matrix, 'design_info'):

300

print(f" Columns: {matrix.design_info.column_names}")

301

```