A Python package for describing statistical models and for building design matrices.
—
Lower-level functions for constructing design matrices from parsed terms, providing more control over the matrix building process than the high-level interface. These functions form the core of Patsy's formula interpretation machinery.
Creates design matrix builder objects from term lists, which can then be used to construct matrices from data.
def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"):
"""
Construct DesignMatrixBuilder objects from term lists.
This is one of Patsy's fundamental functions, providing the core formula
interpretation machinery along with build_design_matrices().
Parameters:
- termlists: List of term lists, where each term list contains Term objects
specifying a design matrix structure
- data_iter_maker: Zero-argument callable returning iterator over dict-like data objects
- eval_env: EvalEnvironment for variable lookup and evaluation
- NA_action (str): Strategy for handling missing data ("drop", "raise", etc.)
Returns:
List of DesignMatrixBuilder objects, one for each input term list
"""import patsy
from patsy import ModelDesc, Term, EvalEnvironment
from patsy.desc import INTERCEPT
import pandas as pd
# Prepare data iterator maker
data = pd.DataFrame({
'x': [1, 2, 3, 4, 5],
'y': [2, 4, 6, 8, 10],
'group': ['A', 'B', 'A', 'B', 'A']
})
def data_iter_maker():
yield data
# Create term lists manually (usually done by formula parsing)
# This is typically internal to Patsy, but shown for completeness
terms_y = [Term([INTERCEPT])] # Just intercept for outcome
terms_x = [Term([INTERCEPT]), Term(['x']), Term(['group'])] # Intercept + predictors
termlists = [terms_y, terms_x]
eval_env = EvalEnvironment.capture()
# Build design matrix builders
builders = patsy.design_matrix_builders(termlists, data_iter_maker, eval_env)
print(f"Number of builders: {len(builders)}")Constructs actual design matrices from pre-built design matrix builder objects.
def build_design_matrices(builders, data, NA_action="drop", return_type="matrix", dtype=float):
"""
Construct design matrices from DesignMatrixBuilder objects.
This is one of Patsy's fundamental functions, working together with
design_matrix_builders() to form the core formula interpretation API.
Parameters:
- builders: List of DesignMatrixBuilder objects (from design_matrix_builders)
- data: Dict-like object containing data for matrix construction
- NA_action (str): Strategy for handling missing data
- return_type (str): "matrix" for numpy arrays, "dataframe" for pandas DataFrames
- dtype: Data type for the resulting matrices (default: float)
Returns:
List of design matrices (DesignMatrix objects or DataFrames)
"""import patsy
import pandas as pd
import numpy as np
# Using the builders from the previous example
# (In practice, you'd usually get builders from the high-level interface)
# New data for matrix construction
new_data = pd.DataFrame({
'x': [1.5, 2.5, 3.5],
'y': [3, 5, 7],
'group': ['A', 'B', 'A']
})
# Build matrices using the pre-constructed builders
matrices = patsy.build_design_matrices(builders, new_data)
print(f"Number of matrices: {len(matrices)}")
print(f"Outcome matrix shape: {matrices[0].shape}")
print(f"Predictor matrix shape: {matrices[1].shape}")
# With different return type
matrices_df = patsy.build_design_matrices(builders, new_data, return_type="dataframe")
print("DataFrame columns:", matrices_df[1].columns.tolist())
# With different data type
matrices_int = patsy.build_design_matrices(builders, new_data, dtype=np.int32)The matrix building functions work behind the scenes in high-level functions:
import patsy
# High-level interface (what users typically use)
y, X = patsy.dmatrices("y ~ x + C(group)", data)
# Is roughly equivalent to this lower-level process:
# 1. Parse formula to create ModelDesc
# 2. Extract term lists from ModelDesc
# 3. Create data iterator
# 4. Call design_matrix_builders()
# 5. Call build_design_matrices()import patsy
# Create builders for incremental processing
def large_data_iter():
# Simulate large dataset in chunks
for i in range(0, 10000, 1000):
chunk_data = {
'x': list(range(i, i+1000)),
'y': [j*2 for j in range(i, i+1000)]
}
yield chunk_data
# Parse formula to get model description
model_desc = patsy.ModelDesc.from_formula("y ~ x")
eval_env = patsy.EvalEnvironment.capture()
# Extract term lists
lhs_termlist = [model_desc.lhs_termlist] if model_desc.lhs_termlist else []
rhs_termlist = [model_desc.rhs_termlist]
termlists = lhs_termlist + rhs_termlist
# Create builders
builders = patsy.design_matrix_builders(termlists, large_data_iter, eval_env)
# Use builders on new data
new_data = {'x': [5000, 5001, 5002], 'y': [10000, 10002, 10004]}
matrices = patsy.build_design_matrices(builders, new_data)import patsy
from patsy.missing import NAAction
# Custom NA action
class CustomNAAction(NAAction):
def handle_NA(self, values, is_NA, origins):
# Custom logic for handling missing values
# This is a simplified example
return values[~is_NA], origins[~is_NA] if origins is not None else None
custom_na_action = CustomNAAction()
# Use with matrix builders
builders = patsy.design_matrix_builders(
termlists,
data_iter_maker,
eval_env,
NA_action=custom_na_action
)
matrices = patsy.build_design_matrices(
builders,
data,
NA_action=custom_na_action
)import patsy
import pandas as pd
# Original training data
train_data = pd.DataFrame({
'x': [1, 2, 3, 4, 5],
'y': [2, 4, 6, 8, 10],
'group': ['A', 'B', 'A', 'B', 'A']
})
# Create builders from training data
def train_iter():
yield train_data
model_desc = patsy.ModelDesc.from_formula("y ~ x + C(group)")
eval_env = patsy.EvalEnvironment.capture()
termlists = [[model_desc.lhs_termlist], [model_desc.rhs_termlist]]
builders = patsy.design_matrix_builders(termlists, train_iter, eval_env)
# Test data (different size, potentially different factor levels)
test_data = pd.DataFrame({
'x': [1.5, 2.5, 3.5, 4.5],
'y': [3, 5, 7, 9],
'group': ['A', 'B', 'A', 'C'] # Note: 'C' is a new level
})
# Build matrices for test data using same builders
try:
test_matrices = patsy.build_design_matrices(builders, test_data)
print("Test matrices built successfully")
except Exception as e:
print(f"Error with new factor level: {e}")
# Handle new factor levels appropriately# Builders contain metadata about the design matrix structure
builder = builders[1] # Predictor matrix builder
# Access design information
print("Column names:", builder.design_info.column_names)
print("Terms:", builder.design_info.terms)
print("Factor infos:", [fi.factor.code for fi in builder.design_info.factor_infos])
# Check for stateful transforms
for factor_info in builder.design_info.factor_infos:
if hasattr(factor_info.factor, 'memorize_chunk'):
print(f"Stateful factor: {factor_info.factor}")# Built matrices contain rich metadata
matrix = matrices[1] # Predictor matrix
if hasattr(matrix, 'design_info'):
print("Matrix column names:", matrix.design_info.column_names)
print("Matrix shape:", matrix.shape)
print("Terms per column:", matrix.design_info.column_name_indexes)# Factor level mismatches
try:
matrices = patsy.build_design_matrices(builders, data_with_new_levels)
except patsy.PatsyError as e:
print(f"Factor level error: {e}")
# Handle appropriately (drop new levels, add to design, etc.)
# Missing data issues
try:
matrices = patsy.build_design_matrices(builders, data_with_nas, NA_action="raise")
except patsy.PatsyError as e:
print(f"Missing data error: {e}")
# Switch to different NA_action or preprocess data# Inspect intermediate results
print("Builder design infos:")
for i, builder in enumerate(builders):
print(f"Builder {i}: {builder.design_info.column_names}")
# Check data types and shapes
for i, matrix in enumerate(matrices):
print(f"Matrix {i}: shape={matrix.shape}, dtype={matrix.dtype}")
if hasattr(matrix, 'design_info'):
print(f" Columns: {matrix.design_info.column_names}")Install with Tessl CLI
npx tessl i tessl/pypi-patsy