A Python package for describing statistical models and for building design matrices.
—
Helper functions for generating test data, creating balanced designs, and programmatically constructing formulas. These utilities support common tasks in statistical modeling and experimental design.
Creates simple balanced factorial designs for testing and experimentation.
def balanced(**kwargs):
"""
Create balanced factorial designs for testing.
Given factor names and number of levels for each, generates a balanced factorial
design as a data dictionary. Useful for creating test data with all combinations
of factor levels.
Parameters:
- **kwargs: factor_name=num_levels pairs specifying factors and their level counts
- repeat (int): Number of replications of the complete design (default: 1)
Returns:
dict: Data dictionary with factor names as keys and level lists as values
"""import patsy
# Simple 2x3 factorial design
data = patsy.balanced(treatment=2, dose=3)
print(data)
# {'treatment': ['treatment1', 'treatment1', 'treatment1',
# 'treatment2', 'treatment2', 'treatment2'],
# 'dose': ['dose1', 'dose2', 'dose3', 'dose1', 'dose2', 'dose3']}
# Multiple factors
data = patsy.balanced(group=2, time=3, condition=2)
print(f"Total combinations: {len(data['group'])}") # 2*3*2 = 12 combinations
# With replication
data = patsy.balanced(treatment=2, dose=2, repeat=3)
print(f"Total observations: {len(data['treatment'])}") # 2*2*3 = 12 observations
# Use in design matrix construction
design = patsy.dmatrix("C(treatment) * C(dose)", data)
print(f"Design matrix shape: {design.shape}")
# Complete model with balanced design
y_data = [i + np.random.normal(0, 0.1) for i in range(len(data['treatment']))]
data['y'] = y_data
y, X = patsy.dmatrices("y ~ C(treatment) * C(dose)", data)Creates simple categorical and numerical demo data for testing formulas and models.
def demo_data(*names, nlevels=2, min_rows=5):
"""
Create simple categorical/numerical demo data.
Variable names starting with 'a'-'m' become categorical with specified levels.
Names starting with 'p'-'z' become numerical (normal distribution).
Creates balanced design for categorical variables with at least min_rows observations.
Parameters:
- *names: Variable names to create
- nlevels (int): Number of levels for categorical variables (default: 2)
- min_rows (int): Minimum number of data rows to generate (default: 5)
Returns:
dict: Data dictionary with variable names as keys
Notes:
- Categorical variables: names starting with 'a' through 'm'
- Numerical variables: names starting with 'p' through 'z'
- Uses fixed random seed for reproducible numerical data
"""import patsy
import numpy as np
# Mixed categorical and numerical variables
data = patsy.demo_data("group", "condition", "score", "time")
print("Variables created:")
for name, values in data.items():
print(f" {name}: {type(values[0]).__name__} - {len(values)} observations")
# Categorical variables (a-m)
cat_data = patsy.demo_data("factor_a", "factor_b", "group")
print("Categorical levels:")
for name, values in cat_data.items():
print(f" {name}: {set(values)}")
# Numerical variables (p-z)
num_data = patsy.demo_data("x", "y", "z", "score", "time")
print("Numerical data types:")
for name, values in num_data.items():
if isinstance(values, np.ndarray):
print(f" {name}: mean={np.mean(values):.2f}, std={np.std(values):.2f}")
# Custom parameters
data = patsy.demo_data("group", "x", "y", nlevels=4, min_rows=20)
print(f"Group levels: {set(data['group'])}")
print(f"Data size: {len(data['x'])} rows")
# Use with formula construction
y, X = patsy.dmatrices("y ~ C(group) + x", data)
print(f"Design matrix shape: {X.shape}")
# Reproducible data (same seed)
data1 = patsy.demo_data("x", "y")
data2 = patsy.demo_data("x", "y")
print("Reproducible:", np.array_equal(data1["x"], data2["x"]))A factor class for programmatically constructing formulas without string parsing.
class LookupFactor:
"""
Simple factor class that looks up named entries in data.
Useful for programmatically constructing formulas and as an example
of the factor protocol. Provides more control than string-based formulas.
"""
def __init__(self, varname, force_categorical=False, contrast=None, levels=None):
"""
Create a lookup factor.
Parameters:
- varname (str): Variable name for data lookup
- force_categorical (bool): Treat as categorical regardless of data type
- contrast: Contrast coding scheme (requires force_categorical=True)
- levels: Explicit categorical levels (requires force_categorical=True)
"""import patsy
from patsy import LookupFactor, ModelDesc, Term
import pandas as pd
# Sample data
data = pd.DataFrame({
'x': [1, 2, 3, 4, 5],
'group': ['A', 'B', 'A', 'B', 'A'],
'y': [2, 4, 6, 8, 10]
})
# Basic lookup factor
x_factor = LookupFactor("x")
group_factor = LookupFactor("group")
# Programmatically construct model description
# Equivalent to "y ~ x + group"
outcome_term = Term([LookupFactor("y")])
predictor_terms = [
Term([]), # Intercept
Term([LookupFactor("x")]),
Term([LookupFactor("group")])
]
model_desc = ModelDesc([outcome_term], predictor_terms)
# Build design matrices from programmatic model
y, X = patsy.dmatrices(model_desc, data)
print("Programmatic model shape:", X.shape)
# Force categorical treatment
categorical_factor = LookupFactor("x", force_categorical=True)
cat_term = Term([categorical_factor])
cat_model = ModelDesc([], [Term([]), cat_term])
design = patsy.dmatrix(cat_model, data)
print("Forced categorical columns:", design.design_info.column_names)
# With custom contrast
from patsy import Sum
contrast_factor = LookupFactor("group", force_categorical=True, contrast=Sum())
contrast_term = Term([contrast_factor])
contrast_model = ModelDesc([], [Term([]), contrast_term])
contrast_design = patsy.dmatrix(contrast_model, data)
print("Custom contrast columns:", contrast_design.design_info.column_names)
# With explicit levels
levels_factor = LookupFactor("group", force_categorical=True, levels=['B', 'A'])
levels_term = Term([levels_factor])
levels_model = ModelDesc([], [Term([]), levels_term])
levels_design = patsy.dmatrix(levels_model, data)
print("Custom levels columns:", levels_design.design_info.column_names)import patsy
import numpy as np
from sklearn.linear_model import LinearRegression
# Create complex balanced design
data = patsy.balanced(treatment=3, dose=2, gender=2, repeat=5)
# Add outcome variable with realistic effects
np.random.seed(42)
y_values = []
for t, d, g in zip(data['treatment'], data['dose'], data['gender']):
# Simulate treatment and dose effects
effect = {'treatment1': 0, 'treatment2': 2, 'treatment3': 4}[t]
effect += {'dose1': 0, 'dose2': 1}[d]
effect += {'gender1': 0, 'gender2': 0.5}[g]
y_values.append(effect + np.random.normal(0, 0.5))
data['response'] = y_values
# Analyze with full factorial model
y, X = patsy.dmatrices("response ~ C(treatment) * C(dose) * C(gender)", data)
print(f"Full factorial design: {X.shape}")
# Fit model
model = LinearRegression(fit_intercept=False)
model.fit(X, y.ravel())
print(f"Model R²: {model.score(X, y.ravel()):.3f}")import patsy
# Generate data for testing various transformations
data = patsy.demo_data("group", "x", "y", "z", nlevels=3, min_rows=30)
# Test spline transformations
spline_design = patsy.dmatrix("bs(x, df=4)", data)
print(f"B-spline design: {spline_design.shape}")
# Test interactions with categorical
interaction_design = patsy.dmatrix("C(group) * x", data)
print(f"Interaction design: {interaction_design.shape}")
# Test stateful transforms
standardized_design = patsy.dmatrix("standardize(x) + standardize(y)", data)
print(f"Standardized design: {standardized_design.shape}")
# Complete mixed-effects style model
complex_y, complex_X = patsy.dmatrices(
"z ~ C(group) + bs(x, df=3) + standardize(y)",
data
)
print(f"Complex model: {complex_X.shape}")import patsy
from patsy import LookupFactor, ModelDesc, Term, INTERCEPT
# Function to build models programmatically
def build_model(outcome, predictors, interactions=None):
"""Build ModelDesc programmatically"""
# Outcome term
outcome_term = Term([LookupFactor(outcome)])
# Predictor terms starting with intercept
pred_terms = [Term([INTERCEPT])]
# Add main effects
for pred in predictors:
pred_terms.append(Term([LookupFactor(pred)]))
# Add interactions if specified
if interactions:
for pred1, pred2 in interactions:
interaction_term = Term([LookupFactor(pred1), LookupFactor(pred2)])
pred_terms.append(interaction_term)
return ModelDesc([outcome_term], pred_terms)
# Use the function
data = patsy.demo_data("group", "condition", "x", "y", "response")
# Build model: response ~ group + condition + x + group:condition
model = build_model(
outcome="response",
predictors=["group", "condition", "x"],
interactions=[("group", "condition")]
)
y, X = patsy.dmatrices(model, data)
print(f"Programmatic model: {X.shape}")
print("Columns:", X.design_info.column_names)def create_experiment_data(n_subjects, n_conditions, n_timepoints):
"""Create realistic experimental data structure"""
# Use balanced design for experimental structure
design = patsy.balanced(
subject=n_subjects,
condition=n_conditions,
timepoint=n_timepoints
)
# Add realistic measurement data
np.random.seed(42)
measurements = []
for subj, cond, time in zip(design['subject'], design['condition'], design['timepoint']):
# Simulate individual differences and condition effects
subject_effect = int(subj.replace('subject', '')) * 0.1
condition_effect = {'condition1': 0, 'condition2': 1, 'condition3': 2}[cond]
time_effect = int(time.replace('timepoint', '')) * 0.2
measurement = subject_effect + condition_effect + time_effect + np.random.normal(0, 0.3)
measurements.append(measurement)
design['measurement'] = measurements
return design
# Use custom data generation
exp_data = create_experiment_data(10, 3, 4)
print(f"Experimental data: {len(exp_data['measurement'])} observations")
# Analyze with mixed-effects style formula
y, X = patsy.dmatrices("measurement ~ C(condition) + C(timepoint)", exp_data)
print(f"Analysis design: {X.shape}")Install with Tessl CLI
npx tessl i tessl/pypi-patsy