tessl/pypi-patsy

A Python package for describing statistical models and for building design matrices.

—

Pending

Overview

Eval results

Files

Utility Functions

Name: tessl/pypi-patsy
Author: tessl

Helper functions for generating test data, creating balanced designs, and programmatically constructing formulas. These utilities support common tasks in statistical modeling and experimental design.

Capabilities

Balanced Factorial Design Generation

Creates simple balanced factorial designs for testing and experimentation.

def balanced(**kwargs):
    """
    Create balanced factorial designs for testing.

    Given factor names and number of levels for each, generates a balanced factorial
    design as a data dictionary. Useful for creating test data with all combinations
    of factor levels.

    Parameters:
    - **kwargs: factor_name=num_levels pairs specifying factors and their level counts
    - repeat (int): Number of replications of the complete design (default: 1)

    Returns:
    dict: Data dictionary with factor names as keys and level lists as values
    """

Usage Examples

import patsy

# Simple 2x3 factorial design
data = patsy.balanced(treatment=2, dose=3)
print(data)
# {'treatment': ['treatment1', 'treatment1', 'treatment1', 
#                'treatment2', 'treatment2', 'treatment2'],
#  'dose': ['dose1', 'dose2', 'dose3', 'dose1', 'dose2', 'dose3']}

# Multiple factors
data = patsy.balanced(group=2, time=3, condition=2)
print(f"Total combinations: {len(data['group'])}")  # 2*3*2 = 12 combinations

# With replication
data = patsy.balanced(treatment=2, dose=2, repeat=3)
print(f"Total observations: {len(data['treatment'])}")  # 2*2*3 = 12 observations

# Use in design matrix construction
design = patsy.dmatrix("C(treatment) * C(dose)", data)
print(f"Design matrix shape: {design.shape}")

# Complete model with balanced design
y_data = [i + np.random.normal(0, 0.1) for i in range(len(data['treatment']))]
data['y'] = y_data
y, X = patsy.dmatrices("y ~ C(treatment) * C(dose)", data)

Demo Data Generation

Creates simple categorical and numerical demo data for testing formulas and models.

def demo_data(*names, nlevels=2, min_rows=5):
    """
    Create simple categorical/numerical demo data.

    Variable names starting with 'a'-'m' become categorical with specified levels.
    Names starting with 'p'-'z' become numerical (normal distribution).
    Creates balanced design for categorical variables with at least min_rows observations.

    Parameters:
    - *names: Variable names to create
    - nlevels (int): Number of levels for categorical variables (default: 2)
    - min_rows (int): Minimum number of data rows to generate (default: 5)

    Returns:
    dict: Data dictionary with variable names as keys
    
    Notes:
    - Categorical variables: names starting with 'a' through 'm'
    - Numerical variables: names starting with 'p' through 'z'  
    - Uses fixed random seed for reproducible numerical data
    """

Usage Examples

import patsy
import numpy as np

# Mixed categorical and numerical variables
data = patsy.demo_data("group", "condition", "score", "time")
print("Variables created:")
for name, values in data.items():
    print(f"  {name}: {type(values[0]).__name__} - {len(values)} observations")

# Categorical variables (a-m)
cat_data = patsy.demo_data("factor_a", "factor_b", "group")
print("Categorical levels:")
for name, values in cat_data.items():
    print(f"  {name}: {set(values)}")

# Numerical variables (p-z)  
num_data = patsy.demo_data("x", "y", "z", "score", "time")
print("Numerical data types:")
for name, values in num_data.items():
    if isinstance(values, np.ndarray):
        print(f"  {name}: mean={np.mean(values):.2f}, std={np.std(values):.2f}")

# Custom parameters
data = patsy.demo_data("group", "x", "y", nlevels=4, min_rows=20)
print(f"Group levels: {set(data['group'])}")
print(f"Data size: {len(data['x'])} rows")

# Use with formula construction
y, X = patsy.dmatrices("y ~ C(group) + x", data)
print(f"Design matrix shape: {X.shape}")

# Reproducible data (same seed)
data1 = patsy.demo_data("x", "y")
data2 = patsy.demo_data("x", "y")
print("Reproducible:", np.array_equal(data1["x"], data2["x"]))

Programmatic Factor Construction

A factor class for programmatically constructing formulas without string parsing.

class LookupFactor:
    """
    Simple factor class that looks up named entries in data.
    
    Useful for programmatically constructing formulas and as an example
    of the factor protocol. Provides more control than string-based formulas.
    """
    def __init__(self, varname, force_categorical=False, contrast=None, levels=None):
        """
        Create a lookup factor.
        
        Parameters:
        - varname (str): Variable name for data lookup
        - force_categorical (bool): Treat as categorical regardless of data type
        - contrast: Contrast coding scheme (requires force_categorical=True)
        - levels: Explicit categorical levels (requires force_categorical=True)
        """

Usage Examples

import patsy
from patsy import LookupFactor, ModelDesc, Term
import pandas as pd

# Sample data
data = pd.DataFrame({
    'x': [1, 2, 3, 4, 5],
    'group': ['A', 'B', 'A', 'B', 'A'],
    'y': [2, 4, 6, 8, 10]
})

# Basic lookup factor
x_factor = LookupFactor("x")
group_factor = LookupFactor("group")

# Programmatically construct model description
# Equivalent to "y ~ x + group"
outcome_term = Term([LookupFactor("y")])
predictor_terms = [
    Term([]),  # Intercept
    Term([LookupFactor("x")]),
    Term([LookupFactor("group")])
]

model_desc = ModelDesc([outcome_term], predictor_terms)

# Build design matrices from programmatic model
y, X = patsy.dmatrices(model_desc, data)
print("Programmatic model shape:", X.shape)

# Force categorical treatment
categorical_factor = LookupFactor("x", force_categorical=True)
cat_term = Term([categorical_factor])
cat_model = ModelDesc([], [Term([]), cat_term])
design = patsy.dmatrix(cat_model, data)
print("Forced categorical columns:", design.design_info.column_names)

# With custom contrast
from patsy import Sum
contrast_factor = LookupFactor("group", force_categorical=True, contrast=Sum())
contrast_term = Term([contrast_factor])
contrast_model = ModelDesc([], [Term([]), contrast_term])
contrast_design = patsy.dmatrix(contrast_model, data)
print("Custom contrast columns:", contrast_design.design_info.column_names)

# With explicit levels
levels_factor = LookupFactor("group", force_categorical=True, levels=['B', 'A'])
levels_term = Term([levels_factor])  
levels_model = ModelDesc([], [Term([]), levels_term])
levels_design = patsy.dmatrix(levels_model, data)
print("Custom levels columns:", levels_design.design_info.column_names)

Integration with Other Patsy Features

Balanced Designs with Complex Models

import patsy
import numpy as np
from sklearn.linear_model import LinearRegression

# Create complex balanced design
data = patsy.balanced(treatment=3, dose=2, gender=2, repeat=5)

# Add outcome variable with realistic effects
np.random.seed(42)
y_values = []
for t, d, g in zip(data['treatment'], data['dose'], data['gender']):
    # Simulate treatment and dose effects
    effect = {'treatment1': 0, 'treatment2': 2, 'treatment3': 4}[t]
    effect += {'dose1': 0, 'dose2': 1}[d]  
    effect += {'gender1': 0, 'gender2': 0.5}[g]
    y_values.append(effect + np.random.normal(0, 0.5))

data['response'] = y_values

# Analyze with full factorial model
y, X = patsy.dmatrices("response ~ C(treatment) * C(dose) * C(gender)", data)
print(f"Full factorial design: {X.shape}")

# Fit model
model = LinearRegression(fit_intercept=False)
model.fit(X, y.ravel())
print(f"Model R²: {model.score(X, y.ravel()):.3f}")

Demo Data for Testing Transformations

import patsy

# Generate data for testing various transformations
data = patsy.demo_data("group", "x", "y", "z", nlevels=3, min_rows=30)

# Test spline transformations
spline_design = patsy.dmatrix("bs(x, df=4)", data)
print(f"B-spline design: {spline_design.shape}")

# Test interactions with categorical
interaction_design = patsy.dmatrix("C(group) * x", data)
print(f"Interaction design: {interaction_design.shape}")

# Test stateful transforms
standardized_design = patsy.dmatrix("standardize(x) + standardize(y)", data)
print(f"Standardized design: {standardized_design.shape}")

# Complete mixed-effects style model
complex_y, complex_X = patsy.dmatrices(
    "z ~ C(group) + bs(x, df=3) + standardize(y)", 
    data
)
print(f"Complex model: {complex_X.shape}")

Programmatic Model Construction

import patsy
from patsy import LookupFactor, ModelDesc, Term, INTERCEPT

# Function to build models programmatically
def build_model(outcome, predictors, interactions=None):
    """Build ModelDesc programmatically"""
    # Outcome term
    outcome_term = Term([LookupFactor(outcome)])
    
    # Predictor terms starting with intercept
    pred_terms = [Term([INTERCEPT])]
    
    # Add main effects
    for pred in predictors:
        pred_terms.append(Term([LookupFactor(pred)]))
    
    # Add interactions if specified
    if interactions:
        for pred1, pred2 in interactions:
            interaction_term = Term([LookupFactor(pred1), LookupFactor(pred2)])
            pred_terms.append(interaction_term)
    
    return ModelDesc([outcome_term], pred_terms)

# Use the function
data = patsy.demo_data("group", "condition", "x", "y", "response")

# Build model: response ~ group + condition + x + group:condition
model = build_model(
    outcome="response",
    predictors=["group", "condition", "x"],
    interactions=[("group", "condition")]
)

y, X = patsy.dmatrices(model, data)
print(f"Programmatic model: {X.shape}")
print("Columns:", X.design_info.column_names)

Advanced Utility Patterns

Custom Data Generation

def create_experiment_data(n_subjects, n_conditions, n_timepoints):
    """Create realistic experimental data structure"""
    
    # Use balanced design for experimental structure
    design = patsy.balanced(
        subject=n_subjects,
        condition=n_conditions, 
        timepoint=n_timepoints
    )
    
    # Add realistic measurement data
    np.random.seed(42)
    measurements = []
    for subj, cond, time in zip(design['subject'], design['condition'], design['timepoint']):
        # Simulate individual differences and condition effects
        subject_effect = int(subj.replace('subject', '')) * 0.1
        condition_effect = {'condition1': 0, 'condition2': 1, 'condition3': 2}[cond]
        time_effect = int(time.replace('timepoint', '')) * 0.2
        
        measurement = subject_effect + condition_effect + time_effect + np.random.normal(0, 0.3)
        measurements.append(measurement)
    
    design['measurement'] = measurements
    return design

# Use custom data generation
exp_data = create_experiment_data(10, 3, 4)
print(f"Experimental data: {len(exp_data['measurement'])} observations")

# Analyze with mixed-effects style formula
y, X = patsy.dmatrices("measurement ~ C(condition) + C(timepoint)", exp_data)
print(f"Analysis design: {X.shape}")

Install with Tessl CLI