tessl/pypi-patsy

A Python package for describing statistical models and for building design matrices.

—

Pending

Overview

Eval results

Files

Design Matrix Building

Name: tessl/pypi-patsy
Author: tessl

Lower-level functions for constructing design matrices from parsed terms, providing more control over the matrix building process than the high-level interface. These functions form the core of Patsy's formula interpretation machinery.

Capabilities

Design Matrix Builders Construction

Creates design matrix builder objects from term lists, which can then be used to construct matrices from data.

def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"):
    """
    Construct DesignMatrixBuilder objects from term lists.

    This is one of Patsy's fundamental functions, providing the core formula
    interpretation machinery along with build_design_matrices().

    Parameters:
    - termlists: List of term lists, where each term list contains Term objects
                specifying a design matrix structure
    - data_iter_maker: Zero-argument callable returning iterator over dict-like data objects
    - eval_env: EvalEnvironment for variable lookup and evaluation
    - NA_action (str): Strategy for handling missing data ("drop", "raise", etc.)

    Returns:
    List of DesignMatrixBuilder objects, one for each input term list
    """

Usage Examples

import patsy
from patsy import ModelDesc, Term, EvalEnvironment
from patsy.desc import INTERCEPT
import pandas as pd

# Prepare data iterator maker
data = pd.DataFrame({
    'x': [1, 2, 3, 4, 5],
    'y': [2, 4, 6, 8, 10],
    'group': ['A', 'B', 'A', 'B', 'A']
})

def data_iter_maker():
    yield data

# Create term lists manually (usually done by formula parsing)
# This is typically internal to Patsy, but shown for completeness
terms_y = [Term([INTERCEPT])]  # Just intercept for outcome
terms_x = [Term([INTERCEPT]), Term(['x']), Term(['group'])]  # Intercept + predictors

termlists = [terms_y, terms_x]
eval_env = EvalEnvironment.capture()

# Build design matrix builders
builders = patsy.design_matrix_builders(termlists, data_iter_maker, eval_env)
print(f"Number of builders: {len(builders)}")

Design Matrix Construction

Constructs actual design matrices from pre-built design matrix builder objects.

def build_design_matrices(builders, data, NA_action="drop", return_type="matrix", dtype=float):
    """
    Construct design matrices from DesignMatrixBuilder objects.

    This is one of Patsy's fundamental functions, working together with
    design_matrix_builders() to form the core formula interpretation API.

    Parameters:
    - builders: List of DesignMatrixBuilder objects (from design_matrix_builders)
    - data: Dict-like object containing data for matrix construction
    - NA_action (str): Strategy for handling missing data
    - return_type (str): "matrix" for numpy arrays, "dataframe" for pandas DataFrames
    - dtype: Data type for the resulting matrices (default: float)

    Returns:
    List of design matrices (DesignMatrix objects or DataFrames)
    """

Usage Examples

import patsy
import pandas as pd
import numpy as np

# Using the builders from the previous example
# (In practice, you'd usually get builders from the high-level interface)

# New data for matrix construction
new_data = pd.DataFrame({
    'x': [1.5, 2.5, 3.5],
    'y': [3, 5, 7],
    'group': ['A', 'B', 'A']
})

# Build matrices using the pre-constructed builders
matrices = patsy.build_design_matrices(builders, new_data)
print(f"Number of matrices: {len(matrices)}")
print(f"Outcome matrix shape: {matrices[0].shape}")
print(f"Predictor matrix shape: {matrices[1].shape}")

# With different return type
matrices_df = patsy.build_design_matrices(builders, new_data, return_type="dataframe")
print("DataFrame columns:", matrices_df[1].columns.tolist())

# With different data type
matrices_int = patsy.build_design_matrices(builders, new_data, dtype=np.int32)

Integration with High-Level Interface

The matrix building functions work behind the scenes in high-level functions:

import patsy

# High-level interface (what users typically use)
y, X = patsy.dmatrices("y ~ x + C(group)", data)

# Is roughly equivalent to this lower-level process:
# 1. Parse formula to create ModelDesc
# 2. Extract term lists from ModelDesc
# 3. Create data iterator
# 4. Call design_matrix_builders()
# 5. Call build_design_matrices()

Advanced Usage Patterns

Incremental Processing with Builders

import patsy

# Create builders for incremental processing
def large_data_iter():
    # Simulate large dataset in chunks
    for i in range(0, 10000, 1000):
        chunk_data = {
            'x': list(range(i, i+1000)),
            'y': [j*2 for j in range(i, i+1000)]
        }
        yield chunk_data

# Parse formula to get model description
model_desc = patsy.ModelDesc.from_formula("y ~ x")
eval_env = patsy.EvalEnvironment.capture()

# Extract term lists
lhs_termlist = [model_desc.lhs_termlist] if model_desc.lhs_termlist else []
rhs_termlist = [model_desc.rhs_termlist]
termlists = lhs_termlist + rhs_termlist

# Create builders
builders = patsy.design_matrix_builders(termlists, large_data_iter, eval_env)

# Use builders on new data
new_data = {'x': [5000, 5001, 5002], 'y': [10000, 10002, 10004]}
matrices = patsy.build_design_matrices(builders, new_data)

Custom Missing Data Handling

import patsy
from patsy.missing import NAAction

# Custom NA action
class CustomNAAction(NAAction):
    def handle_NA(self, values, is_NA, origins):
        # Custom logic for handling missing values
        # This is a simplified example
        return values[~is_NA], origins[~is_NA] if origins is not None else None

custom_na_action = CustomNAAction()

# Use with matrix builders
builders = patsy.design_matrix_builders(
    termlists, 
    data_iter_maker, 
    eval_env, 
    NA_action=custom_na_action
)

matrices = patsy.build_design_matrices(
    builders, 
    data, 
    NA_action=custom_na_action
)

Reusing Builders for Different Data

import patsy
import pandas as pd

# Original training data
train_data = pd.DataFrame({
    'x': [1, 2, 3, 4, 5],
    'y': [2, 4, 6, 8, 10],
    'group': ['A', 'B', 'A', 'B', 'A']
})

# Create builders from training data
def train_iter():
    yield train_data

model_desc = patsy.ModelDesc.from_formula("y ~ x + C(group)")
eval_env = patsy.EvalEnvironment.capture()
termlists = [[model_desc.lhs_termlist], [model_desc.rhs_termlist]]

builders = patsy.design_matrix_builders(termlists, train_iter, eval_env)

# Test data (different size, potentially different factor levels)
test_data = pd.DataFrame({
    'x': [1.5, 2.5, 3.5, 4.5],
    'y': [3, 5, 7, 9],
    'group': ['A', 'B', 'A', 'C']  # Note: 'C' is a new level
})

# Build matrices for test data using same builders
try:
    test_matrices = patsy.build_design_matrices(builders, test_data)
    print("Test matrices built successfully")
except Exception as e:
    print(f"Error with new factor level: {e}")
    # Handle new factor levels appropriately

Builder Objects and Metadata

DesignMatrixBuilder Properties

# Builders contain metadata about the design matrix structure
builder = builders[1]  # Predictor matrix builder

# Access design information
print("Column names:", builder.design_info.column_names)
print("Terms:", builder.design_info.terms)
print("Factor infos:", [fi.factor.code for fi in builder.design_info.factor_infos])

# Check for stateful transforms
for factor_info in builder.design_info.factor_infos:
    if hasattr(factor_info.factor, 'memorize_chunk'):
        print(f"Stateful factor: {factor_info.factor}")

Matrix Metadata

# Built matrices contain rich metadata
matrix = matrices[1]  # Predictor matrix

if hasattr(matrix, 'design_info'):
    print("Matrix column names:", matrix.design_info.column_names)
    print("Matrix shape:", matrix.shape)
    print("Terms per column:", matrix.design_info.column_name_indexes)

Error Handling and Debugging

Common Issues

# Factor level mismatches
try:
    matrices = patsy.build_design_matrices(builders, data_with_new_levels)
except patsy.PatsyError as e:
    print(f"Factor level error: {e}")
    # Handle appropriately (drop new levels, add to design, etc.)

# Missing data issues
try:
    matrices = patsy.build_design_matrices(builders, data_with_nas, NA_action="raise")
except patsy.PatsyError as e:
    print(f"Missing data error: {e}")
    # Switch to different NA_action or preprocess data

Debugging Matrix Construction

# Inspect intermediate results
print("Builder design infos:")
for i, builder in enumerate(builders):
    print(f"Builder {i}: {builder.design_info.column_names}")

# Check data types and shapes
for i, matrix in enumerate(matrices):
    print(f"Matrix {i}: shape={matrix.shape}, dtype={matrix.dtype}")
    if hasattr(matrix, 'design_info'):
        print(f"  Columns: {matrix.design_info.column_names}")

Install with Tessl CLI