tessl/pypi-patsy

A Python package for describing statistical models and for building design matrices.

—

Pending

Overview

Eval results

Files

Stateful Transforms

Name: tessl/pypi-patsy
Author: tessl

Transform functions that maintain state across data processing operations. These transforms remember characteristics of the training data and apply consistent transformations to new data, essential for preprocessing in statistical modeling.

Capabilities

Stateful Transform Decorator

Creates stateful transform callable objects from classes implementing the stateful transform protocol.

def stateful_transform(class_):
    """
    Create a stateful transform callable from a class implementing the stateful transform protocol.

    Parameters:
    - class_: A class implementing the stateful transform protocol with methods:
              - __init__(): Initialize the transform
              - memorize_chunk(input_data): Process data during learning phase
              - memorize_finish(): Finalize learning phase
              - transform(input_data): Apply transformation to data

    Returns:
    Callable transform object that can be used in formulas
    """

Usage Examples

import patsy
import numpy as np

# Define a custom stateful transform class
class CustomScale:
    def __init__(self):
        self.scale_factor = None
    
    def memorize_chunk(self, input_data):
        # Accumulate data statistics during training
        pass
    
    def memorize_finish(self):
        # Finalize computation after seeing all training data
        pass
    
    def transform(self, input_data):
        # Apply transformation consistently to new data
        return input_data * self.scale_factor

# Create the stateful transform
custom_scale = patsy.stateful_transform(CustomScale)

# Use in formulas (conceptually)
# design = patsy.dmatrix("custom_scale(x)", data)

Centering Transform

Subtracts the mean from data, centering it around zero while preserving the scale.

def center(x):
    """
    Stateful transform that centers input data by subtracting the mean.

    Parameters:
    - x: Array-like data to center

    Returns:
    Array with same shape as input, centered around zero

    Notes:
    - For multi-column input, centers each column separately
    - Equivalent to standardize(x, rescale=False)
    - State: Remembers the mean of training data
    """

Usage Examples

import patsy
import numpy as np
import pandas as pd

# Sample data
data = pd.DataFrame({
    'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'y': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
})

# Center a variable in formula
design = patsy.dmatrix("center(x)", data)
print(f"Original mean: {np.mean(data['x'])}")
print(f"Centered mean: {np.mean(design)}")  # Should be close to 0

# Center multiple variables
design = patsy.dmatrix("center(x) + center(y)", data)

# Complete model with centering
y_matrix, X_matrix = patsy.dmatrices("y ~ center(x)", data)

# Centering preserves relationships but changes intercept interpretation
print("Design matrix mean by column:", np.mean(X_matrix, axis=0))

Standardization Transform

Centers data and scales to unit variance (z-score standardization).

def standardize(x, center=True, rescale=True, ddof=0):
    """
    Stateful transform that standardizes input data (z-score normalization).

    Parameters:
    - x: Array-like data to standardize
    - center (bool): Whether to subtract the mean (default: True)
    - rescale (bool): Whether to divide by standard deviation (default: True)
    - ddof (int): Delta degrees of freedom for standard deviation computation (default: 0)

    Returns:
    Array with same shape as input, standardized

    Notes:
    - ddof=0 gives maximum likelihood estimate (divides by n)
    - ddof=1 gives unbiased estimate (divides by n-1)
    - For multi-column input, standardizes each column separately
    - State: Remembers mean and standard deviation of training data
    """

Usage Examples

import patsy
import numpy as np
import pandas as pd

# Sample data with different scales
data = pd.DataFrame({
    'small': [0.1, 0.2, 0.3, 0.4, 0.5],
    'large': [100, 200, 300, 400, 500],
    'y': [1, 2, 3, 4, 5]
})

# Standardize variables to have mean 0, std 1
design = patsy.dmatrix("standardize(small) + standardize(large)", data)
print("Standardized means:", np.mean(design, axis=0))  # Should be ~0
print("Standardized stds:", np.std(design, axis=0))    # Should be ~1

# Only center without rescaling
design = patsy.dmatrix("standardize(small, rescale=False)", data)

# Only rescale without centering
design = patsy.dmatrix("standardize(small, center=False)", data)

# Use unbiased standard deviation (ddof=1)
design = patsy.dmatrix("standardize(small, ddof=1)", data)

# Complete model with standardization
y_matrix, X_matrix = patsy.dmatrices("y ~ standardize(small) + standardize(large)", data)

Scale Transform

Alias for the standardize function, providing the same functionality.

def scale(x, ddof=0):
    """
    Alias for standardize() function.
    
    Equivalent to standardize(x, center=True, rescale=True, ddof=ddof)

    Parameters:
    - x: Array-like data to scale
    - ddof (int): Delta degrees of freedom for standard deviation computation

    Returns:
    Standardized array (mean 0, standard deviation 1)
    """

Usage Examples

import patsy
import pandas as pd

data = pd.DataFrame({
    'x': [10, 20, 30, 40, 50],
    'y': [1, 4, 9, 16, 25]
})

# scale() is equivalent to standardize()
design1 = patsy.dmatrix("scale(x)", data)
design2 = patsy.dmatrix("standardize(x)", data)
print("Designs are equal:", np.allclose(design1, design2))

# Complete model using scale
y_matrix, X_matrix = patsy.dmatrices("y ~ scale(x)", data)

Transform Behavior and State

Stateful Nature

Stateful transforms work in two phases:

Learning Phase (during initial matrix construction):
- memorize_chunk(): Process training data chunks
- memorize_finish(): Finalize parameter computation
Transform Phase (during application to new data):
- transform(): Apply learned parameters to new data

Consistent Application

import patsy
import numpy as np

# Training data
train_data = {'x': [1, 2, 3, 4, 5]}
builder = patsy.dmatrix("standardize(x)", train_data)

# The standardize transform has learned the mean and std from training data
# Now it can be applied consistently to new data
test_data = {'x': [1.5, 2.5, 3.5]}
test_design = builder.transform(test_data)  # Uses same mean/std from training

Integration with Incremental Processing

Stateful transforms work with Patsy's incremental processing for large datasets:

import patsy

def data_chunks():
    # Generator yielding data chunks
    for i in range(0, 10000, 1000):
        yield {'x': list(range(i, i+1000))}

# Build incremental design matrix with transforms
builder = patsy.incr_dbuilder("standardize(x)", data_chunks)

# Apply to new data using learned parameters
new_data = {'x': [5000, 5001, 5002]}
design = builder.build(new_data)

Advanced Transform Usage

Multiple Transforms

# Chain transforms
design = patsy.dmatrix("center(standardize(x))", data)  # Note: This is redundant

# Apply different transforms to different variables
design = patsy.dmatrix("center(x1) + standardize(x2) + scale(x3)", data)

Custom Transform Development

class RobustScale:
    """Custom stateful transform using median and MAD instead of mean and std"""
    
    def __init__(self):
        self.median = None
        self.mad = None
    
    def memorize_chunk(self, input_data):
        # In practice, you'd accumulate statistics across chunks
        data = np.asarray(input_data)
        if self.median is None:
            self.median = np.median(data)
            self.mad = np.median(np.abs(data - self.median))
    
    def memorize_finish(self):
        # Finalize computation if needed
        pass
    
    def transform(self, input_data):
        data = np.asarray(input_data)
        return (data - self.median) / (1.4826 * self.mad)  # 1.4826 for normal consistency

# Create the transform
robust_scale = patsy.stateful_transform(RobustScale)

Transform with Model Fitting

import patsy
from sklearn.linear_model import LinearRegression

# Create standardized design matrices
data = {'x': [1, 2, 3, 4, 5], 'y': [2, 4, 6, 8, 10]}
y, X = patsy.dmatrices("y ~ standardize(x)", data)

# Fit model
model = LinearRegression(fit_intercept=False)
model.fit(X, y.ravel())

# The transform state is preserved for new predictions
new_data = {'x': [1.5, 2.5, 3.5]}
X_new = patsy.dmatrix("standardize(x)", new_data, 
                      return_type="matrix")  # Uses same standardization parameters
predictions = model.predict(X_new)

Install with Tessl CLI