A Python package for describing statistical models and for building design matrices.
—
Transform functions that maintain state across data processing operations. These transforms remember characteristics of the training data and apply consistent transformations to new data, essential for preprocessing in statistical modeling.
Creates stateful transform callable objects from classes implementing the stateful transform protocol.
def stateful_transform(class_):
"""
Create a stateful transform callable from a class implementing the stateful transform protocol.
Parameters:
- class_: A class implementing the stateful transform protocol with methods:
- __init__(): Initialize the transform
- memorize_chunk(input_data): Process data during learning phase
- memorize_finish(): Finalize learning phase
- transform(input_data): Apply transformation to data
Returns:
Callable transform object that can be used in formulas
"""import patsy
import numpy as np
# Define a custom stateful transform class
class CustomScale:
def __init__(self):
self.scale_factor = None
def memorize_chunk(self, input_data):
# Accumulate data statistics during training
pass
def memorize_finish(self):
# Finalize computation after seeing all training data
pass
def transform(self, input_data):
# Apply transformation consistently to new data
return input_data * self.scale_factor
# Create the stateful transform
custom_scale = patsy.stateful_transform(CustomScale)
# Use in formulas (conceptually)
# design = patsy.dmatrix("custom_scale(x)", data)Subtracts the mean from data, centering it around zero while preserving the scale.
def center(x):
"""
Stateful transform that centers input data by subtracting the mean.
Parameters:
- x: Array-like data to center
Returns:
Array with same shape as input, centered around zero
Notes:
- For multi-column input, centers each column separately
- Equivalent to standardize(x, rescale=False)
- State: Remembers the mean of training data
"""import patsy
import numpy as np
import pandas as pd
# Sample data
data = pd.DataFrame({
'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'y': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
})
# Center a variable in formula
design = patsy.dmatrix("center(x)", data)
print(f"Original mean: {np.mean(data['x'])}")
print(f"Centered mean: {np.mean(design)}") # Should be close to 0
# Center multiple variables
design = patsy.dmatrix("center(x) + center(y)", data)
# Complete model with centering
y_matrix, X_matrix = patsy.dmatrices("y ~ center(x)", data)
# Centering preserves relationships but changes intercept interpretation
print("Design matrix mean by column:", np.mean(X_matrix, axis=0))Centers data and scales to unit variance (z-score standardization).
def standardize(x, center=True, rescale=True, ddof=0):
"""
Stateful transform that standardizes input data (z-score normalization).
Parameters:
- x: Array-like data to standardize
- center (bool): Whether to subtract the mean (default: True)
- rescale (bool): Whether to divide by standard deviation (default: True)
- ddof (int): Delta degrees of freedom for standard deviation computation (default: 0)
Returns:
Array with same shape as input, standardized
Notes:
- ddof=0 gives maximum likelihood estimate (divides by n)
- ddof=1 gives unbiased estimate (divides by n-1)
- For multi-column input, standardizes each column separately
- State: Remembers mean and standard deviation of training data
"""import patsy
import numpy as np
import pandas as pd
# Sample data with different scales
data = pd.DataFrame({
'small': [0.1, 0.2, 0.3, 0.4, 0.5],
'large': [100, 200, 300, 400, 500],
'y': [1, 2, 3, 4, 5]
})
# Standardize variables to have mean 0, std 1
design = patsy.dmatrix("standardize(small) + standardize(large)", data)
print("Standardized means:", np.mean(design, axis=0)) # Should be ~0
print("Standardized stds:", np.std(design, axis=0)) # Should be ~1
# Only center without rescaling
design = patsy.dmatrix("standardize(small, rescale=False)", data)
# Only rescale without centering
design = patsy.dmatrix("standardize(small, center=False)", data)
# Use unbiased standard deviation (ddof=1)
design = patsy.dmatrix("standardize(small, ddof=1)", data)
# Complete model with standardization
y_matrix, X_matrix = patsy.dmatrices("y ~ standardize(small) + standardize(large)", data)Alias for the standardize function, providing the same functionality.
def scale(x, ddof=0):
"""
Alias for standardize() function.
Equivalent to standardize(x, center=True, rescale=True, ddof=ddof)
Parameters:
- x: Array-like data to scale
- ddof (int): Delta degrees of freedom for standard deviation computation
Returns:
Standardized array (mean 0, standard deviation 1)
"""import patsy
import pandas as pd
data = pd.DataFrame({
'x': [10, 20, 30, 40, 50],
'y': [1, 4, 9, 16, 25]
})
# scale() is equivalent to standardize()
design1 = patsy.dmatrix("scale(x)", data)
design2 = patsy.dmatrix("standardize(x)", data)
print("Designs are equal:", np.allclose(design1, design2))
# Complete model using scale
y_matrix, X_matrix = patsy.dmatrices("y ~ scale(x)", data)Stateful transforms work in two phases:
Learning Phase (during initial matrix construction):
memorize_chunk(): Process training data chunksmemorize_finish(): Finalize parameter computationTransform Phase (during application to new data):
transform(): Apply learned parameters to new dataimport patsy
import numpy as np
# Training data
train_data = {'x': [1, 2, 3, 4, 5]}
builder = patsy.dmatrix("standardize(x)", train_data)
# The standardize transform has learned the mean and std from training data
# Now it can be applied consistently to new data
test_data = {'x': [1.5, 2.5, 3.5]}
test_design = builder.transform(test_data) # Uses same mean/std from trainingStateful transforms work with Patsy's incremental processing for large datasets:
import patsy
def data_chunks():
# Generator yielding data chunks
for i in range(0, 10000, 1000):
yield {'x': list(range(i, i+1000))}
# Build incremental design matrix with transforms
builder = patsy.incr_dbuilder("standardize(x)", data_chunks)
# Apply to new data using learned parameters
new_data = {'x': [5000, 5001, 5002]}
design = builder.build(new_data)# Chain transforms
design = patsy.dmatrix("center(standardize(x))", data) # Note: This is redundant
# Apply different transforms to different variables
design = patsy.dmatrix("center(x1) + standardize(x2) + scale(x3)", data)class RobustScale:
"""Custom stateful transform using median and MAD instead of mean and std"""
def __init__(self):
self.median = None
self.mad = None
def memorize_chunk(self, input_data):
# In practice, you'd accumulate statistics across chunks
data = np.asarray(input_data)
if self.median is None:
self.median = np.median(data)
self.mad = np.median(np.abs(data - self.median))
def memorize_finish(self):
# Finalize computation if needed
pass
def transform(self, input_data):
data = np.asarray(input_data)
return (data - self.median) / (1.4826 * self.mad) # 1.4826 for normal consistency
# Create the transform
robust_scale = patsy.stateful_transform(RobustScale)import patsy
from sklearn.linear_model import LinearRegression
# Create standardized design matrices
data = {'x': [1, 2, 3, 4, 5], 'y': [2, 4, 6, 8, 10]}
y, X = patsy.dmatrices("y ~ standardize(x)", data)
# Fit model
model = LinearRegression(fit_intercept=False)
model.fit(X, y.ravel())
# The transform state is preserved for new predictions
new_data = {'x': [1.5, 2.5, 3.5]}
X_new = patsy.dmatrix("standardize(x)", new_data,
return_type="matrix") # Uses same standardization parameters
predictions = model.predict(X_new)Install with Tessl CLI
npx tessl i tessl/pypi-patsy