A Python package for describing statistical models and for building design matrices.
npx @tessl/cli install tessl/pypi-patsy@1.0.0A Python package for describing statistical models (especially linear models or models with linear components) and building design matrices. Patsy brings the convenience of R-style 'formulas' to Python, allowing users to specify statistical models using intuitive string-based syntax like "y ~ x + I(x**2)". The library provides comprehensive functionality for transforming data into design matrices suitable for statistical analysis, handling categorical variables, interactions, transformations, and various statistical functions including splines.
pip install patsyimport patsyMost common pattern for high-level functions:
from patsy import dmatrix, dmatrices, Cimport patsy
import pandas as pd
import numpy as np
# Create some sample data
data = pd.DataFrame({
'y': [1, 2, 3, 4, 5, 6],
'x1': [1, 2, 3, 4, 5, 6],
'x2': [0.5, 1.5, 2.5, 3.5, 4.5, 5.5],
'group': ['A', 'A', 'B', 'B', 'C', 'C']
})
# Build a single design matrix (predictors only)
design_matrix = patsy.dmatrix("x1 + x2 + C(group)", data)
print(design_matrix)
# Build both outcome and predictor matrices
y, X = patsy.dmatrices("y ~ x1 + x2 + C(group)", data)
print("Outcome:", y)
print("Predictors:", X)
# Using interactions and transformations
design_matrix = patsy.dmatrix("x1 + I(x1**2) + x1:x2", data)
print(design_matrix)Patsy is built around several key architectural components:
This design enables flexible model specification while providing efficient matrix construction for statistical computing.
The main entry points for creating design matrices from formula strings. These functions handle the complete workflow from formula parsing to matrix construction.
def dmatrix(formula_like, data={}, eval_env=0, NA_action="drop", return_type="matrix"): ...
def dmatrices(formula_like, data={}, eval_env=0, NA_action="drop", return_type="matrix"): ...
def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): ...
def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): ...Functions and classes for handling categorical data, including automatic detection, manual specification, and conversion utilities.
def C(data, contrast=None, levels=None): ...
def guess_categorical(data): ...
def categorical_to_int(data, levels=None, pandas_index=False): ...
class CategoricalSniffer: ...Classes implementing different contrast coding schemes for categorical variables, essential for statistical modeling.
class ContrastMatrix: ...
class Treatment: ...
class Sum: ...
class Helmert: ...
class Poly: ...
class Diff: ...B-splines and cubic regression splines for modeling non-linear relationships, compatible with R and MGCV implementations.
def bs(x, df=None, knots=None, degree=3, include_intercept=False, lower_bound=None, upper_bound=None): ...
def cr(x, df=10, constraints=None): ...
def cc(x, df=10, constraints=None): ...
def te(*args, **kwargs): ...Transform functions that maintain state across data processing, useful for centering, standardization, and custom transformations.
def stateful_transform(class_): ...
def center(x): ...
def standardize(x): ...
def scale(x, ddof=0): ...Lower-level functions for constructing design matrices from parsed terms, providing more control over the matrix building process.
def design_matrix_builders(termlists, data_iter_maker, eval_env=None, NA_action="drop"): ...
def build_design_matrices(builders, data, NA_action=None, return_type="matrix"): ...Special functions available in formula namespaces for escaping arithmetic operations and handling variable names with special characters.
def I(x): ...
def Q(name): ...Helper functions for generating test data, creating balanced designs, and other common tasks.
def balanced(*factors, levels=None): ...
def demo_data(formula, num_rows=100, seed=None): ...
class LookupFactor: ...class PatsyError(Exception):
"""Main exception class for Patsy-specific errors."""
def __init__(self, message, origin=None): ...
def set_origin(self, origin): ...
class ModelDesc:
"""Describes the overall structure of a statistical model."""
@classmethod
def from_formula(cls, formula_string, default_env=0): ...
class Term:
"""Represents a term in a statistical model."""
def __init__(self, factors, origin=None): ...
class DesignInfo:
"""Information about the structure of a design matrix."""
def __init__(self, column_names, factor_infos=None, term_name_slices=None,
term_names=None, terms=None, builder=None): ...
class DesignMatrix(numpy.ndarray):
"""numpy array subclass with design matrix metadata."""
@property
def design_info(self): ...
class LinearConstraint:
"""Class for representing linear constraints on design matrices."""
def __init__(self, constraint_matrix, constants=None): ...
class NAAction:
"""Defines strategy for handling missing data."""
def __init__(self, on_NA="drop", NA_types=["None", "NaN"]): ...
def is_numerical_NA(self, array): ...
def is_categorical_NA(self, array): ...
class EvalEnvironment:
"""Captures the environment for evaluating formulas."""
def __init__(self, namespaces, flags=0): ...
@classmethod
def capture(cls, depth=0, reference=None): ...
def eval(self, code, inner_namespace={}): ...
def namespace(self, name): ...
class EvalFactor:
"""Factor that evaluates arbitrary Python code in a given environment."""
def __init__(self, code, origin=None): ...
def eval(self, state, env): ...
def name(self): ...
class Origin:
"""Tracks the origin of objects in strings for error reporting."""
def __init__(self, code, start, end): ...
@classmethod
def combine(cls, origin_objs): ...
def caretize(self, indent=0): ...INTERCEPT: Term # Special constant representing the intercept term