A Python package for describing statistical models and for building design matrices.
—
Functions and classes for handling categorical data in statistical models. Patsy provides automatic detection of categorical variables and flexible manual specification with custom contrast coding schemes.
Explicitly marks data as categorical and specifies how it should be interpreted in formulas.
def C(data, contrast=None, levels=None):
"""
Marks data as categorical and specifies interpretation options.
Parameters:
- data: Array-like data to be treated as categorical
- contrast (contrast object or None): Contrast coding scheme to use (Treatment, Sum, Helmert, etc.)
- levels (sequence or None): Explicit ordering of category levels
Returns:
Categorical factor object for use in formulas
"""import patsy
import pandas as pd
data = pd.DataFrame({
'treatment': ['control', 'drug_a', 'drug_b', 'control', 'drug_a'],
'outcome': [1.2, 2.3, 3.1, 1.8, 2.9]
})
# Basic categorical specification
design = patsy.dmatrix("C(treatment)", data)
# With custom level ordering
design = patsy.dmatrix("C(treatment, levels=['control', 'drug_a', 'drug_b'])", data)
# With custom contrast coding
from patsy import Sum
design = patsy.dmatrix("C(treatment, Sum)", data)
# Combining with other terms
y, X = patsy.dmatrices("outcome ~ C(treatment) + I(treatment=='control')", data)Determines whether data should be automatically treated as categorical based on its type and content.
def guess_categorical(data):
"""
Determine if data should be treated as categorical.
Parameters:
- data: Array-like data to examine
Returns:
bool: True if data appears categorical, False otherwise
"""import patsy
import numpy as np
# String data is usually categorical
text_data = ['A', 'B', 'A', 'C', 'B']
print(patsy.guess_categorical(text_data)) # True
# Numeric data with few unique values might be categorical
numeric_groups = [1, 2, 1, 3, 2, 1, 3]
print(patsy.guess_categorical(numeric_groups)) # Depends on implementation
# Continuous numeric data is not categorical
continuous = np.random.normal(0, 1, 100)
print(patsy.guess_categorical(continuous)) # FalseConverts categorical data to integer codes for internal processing.
def categorical_to_int(data, levels=None, pandas_index=False):
"""
Convert categorical data to integer representation.
Parameters:
- data: Categorical data to convert
- levels (sequence or None): Explicit level ordering
- pandas_index (bool): Whether to return pandas index information
Returns:
Integer array with category codes, with missing values as -1
"""import patsy
# Convert string categories to integers
categories = ['A', 'B', 'A', 'C', 'B']
int_codes = patsy.categorical_to_int(categories)
print(int_codes) # [0, 1, 0, 2, 1] or similar
# With explicit level ordering
int_codes = patsy.categorical_to_int(categories, levels=['C', 'B', 'A'])
print(int_codes) # Different orderingA class that can detect and handle categorical variables automatically during formula evaluation.
class CategoricalSniffer:
"""
Automatically detects and handles categorical variables during formula processing.
"""
def __init__(self, NA_action, origin=None):
"""
Initialize categorical detection.
Parameters:
- NA_action: Strategy for handling missing data
- origin: Origin information for error reporting
"""import patsy
from patsy.missing import NAAction
# Create a categorical sniffer
na_action = NAAction()
sniffer = patsy.CategoricalSniffer(na_action)
# The sniffer is typically used internally by patsy,
# but can be used manually for custom processingPatsy recognizes several types of categorical data:
import pandas as pd
import patsy
# Pandas categorical data
cat_data = pd.Categorical(['A', 'B', 'A', 'C'], categories=['A', 'B', 'C'])
design = patsy.dmatrix("cat_data", {'cat_data': cat_data})# String data is automatically treated as categorical
text_groups = ['control', 'treatment', 'control', 'treatment']
design = patsy.dmatrix("C(text_groups)", {'text_groups': text_groups})# Numeric data can be explicitly marked categorical
numeric_groups = [1, 2, 1, 3, 2]
design = patsy.dmatrix("C(numeric_groups)", {'numeric_groups': numeric_groups})Categorical variables work seamlessly with Patsy's contrast coding system:
import patsy
from patsy import Treatment, Sum, Helmert
data = {'group': ['A', 'B', 'C', 'A', 'B', 'C']}
# Default treatment contrasts
design1 = patsy.dmatrix("C(group)", data)
# Sum-to-zero contrasts
design2 = patsy.dmatrix("C(group, Sum)", data)
# Helmert contrasts
design3 = patsy.dmatrix("C(group, Helmert)", data)Categorical functions respect Patsy's missing data handling:
Install with Tessl CLI
npx tessl i tessl/pypi-patsy