CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-patsy

A Python package for describing statistical models and for building design matrices.

Pending
Overview
Eval results
Files

builtins.mddocs/

Built-in Functions

Special functions available in formula namespaces for escaping arithmetic operations and handling variable names with special characters. These functions are automatically imported into the formula evaluation environment.

Capabilities

Identity Function

Escapes arithmetic operations from formula parsing, allowing complex expressions to be treated as single terms.

def I(x):
    """
    Identity function that returns its input unchanged.

    The key purpose is to 'hide' arithmetic operations from Patsy's formula parser.
    Since the parser ignores anything inside function call syntax, I() allows
    complex expressions to be treated as single predictors.

    Parameters:
    - x: Any expression or value

    Returns:
    The input value unchanged
    """

Usage Examples

import patsy
import pandas as pd
import numpy as np

data = pd.DataFrame({
    'x1': [1, 2, 3, 4, 5],
    'x2': [2, 4, 6, 8, 10],
    'y': [3, 6, 9, 12, 15]
})

# Without I(): creates separate terms for x1 and x2
design1 = patsy.dmatrix("x1 + x2", data)
print(f"Without I(): {design1.shape[1]} columns")  # 3 columns: intercept, x1, x2

# With I(): creates single term for the sum
design2 = patsy.dmatrix("I(x1 + x2)", data)
print(f"With I(): {design2.shape[1]} columns")     # 2 columns: intercept, sum

# More complex expressions
design3 = patsy.dmatrix("I(x1**2) + I(x2**3)", data)  # Polynomial terms
design4 = patsy.dmatrix("I(x1 * x2)", data)           # Interaction as single term
design5 = patsy.dmatrix("I((x1 + x2) / 2)", data)     # Arithmetic mean

# Complete model with I() functions
y, X = patsy.dmatrices("y ~ I(x1**2) + I(x1 * x2)", data)
print("Column names:", X.design_info.column_names)

Variable Name Quoting

Allows reference to variable names that don't conform to Python identifier rules.

def Q(name):
    """
    Quote variable names, especially those that don't meet Python's variable name rules.

    Takes a string containing a variable name and returns the value of that variable
    from the evaluation environment. Useful for column names with special characters,
    spaces, or reserved words.

    Parameters:
    - name (str): String containing the variable name to look up

    Returns:
    The value of the named variable from the evaluation environment

    Raises:
    NameError: If no variable with the given name is found
    """

Usage Examples

import patsy
import pandas as pd

# Data with problematic column names
data = pd.DataFrame({
    'weight.in.kg': [70, 80, 90, 75, 85],
    'height in cm': [170, 180, 185, 175, 182],
    'age-years': [25, 30, 35, 28, 32],
    'class': [1, 2, 1, 2, 1],  # 'class' is a Python reserved word
    'y': [1, 2, 3, 4, 5]
})

# These would fail without Q():
# design = patsy.dmatrix("weight.in.kg", data)  # Error: attribute access
# design = patsy.dmatrix("height in cm", data)  # Error: 'in' is reserved word
# design = patsy.dmatrix("class", data)         # Error: 'class' is reserved word

# Use Q() to handle problematic names:
design1 = patsy.dmatrix('Q("weight.in.kg")', data)
design2 = patsy.dmatrix('Q("height in cm")', data)
design3 = patsy.dmatrix('Q("age-years")', data)
design4 = patsy.dmatrix('Q("class")', data)

# Multiple problematic variables
design_multi = patsy.dmatrix('Q("weight.in.kg") + Q("height in cm")', data)

# Complete model with quoted variables
y, X = patsy.dmatrices('y ~ Q("weight.in.kg") + Q("class")', data)
print("Column names:", X.design_info.column_names)

# Q() can be used in complex expressions
design_complex = patsy.dmatrix('I(Q("weight.in.kg") / Q("height in cm"))', data)  # BMI-like ratio

Advanced Usage Patterns

Combining I() and Q()

import patsy
import pandas as pd
import numpy as np

# Data with both problematic names and need for complex expressions
data = pd.DataFrame({
    'var.1': [1, 2, 3, 4, 5],
    'var.2': [2, 4, 6, 8, 10],
    'weight in kg': [70, 75, 80, 85, 90],
    'height in m': [1.7, 1.8, 1.75, 1.85, 1.82],
    'y': [20, 25, 22, 28, 26]
})

# Combine Q() and I() for complex expressions with problematic names
bmi_design = patsy.dmatrix('I(Q("weight in kg") / Q("height in m")**2)', data)
interaction_design = patsy.dmatrix('I(Q("var.1") * Q("var.2"))', data)
polynomial_design = patsy.dmatrix('I(Q("var.1")**2) + I(Q("var.1")**3)', data)

# Complete model
y, X = patsy.dmatrices(
    'y ~ Q("var.1") + Q("var.2") + I(Q("weight in kg") / Q("height in m")**2)', 
    data
)

Formula String Quoting Considerations

# Different ways to handle quotes in formulas with Q()

# Option 1: Single quotes around formula, double quotes in Q()
formula1 = 'y ~ Q("weight.in.kg")'

# Option 2: Double quotes around formula, single quotes in Q()
formula2 = "y ~ Q('weight.in.kg')"

# Option 3: Double quotes with escaped inner quotes
formula3 = "y ~ Q(\"weight.in.kg\")"

# Option 4: Triple quotes for complex formulas
formula4 = '''y ~ Q("weight.in.kg") + Q("height in cm")'''

# All produce the same result
designs = [patsy.dmatrix(f, data) for f in [formula1, formula2, formula3, formula4]]

Working with Pandas Column Names

import patsy
import pandas as pd

# Real-world example with messy column names
survey_data = pd.DataFrame({
    'Q1. How satisfied are you?': [5, 4, 3, 5, 4],
    'Income ($)': [50000, 60000, 45000, 70000, 55000],
    '2023_score': [85, 90, 75, 95, 80],
    'group-id': ['A', 'B', 'A', 'C', 'B'],
    'outcome': [1, 2, 1, 3, 2]
})

# Use Q() for all problematic column names
design = patsy.dmatrix('''
    Q("Q1. How satisfied are you?") + 
    Q("Income ($)") + 
    Q("2023_score") + 
    C(Q("group-id"))
''', survey_data)

print("Successfully created design matrix with problematic column names")
print("Column names:", design.design_info.column_names)

Dynamic Variable Selection with Q()

import patsy

# Programmatically build formulas with Q()
data = pd.DataFrame({
    'var-1': [1, 2, 3], 'var-2': [4, 5, 6], 'var-3': [7, 8, 9],
    'outcome': [10, 11, 12]
})

# List of problematic variable names
predictors = ['var-1', 'var-2', 'var-3']

# Build formula dynamically
quoted_predictors = [f'Q("{var}")' for var in predictors]
formula = 'outcome ~ ' + ' + '.join(quoted_predictors)
print(f"Dynamic formula: {formula}")

y, X = patsy.dmatrices(formula, data)

Integration with Other Patsy Functions

I() with Transformations

# Combine I() with stateful transforms
design = patsy.dmatrix("standardize(I(x1 + x2))", data)  # Standardize the sum

# I() with splines
design = patsy.dmatrix("bs(I(x1 * x2), df=4)", data)  # Spline of interaction

Q() with Categorical Variables

# Categorical variables with problematic names
data_cat = pd.DataFrame({
    'treatment-group': ['control', 'drug_a', 'drug_b'] * 10,
    'patient.id': range(30),
    'response': np.random.normal(0, 1, 30)
})

# Use Q() with C() for categorical specification
design = patsy.dmatrix('C(Q("treatment-group"))', data_cat)
y, X = patsy.dmatrices('response ~ C(Q("treatment-group"))', data_cat)

Error Handling

Common Q() Errors

import patsy

# Variable doesn't exist
try:
    design = patsy.dmatrix('Q("nonexistent_var")', data)
except NameError as e:
    print(f"Variable not found: {e}")

# Typo in variable name
try:
    design = patsy.dmatrix('Q("weight.in.kgg")', data)  # Extra 'g'
except NameError as e:
    print(f"Typo in variable name: {e}")

Debugging Formula Issues

# Check what variables are available
print("Available columns:", data.columns.tolist())

# Test Q() function directly
try:
    test_value = patsy.Q("weight.in.kg")  # Won't work outside formula context
except Exception as e:
    print("Q() needs proper evaluation environment")

# Use in formula context
design = patsy.dmatrix('Q("weight.in.kg")', data)  # Works correctly

Install with Tessl CLI

npx tessl i tessl/pypi-patsy

docs

builtins.md

categorical.md

contrasts.md

high-level.md

index.md

matrix-building.md

splines.md

transforms.md

utilities.md

tile.json