A Python package for describing statistical models and for building design matrices.
—
B-splines and cubic regression splines for modeling non-linear relationships in statistical models. Patsy provides implementations compatible with R and MGCV, allowing flexible smooth terms in formulas.
Generates B-spline basis functions for non-linear curve fitting, providing smooth approximation of arbitrary functions.
def bs(x, df=None, knots=None, degree=3, include_intercept=False, lower_bound=None, upper_bound=None):
"""
Generate B-spline basis for x, allowing non-linear fits.
Parameters:
- x: Array-like data to create spline basis for
- df (int or None): Number of degrees of freedom (columns in output)
- knots (array-like or None): Interior knot locations (default: equally spaced quantiles)
- degree (int): Degree of the spline (default: 3 for cubic)
- include_intercept (bool): Whether basis spans intercept term (default: False)
- lower_bound (float or None): Lower boundary for spline
- upper_bound (float or None): Upper boundary for spline
Returns:
2D array with basis functions as columns
Note: Must specify at least one of df and knots
"""import patsy
import numpy as np
import pandas as pd
# Sample data with non-linear relationship
x = np.linspace(0, 10, 100)
y = 2 * np.sin(x) + np.random.normal(0, 0.1, 100)
data = pd.DataFrame({'x': x, 'y': y})
# Basic B-spline with 4 degrees of freedom
design = patsy.dmatrix("bs(x, df=4)", data)
print(f"B-spline basis shape: {design.shape}")
# B-spline with custom knots
knots = [2, 4, 6, 8]
design = patsy.dmatrix("bs(x, knots=knots)", data, extra_env={'knots': knots})
# Higher degree spline
design = patsy.dmatrix("bs(x, df=6, degree=5)", data)
# Include intercept in basis
design = patsy.dmatrix("bs(x, df=4, include_intercept=True)", data)
# Complete model with B-splines
y_matrix, X_matrix = patsy.dmatrices("y ~ bs(x, df=5)", data)Natural cubic splines with optional constraints, compatible with MGCV's cubic regression splines.
def cr(x, df=None, knots=None, lower_bound=None, upper_bound=None, constraints=None):
"""
Generate natural cubic spline basis for x with optional constraints.
Parameters:
- x: Array-like data to create spline basis for
- df (int or None): Number of degrees of freedom
- knots (array-like or None): Interior knot locations
- lower_bound (float or None): Lower boundary for spline
- upper_bound (float or None): Upper boundary for spline
- constraints (str or None): Constraint type ('center' for centering constraint)
Returns:
2D array with natural cubic spline basis functions
"""import patsy
import numpy as np
# Basic cubic regression spline
x = np.linspace(-2, 2, 50)
y = x**3 + 0.5 * x + np.random.normal(0, 0.2, 50)
data = {'x': x, 'y': y}
# Natural cubic spline with 5 degrees of freedom
design = patsy.dmatrix("cr(x, df=5)", data)
# With centering constraint
design = patsy.dmatrix("cr(x, df=5, constraints='center')", data)
# Complete model
y_matrix, X_matrix = patsy.dmatrices("y ~ cr(x, df=6)", data)Cubic splines with cyclic boundary conditions, useful for periodic data.
def cc(x, df=None, knots=None, lower_bound=None, upper_bound=None, constraints=None):
"""
Generate cyclic cubic spline basis for x with optional constraints.
Parameters:
- x: Array-like data to create spline basis for
- df (int or None): Number of degrees of freedom
- knots (array-like or None): Interior knot locations
- lower_bound (float or None): Lower boundary for cyclic period
- upper_bound (float or None): Upper boundary for cyclic period
- constraints (str or None): Constraint type ('center' for centering constraint)
Returns:
2D array with cyclic cubic spline basis functions
"""import patsy
import numpy as np
# Cyclic data (e.g., seasonal patterns, angles)
t = np.linspace(0, 2*np.pi, 100)
y = np.sin(2*t) + 0.5*np.cos(3*t) + np.random.normal(0, 0.1, 100)
data = {'t': t, 'y': y}
# Cyclic cubic spline
design = patsy.dmatrix("cc(t, df=8)", data)
# With explicit boundaries for the cyclic period
design = patsy.dmatrix("cc(t, df=8, lower_bound=0, upper_bound=6.28)", data)
# Complete model for seasonal data
y_matrix, X_matrix = patsy.dmatrices("y ~ cc(t, df=10)", data)Multi-dimensional smooth terms as tensor products of univariate smooths, for modeling interactions between smooth functions.
def te(*args, constraints=None):
"""
Generate tensor product smooth of several covariates.
Parameters:
- *args: Multiple smooth terms (s1, s2, ..., sn) as marginal univariate smooths
- constraints (str or None): Constraint type for the tensor product
Returns:
2D array with tensor product basis functions
Note: Marginal smooths must transform data into basis function arrays.
The resulting basis dimension is the product of marginal basis dimensions.
"""import patsy
import numpy as np
# Two-dimensional smooth surface
x1 = np.random.uniform(-2, 2, 100)
x2 = np.random.uniform(-2, 2, 100)
y = x1**2 + x2**2 + x1*x2 + np.random.normal(0, 0.5, 100)
data = {'x1': x1, 'x2': x2, 'y': y}
# Tensor product of cubic regression splines
# Note: This requires careful setup of the marginal smooths
design = patsy.dmatrix("te(cr(x1, df=5), cr(x2, df=5))", data)
# Three-dimensional tensor product
x3 = np.random.uniform(-1, 1, 100)
data['x3'] = x3
design = patsy.dmatrix("te(cr(x1, df=4), cr(x2, df=4), cr(x3, df=3))", data)
# Complete model with tensor product smooth
y_matrix, X_matrix = patsy.dmatrices("y ~ te(cr(x1, df=5), cr(x2, df=5))", data)| Spline Type | Best For | Characteristics |
|---|---|---|
B-splines (bs) | General smooth curves | Flexible, local support, compatible with R |
Cubic regression (cr) | Natural smooth curves | Natural boundary conditions, MGCV compatible |
Cyclic cubic (cc) | Periodic/seasonal data | Cyclic boundary conditions |
Tensor products (te) | Multi-dimensional smooths | Interaction of smooth terms |
import patsy
import numpy as np
from sklearn.linear_model import LinearRegression
# Generate sample data
np.random.seed(42)
x = np.linspace(0, 10, 100)
y = 2*np.sin(x) + 0.5*x + np.random.normal(0, 0.3, 100)
data = {'x': x, 'y': y}
# Create spline design matrix
y_matrix, X_matrix = patsy.dmatrices("y ~ bs(x, df=6)", data)
# Fit with scikit-learn
model = LinearRegression(fit_intercept=False) # Intercept already in design matrix
model.fit(X_matrix, y_matrix.ravel())
# Predict on new data
x_new = np.linspace(0, 10, 50)
data_new = {'x': x_new}
X_new = patsy.dmatrix("bs(x, df=6)", data_new)
y_pred = model.predict(X_new)# Mixed models with splines and linear terms
y, X = patsy.dmatrices("y ~ x1 + bs(x2, df=4) + C(group)", data)
# Multiple spline terms
y, X = patsy.dmatrices("y ~ bs(x1, df=3) + bs(x2, df=5)", data)
# Spline interactions
y, X = patsy.dmatrices("y ~ bs(x1, df=3) * bs(x2, df=3)", data)Splines handle boundaries differently:
All spline functions are stateful transforms, meaning:
Install with Tessl CLI
npx tessl i tessl/pypi-patsy