A scikit-learn-compatible module for estimating prediction intervals using conformal prediction methods.
Utility functions for data splitting, cross-validation strategies, and bootstrap sampling methods specifically designed for conformal prediction workflows. These tools support the specialized data handling requirements of conformal prediction methods.
Specialized data splitting functions for conformal prediction that require separate training, conformalization, and test sets.
def train_conformalize_test_split(X, y, train_size, conformalize_size, test_size, random_state=None, shuffle=True):
"""
Split arrays into train, conformalization, and test subsets.
Parameters:
- X: ArrayLike, input features (shape: n_samples x n_features)
- y: ArrayLike, target values (shape: n_samples,)
- train_size: Union[float, int], size of training set (fraction or absolute number)
- conformalize_size: Union[float, int], size of conformalization set
- test_size: Union[float, int], size of test set
- random_state: Optional[int], random seed for reproducibility
- shuffle: bool, whether to shuffle data before splitting (default: True)
Returns:
Tuple[NDArray, NDArray, NDArray, NDArray, NDArray, NDArray]:
X_train, X_conformalize, X_test, y_train, y_conformalize, y_test
"""Cross-validation and bootstrap sampling strategies designed for conformal prediction and ensemble methods.
class Subsample:
"""
Bootstrap sampling method for conformal prediction.
Parameters:
- n_resamplings: int, number of bootstrap resamples (default: 30)
- n_samples: Optional[int], number of samples per resample (default: None, uses input size)
- replace: bool, whether to sample with replacement (default: True)
- random_state: Optional[int], random seed
"""
def __init__(self, n_resamplings=30, n_samples=None, replace=True, random_state=None): ...
def split(self, X, *args, **kwargs):
"""
Generate bootstrap sample indices.
Parameters:
- X: ArrayLike, input data for determining sample size
Yields:
Generator[Tuple[NDArray, NDArray], None, None]: (train_indices, test_indices)
"""
def get_n_splits(self, *args, **kwargs):
"""
Get number of splits.
Returns:
int: number of resampling splits
"""
class BlockBootstrap:
"""
Block bootstrap sampling for time series data.
Parameters:
- n_resamplings: int, number of bootstrap resamples (default: 30)
- length: Optional[int], block length (default: None, computed automatically)
- n_blocks: Optional[int], number of blocks (default: None, computed automatically)
- overlapping: bool, whether blocks can overlap (default: False)
- random_state: Optional[int], random seed
"""
def __init__(self, n_resamplings=30, length=None, n_blocks=None, overlapping=False, random_state=None): ...
def split(self, X, *args, **kwargs):
"""
Generate block bootstrap sample indices for time series.
Parameters:
- X: ArrayLike, time series data
Yields:
Generator[Tuple[NDArray, NDArray], None, None]: (train_indices, test_indices)
"""
def get_n_splits(self, *args, **kwargs):
"""
Get number of splits.
Returns:
int: number of resampling splits
"""from mapie.utils import train_conformalize_test_split
import numpy as np
# Generate sample data
X = np.random.randn(1000, 5)
y = np.random.randn(1000)
# Split into train (60%), conformalize (20%), test (20%)
X_train, X_conf, X_test, y_train, y_conf, y_test = train_conformalize_test_split(
X, y,
train_size=0.6,
conformalize_size=0.2,
test_size=0.2,
random_state=42
)
print(f"Train set size: {X_train.shape[0]}")
print(f"Conformalization set size: {X_conf.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
# Use with absolute numbers
X_train, X_conf, X_test, y_train, y_conf, y_test = train_conformalize_test_split(
X, y,
train_size=600,
conformalize_size=200,
test_size=200,
random_state=42
)from mapie.subsample import Subsample
from mapie.regression import JackknifeAfterBootstrapRegressor
from sklearn.ensemble import RandomForestRegressor
# Create bootstrap sampler
bootstrap = Subsample(
n_resamplings=50, # Number of bootstrap samples
n_samples=None, # Use full dataset size
replace=True, # Bootstrap with replacement
random_state=42
)
# Use with Jackknife-After-Bootstrap
jab_reg = JackknifeAfterBootstrapRegressor(
estimator=RandomForestRegressor(n_estimators=50),
resampling=bootstrap, # Custom bootstrap strategy
confidence_level=0.9
)
# Fit and predict
jab_reg.fit_conformalize(X_train, y_train)
y_pred, y_intervals = jab_reg.predict_interval(X_test)
# Examine bootstrap splits
splits = list(bootstrap.split(X_train))
print(f"Number of bootstrap samples: {len(splits)}")
print(f"First bootstrap - train size: {len(splits[0][0])}, test size: {len(splits[0][1])}")from mapie.subsample import BlockBootstrap
import pandas as pd
# Time series data
dates = pd.date_range('2020-01-01', periods=365, freq='D')
ts_data = np.random.randn(365, 3) # 365 days, 3 features
ts_target = np.random.randn(365)
# Block bootstrap for temporal data
block_bootstrap = BlockBootstrap(
n_resamplings=30,
length=30, # 30-day blocks
n_blocks=None, # Auto-compute number of blocks
overlapping=False, # Non-overlapping blocks
random_state=42
)
# Use with time series regressor
from mapie.regression import TimeSeriesRegressor
ts_reg = TimeSeriesRegressor(
estimator=RandomForestRegressor(),
method="enbpi",
cv=block_bootstrap # Use block bootstrap for CV
)
# Generate bootstrap samples
splits = list(block_bootstrap.split(ts_data))
print(f"Block bootstrap samples: {len(splits)}")
# Examine block structure
train_idx, test_idx = splits[0]
print(f"First block - train indices range: {train_idx.min()}-{train_idx.max()}")
print(f"First block - test indices range: {test_idx.min()}-{test_idx.max()}")# Stratified bootstrap for imbalanced data
from sklearn.utils import resample
from sklearn.model_selection import StratifiedShuffleSplit
class StratifiedSubsample:
"""Custom stratified bootstrap sampler."""
def __init__(self, n_resamplings=30, random_state=None):
self.n_resamplings = n_resamplings
self.random_state = random_state
def split(self, X, y):
"""Generate stratified bootstrap samples."""
np.random.seed(self.random_state)
for i in range(self.n_resamplings):
# Stratified resample
X_boot, y_boot, indices = resample(
X, y, range(len(X)),
stratify=y,
random_state=self.random_state + i if self.random_state else None
)
# Out-of-bag indices
oob_indices = np.setdiff1d(range(len(X)), indices)
yield indices, oob_indices
def get_n_splits(self, X=None, y=None, groups=None):
return self.n_resamplings
# Usage
stratified_sampler = StratifiedSubsample(n_resamplings=25, random_state=42)from sklearn.model_selection import TimeSeriesSplit, GroupKFold
from mapie.regression import CrossConformalRegressor
# Time series cross-validation
ts_cv = TimeSeriesSplit(n_splits=5, gap=10)
cross_reg = CrossConformalRegressor(
estimator=RandomForestRegressor(),
cv=ts_cv, # Time-aware cross-validation
method="plus"
)
# Group-based cross-validation
group_cv = GroupKFold(n_splits=5)
groups = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4] * 100) # Group labels
cross_reg = CrossConformalRegressor(
estimator=RandomForestRegressor(),
cv=group_cv
)
# Fit with groups
cross_reg.fit_conformalize(X_train, y_train, groups=groups[:len(X_train)])def monte_carlo_conformal(base_estimator, X_train, y_train, X_test, n_trials=100):
"""
Monte Carlo approach to conformal prediction.
Repeatedly split data and compute prediction intervals to assess stability.
"""
intervals_collection = []
for trial in range(n_trials):
# Random split for each trial
X_tr, X_cal, y_tr, y_cal = train_conformalize_test_split(
X_train, y_train,
train_size=0.7,
conformalize_size=0.3,
test_size=0.0, # No test split needed
random_state=trial
)
# Fit conformal predictor
from mapie.regression import SplitConformalRegressor
mapie_reg = SplitConformalRegressor(
estimator=clone(base_estimator),
prefit=False
)
mapie_reg.fit(X_tr, y_tr)
mapie_reg.conformalize(X_cal, y_cal)
# Predict intervals
_, intervals = mapie_reg.predict_interval(X_test)
intervals_collection.append(intervals)
# Aggregate results
intervals_array = np.array(intervals_collection)
mean_intervals = np.mean(intervals_array, axis=0)
std_intervals = np.std(intervals_array, axis=0)
return {
'mean_intervals': mean_intervals,
'std_intervals': std_intervals,
'all_intervals': intervals_array
}
# Usage
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
mc_results = monte_carlo_conformal(
RandomForestRegressor(n_estimators=50),
X_train, y_train, X_test,
n_trials=50
)class WeightedSubsample:
"""Bootstrap with sample weights for imbalanced data."""
def __init__(self, n_resamplings=30, random_state=None):
self.n_resamplings = n_resamplings
self.random_state = random_state
def split(self, X, y, sample_weight=None):
"""Generate weighted bootstrap samples."""
n_samples = len(X)
np.random.seed(self.random_state)
# Compute weights if not provided
if sample_weight is None:
# Inverse class frequency weighting
from sklearn.utils.class_weight import compute_sample_weight
sample_weight = compute_sample_weight('balanced', y)
# Normalize weights
sample_weight = sample_weight / np.sum(sample_weight)
for i in range(self.n_resamplings):
# Weighted sampling
indices = np.random.choice(
n_samples,
size=n_samples,
replace=True,
p=sample_weight
)
# Out-of-bag indices
oob_indices = np.setdiff1d(range(n_samples), np.unique(indices))
yield indices, oob_indices
def get_n_splits(self, X=None, y=None, groups=None):
return self.n_resamplings
# Usage for imbalanced datasets
weighted_sampler = WeightedSubsample(n_resamplings=30, random_state=42)
# Use with Jackknife-After-Bootstrap
jab_reg = JackknifeAfterBootstrapRegressor(
estimator=RandomForestRegressor(),
resampling=weighted_sampler
)def optimal_split_sizes(n_total, method="split_conformal"):
"""
Recommend optimal split sizes based on conformal prediction method.
Parameters:
- n_total: int, total number of samples
- method: str, conformal prediction method
Returns:
dict: recommended split proportions
"""
if method == "split_conformal":
# Split conformal: larger training set, moderate conformalization
return {
"train": max(0.5, min(0.7, 500 / n_total)),
"conformalize": max(0.2, min(0.3, 200 / n_total)),
"test": max(0.1, 0.3)
}
elif method == "cross_conformal":
# Cross conformal: can use more data since CV utilized
return {
"train": 0.8,
"conformalize": 0.0, # Handled by CV
"test": 0.2
}
else:
# Default balanced split
return {"train": 0.6, "conformalize": 0.2, "test": 0.2}
# Usage
n_samples = 1000
splits = optimal_split_sizes(n_samples, method="split_conformal")
print(f"Recommended splits for {n_samples} samples: {splits}")def small_dataset_strategy(X, y, min_conformalize_size=50):
"""
Handle small datasets with adaptive splitting strategy.
"""
n_samples = len(X)
if n_samples < 200:
# Use cross-validation for small datasets
from mapie.regression import CrossConformalRegressor
print("Using cross-validation for small dataset")
return CrossConformalRegressor(cv=5)
elif n_samples < 500:
# Minimal test set, focus on train/conformalize
conf_size = max(min_conformalize_size, int(0.3 * n_samples))
train_size = n_samples - conf_size - 50 # Keep 50 for test
return train_conformalize_test_split(
X, y,
train_size=train_size,
conformalize_size=conf_size,
test_size=50
)
else:
# Standard split for larger datasets
return train_conformalize_test_split(
X, y,
train_size=0.6,
conformalize_size=0.2,
test_size=0.2
)Install with Tessl CLI
npx tessl i tessl/pypi-mapie