0
# Utilities and Sampling
1
2
Utility functions for data splitting, cross-validation strategies, and bootstrap sampling methods specifically designed for conformal prediction workflows. These tools support the specialized data handling requirements of conformal prediction methods.
3
4
## Capabilities
5
6
### Data Splitting Utilities
7
8
Specialized data splitting functions for conformal prediction that require separate training, conformalization, and test sets.
9
10
```python { .api }
11
def train_conformalize_test_split(X, y, train_size, conformalize_size, test_size, random_state=None, shuffle=True):
12
"""
13
Split arrays into train, conformalization, and test subsets.
14
15
Parameters:
16
- X: ArrayLike, input features (shape: n_samples x n_features)
17
- y: ArrayLike, target values (shape: n_samples,)
18
- train_size: Union[float, int], size of training set (fraction or absolute number)
19
- conformalize_size: Union[float, int], size of conformalization set
20
- test_size: Union[float, int], size of test set
21
- random_state: Optional[int], random seed for reproducibility
22
- shuffle: bool, whether to shuffle data before splitting (default: True)
23
24
Returns:
25
Tuple[NDArray, NDArray, NDArray, NDArray, NDArray, NDArray]:
26
X_train, X_conformalize, X_test, y_train, y_conformalize, y_test
27
"""
28
```
29
30
### Bootstrap Sampling Methods
31
32
Cross-validation and bootstrap sampling strategies designed for conformal prediction and ensemble methods.
33
34
```python { .api }
35
class Subsample:
36
"""
37
Bootstrap sampling method for conformal prediction.
38
39
Parameters:
40
- n_resamplings: int, number of bootstrap resamples (default: 30)
41
- n_samples: Optional[int], number of samples per resample (default: None, uses input size)
42
- replace: bool, whether to sample with replacement (default: True)
43
- random_state: Optional[int], random seed
44
"""
45
def __init__(self, n_resamplings=30, n_samples=None, replace=True, random_state=None): ...
46
47
def split(self, X, *args, **kwargs):
48
"""
49
Generate bootstrap sample indices.
50
51
Parameters:
52
- X: ArrayLike, input data for determining sample size
53
54
Yields:
55
Generator[Tuple[NDArray, NDArray], None, None]: (train_indices, test_indices)
56
"""
57
58
def get_n_splits(self, *args, **kwargs):
59
"""
60
Get number of splits.
61
62
Returns:
63
int: number of resampling splits
64
"""
65
66
class BlockBootstrap:
67
"""
68
Block bootstrap sampling for time series data.
69
70
Parameters:
71
- n_resamplings: int, number of bootstrap resamples (default: 30)
72
- length: Optional[int], block length (default: None, computed automatically)
73
- n_blocks: Optional[int], number of blocks (default: None, computed automatically)
74
- overlapping: bool, whether blocks can overlap (default: False)
75
- random_state: Optional[int], random seed
76
"""
77
def __init__(self, n_resamplings=30, length=None, n_blocks=None, overlapping=False, random_state=None): ...
78
79
def split(self, X, *args, **kwargs):
80
"""
81
Generate block bootstrap sample indices for time series.
82
83
Parameters:
84
- X: ArrayLike, time series data
85
86
Yields:
87
Generator[Tuple[NDArray, NDArray], None, None]: (train_indices, test_indices)
88
"""
89
90
def get_n_splits(self, *args, **kwargs):
91
"""
92
Get number of splits.
93
94
Returns:
95
int: number of resampling splits
96
"""
97
```
98
99
## Usage Examples
100
101
### Three-Way Data Splitting
102
103
```python
104
from mapie.utils import train_conformalize_test_split
105
import numpy as np
106
107
# Generate sample data
108
X = np.random.randn(1000, 5)
109
y = np.random.randn(1000)
110
111
# Split into train (60%), conformalize (20%), test (20%)
112
X_train, X_conf, X_test, y_train, y_conf, y_test = train_conformalize_test_split(
113
X, y,
114
train_size=0.6,
115
conformalize_size=0.2,
116
test_size=0.2,
117
random_state=42
118
)
119
120
print(f"Train set size: {X_train.shape[0]}")
121
print(f"Conformalization set size: {X_conf.shape[0]}")
122
print(f"Test set size: {X_test.shape[0]}")
123
124
# Use with absolute numbers
125
X_train, X_conf, X_test, y_train, y_conf, y_test = train_conformalize_test_split(
126
X, y,
127
train_size=600,
128
conformalize_size=200,
129
test_size=200,
130
random_state=42
131
)
132
```
133
134
### Bootstrap Sampling for Jackknife-After-Bootstrap
135
136
```python
137
from mapie.subsample import Subsample
138
from mapie.regression import JackknifeAfterBootstrapRegressor
139
from sklearn.ensemble import RandomForestRegressor
140
141
# Create bootstrap sampler
142
bootstrap = Subsample(
143
n_resamplings=50, # Number of bootstrap samples
144
n_samples=None, # Use full dataset size
145
replace=True, # Bootstrap with replacement
146
random_state=42
147
)
148
149
# Use with Jackknife-After-Bootstrap
150
jab_reg = JackknifeAfterBootstrapRegressor(
151
estimator=RandomForestRegressor(n_estimators=50),
152
resampling=bootstrap, # Custom bootstrap strategy
153
confidence_level=0.9
154
)
155
156
# Fit and predict
157
jab_reg.fit_conformalize(X_train, y_train)
158
y_pred, y_intervals = jab_reg.predict_interval(X_test)
159
160
# Examine bootstrap splits
161
splits = list(bootstrap.split(X_train))
162
print(f"Number of bootstrap samples: {len(splits)}")
163
print(f"First bootstrap - train size: {len(splits[0][0])}, test size: {len(splits[0][1])}")
164
```
165
166
### Block Bootstrap for Time Series
167
168
```python
169
from mapie.subsample import BlockBootstrap
170
import pandas as pd
171
172
# Time series data
173
dates = pd.date_range('2020-01-01', periods=365, freq='D')
174
ts_data = np.random.randn(365, 3) # 365 days, 3 features
175
ts_target = np.random.randn(365)
176
177
# Block bootstrap for temporal data
178
block_bootstrap = BlockBootstrap(
179
n_resamplings=30,
180
length=30, # 30-day blocks
181
n_blocks=None, # Auto-compute number of blocks
182
overlapping=False, # Non-overlapping blocks
183
random_state=42
184
)
185
186
# Use with time series regressor
187
from mapie.regression import TimeSeriesRegressor
188
189
ts_reg = TimeSeriesRegressor(
190
estimator=RandomForestRegressor(),
191
method="enbpi",
192
cv=block_bootstrap # Use block bootstrap for CV
193
)
194
195
# Generate bootstrap samples
196
splits = list(block_bootstrap.split(ts_data))
197
print(f"Block bootstrap samples: {len(splits)}")
198
199
# Examine block structure
200
train_idx, test_idx = splits[0]
201
print(f"First block - train indices range: {train_idx.min()}-{train_idx.max()}")
202
print(f"First block - test indices range: {test_idx.min()}-{test_idx.max()}")
203
```
204
205
### Custom Sampling Strategies
206
207
```python
208
# Stratified bootstrap for imbalanced data
209
from sklearn.utils import resample
210
from sklearn.model_selection import StratifiedShuffleSplit
211
212
class StratifiedSubsample:
213
"""Custom stratified bootstrap sampler."""
214
215
def __init__(self, n_resamplings=30, random_state=None):
216
self.n_resamplings = n_resamplings
217
self.random_state = random_state
218
219
def split(self, X, y):
220
"""Generate stratified bootstrap samples."""
221
np.random.seed(self.random_state)
222
223
for i in range(self.n_resamplings):
224
# Stratified resample
225
X_boot, y_boot, indices = resample(
226
X, y, range(len(X)),
227
stratify=y,
228
random_state=self.random_state + i if self.random_state else None
229
)
230
231
# Out-of-bag indices
232
oob_indices = np.setdiff1d(range(len(X)), indices)
233
234
yield indices, oob_indices
235
236
def get_n_splits(self, X=None, y=None, groups=None):
237
return self.n_resamplings
238
239
# Usage
240
stratified_sampler = StratifiedSubsample(n_resamplings=25, random_state=42)
241
```
242
243
## Advanced Sampling Techniques
244
245
### Cross-Validation Integration
246
247
```python
248
from sklearn.model_selection import TimeSeriesSplit, GroupKFold
249
from mapie.regression import CrossConformalRegressor
250
251
# Time series cross-validation
252
ts_cv = TimeSeriesSplit(n_splits=5, gap=10)
253
254
cross_reg = CrossConformalRegressor(
255
estimator=RandomForestRegressor(),
256
cv=ts_cv, # Time-aware cross-validation
257
method="plus"
258
)
259
260
# Group-based cross-validation
261
group_cv = GroupKFold(n_splits=5)
262
groups = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4] * 100) # Group labels
263
264
cross_reg = CrossConformalRegressor(
265
estimator=RandomForestRegressor(),
266
cv=group_cv
267
)
268
269
# Fit with groups
270
cross_reg.fit_conformalize(X_train, y_train, groups=groups[:len(X_train)])
271
```
272
273
### Monte Carlo Sampling
274
275
```python
276
def monte_carlo_conformal(base_estimator, X_train, y_train, X_test, n_trials=100):
277
"""
278
Monte Carlo approach to conformal prediction.
279
280
Repeatedly split data and compute prediction intervals to assess stability.
281
"""
282
intervals_collection = []
283
284
for trial in range(n_trials):
285
# Random split for each trial
286
X_tr, X_cal, y_tr, y_cal = train_conformalize_test_split(
287
X_train, y_train,
288
train_size=0.7,
289
conformalize_size=0.3,
290
test_size=0.0, # No test split needed
291
random_state=trial
292
)
293
294
# Fit conformal predictor
295
from mapie.regression import SplitConformalRegressor
296
297
mapie_reg = SplitConformalRegressor(
298
estimator=clone(base_estimator),
299
prefit=False
300
)
301
mapie_reg.fit(X_tr, y_tr)
302
mapie_reg.conformalize(X_cal, y_cal)
303
304
# Predict intervals
305
_, intervals = mapie_reg.predict_interval(X_test)
306
intervals_collection.append(intervals)
307
308
# Aggregate results
309
intervals_array = np.array(intervals_collection)
310
mean_intervals = np.mean(intervals_array, axis=0)
311
std_intervals = np.std(intervals_array, axis=0)
312
313
return {
314
'mean_intervals': mean_intervals,
315
'std_intervals': std_intervals,
316
'all_intervals': intervals_array
317
}
318
319
# Usage
320
from sklearn.ensemble import RandomForestRegressor
321
from sklearn.base import clone
322
323
mc_results = monte_carlo_conformal(
324
RandomForestRegressor(n_estimators=50),
325
X_train, y_train, X_test,
326
n_trials=50
327
)
328
```
329
330
### Weighted Bootstrap
331
332
```python
333
class WeightedSubsample:
334
"""Bootstrap with sample weights for imbalanced data."""
335
336
def __init__(self, n_resamplings=30, random_state=None):
337
self.n_resamplings = n_resamplings
338
self.random_state = random_state
339
340
def split(self, X, y, sample_weight=None):
341
"""Generate weighted bootstrap samples."""
342
n_samples = len(X)
343
np.random.seed(self.random_state)
344
345
# Compute weights if not provided
346
if sample_weight is None:
347
# Inverse class frequency weighting
348
from sklearn.utils.class_weight import compute_sample_weight
349
sample_weight = compute_sample_weight('balanced', y)
350
351
# Normalize weights
352
sample_weight = sample_weight / np.sum(sample_weight)
353
354
for i in range(self.n_resamplings):
355
# Weighted sampling
356
indices = np.random.choice(
357
n_samples,
358
size=n_samples,
359
replace=True,
360
p=sample_weight
361
)
362
363
# Out-of-bag indices
364
oob_indices = np.setdiff1d(range(n_samples), np.unique(indices))
365
366
yield indices, oob_indices
367
368
def get_n_splits(self, X=None, y=None, groups=None):
369
return self.n_resamplings
370
371
# Usage for imbalanced datasets
372
weighted_sampler = WeightedSubsample(n_resamplings=30, random_state=42)
373
374
# Use with Jackknife-After-Bootstrap
375
jab_reg = JackknifeAfterBootstrapRegressor(
376
estimator=RandomForestRegressor(),
377
resampling=weighted_sampler
378
)
379
```
380
381
## Best Practices
382
383
### Choosing Sample Sizes
384
385
```python
386
def optimal_split_sizes(n_total, method="split_conformal"):
387
"""
388
Recommend optimal split sizes based on conformal prediction method.
389
390
Parameters:
391
- n_total: int, total number of samples
392
- method: str, conformal prediction method
393
394
Returns:
395
dict: recommended split proportions
396
"""
397
if method == "split_conformal":
398
# Split conformal: larger training set, moderate conformalization
399
return {
400
"train": max(0.5, min(0.7, 500 / n_total)),
401
"conformalize": max(0.2, min(0.3, 200 / n_total)),
402
"test": max(0.1, 0.3)
403
}
404
elif method == "cross_conformal":
405
# Cross conformal: can use more data since CV utilized
406
return {
407
"train": 0.8,
408
"conformalize": 0.0, # Handled by CV
409
"test": 0.2
410
}
411
else:
412
# Default balanced split
413
return {"train": 0.6, "conformalize": 0.2, "test": 0.2}
414
415
# Usage
416
n_samples = 1000
417
splits = optimal_split_sizes(n_samples, method="split_conformal")
418
print(f"Recommended splits for {n_samples} samples: {splits}")
419
```
420
421
### Handling Small Datasets
422
423
```python
424
def small_dataset_strategy(X, y, min_conformalize_size=50):
425
"""
426
Handle small datasets with adaptive splitting strategy.
427
"""
428
n_samples = len(X)
429
430
if n_samples < 200:
431
# Use cross-validation for small datasets
432
from mapie.regression import CrossConformalRegressor
433
print("Using cross-validation for small dataset")
434
return CrossConformalRegressor(cv=5)
435
436
elif n_samples < 500:
437
# Minimal test set, focus on train/conformalize
438
conf_size = max(min_conformalize_size, int(0.3 * n_samples))
439
train_size = n_samples - conf_size - 50 # Keep 50 for test
440
441
return train_conformalize_test_split(
442
X, y,
443
train_size=train_size,
444
conformalize_size=conf_size,
445
test_size=50
446
)
447
448
else:
449
# Standard split for larger datasets
450
return train_conformalize_test_split(
451
X, y,
452
train_size=0.6,
453
conformalize_size=0.2,
454
test_size=0.2
455
)
456
```