Tessl Tile for pypi/mapie@1.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

calibration.md classification.md index.md metrics.md regression.md risk-control.md utils.md

utils.mddocs/

0
# Utilities and Sampling
1

2
Utility functions for data splitting, cross-validation strategies, and bootstrap sampling methods specifically designed for conformal prediction workflows. These tools support the specialized data handling requirements of conformal prediction methods.
3

4
## Capabilities
5

6
### Data Splitting Utilities
7

8
Specialized data splitting functions for conformal prediction that require separate training, conformalization, and test sets.
9

10
```python { .api }
11
def train_conformalize_test_split(X, y, train_size, conformalize_size, test_size, random_state=None, shuffle=True):
12
    """
13
    Split arrays into train, conformalization, and test subsets.
14

15
    Parameters:
16
    - X: ArrayLike, input features (shape: n_samples x n_features)
17
    - y: ArrayLike, target values (shape: n_samples,)
18
    - train_size: Union[float, int], size of training set (fraction or absolute number)
19
    - conformalize_size: Union[float, int], size of conformalization set
20
    - test_size: Union[float, int], size of test set
21
    - random_state: Optional[int], random seed for reproducibility
22
    - shuffle: bool, whether to shuffle data before splitting (default: True)
23

24
    Returns:
25
    Tuple[NDArray, NDArray, NDArray, NDArray, NDArray, NDArray]:
26
        X_train, X_conformalize, X_test, y_train, y_conformalize, y_test
27
    """
28
```
29

30
### Bootstrap Sampling Methods
31

32
Cross-validation and bootstrap sampling strategies designed for conformal prediction and ensemble methods.
33

34
```python { .api }
35
class Subsample:
36
    """
37
    Bootstrap sampling method for conformal prediction.
38

39
    Parameters:
40
    - n_resamplings: int, number of bootstrap resamples (default: 30)
41
    - n_samples: Optional[int], number of samples per resample (default: None, uses input size)
42
    - replace: bool, whether to sample with replacement (default: True)
43
    - random_state: Optional[int], random seed
44
    """
45
    def __init__(self, n_resamplings=30, n_samples=None, replace=True, random_state=None): ...
46

47
    def split(self, X, *args, **kwargs):
48
        """
49
        Generate bootstrap sample indices.
50

51
        Parameters:
52
        - X: ArrayLike, input data for determining sample size
53

54
        Yields:
55
        Generator[Tuple[NDArray, NDArray], None, None]: (train_indices, test_indices)
56
        """
57

58
    def get_n_splits(self, *args, **kwargs):
59
        """
60
        Get number of splits.
61

62
        Returns:
63
        int: number of resampling splits
64
        """
65

66
class BlockBootstrap:
67
    """
68
    Block bootstrap sampling for time series data.
69

70
    Parameters:
71
    - n_resamplings: int, number of bootstrap resamples (default: 30)
72
    - length: Optional[int], block length (default: None, computed automatically)
73
    - n_blocks: Optional[int], number of blocks (default: None, computed automatically)
74
    - overlapping: bool, whether blocks can overlap (default: False)
75
    - random_state: Optional[int], random seed
76
    """
77
    def __init__(self, n_resamplings=30, length=None, n_blocks=None, overlapping=False, random_state=None): ...
78

79
    def split(self, X, *args, **kwargs):
80
        """
81
        Generate block bootstrap sample indices for time series.
82

83
        Parameters:
84
        - X: ArrayLike, time series data
85

86
        Yields:
87
        Generator[Tuple[NDArray, NDArray], None, None]: (train_indices, test_indices)
88
        """
89

90
    def get_n_splits(self, *args, **kwargs):
91
        """
92
        Get number of splits.
93

94
        Returns:
95
        int: number of resampling splits
96
        """
97
```
98

99
## Usage Examples
100

101
### Three-Way Data Splitting
102

103
```python
104
from mapie.utils import train_conformalize_test_split
105
import numpy as np
106

107
# Generate sample data
108
X = np.random.randn(1000, 5)
109
y = np.random.randn(1000)
110

111
# Split into train (60%), conformalize (20%), test (20%)
112
X_train, X_conf, X_test, y_train, y_conf, y_test = train_conformalize_test_split(
113
    X, y,
114
    train_size=0.6,
115
    conformalize_size=0.2,
116
    test_size=0.2,
117
    random_state=42
118
)
119

120
print(f"Train set size: {X_train.shape[0]}")
121
print(f"Conformalization set size: {X_conf.shape[0]}")
122
print(f"Test set size: {X_test.shape[0]}")
123

124
# Use with absolute numbers
125
X_train, X_conf, X_test, y_train, y_conf, y_test = train_conformalize_test_split(
126
    X, y,
127
    train_size=600,
128
    conformalize_size=200,
129
    test_size=200,
130
    random_state=42
131
)
132
```
133

134
### Bootstrap Sampling for Jackknife-After-Bootstrap
135

136
```python
137
from mapie.subsample import Subsample
138
from mapie.regression import JackknifeAfterBootstrapRegressor
139
from sklearn.ensemble import RandomForestRegressor
140

141
# Create bootstrap sampler
142
bootstrap = Subsample(
143
    n_resamplings=50,  # Number of bootstrap samples
144
    n_samples=None,    # Use full dataset size
145
    replace=True,      # Bootstrap with replacement
146
    random_state=42
147
)
148

149
# Use with Jackknife-After-Bootstrap
150
jab_reg = JackknifeAfterBootstrapRegressor(
151
    estimator=RandomForestRegressor(n_estimators=50),
152
    resampling=bootstrap,  # Custom bootstrap strategy
153
    confidence_level=0.9
154
)
155

156
# Fit and predict
157
jab_reg.fit_conformalize(X_train, y_train)
158
y_pred, y_intervals = jab_reg.predict_interval(X_test)
159

160
# Examine bootstrap splits
161
splits = list(bootstrap.split(X_train))
162
print(f"Number of bootstrap samples: {len(splits)}")
163
print(f"First bootstrap - train size: {len(splits[0][0])}, test size: {len(splits[0][1])}")
164
```
165

166
### Block Bootstrap for Time Series
167

168
```python
169
from mapie.subsample import BlockBootstrap
170
import pandas as pd
171

172
# Time series data
173
dates = pd.date_range('2020-01-01', periods=365, freq='D')
174
ts_data = np.random.randn(365, 3)  # 365 days, 3 features
175
ts_target = np.random.randn(365)
176

177
# Block bootstrap for temporal data
178
block_bootstrap = BlockBootstrap(
179
    n_resamplings=30,
180
    length=30,         # 30-day blocks
181
    n_blocks=None,     # Auto-compute number of blocks
182
    overlapping=False, # Non-overlapping blocks
183
    random_state=42
184
)
185

186
# Use with time series regressor
187
from mapie.regression import TimeSeriesRegressor
188

189
ts_reg = TimeSeriesRegressor(
190
    estimator=RandomForestRegressor(),
191
    method="enbpi",
192
    cv=block_bootstrap  # Use block bootstrap for CV
193
)
194

195
# Generate bootstrap samples
196
splits = list(block_bootstrap.split(ts_data))
197
print(f"Block bootstrap samples: {len(splits)}")
198

199
# Examine block structure
200
train_idx, test_idx = splits[0]
201
print(f"First block - train indices range: {train_idx.min()}-{train_idx.max()}")
202
print(f"First block - test indices range: {test_idx.min()}-{test_idx.max()}")
203
```
204

205
### Custom Sampling Strategies
206

207
```python
208
# Stratified bootstrap for imbalanced data
209
from sklearn.utils import resample
210
from sklearn.model_selection import StratifiedShuffleSplit
211

212
class StratifiedSubsample:
213
    """Custom stratified bootstrap sampler."""
214

215
    def __init__(self, n_resamplings=30, random_state=None):
216
        self.n_resamplings = n_resamplings
217
        self.random_state = random_state
218

219
    def split(self, X, y):
220
        """Generate stratified bootstrap samples."""
221
        np.random.seed(self.random_state)
222

223
        for i in range(self.n_resamplings):
224
            # Stratified resample
225
            X_boot, y_boot, indices = resample(
226
                X, y, range(len(X)),
227
                stratify=y,
228
                random_state=self.random_state + i if self.random_state else None
229
            )
230

231
            # Out-of-bag indices
232
            oob_indices = np.setdiff1d(range(len(X)), indices)
233

234
            yield indices, oob_indices
235

236
    def get_n_splits(self, X=None, y=None, groups=None):
237
        return self.n_resamplings
238

239
# Usage
240
stratified_sampler = StratifiedSubsample(n_resamplings=25, random_state=42)
241
```
242

243
## Advanced Sampling Techniques
244

245
### Cross-Validation Integration
246

247
```python
248
from sklearn.model_selection import TimeSeriesSplit, GroupKFold
249
from mapie.regression import CrossConformalRegressor
250

251
# Time series cross-validation
252
ts_cv = TimeSeriesSplit(n_splits=5, gap=10)
253

254
cross_reg = CrossConformalRegressor(
255
    estimator=RandomForestRegressor(),
256
    cv=ts_cv,  # Time-aware cross-validation
257
    method="plus"
258
)
259

260
# Group-based cross-validation
261
group_cv = GroupKFold(n_splits=5)
262
groups = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4] * 100)  # Group labels
263

264
cross_reg = CrossConformalRegressor(
265
    estimator=RandomForestRegressor(),
266
    cv=group_cv
267
)
268

269
# Fit with groups
270
cross_reg.fit_conformalize(X_train, y_train, groups=groups[:len(X_train)])
271
```
272

273
### Monte Carlo Sampling
274

275
```python
276
def monte_carlo_conformal(base_estimator, X_train, y_train, X_test, n_trials=100):
277
    """
278
    Monte Carlo approach to conformal prediction.
279

280
    Repeatedly split data and compute prediction intervals to assess stability.
281
    """
282
    intervals_collection = []
283

284
    for trial in range(n_trials):
285
        # Random split for each trial
286
        X_tr, X_cal, y_tr, y_cal = train_conformalize_test_split(
287
            X_train, y_train,
288
            train_size=0.7,
289
            conformalize_size=0.3,
290
            test_size=0.0,  # No test split needed
291
            random_state=trial
292
        )
293

294
        # Fit conformal predictor
295
        from mapie.regression import SplitConformalRegressor
296

297
        mapie_reg = SplitConformalRegressor(
298
            estimator=clone(base_estimator),
299
            prefit=False
300
        )
301
        mapie_reg.fit(X_tr, y_tr)
302
        mapie_reg.conformalize(X_cal, y_cal)
303

304
        # Predict intervals
305
        _, intervals = mapie_reg.predict_interval(X_test)
306
        intervals_collection.append(intervals)
307

308
    # Aggregate results
309
    intervals_array = np.array(intervals_collection)
310
    mean_intervals = np.mean(intervals_array, axis=0)
311
    std_intervals = np.std(intervals_array, axis=0)
312

313
    return {
314
        'mean_intervals': mean_intervals,
315
        'std_intervals': std_intervals,
316
        'all_intervals': intervals_array
317
    }
318

319
# Usage
320
from sklearn.ensemble import RandomForestRegressor
321
from sklearn.base import clone
322

323
mc_results = monte_carlo_conformal(
324
    RandomForestRegressor(n_estimators=50),
325
    X_train, y_train, X_test,
326
    n_trials=50
327
)
328
```
329

330
### Weighted Bootstrap
331

332
```python
333
class WeightedSubsample:
334
    """Bootstrap with sample weights for imbalanced data."""
335

336
    def __init__(self, n_resamplings=30, random_state=None):
337
        self.n_resamplings = n_resamplings
338
        self.random_state = random_state
339

340
    def split(self, X, y, sample_weight=None):
341
        """Generate weighted bootstrap samples."""
342
        n_samples = len(X)
343
        np.random.seed(self.random_state)
344

345
        # Compute weights if not provided
346
        if sample_weight is None:
347
            # Inverse class frequency weighting
348
            from sklearn.utils.class_weight import compute_sample_weight
349
            sample_weight = compute_sample_weight('balanced', y)
350

351
        # Normalize weights
352
        sample_weight = sample_weight / np.sum(sample_weight)
353

354
        for i in range(self.n_resamplings):
355
            # Weighted sampling
356
            indices = np.random.choice(
357
                n_samples,
358
                size=n_samples,
359
                replace=True,
360
                p=sample_weight
361
            )
362

363
            # Out-of-bag indices
364
            oob_indices = np.setdiff1d(range(n_samples), np.unique(indices))
365

366
            yield indices, oob_indices
367

368
    def get_n_splits(self, X=None, y=None, groups=None):
369
        return self.n_resamplings
370

371
# Usage for imbalanced datasets
372
weighted_sampler = WeightedSubsample(n_resamplings=30, random_state=42)
373

374
# Use with Jackknife-After-Bootstrap
375
jab_reg = JackknifeAfterBootstrapRegressor(
376
    estimator=RandomForestRegressor(),
377
    resampling=weighted_sampler
378
)
379
```
380

381
## Best Practices
382

383
### Choosing Sample Sizes
384

385
```python
386
def optimal_split_sizes(n_total, method="split_conformal"):
387
    """
388
    Recommend optimal split sizes based on conformal prediction method.
389

390
    Parameters:
391
    - n_total: int, total number of samples
392
    - method: str, conformal prediction method
393

394
    Returns:
395
    dict: recommended split proportions
396
    """
397
    if method == "split_conformal":
398
        # Split conformal: larger training set, moderate conformalization
399
        return {
400
            "train": max(0.5, min(0.7, 500 / n_total)),
401
            "conformalize": max(0.2, min(0.3, 200 / n_total)),
402
            "test": max(0.1, 0.3)
403
        }
404
    elif method == "cross_conformal":
405
        # Cross conformal: can use more data since CV utilized
406
        return {
407
            "train": 0.8,
408
            "conformalize": 0.0,  # Handled by CV
409
            "test": 0.2
410
        }
411
    else:
412
        # Default balanced split
413
        return {"train": 0.6, "conformalize": 0.2, "test": 0.2}
414

415
# Usage
416
n_samples = 1000
417
splits = optimal_split_sizes(n_samples, method="split_conformal")
418
print(f"Recommended splits for {n_samples} samples: {splits}")
419
```
420

421
### Handling Small Datasets
422

423
```python
424
def small_dataset_strategy(X, y, min_conformalize_size=50):
425
    """
426
    Handle small datasets with adaptive splitting strategy.
427
    """
428
    n_samples = len(X)
429

430
    if n_samples < 200:
431
        # Use cross-validation for small datasets
432
        from mapie.regression import CrossConformalRegressor
433
        print("Using cross-validation for small dataset")
434
        return CrossConformalRegressor(cv=5)
435

436
    elif n_samples < 500:
437
        # Minimal test set, focus on train/conformalize
438
        conf_size = max(min_conformalize_size, int(0.3 * n_samples))
439
        train_size = n_samples - conf_size - 50  # Keep 50 for test
440

441
        return train_conformalize_test_split(
442
            X, y,
443
            train_size=train_size,
444
            conformalize_size=conf_size,
445
            test_size=50
446
        )
447

448
    else:
449
        # Standard split for larger datasets
450
        return train_conformalize_test_split(
451
            X, y,
452
            train_size=0.6,
453
            conformalize_size=0.2,
454
            test_size=0.2
455
        )
456
```

Version

Tile

Files

utils.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utils.mddocs/