or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

calibration.mdclassification.mdindex.mdmetrics.mdregression.mdrisk-control.mdutils.md

utils.mddocs/

0

# Utilities and Sampling

1

2

Utility functions for data splitting, cross-validation strategies, and bootstrap sampling methods specifically designed for conformal prediction workflows. These tools support the specialized data handling requirements of conformal prediction methods.

3

4

## Capabilities

5

6

### Data Splitting Utilities

7

8

Specialized data splitting functions for conformal prediction that require separate training, conformalization, and test sets.

9

10

```python { .api }

11

def train_conformalize_test_split(X, y, train_size, conformalize_size, test_size, random_state=None, shuffle=True):

12

"""

13

Split arrays into train, conformalization, and test subsets.

14

15

Parameters:

16

- X: ArrayLike, input features (shape: n_samples x n_features)

17

- y: ArrayLike, target values (shape: n_samples,)

18

- train_size: Union[float, int], size of training set (fraction or absolute number)

19

- conformalize_size: Union[float, int], size of conformalization set

20

- test_size: Union[float, int], size of test set

21

- random_state: Optional[int], random seed for reproducibility

22

- shuffle: bool, whether to shuffle data before splitting (default: True)

23

24

Returns:

25

Tuple[NDArray, NDArray, NDArray, NDArray, NDArray, NDArray]:

26

X_train, X_conformalize, X_test, y_train, y_conformalize, y_test

27

"""

28

```

29

30

### Bootstrap Sampling Methods

31

32

Cross-validation and bootstrap sampling strategies designed for conformal prediction and ensemble methods.

33

34

```python { .api }

35

class Subsample:

36

"""

37

Bootstrap sampling method for conformal prediction.

38

39

Parameters:

40

- n_resamplings: int, number of bootstrap resamples (default: 30)

41

- n_samples: Optional[int], number of samples per resample (default: None, uses input size)

42

- replace: bool, whether to sample with replacement (default: True)

43

- random_state: Optional[int], random seed

44

"""

45

def __init__(self, n_resamplings=30, n_samples=None, replace=True, random_state=None): ...

46

47

def split(self, X, *args, **kwargs):

48

"""

49

Generate bootstrap sample indices.

50

51

Parameters:

52

- X: ArrayLike, input data for determining sample size

53

54

Yields:

55

Generator[Tuple[NDArray, NDArray], None, None]: (train_indices, test_indices)

56

"""

57

58

def get_n_splits(self, *args, **kwargs):

59

"""

60

Get number of splits.

61

62

Returns:

63

int: number of resampling splits

64

"""

65

66

class BlockBootstrap:

67

"""

68

Block bootstrap sampling for time series data.

69

70

Parameters:

71

- n_resamplings: int, number of bootstrap resamples (default: 30)

72

- length: Optional[int], block length (default: None, computed automatically)

73

- n_blocks: Optional[int], number of blocks (default: None, computed automatically)

74

- overlapping: bool, whether blocks can overlap (default: False)

75

- random_state: Optional[int], random seed

76

"""

77

def __init__(self, n_resamplings=30, length=None, n_blocks=None, overlapping=False, random_state=None): ...

78

79

def split(self, X, *args, **kwargs):

80

"""

81

Generate block bootstrap sample indices for time series.

82

83

Parameters:

84

- X: ArrayLike, time series data

85

86

Yields:

87

Generator[Tuple[NDArray, NDArray], None, None]: (train_indices, test_indices)

88

"""

89

90

def get_n_splits(self, *args, **kwargs):

91

"""

92

Get number of splits.

93

94

Returns:

95

int: number of resampling splits

96

"""

97

```

98

99

## Usage Examples

100

101

### Three-Way Data Splitting

102

103

```python

104

from mapie.utils import train_conformalize_test_split

105

import numpy as np

106

107

# Generate sample data

108

X = np.random.randn(1000, 5)

109

y = np.random.randn(1000)

110

111

# Split into train (60%), conformalize (20%), test (20%)

112

X_train, X_conf, X_test, y_train, y_conf, y_test = train_conformalize_test_split(

113

X, y,

114

train_size=0.6,

115

conformalize_size=0.2,

116

test_size=0.2,

117

random_state=42

118

)

119

120

print(f"Train set size: {X_train.shape[0]}")

121

print(f"Conformalization set size: {X_conf.shape[0]}")

122

print(f"Test set size: {X_test.shape[0]}")

123

124

# Use with absolute numbers

125

X_train, X_conf, X_test, y_train, y_conf, y_test = train_conformalize_test_split(

126

X, y,

127

train_size=600,

128

conformalize_size=200,

129

test_size=200,

130

random_state=42

131

)

132

```

133

134

### Bootstrap Sampling for Jackknife-After-Bootstrap

135

136

```python

137

from mapie.subsample import Subsample

138

from mapie.regression import JackknifeAfterBootstrapRegressor

139

from sklearn.ensemble import RandomForestRegressor

140

141

# Create bootstrap sampler

142

bootstrap = Subsample(

143

n_resamplings=50, # Number of bootstrap samples

144

n_samples=None, # Use full dataset size

145

replace=True, # Bootstrap with replacement

146

random_state=42

147

)

148

149

# Use with Jackknife-After-Bootstrap

150

jab_reg = JackknifeAfterBootstrapRegressor(

151

estimator=RandomForestRegressor(n_estimators=50),

152

resampling=bootstrap, # Custom bootstrap strategy

153

confidence_level=0.9

154

)

155

156

# Fit and predict

157

jab_reg.fit_conformalize(X_train, y_train)

158

y_pred, y_intervals = jab_reg.predict_interval(X_test)

159

160

# Examine bootstrap splits

161

splits = list(bootstrap.split(X_train))

162

print(f"Number of bootstrap samples: {len(splits)}")

163

print(f"First bootstrap - train size: {len(splits[0][0])}, test size: {len(splits[0][1])}")

164

```

165

166

### Block Bootstrap for Time Series

167

168

```python

169

from mapie.subsample import BlockBootstrap

170

import pandas as pd

171

172

# Time series data

173

dates = pd.date_range('2020-01-01', periods=365, freq='D')

174

ts_data = np.random.randn(365, 3) # 365 days, 3 features

175

ts_target = np.random.randn(365)

176

177

# Block bootstrap for temporal data

178

block_bootstrap = BlockBootstrap(

179

n_resamplings=30,

180

length=30, # 30-day blocks

181

n_blocks=None, # Auto-compute number of blocks

182

overlapping=False, # Non-overlapping blocks

183

random_state=42

184

)

185

186

# Use with time series regressor

187

from mapie.regression import TimeSeriesRegressor

188

189

ts_reg = TimeSeriesRegressor(

190

estimator=RandomForestRegressor(),

191

method="enbpi",

192

cv=block_bootstrap # Use block bootstrap for CV

193

)

194

195

# Generate bootstrap samples

196

splits = list(block_bootstrap.split(ts_data))

197

print(f"Block bootstrap samples: {len(splits)}")

198

199

# Examine block structure

200

train_idx, test_idx = splits[0]

201

print(f"First block - train indices range: {train_idx.min()}-{train_idx.max()}")

202

print(f"First block - test indices range: {test_idx.min()}-{test_idx.max()}")

203

```

204

205

### Custom Sampling Strategies

206

207

```python

208

# Stratified bootstrap for imbalanced data

209

from sklearn.utils import resample

210

from sklearn.model_selection import StratifiedShuffleSplit

211

212

class StratifiedSubsample:

213

"""Custom stratified bootstrap sampler."""

214

215

def __init__(self, n_resamplings=30, random_state=None):

216

self.n_resamplings = n_resamplings

217

self.random_state = random_state

218

219

def split(self, X, y):

220

"""Generate stratified bootstrap samples."""

221

np.random.seed(self.random_state)

222

223

for i in range(self.n_resamplings):

224

# Stratified resample

225

X_boot, y_boot, indices = resample(

226

X, y, range(len(X)),

227

stratify=y,

228

random_state=self.random_state + i if self.random_state else None

229

)

230

231

# Out-of-bag indices

232

oob_indices = np.setdiff1d(range(len(X)), indices)

233

234

yield indices, oob_indices

235

236

def get_n_splits(self, X=None, y=None, groups=None):

237

return self.n_resamplings

238

239

# Usage

240

stratified_sampler = StratifiedSubsample(n_resamplings=25, random_state=42)

241

```

242

243

## Advanced Sampling Techniques

244

245

### Cross-Validation Integration

246

247

```python

248

from sklearn.model_selection import TimeSeriesSplit, GroupKFold

249

from mapie.regression import CrossConformalRegressor

250

251

# Time series cross-validation

252

ts_cv = TimeSeriesSplit(n_splits=5, gap=10)

253

254

cross_reg = CrossConformalRegressor(

255

estimator=RandomForestRegressor(),

256

cv=ts_cv, # Time-aware cross-validation

257

method="plus"

258

)

259

260

# Group-based cross-validation

261

group_cv = GroupKFold(n_splits=5)

262

groups = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4] * 100) # Group labels

263

264

cross_reg = CrossConformalRegressor(

265

estimator=RandomForestRegressor(),

266

cv=group_cv

267

)

268

269

# Fit with groups

270

cross_reg.fit_conformalize(X_train, y_train, groups=groups[:len(X_train)])

271

```

272

273

### Monte Carlo Sampling

274

275

```python

276

def monte_carlo_conformal(base_estimator, X_train, y_train, X_test, n_trials=100):

277

"""

278

Monte Carlo approach to conformal prediction.

279

280

Repeatedly split data and compute prediction intervals to assess stability.

281

"""

282

intervals_collection = []

283

284

for trial in range(n_trials):

285

# Random split for each trial

286

X_tr, X_cal, y_tr, y_cal = train_conformalize_test_split(

287

X_train, y_train,

288

train_size=0.7,

289

conformalize_size=0.3,

290

test_size=0.0, # No test split needed

291

random_state=trial

292

)

293

294

# Fit conformal predictor

295

from mapie.regression import SplitConformalRegressor

296

297

mapie_reg = SplitConformalRegressor(

298

estimator=clone(base_estimator),

299

prefit=False

300

)

301

mapie_reg.fit(X_tr, y_tr)

302

mapie_reg.conformalize(X_cal, y_cal)

303

304

# Predict intervals

305

_, intervals = mapie_reg.predict_interval(X_test)

306

intervals_collection.append(intervals)

307

308

# Aggregate results

309

intervals_array = np.array(intervals_collection)

310

mean_intervals = np.mean(intervals_array, axis=0)

311

std_intervals = np.std(intervals_array, axis=0)

312

313

return {

314

'mean_intervals': mean_intervals,

315

'std_intervals': std_intervals,

316

'all_intervals': intervals_array

317

}

318

319

# Usage

320

from sklearn.ensemble import RandomForestRegressor

321

from sklearn.base import clone

322

323

mc_results = monte_carlo_conformal(

324

RandomForestRegressor(n_estimators=50),

325

X_train, y_train, X_test,

326

n_trials=50

327

)

328

```

329

330

### Weighted Bootstrap

331

332

```python

333

class WeightedSubsample:

334

"""Bootstrap with sample weights for imbalanced data."""

335

336

def __init__(self, n_resamplings=30, random_state=None):

337

self.n_resamplings = n_resamplings

338

self.random_state = random_state

339

340

def split(self, X, y, sample_weight=None):

341

"""Generate weighted bootstrap samples."""

342

n_samples = len(X)

343

np.random.seed(self.random_state)

344

345

# Compute weights if not provided

346

if sample_weight is None:

347

# Inverse class frequency weighting

348

from sklearn.utils.class_weight import compute_sample_weight

349

sample_weight = compute_sample_weight('balanced', y)

350

351

# Normalize weights

352

sample_weight = sample_weight / np.sum(sample_weight)

353

354

for i in range(self.n_resamplings):

355

# Weighted sampling

356

indices = np.random.choice(

357

n_samples,

358

size=n_samples,

359

replace=True,

360

p=sample_weight

361

)

362

363

# Out-of-bag indices

364

oob_indices = np.setdiff1d(range(n_samples), np.unique(indices))

365

366

yield indices, oob_indices

367

368

def get_n_splits(self, X=None, y=None, groups=None):

369

return self.n_resamplings

370

371

# Usage for imbalanced datasets

372

weighted_sampler = WeightedSubsample(n_resamplings=30, random_state=42)

373

374

# Use with Jackknife-After-Bootstrap

375

jab_reg = JackknifeAfterBootstrapRegressor(

376

estimator=RandomForestRegressor(),

377

resampling=weighted_sampler

378

)

379

```

380

381

## Best Practices

382

383

### Choosing Sample Sizes

384

385

```python

386

def optimal_split_sizes(n_total, method="split_conformal"):

387

"""

388

Recommend optimal split sizes based on conformal prediction method.

389

390

Parameters:

391

- n_total: int, total number of samples

392

- method: str, conformal prediction method

393

394

Returns:

395

dict: recommended split proportions

396

"""

397

if method == "split_conformal":

398

# Split conformal: larger training set, moderate conformalization

399

return {

400

"train": max(0.5, min(0.7, 500 / n_total)),

401

"conformalize": max(0.2, min(0.3, 200 / n_total)),

402

"test": max(0.1, 0.3)

403

}

404

elif method == "cross_conformal":

405

# Cross conformal: can use more data since CV utilized

406

return {

407

"train": 0.8,

408

"conformalize": 0.0, # Handled by CV

409

"test": 0.2

410

}

411

else:

412

# Default balanced split

413

return {"train": 0.6, "conformalize": 0.2, "test": 0.2}

414

415

# Usage

416

n_samples = 1000

417

splits = optimal_split_sizes(n_samples, method="split_conformal")

418

print(f"Recommended splits for {n_samples} samples: {splits}")

419

```

420

421

### Handling Small Datasets

422

423

```python

424

def small_dataset_strategy(X, y, min_conformalize_size=50):

425

"""

426

Handle small datasets with adaptive splitting strategy.

427

"""

428

n_samples = len(X)

429

430

if n_samples < 200:

431

# Use cross-validation for small datasets

432

from mapie.regression import CrossConformalRegressor

433

print("Using cross-validation for small dataset")

434

return CrossConformalRegressor(cv=5)

435

436

elif n_samples < 500:

437

# Minimal test set, focus on train/conformalize

438

conf_size = max(min_conformalize_size, int(0.3 * n_samples))

439

train_size = n_samples - conf_size - 50 # Keep 50 for test

440

441

return train_conformalize_test_split(

442

X, y,

443

train_size=train_size,

444

conformalize_size=conf_size,

445

test_size=50

446

)

447

448

else:

449

# Standard split for larger datasets

450

return train_conformalize_test_split(

451

X, y,

452

train_size=0.6,

453

conformalize_size=0.2,

454

test_size=0.2

455

)

456

```