or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddatasets.mdevaluation.mdfeature-engineering.mdfile-io.mdindex.mdmath-utils.mdpattern-mining.mdplotting.mdpreprocessing.mdregression.mdtext-processing.mdutilities.md

evaluation.mddocs/

0

# Model Evaluation

1

2

Comprehensive model evaluation tools including statistical tests, bootstrap methods, and cross-validation utilities for assessing and comparing machine learning models.

3

4

## Capabilities

5

6

### Statistical Testing

7

8

Statistical tests for comparing classifier performance and assessing significance of differences.

9

10

```python { .api }

11

def mcnemar(ary, corrected=True, exact=False):

12

"""

13

McNemar test for comparing two classifiers on the same dataset.

14

15

Parameters:

16

- ary: array-like, 2x2 contingency table or confusion matrix

17

- corrected: bool, apply continuity correction

18

- exact: bool, use exact binomial test

19

20

Returns:

21

- chi2: float, chi-squared statistic

22

- p_value: float, p-value of the test

23

"""

24

25

def mcnemar_table(y_target, y_model1, y_model2):

26

"""

27

Create McNemar table for two classifiers.

28

29

Parameters:

30

- y_target: array-like, true class labels

31

- y_model1: array-like, predictions from first classifier

32

- y_model2: array-like, predictions from second classifier

33

34

Returns:

35

- tb: array, 2x2 McNemar table

36

"""

37

38

def mcnemar_tables(y_target, *y_model_predictions):

39

"""

40

Create multiple McNemar tables for pairwise comparisons.

41

42

Parameters:

43

- y_target: array-like, true class labels

44

- y_model_predictions: arrays, predictions from multiple classifiers

45

46

Returns:

47

- tb: dict, pairwise McNemar tables

48

"""

49

50

def cochrans_q(X, alpha=0.05):

51

"""

52

Cochran's Q test for comparing multiple classifiers.

53

54

Parameters:

55

- X: array-like, binary classifier results matrix

56

- alpha: float, significance level

57

58

Returns:

59

- q: float, Cochran's Q statistic

60

- p_value: float, p-value of the test

61

"""

62

63

def paired_ttest_resampled(estimator1, estimator2, X, y, num_rounds=30,

64

test_size=0.3, scoring=None, random_seed=None):

65

"""

66

Resampled paired t-test for classifier comparison.

67

68

Parameters:

69

- estimator1, estimator2: sklearn-compatible estimators

70

- X: array-like, feature matrix

71

- y: array-like, target labels

72

- num_rounds: int, number of resampling rounds

73

- test_size: float, test set proportion

74

- scoring: str or callable, scoring metric

75

- random_seed: int, random seed

76

77

Returns:

78

- t: float, t-statistic

79

- p_value: float, p-value

80

- scores_diff: array, score differences

81

"""

82

83

def paired_ttest_kfold_cv(estimator1, estimator2, X, y, cv=10,

84

scoring=None, shuffle=True, random_seed=None):

85

"""

86

Paired t-test with k-fold cross-validation.

87

88

Parameters:

89

- estimator1, estimator2: sklearn-compatible estimators

90

- X: array-like, feature matrix

91

- y: array-like, target labels

92

- cv: int, number of cross-validation folds

93

- scoring: str or callable, scoring metric

94

- shuffle: bool, shuffle data before splitting

95

- random_seed: int, random seed

96

97

Returns:

98

- t: float, t-statistic

99

- p_value: float, p-value

100

- scores_diff: array, score differences

101

"""

102

103

def paired_ttest_5x2cv(estimator1, estimator2, X, y, scoring=None, random_seed=None):

104

"""

105

5x2cv paired t-test for classifier comparison.

106

107

Parameters:

108

- estimator1, estimator2: sklearn-compatible estimators

109

- X: array-like, feature matrix

110

- y: array-like, target labels

111

- scoring: str or callable, scoring metric

112

- random_seed: int, random seed

113

114

Returns:

115

- t: float, t-statistic

116

- p_value: float, p-value

117

"""

118

119

def proportion_difference(x, n, alpha=0.05):

120

"""

121

Test for difference in proportions with confidence interval.

122

123

Parameters:

124

- x: int, number of successes in sample

125

- n: int, sample size

126

- alpha: float, significance level

127

128

Returns:

129

- prop: float, sample proportion

130

- ci_lower: float, lower confidence interval bound

131

- ci_upper: float, upper confidence interval bound

132

"""

133

```

134

135

### Bootstrap Methods

136

137

Bootstrap resampling methods for model evaluation and confidence interval estimation.

138

139

```python { .api }

140

def bootstrap(x, func, n_splits=200, confidence_interval=0.95,

141

random_seed=None, ddof=1):

142

"""

143

Bootstrap confidence intervals for any statistic.

144

145

Parameters:

146

- x: array-like, input data

147

- func: callable, function to apply to bootstrap samples

148

- n_splits: int, number of bootstrap samples

149

- confidence_interval: float, confidence interval level

150

- random_seed: int, random seed

151

- ddof: int, degrees of freedom for variance calculation

152

153

Returns:

154

- original: float, original statistic

155

- bias: float, bootstrap bias

156

- std_err: float, bootstrap standard error

157

- ci_bounds: tuple, confidence interval bounds

158

"""

159

160

def bootstrap_point632_score(estimator, X, y, n_splits=200, method='.632+',

161

scoring=None, predict_proba=False, pos_label=1,

162

random_seed=None):

163

"""

164

Bootstrap .632 and .632+ error estimation.

165

166

Parameters:

167

- estimator: sklearn-compatible estimator

168

- X: array-like, feature matrix

169

- y: array-like, target labels

170

- n_splits: int, number of bootstrap samples

171

- method: str, '.632' or '.632+'

172

- scoring: str or callable, scoring metric

173

- predict_proba: bool, use predicted probabilities

174

- pos_label: int, positive class label for binary classification

175

- random_seed: int, random seed

176

177

Returns:

178

- scores: dict, bootstrap error estimates

179

"""

180

181

class BootstrapOutOfBag:

182

def __init__(self, n_splits=200, random_state=None):

183

"""

184

Bootstrap Out-of-Bag cross-validation.

185

186

Parameters:

187

- n_splits: int, number of bootstrap samples

188

- random_state: int, random state

189

"""

190

191

def split(self, X, y=None, groups=None):

192

"""Generate bootstrap train/test splits"""

193

194

def get_n_splits(self, X=None, y=None, groups=None):

195

"""Get number of splits"""

196

```

197

198

### Cross-Validation Utilities

199

200

Advanced cross-validation strategies for specific data types and evaluation scenarios.

201

202

```python { .api }

203

class RandomHoldoutSplit:

204

def __init__(self, valid_size=0.5, n_splits=1, stratify=False, random_state=None):

205

"""

206

Random holdout validation split.

207

208

Parameters:

209

- valid_size: float, validation set proportion

210

- n_splits: int, number of splits to generate

211

- stratify: bool, stratified sampling

212

- random_state: int, random state

213

"""

214

215

def split(self, X, y=None, groups=None):

216

"""Generate train/validation splits"""

217

218

class PredefinedHoldoutSplit:

219

def __init__(self, test_fold):

220

"""

221

Predefined holdout split using test fold indices.

222

223

Parameters:

224

- test_fold: array-like, test set indices

225

"""

226

227

def split(self, X, y=None, groups=None):

228

"""Generate predefined train/test split"""

229

230

class GroupTimeSeriesSplit:

231

def __init__(self, n_splits=5, test_size=None):

232

"""

233

Time series cross-validation for grouped data.

234

235

Parameters:

236

- n_splits: int, number of splits

237

- test_size: int, test set size

238

"""

239

240

def split(self, X, y=None, groups=None):

241

"""Generate time series splits"""

242

243

def get_n_splits(self, X=None, y=None, groups=None):

244

"""Get number of splits"""

245

```

246

247

### Feature Importance and Permutation Testing

248

249

Methods for assessing feature importance and performing permutation-based statistical tests.

250

251

```python { .api }

252

def feature_importance_permutation(X, y, predict_method, metric, num_rounds=1,

253

seed=None):

254

"""

255

Permutation-based feature importance calculation.

256

257

Parameters:

258

- X: array-like, feature matrix

259

- y: array-like, target labels

260

- predict_method: callable, prediction method

261

- metric: callable, evaluation metric

262

- num_rounds: int, number of permutation rounds

263

- seed: int, random seed

264

265

Returns:

266

- importances: array, feature importance scores

267

"""

268

269

def permutation_test(x, y, func, method='exact', num_rounds=1000, seed=None):

270

"""

271

Permutation test for statistical significance.

272

273

Parameters:

274

- x: array-like, first sample

275

- y: array-like, second sample

276

- func: callable, test statistic function

277

- method: str, 'exact' or 'approximate'

278

- num_rounds: int, number of permutation rounds

279

- seed: int, random seed

280

281

Returns:

282

- original_stat: float, original test statistic

283

- p_value: float, permutation p-value

284

- null_dist: array, null distribution of test statistics

285

"""

286

```

287

288

### Bias-Variance Decomposition

289

290

Decompose prediction error into bias and variance components.

291

292

```python { .api }

293

def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,

294

loss='0-1_loss', num_rounds=200, random_seed=None):

295

"""

296

Bias-variance decomposition for model evaluation.

297

298

Parameters:

299

- estimator: sklearn-compatible estimator

300

- X_train: array-like, training features

301

- y_train: array-like, training labels

302

- X_test: array-like, test features

303

- y_test: array-like, test labels

304

- loss: str, loss function ('0-1_loss' or 'mse')

305

- num_rounds: int, number of bootstrap rounds

306

- random_seed: int, random seed

307

308

Returns:

309

- avg_expected_loss: float, average expected loss

310

- avg_bias: float, average bias

311

- avg_var: float, average variance

312

- all_pred: array, all predictions from bootstrap samples

313

"""

314

```

315

316

### Additional Metrics and Utilities

317

318

Additional evaluation metrics and utility functions.

319

320

```python { .api }

321

def accuracy_score(y_target, y_predicted, normalize=True):

322

"""

323

Calculate accuracy score.

324

325

Parameters:

326

- y_target: array-like, true labels

327

- y_predicted: array-like, predicted labels

328

- normalize: bool, return fraction or count

329

330

Returns:

331

- accuracy: float or int, accuracy score

332

"""

333

334

def lift_score(y_target, y_probas, binary=True):

335

"""

336

Calculate lift score for binary classification.

337

338

Parameters:

339

- y_target: array-like, true binary labels

340

- y_probas: array-like, predicted probabilities

341

- binary: bool, binary classification

342

343

Returns:

344

- lift: float, lift score

345

"""

346

347

def confusion_matrix(y_target, y_predicted, binary=False):

348

"""

349

Create confusion matrix.

350

351

Parameters:

352

- y_target: array-like, true labels

353

- y_predicted: array-like, predicted labels

354

- binary: bool, binary classification

355

356

Returns:

357

- cm: array, confusion matrix

358

"""

359

360

def create_counterfactual(df, x1, y1, x2, y2, treatment_feature, outcome_feature):

361

"""

362

Generate counterfactual examples for causal analysis.

363

364

Parameters:

365

- df: DataFrame, input data

366

- x1, y1: int, coordinates for treatment group

367

- x2, y2: int, coordinates for control group

368

- treatment_feature: str, treatment column name

369

- outcome_feature: str, outcome column name

370

371

Returns:

372

- counterfactual_df: DataFrame, counterfactual examples

373

"""

374

375

def ftest(ary):

376

"""

377

F-test for comparing multiple classifier variances.

378

379

Parameters:

380

- ary: array-like, classifier performance scores

381

382

Returns:

383

- f_stat: float, F-statistic

384

- p_value: float, p-value

385

"""

386

387

def combined_ftest_5x2cv(estimator1, estimator2, X, y, random_seed=None):

388

"""

389

Combined F-test using 5x2 cross-validation.

390

391

Parameters:

392

- estimator1, estimator2: sklearn-compatible estimators

393

- X: array-like, feature matrix

394

- y: array-like, target labels

395

- random_seed: int, random seed

396

397

Returns:

398

- f: float, F-statistic

399

- p_value: float, p-value

400

"""

401

402

def scoring(y_target, y_predicted, metric='accuracy', pos_label=1, average='binary'):

403

"""

404

Flexible scoring function supporting multiple metrics.

405

406

Parameters:

407

- y_target: array-like, true labels

408

- y_predicted: array-like, predicted labels

409

- metric: str, evaluation metric

410

- pos_label: int, positive class label

411

- average: str, averaging method for multi-class

412

413

Returns:

414

- score: float, computed score

415

"""

416

```

417

418

## Usage Examples

419

420

### McNemar Test Example

421

422

```python

423

from mlxtend.evaluate import mcnemar, mcnemar_table

424

from sklearn.ensemble import RandomForestClassifier

425

from sklearn.svm import SVC

426

from sklearn.datasets import make_classification

427

from sklearn.model_selection import train_test_split

428

429

# Create dataset

430

X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

431

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

432

433

# Train two classifiers

434

clf1 = RandomForestClassifier(random_state=42)

435

clf2 = SVC(random_state=42)

436

437

clf1.fit(X_train, y_train)

438

clf2.fit(X_train, y_train)

439

440

# Get predictions

441

y_pred1 = clf1.predict(X_test)

442

y_pred2 = clf2.predict(X_test)

443

444

# Create McNemar table and perform test

445

tb = mcnemar_table(y_test, y_pred1, y_pred2)

446

chi2, p_value = mcnemar(tb, corrected=True)

447

448

print(f"McNemar's chi-squared: {chi2:.4f}")

449

print(f"P-value: {p_value:.4f}")

450

```

451

452

### Bootstrap Evaluation Example

453

454

```python

455

from mlxtend.evaluate import bootstrap_point632_score

456

from sklearn.ensemble import RandomForestClassifier

457

from sklearn.datasets import make_classification

458

459

# Create dataset

460

X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

461

462

# Train classifier

463

clf = RandomForestClassifier(random_state=42)

464

465

# Perform bootstrap .632+ evaluation

466

scores = bootstrap_point632_score(clf, X, y, method='.632+',

467

scoring='accuracy', n_splits=200)

468

469

print(f"Bootstrap .632+ accuracy: {scores['.632+']:.4f}")

470

print(f"Training accuracy: {scores['train']:.4f}")

471

print(f"Test accuracy: {scores['test']:.4f}")

472

```

473

474

### Bias-Variance Decomposition Example

475

476

```python

477

from mlxtend.evaluate import bias_variance_decomp

478

from sklearn.tree import DecisionTreeClassifier

479

from sklearn.datasets import make_classification

480

from sklearn.model_selection import train_test_split

481

482

# Create dataset

483

X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

484

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

485

486

# Analyze bias-variance tradeoff

487

clf = DecisionTreeClassifier(max_depth=5, random_state=42)

488

avg_expected_loss, avg_bias, avg_var, all_pred = bias_variance_decomp(

489

clf, X_train, y_train, X_test, y_test,

490

loss='0-1_loss', num_rounds=200, random_seed=42

491

)

492

493

print(f"Average Expected Loss: {avg_expected_loss:.4f}")

494

print(f"Average Bias: {avg_bias:.4f}")

495

print(f"Average Variance: {avg_var:.4f}")

496

```