0
# Model Evaluation
1
2
Comprehensive model evaluation tools including statistical tests, bootstrap methods, and cross-validation utilities for assessing and comparing machine learning models.
3
4
## Capabilities
5
6
### Statistical Testing
7
8
Statistical tests for comparing classifier performance and assessing significance of differences.
9
10
```python { .api }
11
def mcnemar(ary, corrected=True, exact=False):
12
"""
13
McNemar test for comparing two classifiers on the same dataset.
14
15
Parameters:
16
- ary: array-like, 2x2 contingency table or confusion matrix
17
- corrected: bool, apply continuity correction
18
- exact: bool, use exact binomial test
19
20
Returns:
21
- chi2: float, chi-squared statistic
22
- p_value: float, p-value of the test
23
"""
24
25
def mcnemar_table(y_target, y_model1, y_model2):
26
"""
27
Create McNemar table for two classifiers.
28
29
Parameters:
30
- y_target: array-like, true class labels
31
- y_model1: array-like, predictions from first classifier
32
- y_model2: array-like, predictions from second classifier
33
34
Returns:
35
- tb: array, 2x2 McNemar table
36
"""
37
38
def mcnemar_tables(y_target, *y_model_predictions):
39
"""
40
Create multiple McNemar tables for pairwise comparisons.
41
42
Parameters:
43
- y_target: array-like, true class labels
44
- y_model_predictions: arrays, predictions from multiple classifiers
45
46
Returns:
47
- tb: dict, pairwise McNemar tables
48
"""
49
50
def cochrans_q(X, alpha=0.05):
51
"""
52
Cochran's Q test for comparing multiple classifiers.
53
54
Parameters:
55
- X: array-like, binary classifier results matrix
56
- alpha: float, significance level
57
58
Returns:
59
- q: float, Cochran's Q statistic
60
- p_value: float, p-value of the test
61
"""
62
63
def paired_ttest_resampled(estimator1, estimator2, X, y, num_rounds=30,
64
test_size=0.3, scoring=None, random_seed=None):
65
"""
66
Resampled paired t-test for classifier comparison.
67
68
Parameters:
69
- estimator1, estimator2: sklearn-compatible estimators
70
- X: array-like, feature matrix
71
- y: array-like, target labels
72
- num_rounds: int, number of resampling rounds
73
- test_size: float, test set proportion
74
- scoring: str or callable, scoring metric
75
- random_seed: int, random seed
76
77
Returns:
78
- t: float, t-statistic
79
- p_value: float, p-value
80
- scores_diff: array, score differences
81
"""
82
83
def paired_ttest_kfold_cv(estimator1, estimator2, X, y, cv=10,
84
scoring=None, shuffle=True, random_seed=None):
85
"""
86
Paired t-test with k-fold cross-validation.
87
88
Parameters:
89
- estimator1, estimator2: sklearn-compatible estimators
90
- X: array-like, feature matrix
91
- y: array-like, target labels
92
- cv: int, number of cross-validation folds
93
- scoring: str or callable, scoring metric
94
- shuffle: bool, shuffle data before splitting
95
- random_seed: int, random seed
96
97
Returns:
98
- t: float, t-statistic
99
- p_value: float, p-value
100
- scores_diff: array, score differences
101
"""
102
103
def paired_ttest_5x2cv(estimator1, estimator2, X, y, scoring=None, random_seed=None):
104
"""
105
5x2cv paired t-test for classifier comparison.
106
107
Parameters:
108
- estimator1, estimator2: sklearn-compatible estimators
109
- X: array-like, feature matrix
110
- y: array-like, target labels
111
- scoring: str or callable, scoring metric
112
- random_seed: int, random seed
113
114
Returns:
115
- t: float, t-statistic
116
- p_value: float, p-value
117
"""
118
119
def proportion_difference(x, n, alpha=0.05):
120
"""
121
Test for difference in proportions with confidence interval.
122
123
Parameters:
124
- x: int, number of successes in sample
125
- n: int, sample size
126
- alpha: float, significance level
127
128
Returns:
129
- prop: float, sample proportion
130
- ci_lower: float, lower confidence interval bound
131
- ci_upper: float, upper confidence interval bound
132
"""
133
```
134
135
### Bootstrap Methods
136
137
Bootstrap resampling methods for model evaluation and confidence interval estimation.
138
139
```python { .api }
140
def bootstrap(x, func, n_splits=200, confidence_interval=0.95,
141
random_seed=None, ddof=1):
142
"""
143
Bootstrap confidence intervals for any statistic.
144
145
Parameters:
146
- x: array-like, input data
147
- func: callable, function to apply to bootstrap samples
148
- n_splits: int, number of bootstrap samples
149
- confidence_interval: float, confidence interval level
150
- random_seed: int, random seed
151
- ddof: int, degrees of freedom for variance calculation
152
153
Returns:
154
- original: float, original statistic
155
- bias: float, bootstrap bias
156
- std_err: float, bootstrap standard error
157
- ci_bounds: tuple, confidence interval bounds
158
"""
159
160
def bootstrap_point632_score(estimator, X, y, n_splits=200, method='.632+',
161
scoring=None, predict_proba=False, pos_label=1,
162
random_seed=None):
163
"""
164
Bootstrap .632 and .632+ error estimation.
165
166
Parameters:
167
- estimator: sklearn-compatible estimator
168
- X: array-like, feature matrix
169
- y: array-like, target labels
170
- n_splits: int, number of bootstrap samples
171
- method: str, '.632' or '.632+'
172
- scoring: str or callable, scoring metric
173
- predict_proba: bool, use predicted probabilities
174
- pos_label: int, positive class label for binary classification
175
- random_seed: int, random seed
176
177
Returns:
178
- scores: dict, bootstrap error estimates
179
"""
180
181
class BootstrapOutOfBag:
182
def __init__(self, n_splits=200, random_state=None):
183
"""
184
Bootstrap Out-of-Bag cross-validation.
185
186
Parameters:
187
- n_splits: int, number of bootstrap samples
188
- random_state: int, random state
189
"""
190
191
def split(self, X, y=None, groups=None):
192
"""Generate bootstrap train/test splits"""
193
194
def get_n_splits(self, X=None, y=None, groups=None):
195
"""Get number of splits"""
196
```
197
198
### Cross-Validation Utilities
199
200
Advanced cross-validation strategies for specific data types and evaluation scenarios.
201
202
```python { .api }
203
class RandomHoldoutSplit:
204
def __init__(self, valid_size=0.5, n_splits=1, stratify=False, random_state=None):
205
"""
206
Random holdout validation split.
207
208
Parameters:
209
- valid_size: float, validation set proportion
210
- n_splits: int, number of splits to generate
211
- stratify: bool, stratified sampling
212
- random_state: int, random state
213
"""
214
215
def split(self, X, y=None, groups=None):
216
"""Generate train/validation splits"""
217
218
class PredefinedHoldoutSplit:
219
def __init__(self, test_fold):
220
"""
221
Predefined holdout split using test fold indices.
222
223
Parameters:
224
- test_fold: array-like, test set indices
225
"""
226
227
def split(self, X, y=None, groups=None):
228
"""Generate predefined train/test split"""
229
230
class GroupTimeSeriesSplit:
231
def __init__(self, n_splits=5, test_size=None):
232
"""
233
Time series cross-validation for grouped data.
234
235
Parameters:
236
- n_splits: int, number of splits
237
- test_size: int, test set size
238
"""
239
240
def split(self, X, y=None, groups=None):
241
"""Generate time series splits"""
242
243
def get_n_splits(self, X=None, y=None, groups=None):
244
"""Get number of splits"""
245
```
246
247
### Feature Importance and Permutation Testing
248
249
Methods for assessing feature importance and performing permutation-based statistical tests.
250
251
```python { .api }
252
def feature_importance_permutation(X, y, predict_method, metric, num_rounds=1,
253
seed=None):
254
"""
255
Permutation-based feature importance calculation.
256
257
Parameters:
258
- X: array-like, feature matrix
259
- y: array-like, target labels
260
- predict_method: callable, prediction method
261
- metric: callable, evaluation metric
262
- num_rounds: int, number of permutation rounds
263
- seed: int, random seed
264
265
Returns:
266
- importances: array, feature importance scores
267
"""
268
269
def permutation_test(x, y, func, method='exact', num_rounds=1000, seed=None):
270
"""
271
Permutation test for statistical significance.
272
273
Parameters:
274
- x: array-like, first sample
275
- y: array-like, second sample
276
- func: callable, test statistic function
277
- method: str, 'exact' or 'approximate'
278
- num_rounds: int, number of permutation rounds
279
- seed: int, random seed
280
281
Returns:
282
- original_stat: float, original test statistic
283
- p_value: float, permutation p-value
284
- null_dist: array, null distribution of test statistics
285
"""
286
```
287
288
### Bias-Variance Decomposition
289
290
Decompose prediction error into bias and variance components.
291
292
```python { .api }
293
def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
294
loss='0-1_loss', num_rounds=200, random_seed=None):
295
"""
296
Bias-variance decomposition for model evaluation.
297
298
Parameters:
299
- estimator: sklearn-compatible estimator
300
- X_train: array-like, training features
301
- y_train: array-like, training labels
302
- X_test: array-like, test features
303
- y_test: array-like, test labels
304
- loss: str, loss function ('0-1_loss' or 'mse')
305
- num_rounds: int, number of bootstrap rounds
306
- random_seed: int, random seed
307
308
Returns:
309
- avg_expected_loss: float, average expected loss
310
- avg_bias: float, average bias
311
- avg_var: float, average variance
312
- all_pred: array, all predictions from bootstrap samples
313
"""
314
```
315
316
### Additional Metrics and Utilities
317
318
Additional evaluation metrics and utility functions.
319
320
```python { .api }
321
def accuracy_score(y_target, y_predicted, normalize=True):
322
"""
323
Calculate accuracy score.
324
325
Parameters:
326
- y_target: array-like, true labels
327
- y_predicted: array-like, predicted labels
328
- normalize: bool, return fraction or count
329
330
Returns:
331
- accuracy: float or int, accuracy score
332
"""
333
334
def lift_score(y_target, y_probas, binary=True):
335
"""
336
Calculate lift score for binary classification.
337
338
Parameters:
339
- y_target: array-like, true binary labels
340
- y_probas: array-like, predicted probabilities
341
- binary: bool, binary classification
342
343
Returns:
344
- lift: float, lift score
345
"""
346
347
def confusion_matrix(y_target, y_predicted, binary=False):
348
"""
349
Create confusion matrix.
350
351
Parameters:
352
- y_target: array-like, true labels
353
- y_predicted: array-like, predicted labels
354
- binary: bool, binary classification
355
356
Returns:
357
- cm: array, confusion matrix
358
"""
359
360
def create_counterfactual(df, x1, y1, x2, y2, treatment_feature, outcome_feature):
361
"""
362
Generate counterfactual examples for causal analysis.
363
364
Parameters:
365
- df: DataFrame, input data
366
- x1, y1: int, coordinates for treatment group
367
- x2, y2: int, coordinates for control group
368
- treatment_feature: str, treatment column name
369
- outcome_feature: str, outcome column name
370
371
Returns:
372
- counterfactual_df: DataFrame, counterfactual examples
373
"""
374
375
def ftest(ary):
376
"""
377
F-test for comparing multiple classifier variances.
378
379
Parameters:
380
- ary: array-like, classifier performance scores
381
382
Returns:
383
- f_stat: float, F-statistic
384
- p_value: float, p-value
385
"""
386
387
def combined_ftest_5x2cv(estimator1, estimator2, X, y, random_seed=None):
388
"""
389
Combined F-test using 5x2 cross-validation.
390
391
Parameters:
392
- estimator1, estimator2: sklearn-compatible estimators
393
- X: array-like, feature matrix
394
- y: array-like, target labels
395
- random_seed: int, random seed
396
397
Returns:
398
- f: float, F-statistic
399
- p_value: float, p-value
400
"""
401
402
def scoring(y_target, y_predicted, metric='accuracy', pos_label=1, average='binary'):
403
"""
404
Flexible scoring function supporting multiple metrics.
405
406
Parameters:
407
- y_target: array-like, true labels
408
- y_predicted: array-like, predicted labels
409
- metric: str, evaluation metric
410
- pos_label: int, positive class label
411
- average: str, averaging method for multi-class
412
413
Returns:
414
- score: float, computed score
415
"""
416
```
417
418
## Usage Examples
419
420
### McNemar Test Example
421
422
```python
423
from mlxtend.evaluate import mcnemar, mcnemar_table
424
from sklearn.ensemble import RandomForestClassifier
425
from sklearn.svm import SVC
426
from sklearn.datasets import make_classification
427
from sklearn.model_selection import train_test_split
428
429
# Create dataset
430
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
431
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
432
433
# Train two classifiers
434
clf1 = RandomForestClassifier(random_state=42)
435
clf2 = SVC(random_state=42)
436
437
clf1.fit(X_train, y_train)
438
clf2.fit(X_train, y_train)
439
440
# Get predictions
441
y_pred1 = clf1.predict(X_test)
442
y_pred2 = clf2.predict(X_test)
443
444
# Create McNemar table and perform test
445
tb = mcnemar_table(y_test, y_pred1, y_pred2)
446
chi2, p_value = mcnemar(tb, corrected=True)
447
448
print(f"McNemar's chi-squared: {chi2:.4f}")
449
print(f"P-value: {p_value:.4f}")
450
```
451
452
### Bootstrap Evaluation Example
453
454
```python
455
from mlxtend.evaluate import bootstrap_point632_score
456
from sklearn.ensemble import RandomForestClassifier
457
from sklearn.datasets import make_classification
458
459
# Create dataset
460
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
461
462
# Train classifier
463
clf = RandomForestClassifier(random_state=42)
464
465
# Perform bootstrap .632+ evaluation
466
scores = bootstrap_point632_score(clf, X, y, method='.632+',
467
scoring='accuracy', n_splits=200)
468
469
print(f"Bootstrap .632+ accuracy: {scores['.632+']:.4f}")
470
print(f"Training accuracy: {scores['train']:.4f}")
471
print(f"Test accuracy: {scores['test']:.4f}")
472
```
473
474
### Bias-Variance Decomposition Example
475
476
```python
477
from mlxtend.evaluate import bias_variance_decomp
478
from sklearn.tree import DecisionTreeClassifier
479
from sklearn.datasets import make_classification
480
from sklearn.model_selection import train_test_split
481
482
# Create dataset
483
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
484
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
485
486
# Analyze bias-variance tradeoff
487
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
488
avg_expected_loss, avg_bias, avg_var, all_pred = bias_variance_decomp(
489
clf, X_train, y_train, X_test, y_test,
490
loss='0-1_loss', num_rounds=200, random_seed=42
491
)
492
493
print(f"Average Expected Loss: {avg_expected_loss:.4f}")
494
print(f"Average Bias: {avg_bias:.4f}")
495
print(f"Average Variance: {avg_var:.4f}")
496
```