Tessl Tile for pypi/mlxtend@0.23.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md clustering.md datasets.md evaluation.md feature-engineering.md file-io.md index.md math-utils.md pattern-mining.md plotting.md preprocessing.md regression.md text-processing.md utilities.md

evaluation.mddocs/

0
# Model Evaluation
1

2
Comprehensive model evaluation tools including statistical tests, bootstrap methods, and cross-validation utilities for assessing and comparing machine learning models.
3

4
## Capabilities
5

6
### Statistical Testing
7

8
Statistical tests for comparing classifier performance and assessing significance of differences.
9

10
```python { .api }
11
def mcnemar(ary, corrected=True, exact=False):
12
    """
13
    McNemar test for comparing two classifiers on the same dataset.
14
    
15
    Parameters:
16
    - ary: array-like, 2x2 contingency table or confusion matrix
17
    - corrected: bool, apply continuity correction
18
    - exact: bool, use exact binomial test
19
    
20
    Returns:
21
    - chi2: float, chi-squared statistic
22
    - p_value: float, p-value of the test
23
    """
24

25
def mcnemar_table(y_target, y_model1, y_model2):
26
    """
27
    Create McNemar table for two classifiers.
28
    
29
    Parameters:
30
    - y_target: array-like, true class labels
31
    - y_model1: array-like, predictions from first classifier
32
    - y_model2: array-like, predictions from second classifier
33
    
34
    Returns:
35
    - tb: array, 2x2 McNemar table
36
    """
37

38
def mcnemar_tables(y_target, *y_model_predictions):
39
    """
40
    Create multiple McNemar tables for pairwise comparisons.
41
    
42
    Parameters:
43
    - y_target: array-like, true class labels
44
    - y_model_predictions: arrays, predictions from multiple classifiers
45
    
46
    Returns:
47
    - tb: dict, pairwise McNemar tables
48
    """
49

50
def cochrans_q(X, alpha=0.05):
51
    """
52
    Cochran's Q test for comparing multiple classifiers.
53
    
54
    Parameters:
55
    - X: array-like, binary classifier results matrix
56
    - alpha: float, significance level
57
    
58
    Returns:
59
    - q: float, Cochran's Q statistic
60
    - p_value: float, p-value of the test
61
    """
62

63
def paired_ttest_resampled(estimator1, estimator2, X, y, num_rounds=30, 
64
                          test_size=0.3, scoring=None, random_seed=None):
65
    """
66
    Resampled paired t-test for classifier comparison.
67
    
68
    Parameters:
69
    - estimator1, estimator2: sklearn-compatible estimators
70
    - X: array-like, feature matrix
71
    - y: array-like, target labels
72
    - num_rounds: int, number of resampling rounds
73
    - test_size: float, test set proportion
74
    - scoring: str or callable, scoring metric
75
    - random_seed: int, random seed
76
    
77
    Returns:
78
    - t: float, t-statistic
79
    - p_value: float, p-value
80
    - scores_diff: array, score differences
81
    """
82

83
def paired_ttest_kfold_cv(estimator1, estimator2, X, y, cv=10, 
84
                         scoring=None, shuffle=True, random_seed=None):
85
    """
86
    Paired t-test with k-fold cross-validation.
87
    
88
    Parameters:
89
    - estimator1, estimator2: sklearn-compatible estimators
90
    - X: array-like, feature matrix
91
    - y: array-like, target labels
92
    - cv: int, number of cross-validation folds
93
    - scoring: str or callable, scoring metric
94
    - shuffle: bool, shuffle data before splitting
95
    - random_seed: int, random seed
96
    
97
    Returns:
98
    - t: float, t-statistic
99
    - p_value: float, p-value
100
    - scores_diff: array, score differences
101
    """
102

103
def paired_ttest_5x2cv(estimator1, estimator2, X, y, scoring=None, random_seed=None):
104
    """
105
    5x2cv paired t-test for classifier comparison.
106
    
107
    Parameters:
108
    - estimator1, estimator2: sklearn-compatible estimators
109
    - X: array-like, feature matrix
110
    - y: array-like, target labels
111
    - scoring: str or callable, scoring metric
112
    - random_seed: int, random seed
113
    
114
    Returns:
115
    - t: float, t-statistic
116
    - p_value: float, p-value
117
    """
118

119
def proportion_difference(x, n, alpha=0.05):
120
    """
121
    Test for difference in proportions with confidence interval.
122
    
123
    Parameters:
124
    - x: int, number of successes in sample
125
    - n: int, sample size
126
    - alpha: float, significance level
127
    
128
    Returns:
129
    - prop: float, sample proportion
130
    - ci_lower: float, lower confidence interval bound
131
    - ci_upper: float, upper confidence interval bound
132
    """
133
```
134

135
### Bootstrap Methods
136

137
Bootstrap resampling methods for model evaluation and confidence interval estimation.
138

139
```python { .api }
140
def bootstrap(x, func, n_splits=200, confidence_interval=0.95, 
141
              random_seed=None, ddof=1):
142
    """
143
    Bootstrap confidence intervals for any statistic.
144
    
145
    Parameters:
146
    - x: array-like, input data
147
    - func: callable, function to apply to bootstrap samples
148
    - n_splits: int, number of bootstrap samples
149
    - confidence_interval: float, confidence interval level
150
    - random_seed: int, random seed
151
    - ddof: int, degrees of freedom for variance calculation
152
    
153
    Returns:
154
    - original: float, original statistic
155
    - bias: float, bootstrap bias
156
    - std_err: float, bootstrap standard error
157
    - ci_bounds: tuple, confidence interval bounds
158
    """
159

160
def bootstrap_point632_score(estimator, X, y, n_splits=200, method='.632+',
161
                           scoring=None, predict_proba=False, pos_label=1,
162
                           random_seed=None):
163
    """
164
    Bootstrap .632 and .632+ error estimation.
165
    
166
    Parameters:
167
    - estimator: sklearn-compatible estimator
168
    - X: array-like, feature matrix
169
    - y: array-like, target labels
170
    - n_splits: int, number of bootstrap samples
171
    - method: str, '.632' or '.632+'
172
    - scoring: str or callable, scoring metric
173
    - predict_proba: bool, use predicted probabilities
174
    - pos_label: int, positive class label for binary classification
175
    - random_seed: int, random seed
176
    
177
    Returns:
178
    - scores: dict, bootstrap error estimates
179
    """
180

181
class BootstrapOutOfBag:
182
    def __init__(self, n_splits=200, random_state=None):
183
        """
184
        Bootstrap Out-of-Bag cross-validation.
185
        
186
        Parameters:
187
        - n_splits: int, number of bootstrap samples
188
        - random_state: int, random state
189
        """
190
    
191
    def split(self, X, y=None, groups=None):
192
        """Generate bootstrap train/test splits"""
193
        
194
    def get_n_splits(self, X=None, y=None, groups=None):
195
        """Get number of splits"""
196
```
197

198
### Cross-Validation Utilities
199

200
Advanced cross-validation strategies for specific data types and evaluation scenarios.
201

202
```python { .api }
203
class RandomHoldoutSplit:
204
    def __init__(self, valid_size=0.5, n_splits=1, stratify=False, random_state=None):
205
        """
206
        Random holdout validation split.
207
        
208
        Parameters:
209
        - valid_size: float, validation set proportion
210
        - n_splits: int, number of splits to generate
211
        - stratify: bool, stratified sampling
212
        - random_state: int, random state
213
        """
214
    
215
    def split(self, X, y=None, groups=None):
216
        """Generate train/validation splits"""
217

218
class PredefinedHoldoutSplit:
219
    def __init__(self, test_fold):
220
        """
221
        Predefined holdout split using test fold indices.
222
        
223
        Parameters:
224
        - test_fold: array-like, test set indices
225
        """
226
    
227
    def split(self, X, y=None, groups=None):
228
        """Generate predefined train/test split"""
229

230
class GroupTimeSeriesSplit:
231
    def __init__(self, n_splits=5, test_size=None):
232
        """
233
        Time series cross-validation for grouped data.
234
        
235
        Parameters:
236
        - n_splits: int, number of splits
237
        - test_size: int, test set size
238
        """
239
    
240
    def split(self, X, y=None, groups=None):
241
        """Generate time series splits"""
242
        
243
    def get_n_splits(self, X=None, y=None, groups=None):
244
        """Get number of splits"""
245
```
246

247
### Feature Importance and Permutation Testing
248

249
Methods for assessing feature importance and performing permutation-based statistical tests.
250

251
```python { .api }
252
def feature_importance_permutation(X, y, predict_method, metric, num_rounds=1,
253
                                 seed=None):
254
    """
255
    Permutation-based feature importance calculation.
256
    
257
    Parameters:
258
    - X: array-like, feature matrix
259
    - y: array-like, target labels
260
    - predict_method: callable, prediction method
261
    - metric: callable, evaluation metric
262
    - num_rounds: int, number of permutation rounds
263
    - seed: int, random seed
264
    
265
    Returns:
266
    - importances: array, feature importance scores
267
    """
268

269
def permutation_test(x, y, func, method='exact', num_rounds=1000, seed=None):
270
    """
271
    Permutation test for statistical significance.
272
    
273
    Parameters:
274
    - x: array-like, first sample
275
    - y: array-like, second sample
276
    - func: callable, test statistic function
277
    - method: str, 'exact' or 'approximate'
278
    - num_rounds: int, number of permutation rounds
279
    - seed: int, random seed
280
    
281
    Returns:
282
    - original_stat: float, original test statistic
283
    - p_value: float, permutation p-value
284
    - null_dist: array, null distribution of test statistics
285
    """
286
```
287

288
### Bias-Variance Decomposition
289

290
Decompose prediction error into bias and variance components.
291

292
```python { .api }
293
def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
294
                        loss='0-1_loss', num_rounds=200, random_seed=None):
295
    """
296
    Bias-variance decomposition for model evaluation.
297
    
298
    Parameters:
299
    - estimator: sklearn-compatible estimator
300
    - X_train: array-like, training features
301
    - y_train: array-like, training labels
302
    - X_test: array-like, test features
303
    - y_test: array-like, test labels
304
    - loss: str, loss function ('0-1_loss' or 'mse')
305
    - num_rounds: int, number of bootstrap rounds
306
    - random_seed: int, random seed
307
    
308
    Returns:
309
    - avg_expected_loss: float, average expected loss
310
    - avg_bias: float, average bias
311
    - avg_var: float, average variance
312
    - all_pred: array, all predictions from bootstrap samples
313
    """
314
```
315

316
### Additional Metrics and Utilities
317

318
Additional evaluation metrics and utility functions.
319

320
```python { .api }
321
def accuracy_score(y_target, y_predicted, normalize=True):
322
    """
323
    Calculate accuracy score.
324
    
325
    Parameters:
326
    - y_target: array-like, true labels
327
    - y_predicted: array-like, predicted labels
328
    - normalize: bool, return fraction or count
329
    
330
    Returns:
331
    - accuracy: float or int, accuracy score
332
    """
333

334
def lift_score(y_target, y_probas, binary=True):
335
    """
336
    Calculate lift score for binary classification.
337
    
338
    Parameters:
339
    - y_target: array-like, true binary labels
340
    - y_probas: array-like, predicted probabilities
341
    - binary: bool, binary classification
342
    
343
    Returns:
344
    - lift: float, lift score
345
    """
346

347
def confusion_matrix(y_target, y_predicted, binary=False):
348
    """
349
    Create confusion matrix.
350
    
351
    Parameters:
352
    - y_target: array-like, true labels
353
    - y_predicted: array-like, predicted labels
354
    - binary: bool, binary classification
355
    
356
    Returns:
357
    - cm: array, confusion matrix
358
    """
359

360
def create_counterfactual(df, x1, y1, x2, y2, treatment_feature, outcome_feature):
361
    """
362
    Generate counterfactual examples for causal analysis.
363
    
364
    Parameters:
365
    - df: DataFrame, input data
366
    - x1, y1: int, coordinates for treatment group
367
    - x2, y2: int, coordinates for control group
368
    - treatment_feature: str, treatment column name
369
    - outcome_feature: str, outcome column name
370
    
371
    Returns:
372
    - counterfactual_df: DataFrame, counterfactual examples
373
    """
374

375
def ftest(ary):
376
    """
377
    F-test for comparing multiple classifier variances.
378
    
379
    Parameters:
380
    - ary: array-like, classifier performance scores
381
    
382
    Returns:
383
    - f_stat: float, F-statistic
384
    - p_value: float, p-value
385
    """
386

387
def combined_ftest_5x2cv(estimator1, estimator2, X, y, random_seed=None):
388
    """
389
    Combined F-test using 5x2 cross-validation.
390
    
391
    Parameters:
392
    - estimator1, estimator2: sklearn-compatible estimators
393
    - X: array-like, feature matrix
394
    - y: array-like, target labels
395
    - random_seed: int, random seed
396
    
397
    Returns:
398
    - f: float, F-statistic
399
    - p_value: float, p-value
400
    """
401

402
def scoring(y_target, y_predicted, metric='accuracy', pos_label=1, average='binary'):
403
    """
404
    Flexible scoring function supporting multiple metrics.
405
    
406
    Parameters:
407
    - y_target: array-like, true labels
408
    - y_predicted: array-like, predicted labels
409
    - metric: str, evaluation metric
410
    - pos_label: int, positive class label
411
    - average: str, averaging method for multi-class
412
    
413
    Returns:
414
    - score: float, computed score
415
    """
416
```
417

418
## Usage Examples
419

420
### McNemar Test Example
421

422
```python
423
from mlxtend.evaluate import mcnemar, mcnemar_table
424
from sklearn.ensemble import RandomForestClassifier
425
from sklearn.svm import SVC
426
from sklearn.datasets import make_classification
427
from sklearn.model_selection import train_test_split
428

429
# Create dataset
430
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
431
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
432

433
# Train two classifiers
434
clf1 = RandomForestClassifier(random_state=42)
435
clf2 = SVC(random_state=42)
436

437
clf1.fit(X_train, y_train)
438
clf2.fit(X_train, y_train)
439

440
# Get predictions
441
y_pred1 = clf1.predict(X_test)
442
y_pred2 = clf2.predict(X_test)
443

444
# Create McNemar table and perform test
445
tb = mcnemar_table(y_test, y_pred1, y_pred2)
446
chi2, p_value = mcnemar(tb, corrected=True)
447

448
print(f"McNemar's chi-squared: {chi2:.4f}")
449
print(f"P-value: {p_value:.4f}")
450
```
451

452
### Bootstrap Evaluation Example
453

454
```python
455
from mlxtend.evaluate import bootstrap_point632_score
456
from sklearn.ensemble import RandomForestClassifier
457
from sklearn.datasets import make_classification
458

459
# Create dataset
460
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
461

462
# Train classifier
463
clf = RandomForestClassifier(random_state=42)
464

465
# Perform bootstrap .632+ evaluation
466
scores = bootstrap_point632_score(clf, X, y, method='.632+', 
467
                                 scoring='accuracy', n_splits=200)
468

469
print(f"Bootstrap .632+ accuracy: {scores['.632+']:.4f}")
470
print(f"Training accuracy: {scores['train']:.4f}")
471
print(f"Test accuracy: {scores['test']:.4f}")
472
```
473

474
### Bias-Variance Decomposition Example
475

476
```python
477
from mlxtend.evaluate import bias_variance_decomp
478
from sklearn.tree import DecisionTreeClassifier
479
from sklearn.datasets import make_classification
480
from sklearn.model_selection import train_test_split
481

482
# Create dataset
483
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
484
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
485

486
# Analyze bias-variance tradeoff
487
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
488
avg_expected_loss, avg_bias, avg_var, all_pred = bias_variance_decomp(
489
    clf, X_train, y_train, X_test, y_test, 
490
    loss='0-1_loss', num_rounds=200, random_seed=42
491
)
492

493
print(f"Average Expected Loss: {avg_expected_loss:.4f}")
494
print(f"Average Bias: {avg_bias:.4f}")
495
print(f"Average Variance: {avg_var:.4f}")
496
```

Version

Tile

Files

evaluation.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

evaluation.mddocs/