Machine Learning Library Extensions providing essential tools for day-to-day data science tasks
—
Comprehensive model evaluation tools including statistical tests, bootstrap methods, and cross-validation utilities for assessing and comparing machine learning models.
Statistical tests for comparing classifier performance and assessing significance of differences.
def mcnemar(ary, corrected=True, exact=False):
"""
McNemar test for comparing two classifiers on the same dataset.
Parameters:
- ary: array-like, 2x2 contingency table or confusion matrix
- corrected: bool, apply continuity correction
- exact: bool, use exact binomial test
Returns:
- chi2: float, chi-squared statistic
- p_value: float, p-value of the test
"""
def mcnemar_table(y_target, y_model1, y_model2):
"""
Create McNemar table for two classifiers.
Parameters:
- y_target: array-like, true class labels
- y_model1: array-like, predictions from first classifier
- y_model2: array-like, predictions from second classifier
Returns:
- tb: array, 2x2 McNemar table
"""
def mcnemar_tables(y_target, *y_model_predictions):
"""
Create multiple McNemar tables for pairwise comparisons.
Parameters:
- y_target: array-like, true class labels
- y_model_predictions: arrays, predictions from multiple classifiers
Returns:
- tb: dict, pairwise McNemar tables
"""
def cochrans_q(X, alpha=0.05):
"""
Cochran's Q test for comparing multiple classifiers.
Parameters:
- X: array-like, binary classifier results matrix
- alpha: float, significance level
Returns:
- q: float, Cochran's Q statistic
- p_value: float, p-value of the test
"""
def paired_ttest_resampled(estimator1, estimator2, X, y, num_rounds=30,
test_size=0.3, scoring=None, random_seed=None):
"""
Resampled paired t-test for classifier comparison.
Parameters:
- estimator1, estimator2: sklearn-compatible estimators
- X: array-like, feature matrix
- y: array-like, target labels
- num_rounds: int, number of resampling rounds
- test_size: float, test set proportion
- scoring: str or callable, scoring metric
- random_seed: int, random seed
Returns:
- t: float, t-statistic
- p_value: float, p-value
- scores_diff: array, score differences
"""
def paired_ttest_kfold_cv(estimator1, estimator2, X, y, cv=10,
scoring=None, shuffle=True, random_seed=None):
"""
Paired t-test with k-fold cross-validation.
Parameters:
- estimator1, estimator2: sklearn-compatible estimators
- X: array-like, feature matrix
- y: array-like, target labels
- cv: int, number of cross-validation folds
- scoring: str or callable, scoring metric
- shuffle: bool, shuffle data before splitting
- random_seed: int, random seed
Returns:
- t: float, t-statistic
- p_value: float, p-value
- scores_diff: array, score differences
"""
def paired_ttest_5x2cv(estimator1, estimator2, X, y, scoring=None, random_seed=None):
"""
5x2cv paired t-test for classifier comparison.
Parameters:
- estimator1, estimator2: sklearn-compatible estimators
- X: array-like, feature matrix
- y: array-like, target labels
- scoring: str or callable, scoring metric
- random_seed: int, random seed
Returns:
- t: float, t-statistic
- p_value: float, p-value
"""
def proportion_difference(x, n, alpha=0.05):
"""
Test for difference in proportions with confidence interval.
Parameters:
- x: int, number of successes in sample
- n: int, sample size
- alpha: float, significance level
Returns:
- prop: float, sample proportion
- ci_lower: float, lower confidence interval bound
- ci_upper: float, upper confidence interval bound
"""Bootstrap resampling methods for model evaluation and confidence interval estimation.
def bootstrap(x, func, n_splits=200, confidence_interval=0.95,
random_seed=None, ddof=1):
"""
Bootstrap confidence intervals for any statistic.
Parameters:
- x: array-like, input data
- func: callable, function to apply to bootstrap samples
- n_splits: int, number of bootstrap samples
- confidence_interval: float, confidence interval level
- random_seed: int, random seed
- ddof: int, degrees of freedom for variance calculation
Returns:
- original: float, original statistic
- bias: float, bootstrap bias
- std_err: float, bootstrap standard error
- ci_bounds: tuple, confidence interval bounds
"""
def bootstrap_point632_score(estimator, X, y, n_splits=200, method='.632+',
scoring=None, predict_proba=False, pos_label=1,
random_seed=None):
"""
Bootstrap .632 and .632+ error estimation.
Parameters:
- estimator: sklearn-compatible estimator
- X: array-like, feature matrix
- y: array-like, target labels
- n_splits: int, number of bootstrap samples
- method: str, '.632' or '.632+'
- scoring: str or callable, scoring metric
- predict_proba: bool, use predicted probabilities
- pos_label: int, positive class label for binary classification
- random_seed: int, random seed
Returns:
- scores: dict, bootstrap error estimates
"""
class BootstrapOutOfBag:
def __init__(self, n_splits=200, random_state=None):
"""
Bootstrap Out-of-Bag cross-validation.
Parameters:
- n_splits: int, number of bootstrap samples
- random_state: int, random state
"""
def split(self, X, y=None, groups=None):
"""Generate bootstrap train/test splits"""
def get_n_splits(self, X=None, y=None, groups=None):
"""Get number of splits"""Advanced cross-validation strategies for specific data types and evaluation scenarios.
class RandomHoldoutSplit:
def __init__(self, valid_size=0.5, n_splits=1, stratify=False, random_state=None):
"""
Random holdout validation split.
Parameters:
- valid_size: float, validation set proportion
- n_splits: int, number of splits to generate
- stratify: bool, stratified sampling
- random_state: int, random state
"""
def split(self, X, y=None, groups=None):
"""Generate train/validation splits"""
class PredefinedHoldoutSplit:
def __init__(self, test_fold):
"""
Predefined holdout split using test fold indices.
Parameters:
- test_fold: array-like, test set indices
"""
def split(self, X, y=None, groups=None):
"""Generate predefined train/test split"""
class GroupTimeSeriesSplit:
def __init__(self, n_splits=5, test_size=None):
"""
Time series cross-validation for grouped data.
Parameters:
- n_splits: int, number of splits
- test_size: int, test set size
"""
def split(self, X, y=None, groups=None):
"""Generate time series splits"""
def get_n_splits(self, X=None, y=None, groups=None):
"""Get number of splits"""Methods for assessing feature importance and performing permutation-based statistical tests.
def feature_importance_permutation(X, y, predict_method, metric, num_rounds=1,
seed=None):
"""
Permutation-based feature importance calculation.
Parameters:
- X: array-like, feature matrix
- y: array-like, target labels
- predict_method: callable, prediction method
- metric: callable, evaluation metric
- num_rounds: int, number of permutation rounds
- seed: int, random seed
Returns:
- importances: array, feature importance scores
"""
def permutation_test(x, y, func, method='exact', num_rounds=1000, seed=None):
"""
Permutation test for statistical significance.
Parameters:
- x: array-like, first sample
- y: array-like, second sample
- func: callable, test statistic function
- method: str, 'exact' or 'approximate'
- num_rounds: int, number of permutation rounds
- seed: int, random seed
Returns:
- original_stat: float, original test statistic
- p_value: float, permutation p-value
- null_dist: array, null distribution of test statistics
"""Decompose prediction error into bias and variance components.
def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
loss='0-1_loss', num_rounds=200, random_seed=None):
"""
Bias-variance decomposition for model evaluation.
Parameters:
- estimator: sklearn-compatible estimator
- X_train: array-like, training features
- y_train: array-like, training labels
- X_test: array-like, test features
- y_test: array-like, test labels
- loss: str, loss function ('0-1_loss' or 'mse')
- num_rounds: int, number of bootstrap rounds
- random_seed: int, random seed
Returns:
- avg_expected_loss: float, average expected loss
- avg_bias: float, average bias
- avg_var: float, average variance
- all_pred: array, all predictions from bootstrap samples
"""Additional evaluation metrics and utility functions.
def accuracy_score(y_target, y_predicted, normalize=True):
"""
Calculate accuracy score.
Parameters:
- y_target: array-like, true labels
- y_predicted: array-like, predicted labels
- normalize: bool, return fraction or count
Returns:
- accuracy: float or int, accuracy score
"""
def lift_score(y_target, y_probas, binary=True):
"""
Calculate lift score for binary classification.
Parameters:
- y_target: array-like, true binary labels
- y_probas: array-like, predicted probabilities
- binary: bool, binary classification
Returns:
- lift: float, lift score
"""
def confusion_matrix(y_target, y_predicted, binary=False):
"""
Create confusion matrix.
Parameters:
- y_target: array-like, true labels
- y_predicted: array-like, predicted labels
- binary: bool, binary classification
Returns:
- cm: array, confusion matrix
"""
def create_counterfactual(df, x1, y1, x2, y2, treatment_feature, outcome_feature):
"""
Generate counterfactual examples for causal analysis.
Parameters:
- df: DataFrame, input data
- x1, y1: int, coordinates for treatment group
- x2, y2: int, coordinates for control group
- treatment_feature: str, treatment column name
- outcome_feature: str, outcome column name
Returns:
- counterfactual_df: DataFrame, counterfactual examples
"""
def ftest(ary):
"""
F-test for comparing multiple classifier variances.
Parameters:
- ary: array-like, classifier performance scores
Returns:
- f_stat: float, F-statistic
- p_value: float, p-value
"""
def combined_ftest_5x2cv(estimator1, estimator2, X, y, random_seed=None):
"""
Combined F-test using 5x2 cross-validation.
Parameters:
- estimator1, estimator2: sklearn-compatible estimators
- X: array-like, feature matrix
- y: array-like, target labels
- random_seed: int, random seed
Returns:
- f: float, F-statistic
- p_value: float, p-value
"""
def scoring(y_target, y_predicted, metric='accuracy', pos_label=1, average='binary'):
"""
Flexible scoring function supporting multiple metrics.
Parameters:
- y_target: array-like, true labels
- y_predicted: array-like, predicted labels
- metric: str, evaluation metric
- pos_label: int, positive class label
- average: str, averaging method for multi-class
Returns:
- score: float, computed score
"""from mlxtend.evaluate import mcnemar, mcnemar_table
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train two classifiers
clf1 = RandomForestClassifier(random_state=42)
clf2 = SVC(random_state=42)
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
# Get predictions
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
# Create McNemar table and perform test
tb = mcnemar_table(y_test, y_pred1, y_pred2)
chi2, p_value = mcnemar(tb, corrected=True)
print(f"McNemar's chi-squared: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")from mlxtend.evaluate import bootstrap_point632_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
# Train classifier
clf = RandomForestClassifier(random_state=42)
# Perform bootstrap .632+ evaluation
scores = bootstrap_point632_score(clf, X, y, method='.632+',
scoring='accuracy', n_splits=200)
print(f"Bootstrap .632+ accuracy: {scores['.632+']:.4f}")
print(f"Training accuracy: {scores['train']:.4f}")
print(f"Test accuracy: {scores['test']:.4f}")from mlxtend.evaluate import bias_variance_decomp
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Analyze bias-variance tradeoff
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
avg_expected_loss, avg_bias, avg_var, all_pred = bias_variance_decomp(
clf, X_train, y_train, X_test, y_test,
loss='0-1_loss', num_rounds=200, random_seed=42
)
print(f"Average Expected Loss: {avg_expected_loss:.4f}")
print(f"Average Bias: {avg_bias:.4f}")
print(f"Average Variance: {avg_var:.4f}")Install with Tessl CLI
npx tessl i tessl/pypi-mlxtend