CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-mlxtend

Machine Learning Library Extensions providing essential tools for day-to-day data science tasks

Pending
Overview
Eval results
Files

evaluation.mddocs/

Model Evaluation

Comprehensive model evaluation tools including statistical tests, bootstrap methods, and cross-validation utilities for assessing and comparing machine learning models.

Capabilities

Statistical Testing

Statistical tests for comparing classifier performance and assessing significance of differences.

def mcnemar(ary, corrected=True, exact=False):
    """
    McNemar test for comparing two classifiers on the same dataset.
    
    Parameters:
    - ary: array-like, 2x2 contingency table or confusion matrix
    - corrected: bool, apply continuity correction
    - exact: bool, use exact binomial test
    
    Returns:
    - chi2: float, chi-squared statistic
    - p_value: float, p-value of the test
    """

def mcnemar_table(y_target, y_model1, y_model2):
    """
    Create McNemar table for two classifiers.
    
    Parameters:
    - y_target: array-like, true class labels
    - y_model1: array-like, predictions from first classifier
    - y_model2: array-like, predictions from second classifier
    
    Returns:
    - tb: array, 2x2 McNemar table
    """

def mcnemar_tables(y_target, *y_model_predictions):
    """
    Create multiple McNemar tables for pairwise comparisons.
    
    Parameters:
    - y_target: array-like, true class labels
    - y_model_predictions: arrays, predictions from multiple classifiers
    
    Returns:
    - tb: dict, pairwise McNemar tables
    """

def cochrans_q(X, alpha=0.05):
    """
    Cochran's Q test for comparing multiple classifiers.
    
    Parameters:
    - X: array-like, binary classifier results matrix
    - alpha: float, significance level
    
    Returns:
    - q: float, Cochran's Q statistic
    - p_value: float, p-value of the test
    """

def paired_ttest_resampled(estimator1, estimator2, X, y, num_rounds=30, 
                          test_size=0.3, scoring=None, random_seed=None):
    """
    Resampled paired t-test for classifier comparison.
    
    Parameters:
    - estimator1, estimator2: sklearn-compatible estimators
    - X: array-like, feature matrix
    - y: array-like, target labels
    - num_rounds: int, number of resampling rounds
    - test_size: float, test set proportion
    - scoring: str or callable, scoring metric
    - random_seed: int, random seed
    
    Returns:
    - t: float, t-statistic
    - p_value: float, p-value
    - scores_diff: array, score differences
    """

def paired_ttest_kfold_cv(estimator1, estimator2, X, y, cv=10, 
                         scoring=None, shuffle=True, random_seed=None):
    """
    Paired t-test with k-fold cross-validation.
    
    Parameters:
    - estimator1, estimator2: sklearn-compatible estimators
    - X: array-like, feature matrix
    - y: array-like, target labels
    - cv: int, number of cross-validation folds
    - scoring: str or callable, scoring metric
    - shuffle: bool, shuffle data before splitting
    - random_seed: int, random seed
    
    Returns:
    - t: float, t-statistic
    - p_value: float, p-value
    - scores_diff: array, score differences
    """

def paired_ttest_5x2cv(estimator1, estimator2, X, y, scoring=None, random_seed=None):
    """
    5x2cv paired t-test for classifier comparison.
    
    Parameters:
    - estimator1, estimator2: sklearn-compatible estimators
    - X: array-like, feature matrix
    - y: array-like, target labels
    - scoring: str or callable, scoring metric
    - random_seed: int, random seed
    
    Returns:
    - t: float, t-statistic
    - p_value: float, p-value
    """

def proportion_difference(x, n, alpha=0.05):
    """
    Test for difference in proportions with confidence interval.
    
    Parameters:
    - x: int, number of successes in sample
    - n: int, sample size
    - alpha: float, significance level
    
    Returns:
    - prop: float, sample proportion
    - ci_lower: float, lower confidence interval bound
    - ci_upper: float, upper confidence interval bound
    """

Bootstrap Methods

Bootstrap resampling methods for model evaluation and confidence interval estimation.

def bootstrap(x, func, n_splits=200, confidence_interval=0.95, 
              random_seed=None, ddof=1):
    """
    Bootstrap confidence intervals for any statistic.
    
    Parameters:
    - x: array-like, input data
    - func: callable, function to apply to bootstrap samples
    - n_splits: int, number of bootstrap samples
    - confidence_interval: float, confidence interval level
    - random_seed: int, random seed
    - ddof: int, degrees of freedom for variance calculation
    
    Returns:
    - original: float, original statistic
    - bias: float, bootstrap bias
    - std_err: float, bootstrap standard error
    - ci_bounds: tuple, confidence interval bounds
    """

def bootstrap_point632_score(estimator, X, y, n_splits=200, method='.632+',
                           scoring=None, predict_proba=False, pos_label=1,
                           random_seed=None):
    """
    Bootstrap .632 and .632+ error estimation.
    
    Parameters:
    - estimator: sklearn-compatible estimator
    - X: array-like, feature matrix
    - y: array-like, target labels
    - n_splits: int, number of bootstrap samples
    - method: str, '.632' or '.632+'
    - scoring: str or callable, scoring metric
    - predict_proba: bool, use predicted probabilities
    - pos_label: int, positive class label for binary classification
    - random_seed: int, random seed
    
    Returns:
    - scores: dict, bootstrap error estimates
    """

class BootstrapOutOfBag:
    def __init__(self, n_splits=200, random_state=None):
        """
        Bootstrap Out-of-Bag cross-validation.
        
        Parameters:
        - n_splits: int, number of bootstrap samples
        - random_state: int, random state
        """
    
    def split(self, X, y=None, groups=None):
        """Generate bootstrap train/test splits"""
        
    def get_n_splits(self, X=None, y=None, groups=None):
        """Get number of splits"""

Cross-Validation Utilities

Advanced cross-validation strategies for specific data types and evaluation scenarios.

class RandomHoldoutSplit:
    def __init__(self, valid_size=0.5, n_splits=1, stratify=False, random_state=None):
        """
        Random holdout validation split.
        
        Parameters:
        - valid_size: float, validation set proportion
        - n_splits: int, number of splits to generate
        - stratify: bool, stratified sampling
        - random_state: int, random state
        """
    
    def split(self, X, y=None, groups=None):
        """Generate train/validation splits"""

class PredefinedHoldoutSplit:
    def __init__(self, test_fold):
        """
        Predefined holdout split using test fold indices.
        
        Parameters:
        - test_fold: array-like, test set indices
        """
    
    def split(self, X, y=None, groups=None):
        """Generate predefined train/test split"""

class GroupTimeSeriesSplit:
    def __init__(self, n_splits=5, test_size=None):
        """
        Time series cross-validation for grouped data.
        
        Parameters:
        - n_splits: int, number of splits
        - test_size: int, test set size
        """
    
    def split(self, X, y=None, groups=None):
        """Generate time series splits"""
        
    def get_n_splits(self, X=None, y=None, groups=None):
        """Get number of splits"""

Feature Importance and Permutation Testing

Methods for assessing feature importance and performing permutation-based statistical tests.

def feature_importance_permutation(X, y, predict_method, metric, num_rounds=1,
                                 seed=None):
    """
    Permutation-based feature importance calculation.
    
    Parameters:
    - X: array-like, feature matrix
    - y: array-like, target labels
    - predict_method: callable, prediction method
    - metric: callable, evaluation metric
    - num_rounds: int, number of permutation rounds
    - seed: int, random seed
    
    Returns:
    - importances: array, feature importance scores
    """

def permutation_test(x, y, func, method='exact', num_rounds=1000, seed=None):
    """
    Permutation test for statistical significance.
    
    Parameters:
    - x: array-like, first sample
    - y: array-like, second sample
    - func: callable, test statistic function
    - method: str, 'exact' or 'approximate'
    - num_rounds: int, number of permutation rounds
    - seed: int, random seed
    
    Returns:
    - original_stat: float, original test statistic
    - p_value: float, permutation p-value
    - null_dist: array, null distribution of test statistics
    """

Bias-Variance Decomposition

Decompose prediction error into bias and variance components.

def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
                        loss='0-1_loss', num_rounds=200, random_seed=None):
    """
    Bias-variance decomposition for model evaluation.
    
    Parameters:
    - estimator: sklearn-compatible estimator
    - X_train: array-like, training features
    - y_train: array-like, training labels
    - X_test: array-like, test features
    - y_test: array-like, test labels
    - loss: str, loss function ('0-1_loss' or 'mse')
    - num_rounds: int, number of bootstrap rounds
    - random_seed: int, random seed
    
    Returns:
    - avg_expected_loss: float, average expected loss
    - avg_bias: float, average bias
    - avg_var: float, average variance
    - all_pred: array, all predictions from bootstrap samples
    """

Additional Metrics and Utilities

Additional evaluation metrics and utility functions.

def accuracy_score(y_target, y_predicted, normalize=True):
    """
    Calculate accuracy score.
    
    Parameters:
    - y_target: array-like, true labels
    - y_predicted: array-like, predicted labels
    - normalize: bool, return fraction or count
    
    Returns:
    - accuracy: float or int, accuracy score
    """

def lift_score(y_target, y_probas, binary=True):
    """
    Calculate lift score for binary classification.
    
    Parameters:
    - y_target: array-like, true binary labels
    - y_probas: array-like, predicted probabilities
    - binary: bool, binary classification
    
    Returns:
    - lift: float, lift score
    """

def confusion_matrix(y_target, y_predicted, binary=False):
    """
    Create confusion matrix.
    
    Parameters:
    - y_target: array-like, true labels
    - y_predicted: array-like, predicted labels
    - binary: bool, binary classification
    
    Returns:
    - cm: array, confusion matrix
    """

def create_counterfactual(df, x1, y1, x2, y2, treatment_feature, outcome_feature):
    """
    Generate counterfactual examples for causal analysis.
    
    Parameters:
    - df: DataFrame, input data
    - x1, y1: int, coordinates for treatment group
    - x2, y2: int, coordinates for control group
    - treatment_feature: str, treatment column name
    - outcome_feature: str, outcome column name
    
    Returns:
    - counterfactual_df: DataFrame, counterfactual examples
    """

def ftest(ary):
    """
    F-test for comparing multiple classifier variances.
    
    Parameters:
    - ary: array-like, classifier performance scores
    
    Returns:
    - f_stat: float, F-statistic
    - p_value: float, p-value
    """

def combined_ftest_5x2cv(estimator1, estimator2, X, y, random_seed=None):
    """
    Combined F-test using 5x2 cross-validation.
    
    Parameters:
    - estimator1, estimator2: sklearn-compatible estimators
    - X: array-like, feature matrix
    - y: array-like, target labels
    - random_seed: int, random seed
    
    Returns:
    - f: float, F-statistic
    - p_value: float, p-value
    """

def scoring(y_target, y_predicted, metric='accuracy', pos_label=1, average='binary'):
    """
    Flexible scoring function supporting multiple metrics.
    
    Parameters:
    - y_target: array-like, true labels
    - y_predicted: array-like, predicted labels
    - metric: str, evaluation metric
    - pos_label: int, positive class label
    - average: str, averaging method for multi-class
    
    Returns:
    - score: float, computed score
    """

Usage Examples

McNemar Test Example

from mlxtend.evaluate import mcnemar, mcnemar_table
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train two classifiers
clf1 = RandomForestClassifier(random_state=42)
clf2 = SVC(random_state=42)

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)

# Get predictions
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)

# Create McNemar table and perform test
tb = mcnemar_table(y_test, y_pred1, y_pred2)
chi2, p_value = mcnemar(tb, corrected=True)

print(f"McNemar's chi-squared: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")

Bootstrap Evaluation Example

from mlxtend.evaluate import bootstrap_point632_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Train classifier
clf = RandomForestClassifier(random_state=42)

# Perform bootstrap .632+ evaluation
scores = bootstrap_point632_score(clf, X, y, method='.632+', 
                                 scoring='accuracy', n_splits=200)

print(f"Bootstrap .632+ accuracy: {scores['.632+']:.4f}")
print(f"Training accuracy: {scores['train']:.4f}")
print(f"Test accuracy: {scores['test']:.4f}")

Bias-Variance Decomposition Example

from mlxtend.evaluate import bias_variance_decomp
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Analyze bias-variance tradeoff
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
avg_expected_loss, avg_bias, avg_var, all_pred = bias_variance_decomp(
    clf, X_train, y_train, X_test, y_test, 
    loss='0-1_loss', num_rounds=200, random_seed=42
)

print(f"Average Expected Loss: {avg_expected_loss:.4f}")
print(f"Average Bias: {avg_bias:.4f}")
print(f"Average Variance: {avg_var:.4f}")

Install with Tessl CLI

npx tessl i tessl/pypi-mlxtend

docs

classification.md

clustering.md

datasets.md

evaluation.md

feature-engineering.md

file-io.md

index.md

math-utils.md

pattern-mining.md

plotting.md

preprocessing.md

regression.md

text-processing.md

utilities.md

tile.json