A scikit-learn-compatible module for estimating prediction intervals using conformal prediction methods.
Comprehensive evaluation metrics for assessing the quality of conformal prediction intervals and sets, including coverage, width, and calibration metrics. These metrics help evaluate the performance and reliability of uncertainty quantification methods.
Metrics for evaluating prediction intervals in regression tasks, focusing on coverage guarantees, interval width efficiency, and distributional properties.
def regression_coverage_score(y_true, y_intervals):
"""
Compute coverage score for regression prediction intervals.
Parameters:
- y_true: ArrayLike, true target values
- y_intervals: ArrayLike, prediction intervals (shape: n_samples x 2 x n_alpha)
Returns:
NDArray: coverage scores for each confidence level
"""
def regression_mean_width_score(y_intervals):
"""
Compute mean width of prediction intervals.
Parameters:
- y_intervals: ArrayLike, prediction intervals (shape: n_samples x 2 x n_alpha)
Returns:
NDArray: mean interval widths for each confidence level
"""
def regression_ssc(y_true, y_intervals):
"""
Size-stratified coverage score for regression.
Parameters:
- y_true: ArrayLike, true target values
- y_intervals: ArrayLike, prediction intervals
Returns:
NDArray: size-stratified coverage scores
"""
def regression_ssc_score(y_true, y_intervals, num_bins=10):
"""
Size-stratified coverage score with binning.
Parameters:
- y_true: ArrayLike, true target values
- y_intervals: ArrayLike, prediction intervals
- num_bins: int, number of bins for stratification (default: 10)
Returns:
NDArray: binned size-stratified coverage scores
"""
def hsic(x, y, kernel="gaussian"):
"""
Hilbert-Schmidt Independence Criterion for testing independence.
Parameters:
- x: ArrayLike, first variable
- y: ArrayLike, second variable
- kernel: str, kernel type ("gaussian", "linear") (default: "gaussian")
Returns:
float: HSIC statistic
"""
def coverage_width_based(y_true, y_intervals, eta=1.0):
"""
Coverage-width-based metric balancing coverage and efficiency.
Parameters:
- y_true: ArrayLike, true target values
- y_intervals: ArrayLike, prediction intervals
- eta: float, weight parameter for width penalty (default: 1.0)
Returns:
NDArray: coverage-width-based scores
"""
def regression_mwi_score(y_intervals, num_bins=10):
"""
Mean width interval score with binning.
Parameters:
- y_intervals: ArrayLike, prediction intervals
- num_bins: int, number of bins (default: 10)
Returns:
NDArray: mean width scores per bin
"""Metrics for evaluating prediction sets in classification tasks, measuring set coverage, size efficiency, and distributional properties.
def classification_coverage_score(y_true, y_pred_set):
"""
Compute coverage score for classification prediction sets.
Parameters:
- y_true: ArrayLike, true class labels
- y_pred_set: ArrayLike, prediction sets (binary matrix: n_samples x n_classes)
Returns:
NDArray: coverage scores
"""
def classification_mean_width_score(y_pred_set):
"""
Compute mean size of prediction sets.
Parameters:
- y_pred_set: ArrayLike, prediction sets (binary matrix)
Returns:
float: mean prediction set size
"""
def classification_ssc(y_true, y_pred_set):
"""
Size-stratified coverage for classification.
Parameters:
- y_true: ArrayLike, true class labels
- y_pred_set: ArrayLike, prediction sets
Returns:
NDArray: size-stratified coverage scores
"""
def classification_ssc_score(y_true, y_pred_set, num_bins=10):
"""
Size-stratified coverage score with binning for classification.
Parameters:
- y_true: ArrayLike, true class labels
- y_pred_set: ArrayLike, prediction sets
- num_bins: int, number of bins for stratification (default: 10)
Returns:
NDArray: binned size-stratified coverage scores
"""Metrics for evaluating probability calibration quality, testing whether predicted probabilities accurately reflect true confidence levels.
def expected_calibration_error(y_true, y_scores, num_bins=50, split_strategy=None):
"""
Expected Calibration Error (ECE) for probability predictions.
Parameters:
- y_true: ArrayLike, true binary labels (0/1)
- y_scores: ArrayLike, predicted probabilities
- num_bins: int, number of bins for reliability diagram (default: 50)
- split_strategy: Optional[str], binning strategy ("uniform", "quantile")
Returns:
float: expected calibration error
"""
def top_label_ece(y_true, y_scores, num_bins=50, split_strategy=None):
"""
Top-label Expected Calibration Error for multi-class problems.
Parameters:
- y_true: ArrayLike, true class labels
- y_scores: ArrayLike, predicted class probabilities (n_samples x n_classes)
- num_bins: int, number of bins (default: 50)
- split_strategy: Optional[str], binning strategy
Returns:
float: top-label expected calibration error
"""
def kolmogorov_smirnov_statistic(y_true, y_score):
"""
Kolmogorov-Smirnov test statistic for calibration assessment.
Parameters:
- y_true: ArrayLike, true binary labels
- y_score: ArrayLike, predicted probabilities
Returns:
float: KS test statistic
"""
def kolmogorov_smirnov_p_value(y_true, y_score):
"""
P-value for Kolmogorov-Smirnov calibration test.
Parameters:
- y_true: ArrayLike, true binary labels
- y_score: ArrayLike, predicted probabilities
Returns:
float: KS test p-value
"""
def kuiper_statistic(y_true, y_score):
"""
Kuiper test statistic for calibration (circular KS test).
Parameters:
- y_true: ArrayLike, true binary labels
- y_score: ArrayLike, predicted probabilities
Returns:
float: Kuiper test statistic
"""
def kuiper_p_value(y_true, y_score):
"""
P-value for Kuiper calibration test.
Parameters:
- y_true: ArrayLike, true binary labels
- y_score: ArrayLike, predicted probabilities
Returns:
float: Kuiper test p-value
"""
def spiegelhalter_statistic(y_true, y_score):
"""
Spiegelhalter test statistic for calibration assessment.
Parameters:
- y_true: ArrayLike, true binary labels
- y_score: ArrayLike, predicted probabilities
Returns:
float: Spiegelhalter test statistic
"""
def spiegelhalter_p_value(y_true, y_score):
"""
P-value for Spiegelhalter calibration test.
Parameters:
- y_true: ArrayLike, true binary labels
- y_score: ArrayLike, predicted probabilities
Returns:
float: Spiegelhalter test p-value
"""from mapie.metrics.regression import (
regression_coverage_score,
regression_mean_width_score,
regression_ssc_score
)
import numpy as np
# Assume we have predictions from MAPIE regressor
# y_pred: point predictions
# y_intervals: prediction intervals (shape: n_samples x 2 x n_alpha)
# y_test: true values
# Coverage evaluation
coverage_scores = regression_coverage_score(y_test, y_intervals)
print(f"Coverage scores: {coverage_scores}")
# Width evaluation
mean_widths = regression_mean_width_score(y_intervals)
print(f"Mean interval widths: {mean_widths}")
# Size-stratified coverage
ssc_scores = regression_ssc_score(y_test, y_intervals, num_bins=10)
print(f"Size-stratified coverage: {ssc_scores}")
# Coverage-width trade-off
from mapie.metrics.regression import coverage_width_based
cwb_scores = coverage_width_based(y_test, y_intervals, eta=0.5)
print(f"Coverage-width-based scores: {cwb_scores}")from mapie.metrics.classification import (
classification_coverage_score,
classification_mean_width_score,
classification_ssc_score
)
# Assume we have prediction sets from MAPIE classifier
# y_pred_sets: binary matrix (n_samples x n_classes)
# y_test: true class labels
# Coverage evaluation
coverage = classification_coverage_score(y_test, y_pred_sets)
print(f"Empirical coverage: {coverage:.3f}")
# Set size evaluation
mean_set_size = classification_mean_width_score(y_pred_sets)
print(f"Mean prediction set size: {mean_set_size:.2f}")
# Size-stratified coverage
ssc_scores = classification_ssc_score(y_test, y_pred_sets, num_bins=5)
print(f"Size-stratified coverage by bin: {ssc_scores}")from mapie.metrics.calibration import (
expected_calibration_error,
top_label_ece,
kolmogorov_smirnov_statistic,
spiegelhalter_p_value
)
# Binary classification calibration
y_proba_binary = classifier.predict_proba(X_test)[:, 1]
y_binary = (y_test == positive_class).astype(int)
# Expected Calibration Error
ece = expected_calibration_error(y_binary, y_proba_binary, num_bins=10)
print(f"Expected Calibration Error: {ece:.4f}")
# Kolmogorov-Smirnov test
ks_stat = kolmogorov_smirnov_statistic(y_binary, y_proba_binary)
print(f"KS statistic: {ks_stat:.4f}")
# Multi-class calibration
y_proba_multi = classifier.predict_proba(X_test)
top_ece = top_label_ece(y_test, y_proba_multi)
print(f"Top-label ECE: {top_ece:.4f}")
# Statistical significance test
spieg_pval = spiegelhalter_p_value(y_binary, y_proba_binary)
print(f"Spiegelhalter p-value: {spieg_pval:.4f}")def evaluate_regression_intervals(y_true, y_pred, y_intervals, confidence_levels):
"""Comprehensive evaluation of regression prediction intervals."""
results = {}
for i, alpha in enumerate(confidence_levels):
level_intervals = y_intervals[:, :, i] if y_intervals.ndim == 3 else y_intervals
# Coverage
coverage = regression_coverage_score(y_true, level_intervals)
# Width
width = regression_mean_width_score(level_intervals)
# Efficiency (width relative to empirical quantiles)
residuals = np.abs(y_true - y_pred)
empirical_quantile = np.quantile(residuals, alpha)
efficiency = width / (2 * empirical_quantile) if empirical_quantile > 0 else np.inf
results[f"confidence_{alpha}"] = {
"coverage": coverage,
"mean_width": width,
"efficiency": efficiency
}
return results
# Usage
results = evaluate_regression_intervals(
y_test, y_pred, y_intervals,
confidence_levels=[0.8, 0.9, 0.95]
)def analyze_prediction_sets(y_true, y_pred_sets, class_names=None):
"""Analyze prediction set characteristics."""
n_samples, n_classes = y_pred_sets.shape
# Set sizes
set_sizes = np.sum(y_pred_sets, axis=1)
# Coverage
coverage = classification_coverage_score(y_true, y_pred_sets)
# Size distribution
size_counts = np.bincount(set_sizes.astype(int), minlength=n_classes+1)
size_dist = size_counts / n_samples
# Per-class inclusion rates
inclusion_rates = np.mean(y_pred_sets, axis=0)
results = {
"overall_coverage": coverage,
"mean_set_size": np.mean(set_sizes),
"set_size_distribution": size_dist,
"inclusion_rates": dict(zip(class_names or range(n_classes), inclusion_rates))
}
return results
# Usage
analysis = analyze_prediction_sets(y_test, y_pred_sets, class_names=['A', 'B', 'C'])import matplotlib.pyplot as plt
def plot_reliability_diagram(y_true, y_proba, n_bins=10):
"""Plot reliability diagram for calibration assessment."""
from sklearn.calibration import calibration_curve
# Compute calibration curve
fraction_of_positives, mean_predicted_value = calibration_curve(
y_true, y_proba, n_bins=n_bins
)
# Plot
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect calibration')
plt.plot(mean_predicted_value, fraction_of_positives, 's-',
label=f'Model (ECE = {expected_calibration_error(y_true, y_proba):.3f})')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Reliability Diagram')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Usage
plot_reliability_diagram(y_binary, y_proba_binary, n_bins=10)from mapie.metrics.regression import hsic
def test_interval_independence(residuals, interval_widths, kernel="gaussian"):
"""Test independence between residuals and interval widths using HSIC."""
# Compute HSIC statistic
hsic_stat = hsic(residuals, interval_widths, kernel=kernel)
# Bootstrap p-value approximation
n_bootstrap = 1000
bootstrap_stats = []
for _ in range(n_bootstrap):
# Shuffle one variable to break dependence
shuffled_widths = np.random.permutation(interval_widths)
bootstrap_stat = hsic(residuals, shuffled_widths, kernel=kernel)
bootstrap_stats.append(bootstrap_stat)
# P-value
p_value = np.mean(np.array(bootstrap_stats) >= hsic_stat)
return {
"hsic_statistic": hsic_stat,
"p_value": p_value,
"is_independent": p_value > 0.05
}
# Usage
residuals = np.abs(y_test - y_pred)
widths = y_intervals[:, 1] - y_intervals[:, 0]
independence_test = test_interval_independence(residuals, widths)Install with Tessl CLI
npx tessl i tessl/pypi-mapie