Mathematical functions for verification, evaluation and optimization of forecasts, predictions or models
Statistical significance testing for forecast comparisons and confidence interval estimation. Provides methods to determine whether differences between forecast systems are statistically significant.
The Diebold-Mariano test evaluates whether two competing forecasts have significantly different predictive accuracy.
def diebold_mariano(
fcst_1: XarrayLike,
fcst_2: XarrayLike,
obs: XarrayLike,
*,
loss_function: Callable = mse,
reduce_dims: Optional[FlexibleDimensionTypes] = None,
preserve_dims: Optional[FlexibleDimensionTypes] = None,
confidence_level: float = 0.95,
variance_adjustment: bool = True,
) -> xr.Dataset:
"""
Perform Diebold-Mariano test for forecast comparison.
Args:
fcst_1: First forecast system
fcst_2: Second forecast system (comparison baseline)
obs: Observation data
loss_function: Loss function for forecast evaluation (default: MSE)
reduce_dims: Dimensions to reduce
preserve_dims: Dimensions to preserve
confidence_level: Confidence level for test (0-1)
variance_adjustment: Apply variance adjustment for small samples
Returns:
Dataset containing test results:
- dm_statistic: Diebold-Mariano test statistic
- p_value: Two-tailed p-value
- confidence_interval: Confidence interval for difference
- mean_difference: Mean loss difference (fcst_1 - fcst_2)
- significant: Boolean indicating statistical significance
Statistical Framework:
H₀: E[L(fcst_1, obs)] = E[L(fcst_2, obs)] (equal predictive accuracy)
H₁: E[L(fcst_1, obs)] ≠ E[L(fcst_2, obs)] (different predictive accuracy)
Test Statistic:
DM = d̄ / √(Var(d̄))
Where:
- d̄ = mean difference in loss functions
- Var(d̄) = estimated variance of d̄ (with autocorrelation correction)
Notes:
- Null hypothesis: forecasts have equal predictive accuracy
- Negative DM statistic: fcst_1 better than fcst_2
- Positive DM statistic: fcst_2 better than fcst_1
- Accounts for temporal dependence in forecast errors
- Robust to non-normality in loss differences
"""from scores.stats.statistical_tests import diebold_mariano
from scores.continuous import mse, mae
from scores.sample_data import continuous_forecast, continuous_observations
import numpy as np
import xarray as xr
# Create two competing forecast systems
np.random.seed(42)
observations = continuous_observations()
forecast_1 = observations + np.random.normal(0, 1, observations.shape) # System 1
forecast_2 = observations + np.random.normal(0, 1.2, observations.shape) # System 2 (slightly worse)
# Perform Diebold-Mariano test using MSE
dm_results = diebold_mariano(forecast_1, forecast_2, observations)
print("Diebold-Mariano Test Results:")
print(f"DM Statistic: {dm_results.dm_statistic.values:.4f}")
print(f"P-value: {dm_results.p_value.values:.4f}")
print(f"Mean MSE difference: {dm_results.mean_difference.values:.4f}")
print(f"Statistically significant: {dm_results.significant.values}")
# Interpret results
if dm_results.significant.values:
if dm_results.dm_statistic.values < 0:
print("Forecast 1 is significantly better than Forecast 2")
else:
print("Forecast 2 is significantly better than Forecast 1")
else:
print("No significant difference between forecasts")# Test using different loss functions
# MAE-based comparison
dm_mae = diebold_mariano(
forecast_1, forecast_2, observations,
loss_function=mae
)
# Custom loss function (quantile loss at 0.9)
from scores.continuous import quantile_score
def quantile_90_loss(fcst, obs):
return quantile_score(fcst, obs, alpha=0.9)
dm_q90 = diebold_mariano(
forecast_1, forecast_2, observations,
loss_function=quantile_90_loss
)
print("\nComparison using different loss functions:")
print(f"MSE-based p-value: {dm_results.p_value.values:.4f}")
print(f"MAE-based p-value: {dm_mae.p_value.values:.4f}")
print(f"Q90-based p-value: {dm_q90.p_value.values:.4f}")# Examine confidence intervals for the difference
confidence_level = 0.95
dm_ci = diebold_mariano(
forecast_1, forecast_2, observations,
confidence_level=confidence_level
)
ci_lower = dm_ci.confidence_interval.isel(bound=0)
ci_upper = dm_ci.confidence_interval.isel(bound=1)
print(f"\n{confidence_level*100:.0f}% Confidence Interval for MSE difference:")
print(f"[{ci_lower.values:.4f}, {ci_upper.values:.4f}]")
if ci_lower.values > 0:
print("Forecast 2 is consistently better (CI above zero)")
elif ci_upper.values < 0:
print("Forecast 1 is consistently better (CI below zero)")
else:
print("Confidence interval includes zero (no clear winner)")# Compare forecasts over spatial dimensions
forecast_1_3d = xr.DataArray(
np.random.normal(10, 2, (100, 10, 15)), # time, lat, lon
dims=["time", "lat", "lon"]
)
forecast_2_3d = xr.DataArray(
np.random.normal(10.5, 2.2, (100, 10, 15)),
dims=["time", "lat", "lon"]
)
observations_3d = xr.DataArray(
np.random.normal(10, 2, (100, 10, 15)),
dims=["time", "lat", "lon"]
)
# Test at each spatial point (reduce time dimension)
dm_spatial = diebold_mariano(
forecast_1_3d, forecast_2_3d, observations_3d,
reduce_dims="time"
)
# Count significant differences by location
n_significant = dm_spatial.significant.sum()
total_points = dm_spatial.significant.size
significance_fraction = n_significant / total_points
print(f"\nSpatial analysis:")
print(f"Significant differences at {n_significant.values}/{total_points} grid points")
print(f"Significance fraction: {significance_fraction.values:.3f}")
# Overall test (reduce all dimensions)
dm_overall = diebold_mariano(
forecast_1_3d, forecast_2_3d, observations_3d,
reduce_dims=["time", "lat", "lon"]
)
print(f"Overall DM statistic: {dm_overall.dm_statistic.values:.4f}")
print(f"Overall p-value: {dm_overall.p_value.values:.4f}")# Analyze temporal patterns in forecast differences
import pandas as pd
# Create time-indexed forecasts
dates = pd.date_range("2023-01-01", periods=365, freq="D")
forecast_1_ts = xr.DataArray(
10 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 1, 365),
dims=["time"],
coords={"time": dates}
)
forecast_2_ts = xr.DataArray(
10.2 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 1.1, 365),
dims=["time"],
coords={"time": dates}
)
observations_ts = xr.DataArray(
10 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 0.8, 365),
dims=["time"],
coords={"time": dates}
)
# Full year comparison
dm_annual = diebold_mariano(forecast_1_ts, forecast_2_ts, observations_ts)
# Seasonal comparisons
seasons = {
'Winter': observations_ts.time.dt.month.isin([12, 1, 2]),
'Spring': observations_ts.time.dt.month.isin([3, 4, 5]),
'Summer': observations_ts.time.dt.month.isin([6, 7, 8]),
'Autumn': observations_ts.time.dt.month.isin([9, 10, 11])
}
print("\nSeasonal forecast comparison:")
print(f"Annual p-value: {dm_annual.p_value.values:.4f}")
for season_name, season_mask in seasons.items():
seasonal_dm = diebold_mariano(
forecast_1_ts.where(season_mask),
forecast_2_ts.where(season_mask),
observations_ts.where(season_mask)
)
print(f"{season_name:6s} p-value: {seasonal_dm.p_value.values:.4f}")# Compare multiple forecast systems
forecasts = {
'System_A': observations + np.random.normal(0, 0.8, observations.shape),
'System_B': observations + np.random.normal(0.1, 1.0, observations.shape),
'System_C': observations + np.random.normal(-0.05, 1.2, observations.shape),
'Persistence': observations.shift(time=1, fill_value=observations.mean())
}
# Pairwise comparisons
import itertools
from collections import defaultdict
comparison_results = defaultdict(dict)
system_names = list(forecasts.keys())
print("\nPairwise Diebold-Mariano Test Results:")
print("=" * 50)
for sys1, sys2 in itertools.combinations(system_names, 2):
dm_result = diebold_mariano(
forecasts[sys1], forecasts[sys2], observations
)
comparison_results[sys1][sys2] = dm_result
significance_star = "*" if dm_result.significant.values else ""
better_system = sys1 if dm_result.dm_statistic.values < 0 else sys2
print(f"{sys1:12s} vs {sys2:12s}: "
f"DM={dm_result.dm_statistic.values:7.3f}, "
f"p={dm_result.p_value.values:.4f}{significance_star:1s}, "
f"Better: {better_system}")
# Summary statistics
print(f"\nForecast System Performance Summary:")
for system_name in system_names:
system_mse = mse(forecasts[system_name], observations)
print(f"{system_name:12s}: MSE = {system_mse.values:.4f}")# Analyze practical significance alongside statistical significance
dm_detailed = diebold_mariano(
forecast_1, forecast_2, observations,
confidence_level=0.95
)
# Calculate effect size measures
mse_1 = mse(forecast_1, observations)
mse_2 = mse(forecast_2, observations)
relative_improvement = (mse_2 - mse_1) / mse_2 * 100
absolute_difference = abs(dm_detailed.mean_difference.values)
print("\nEffect Size Analysis:")
print(f"MSE System 1: {mse_1.values:.4f}")
print(f"MSE System 2: {mse_2.values:.4f}")
print(f"Absolute difference: {absolute_difference:.4f}")
print(f"Relative improvement: {relative_improvement.values:.2f}%")
print(f"Statistical significance: {dm_detailed.significant.values}")
# Practical significance threshold (example: 5% improvement)
practical_threshold = 0.05
practically_significant = abs(relative_improvement.values) > practical_threshold * 100
print(f"Practically significant (>{practical_threshold*100:.0f}% improvement): {practically_significant}")
if dm_detailed.significant.values and practically_significant:
print("✓ Both statistically and practically significant")
elif dm_detailed.significant.values:
print("⚠ Statistically significant but small practical effect")
elif practically_significant:
print("⚠ Large practical effect but not statistically significant")
else:
print("✗ Neither statistically nor practically significant")# Test with different configurations
# Without variance adjustment (for comparison)
dm_no_adj = diebold_mariano(
forecast_1, forecast_2, observations,
variance_adjustment=False
)
# Different confidence levels
confidence_levels = [0.90, 0.95, 0.99]
print("\nSensitivity to Confidence Level:")
for conf_level in confidence_levels:
dm_conf = diebold_mariano(
forecast_1, forecast_2, observations,
confidence_level=conf_level
)
print(f"Confidence {conf_level*100:.0f}%: "
f"Significant = {dm_conf.significant.values}")
print(f"\nVariance Adjustment Impact:")
print(f"With adjustment: DM = {dm_results.dm_statistic.values:.4f}, p = {dm_results.p_value.values:.4f}")
print(f"Without adjustment: DM = {dm_no_adj.dm_statistic.values:.4f}, p = {dm_no_adj.p_value.values:.4f}")Install with Tessl CLI
npx tessl i tessl/pypi-scores