0
# Statistical Tests
1
2
Statistical significance testing for forecast comparisons and confidence interval estimation. Provides methods to determine whether differences between forecast systems are statistically significant.
3
4
## Capabilities
5
6
### Diebold-Mariano Test
7
8
The Diebold-Mariano test evaluates whether two competing forecasts have significantly different predictive accuracy.
9
10
```python { .api }
11
def diebold_mariano(
12
fcst_1: XarrayLike,
13
fcst_2: XarrayLike,
14
obs: XarrayLike,
15
*,
16
loss_function: Callable = mse,
17
reduce_dims: Optional[FlexibleDimensionTypes] = None,
18
preserve_dims: Optional[FlexibleDimensionTypes] = None,
19
confidence_level: float = 0.95,
20
variance_adjustment: bool = True,
21
) -> xr.Dataset:
22
"""
23
Perform Diebold-Mariano test for forecast comparison.
24
25
Args:
26
fcst_1: First forecast system
27
fcst_2: Second forecast system (comparison baseline)
28
obs: Observation data
29
loss_function: Loss function for forecast evaluation (default: MSE)
30
reduce_dims: Dimensions to reduce
31
preserve_dims: Dimensions to preserve
32
confidence_level: Confidence level for test (0-1)
33
variance_adjustment: Apply variance adjustment for small samples
34
35
Returns:
36
Dataset containing test results:
37
- dm_statistic: Diebold-Mariano test statistic
38
- p_value: Two-tailed p-value
39
- confidence_interval: Confidence interval for difference
40
- mean_difference: Mean loss difference (fcst_1 - fcst_2)
41
- significant: Boolean indicating statistical significance
42
43
Statistical Framework:
44
H₀: E[L(fcst_1, obs)] = E[L(fcst_2, obs)] (equal predictive accuracy)
45
H₁: E[L(fcst_1, obs)] ≠ E[L(fcst_2, obs)] (different predictive accuracy)
46
47
Test Statistic:
48
DM = d̄ / √(Var(d̄))
49
50
Where:
51
- d̄ = mean difference in loss functions
52
- Var(d̄) = estimated variance of d̄ (with autocorrelation correction)
53
54
Notes:
55
- Null hypothesis: forecasts have equal predictive accuracy
56
- Negative DM statistic: fcst_1 better than fcst_2
57
- Positive DM statistic: fcst_2 better than fcst_1
58
- Accounts for temporal dependence in forecast errors
59
- Robust to non-normality in loss differences
60
"""
61
```
62
63
## Usage Patterns
64
65
### Basic Forecast Comparison
66
67
```python
68
from scores.stats.statistical_tests import diebold_mariano
69
from scores.continuous import mse, mae
70
from scores.sample_data import continuous_forecast, continuous_observations
71
import numpy as np
72
import xarray as xr
73
74
# Create two competing forecast systems
75
np.random.seed(42)
76
observations = continuous_observations()
77
forecast_1 = observations + np.random.normal(0, 1, observations.shape) # System 1
78
forecast_2 = observations + np.random.normal(0, 1.2, observations.shape) # System 2 (slightly worse)
79
80
# Perform Diebold-Mariano test using MSE
81
dm_results = diebold_mariano(forecast_1, forecast_2, observations)
82
83
print("Diebold-Mariano Test Results:")
84
print(f"DM Statistic: {dm_results.dm_statistic.values:.4f}")
85
print(f"P-value: {dm_results.p_value.values:.4f}")
86
print(f"Mean MSE difference: {dm_results.mean_difference.values:.4f}")
87
print(f"Statistically significant: {dm_results.significant.values}")
88
89
# Interpret results
90
if dm_results.significant.values:
91
if dm_results.dm_statistic.values < 0:
92
print("Forecast 1 is significantly better than Forecast 2")
93
else:
94
print("Forecast 2 is significantly better than Forecast 1")
95
else:
96
print("No significant difference between forecasts")
97
```
98
99
### Alternative Loss Functions
100
101
```python
102
# Test using different loss functions
103
104
# MAE-based comparison
105
dm_mae = diebold_mariano(
106
forecast_1, forecast_2, observations,
107
loss_function=mae
108
)
109
110
# Custom loss function (quantile loss at 0.9)
111
from scores.continuous import quantile_score
112
113
def quantile_90_loss(fcst, obs):
114
return quantile_score(fcst, obs, alpha=0.9)
115
116
dm_q90 = diebold_mariano(
117
forecast_1, forecast_2, observations,
118
loss_function=quantile_90_loss
119
)
120
121
print("\nComparison using different loss functions:")
122
print(f"MSE-based p-value: {dm_results.p_value.values:.4f}")
123
print(f"MAE-based p-value: {dm_mae.p_value.values:.4f}")
124
print(f"Q90-based p-value: {dm_q90.p_value.values:.4f}")
125
```
126
127
### Confidence Intervals
128
129
```python
130
# Examine confidence intervals for the difference
131
confidence_level = 0.95
132
dm_ci = diebold_mariano(
133
forecast_1, forecast_2, observations,
134
confidence_level=confidence_level
135
)
136
137
ci_lower = dm_ci.confidence_interval.isel(bound=0)
138
ci_upper = dm_ci.confidence_interval.isel(bound=1)
139
140
print(f"\n{confidence_level*100:.0f}% Confidence Interval for MSE difference:")
141
print(f"[{ci_lower.values:.4f}, {ci_upper.values:.4f}]")
142
143
if ci_lower.values > 0:
144
print("Forecast 2 is consistently better (CI above zero)")
145
elif ci_upper.values < 0:
146
print("Forecast 1 is consistently better (CI below zero)")
147
else:
148
print("Confidence interval includes zero (no clear winner)")
149
```
150
151
### Multi-dimensional Forecast Comparison
152
153
```python
154
# Compare forecasts over spatial dimensions
155
forecast_1_3d = xr.DataArray(
156
np.random.normal(10, 2, (100, 10, 15)), # time, lat, lon
157
dims=["time", "lat", "lon"]
158
)
159
forecast_2_3d = xr.DataArray(
160
np.random.normal(10.5, 2.2, (100, 10, 15)),
161
dims=["time", "lat", "lon"]
162
)
163
observations_3d = xr.DataArray(
164
np.random.normal(10, 2, (100, 10, 15)),
165
dims=["time", "lat", "lon"]
166
)
167
168
# Test at each spatial point (reduce time dimension)
169
dm_spatial = diebold_mariano(
170
forecast_1_3d, forecast_2_3d, observations_3d,
171
reduce_dims="time"
172
)
173
174
# Count significant differences by location
175
n_significant = dm_spatial.significant.sum()
176
total_points = dm_spatial.significant.size
177
significance_fraction = n_significant / total_points
178
179
print(f"\nSpatial analysis:")
180
print(f"Significant differences at {n_significant.values}/{total_points} grid points")
181
print(f"Significance fraction: {significance_fraction.values:.3f}")
182
183
# Overall test (reduce all dimensions)
184
dm_overall = diebold_mariano(
185
forecast_1_3d, forecast_2_3d, observations_3d,
186
reduce_dims=["time", "lat", "lon"]
187
)
188
189
print(f"Overall DM statistic: {dm_overall.dm_statistic.values:.4f}")
190
print(f"Overall p-value: {dm_overall.p_value.values:.4f}")
191
```
192
193
### Time Series Analysis
194
195
```python
196
# Analyze temporal patterns in forecast differences
197
import pandas as pd
198
199
# Create time-indexed forecasts
200
dates = pd.date_range("2023-01-01", periods=365, freq="D")
201
forecast_1_ts = xr.DataArray(
202
10 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 1, 365),
203
dims=["time"],
204
coords={"time": dates}
205
)
206
forecast_2_ts = xr.DataArray(
207
10.2 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 1.1, 365),
208
dims=["time"],
209
coords={"time": dates}
210
)
211
observations_ts = xr.DataArray(
212
10 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 0.8, 365),
213
dims=["time"],
214
coords={"time": dates}
215
)
216
217
# Full year comparison
218
dm_annual = diebold_mariano(forecast_1_ts, forecast_2_ts, observations_ts)
219
220
# Seasonal comparisons
221
seasons = {
222
'Winter': observations_ts.time.dt.month.isin([12, 1, 2]),
223
'Spring': observations_ts.time.dt.month.isin([3, 4, 5]),
224
'Summer': observations_ts.time.dt.month.isin([6, 7, 8]),
225
'Autumn': observations_ts.time.dt.month.isin([9, 10, 11])
226
}
227
228
print("\nSeasonal forecast comparison:")
229
print(f"Annual p-value: {dm_annual.p_value.values:.4f}")
230
231
for season_name, season_mask in seasons.items():
232
seasonal_dm = diebold_mariano(
233
forecast_1_ts.where(season_mask),
234
forecast_2_ts.where(season_mask),
235
observations_ts.where(season_mask)
236
)
237
print(f"{season_name:6s} p-value: {seasonal_dm.p_value.values:.4f}")
238
```
239
240
### Multiple Comparison Workflow
241
242
```python
243
# Compare multiple forecast systems
244
forecasts = {
245
'System_A': observations + np.random.normal(0, 0.8, observations.shape),
246
'System_B': observations + np.random.normal(0.1, 1.0, observations.shape),
247
'System_C': observations + np.random.normal(-0.05, 1.2, observations.shape),
248
'Persistence': observations.shift(time=1, fill_value=observations.mean())
249
}
250
251
# Pairwise comparisons
252
import itertools
253
from collections import defaultdict
254
255
comparison_results = defaultdict(dict)
256
system_names = list(forecasts.keys())
257
258
print("\nPairwise Diebold-Mariano Test Results:")
259
print("=" * 50)
260
261
for sys1, sys2 in itertools.combinations(system_names, 2):
262
dm_result = diebold_mariano(
263
forecasts[sys1], forecasts[sys2], observations
264
)
265
266
comparison_results[sys1][sys2] = dm_result
267
268
significance_star = "*" if dm_result.significant.values else ""
269
better_system = sys1 if dm_result.dm_statistic.values < 0 else sys2
270
271
print(f"{sys1:12s} vs {sys2:12s}: "
272
f"DM={dm_result.dm_statistic.values:7.3f}, "
273
f"p={dm_result.p_value.values:.4f}{significance_star:1s}, "
274
f"Better: {better_system}")
275
276
# Summary statistics
277
print(f"\nForecast System Performance Summary:")
278
for system_name in system_names:
279
system_mse = mse(forecasts[system_name], observations)
280
print(f"{system_name:12s}: MSE = {system_mse.values:.4f}")
281
```
282
283
### Effect Size Analysis
284
285
```python
286
# Analyze practical significance alongside statistical significance
287
dm_detailed = diebold_mariano(
288
forecast_1, forecast_2, observations,
289
confidence_level=0.95
290
)
291
292
# Calculate effect size measures
293
mse_1 = mse(forecast_1, observations)
294
mse_2 = mse(forecast_2, observations)
295
296
relative_improvement = (mse_2 - mse_1) / mse_2 * 100
297
absolute_difference = abs(dm_detailed.mean_difference.values)
298
299
print("\nEffect Size Analysis:")
300
print(f"MSE System 1: {mse_1.values:.4f}")
301
print(f"MSE System 2: {mse_2.values:.4f}")
302
print(f"Absolute difference: {absolute_difference:.4f}")
303
print(f"Relative improvement: {relative_improvement.values:.2f}%")
304
print(f"Statistical significance: {dm_detailed.significant.values}")
305
306
# Practical significance threshold (example: 5% improvement)
307
practical_threshold = 0.05
308
practically_significant = abs(relative_improvement.values) > practical_threshold * 100
309
310
print(f"Practically significant (>{practical_threshold*100:.0f}% improvement): {practically_significant}")
311
312
if dm_detailed.significant.values and practically_significant:
313
print("✓ Both statistically and practically significant")
314
elif dm_detailed.significant.values:
315
print("⚠ Statistically significant but small practical effect")
316
elif practically_significant:
317
print("⚠ Large practical effect but not statistically significant")
318
else:
319
print("✗ Neither statistically nor practically significant")
320
```
321
322
### Advanced Test Configuration
323
324
```python
325
# Test with different configurations
326
327
# Without variance adjustment (for comparison)
328
dm_no_adj = diebold_mariano(
329
forecast_1, forecast_2, observations,
330
variance_adjustment=False
331
)
332
333
# Different confidence levels
334
confidence_levels = [0.90, 0.95, 0.99]
335
print("\nSensitivity to Confidence Level:")
336
337
for conf_level in confidence_levels:
338
dm_conf = diebold_mariano(
339
forecast_1, forecast_2, observations,
340
confidence_level=conf_level
341
)
342
print(f"Confidence {conf_level*100:.0f}%: "
343
f"Significant = {dm_conf.significant.values}")
344
345
print(f"\nVariance Adjustment Impact:")
346
print(f"With adjustment: DM = {dm_results.dm_statistic.values:.4f}, p = {dm_results.p_value.values:.4f}")
347
print(f"Without adjustment: DM = {dm_no_adj.dm_statistic.values:.4f}, p = {dm_no_adj.p_value.values:.4f}")
348
```