or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

categorical-scores.mdcontinuous-scores.mdemerging-scores.mdindex.mdpandas-integration.mdplot-data.mdprobability-scores.mdprocessing-tools.mdsample-data.mdspatial-scores.mdstatistical-tests.md

statistical-tests.mddocs/

0

# Statistical Tests

1

2

Statistical significance testing for forecast comparisons and confidence interval estimation. Provides methods to determine whether differences between forecast systems are statistically significant.

3

4

## Capabilities

5

6

### Diebold-Mariano Test

7

8

The Diebold-Mariano test evaluates whether two competing forecasts have significantly different predictive accuracy.

9

10

```python { .api }

11

def diebold_mariano(

12

fcst_1: XarrayLike,

13

fcst_2: XarrayLike,

14

obs: XarrayLike,

15

*,

16

loss_function: Callable = mse,

17

reduce_dims: Optional[FlexibleDimensionTypes] = None,

18

preserve_dims: Optional[FlexibleDimensionTypes] = None,

19

confidence_level: float = 0.95,

20

variance_adjustment: bool = True,

21

) -> xr.Dataset:

22

"""

23

Perform Diebold-Mariano test for forecast comparison.

24

25

Args:

26

fcst_1: First forecast system

27

fcst_2: Second forecast system (comparison baseline)

28

obs: Observation data

29

loss_function: Loss function for forecast evaluation (default: MSE)

30

reduce_dims: Dimensions to reduce

31

preserve_dims: Dimensions to preserve

32

confidence_level: Confidence level for test (0-1)

33

variance_adjustment: Apply variance adjustment for small samples

34

35

Returns:

36

Dataset containing test results:

37

- dm_statistic: Diebold-Mariano test statistic

38

- p_value: Two-tailed p-value

39

- confidence_interval: Confidence interval for difference

40

- mean_difference: Mean loss difference (fcst_1 - fcst_2)

41

- significant: Boolean indicating statistical significance

42

43

Statistical Framework:

44

H₀: E[L(fcst_1, obs)] = E[L(fcst_2, obs)] (equal predictive accuracy)

45

H₁: E[L(fcst_1, obs)] ≠ E[L(fcst_2, obs)] (different predictive accuracy)

46

47

Test Statistic:

48

DM = d̄ / √(Var(d̄))

49

50

Where:

51

- d̄ = mean difference in loss functions

52

- Var(d̄) = estimated variance of d̄ (with autocorrelation correction)

53

54

Notes:

55

- Null hypothesis: forecasts have equal predictive accuracy

56

- Negative DM statistic: fcst_1 better than fcst_2

57

- Positive DM statistic: fcst_2 better than fcst_1

58

- Accounts for temporal dependence in forecast errors

59

- Robust to non-normality in loss differences

60

"""

61

```

62

63

## Usage Patterns

64

65

### Basic Forecast Comparison

66

67

```python

68

from scores.stats.statistical_tests import diebold_mariano

69

from scores.continuous import mse, mae

70

from scores.sample_data import continuous_forecast, continuous_observations

71

import numpy as np

72

import xarray as xr

73

74

# Create two competing forecast systems

75

np.random.seed(42)

76

observations = continuous_observations()

77

forecast_1 = observations + np.random.normal(0, 1, observations.shape) # System 1

78

forecast_2 = observations + np.random.normal(0, 1.2, observations.shape) # System 2 (slightly worse)

79

80

# Perform Diebold-Mariano test using MSE

81

dm_results = diebold_mariano(forecast_1, forecast_2, observations)

82

83

print("Diebold-Mariano Test Results:")

84

print(f"DM Statistic: {dm_results.dm_statistic.values:.4f}")

85

print(f"P-value: {dm_results.p_value.values:.4f}")

86

print(f"Mean MSE difference: {dm_results.mean_difference.values:.4f}")

87

print(f"Statistically significant: {dm_results.significant.values}")

88

89

# Interpret results

90

if dm_results.significant.values:

91

if dm_results.dm_statistic.values < 0:

92

print("Forecast 1 is significantly better than Forecast 2")

93

else:

94

print("Forecast 2 is significantly better than Forecast 1")

95

else:

96

print("No significant difference between forecasts")

97

```

98

99

### Alternative Loss Functions

100

101

```python

102

# Test using different loss functions

103

104

# MAE-based comparison

105

dm_mae = diebold_mariano(

106

forecast_1, forecast_2, observations,

107

loss_function=mae

108

)

109

110

# Custom loss function (quantile loss at 0.9)

111

from scores.continuous import quantile_score

112

113

def quantile_90_loss(fcst, obs):

114

return quantile_score(fcst, obs, alpha=0.9)

115

116

dm_q90 = diebold_mariano(

117

forecast_1, forecast_2, observations,

118

loss_function=quantile_90_loss

119

)

120

121

print("\nComparison using different loss functions:")

122

print(f"MSE-based p-value: {dm_results.p_value.values:.4f}")

123

print(f"MAE-based p-value: {dm_mae.p_value.values:.4f}")

124

print(f"Q90-based p-value: {dm_q90.p_value.values:.4f}")

125

```

126

127

### Confidence Intervals

128

129

```python

130

# Examine confidence intervals for the difference

131

confidence_level = 0.95

132

dm_ci = diebold_mariano(

133

forecast_1, forecast_2, observations,

134

confidence_level=confidence_level

135

)

136

137

ci_lower = dm_ci.confidence_interval.isel(bound=0)

138

ci_upper = dm_ci.confidence_interval.isel(bound=1)

139

140

print(f"\n{confidence_level*100:.0f}% Confidence Interval for MSE difference:")

141

print(f"[{ci_lower.values:.4f}, {ci_upper.values:.4f}]")

142

143

if ci_lower.values > 0:

144

print("Forecast 2 is consistently better (CI above zero)")

145

elif ci_upper.values < 0:

146

print("Forecast 1 is consistently better (CI below zero)")

147

else:

148

print("Confidence interval includes zero (no clear winner)")

149

```

150

151

### Multi-dimensional Forecast Comparison

152

153

```python

154

# Compare forecasts over spatial dimensions

155

forecast_1_3d = xr.DataArray(

156

np.random.normal(10, 2, (100, 10, 15)), # time, lat, lon

157

dims=["time", "lat", "lon"]

158

)

159

forecast_2_3d = xr.DataArray(

160

np.random.normal(10.5, 2.2, (100, 10, 15)),

161

dims=["time", "lat", "lon"]

162

)

163

observations_3d = xr.DataArray(

164

np.random.normal(10, 2, (100, 10, 15)),

165

dims=["time", "lat", "lon"]

166

)

167

168

# Test at each spatial point (reduce time dimension)

169

dm_spatial = diebold_mariano(

170

forecast_1_3d, forecast_2_3d, observations_3d,

171

reduce_dims="time"

172

)

173

174

# Count significant differences by location

175

n_significant = dm_spatial.significant.sum()

176

total_points = dm_spatial.significant.size

177

significance_fraction = n_significant / total_points

178

179

print(f"\nSpatial analysis:")

180

print(f"Significant differences at {n_significant.values}/{total_points} grid points")

181

print(f"Significance fraction: {significance_fraction.values:.3f}")

182

183

# Overall test (reduce all dimensions)

184

dm_overall = diebold_mariano(

185

forecast_1_3d, forecast_2_3d, observations_3d,

186

reduce_dims=["time", "lat", "lon"]

187

)

188

189

print(f"Overall DM statistic: {dm_overall.dm_statistic.values:.4f}")

190

print(f"Overall p-value: {dm_overall.p_value.values:.4f}")

191

```

192

193

### Time Series Analysis

194

195

```python

196

# Analyze temporal patterns in forecast differences

197

import pandas as pd

198

199

# Create time-indexed forecasts

200

dates = pd.date_range("2023-01-01", periods=365, freq="D")

201

forecast_1_ts = xr.DataArray(

202

10 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 1, 365),

203

dims=["time"],

204

coords={"time": dates}

205

)

206

forecast_2_ts = xr.DataArray(

207

10.2 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 1.1, 365),

208

dims=["time"],

209

coords={"time": dates}

210

)

211

observations_ts = xr.DataArray(

212

10 + 3*np.sin(2*np.pi*np.arange(365)/365) + np.random.normal(0, 0.8, 365),

213

dims=["time"],

214

coords={"time": dates}

215

)

216

217

# Full year comparison

218

dm_annual = diebold_mariano(forecast_1_ts, forecast_2_ts, observations_ts)

219

220

# Seasonal comparisons

221

seasons = {

222

'Winter': observations_ts.time.dt.month.isin([12, 1, 2]),

223

'Spring': observations_ts.time.dt.month.isin([3, 4, 5]),

224

'Summer': observations_ts.time.dt.month.isin([6, 7, 8]),

225

'Autumn': observations_ts.time.dt.month.isin([9, 10, 11])

226

}

227

228

print("\nSeasonal forecast comparison:")

229

print(f"Annual p-value: {dm_annual.p_value.values:.4f}")

230

231

for season_name, season_mask in seasons.items():

232

seasonal_dm = diebold_mariano(

233

forecast_1_ts.where(season_mask),

234

forecast_2_ts.where(season_mask),

235

observations_ts.where(season_mask)

236

)

237

print(f"{season_name:6s} p-value: {seasonal_dm.p_value.values:.4f}")

238

```

239

240

### Multiple Comparison Workflow

241

242

```python

243

# Compare multiple forecast systems

244

forecasts = {

245

'System_A': observations + np.random.normal(0, 0.8, observations.shape),

246

'System_B': observations + np.random.normal(0.1, 1.0, observations.shape),

247

'System_C': observations + np.random.normal(-0.05, 1.2, observations.shape),

248

'Persistence': observations.shift(time=1, fill_value=observations.mean())

249

}

250

251

# Pairwise comparisons

252

import itertools

253

from collections import defaultdict

254

255

comparison_results = defaultdict(dict)

256

system_names = list(forecasts.keys())

257

258

print("\nPairwise Diebold-Mariano Test Results:")

259

print("=" * 50)

260

261

for sys1, sys2 in itertools.combinations(system_names, 2):

262

dm_result = diebold_mariano(

263

forecasts[sys1], forecasts[sys2], observations

264

)

265

266

comparison_results[sys1][sys2] = dm_result

267

268

significance_star = "*" if dm_result.significant.values else ""

269

better_system = sys1 if dm_result.dm_statistic.values < 0 else sys2

270

271

print(f"{sys1:12s} vs {sys2:12s}: "

272

f"DM={dm_result.dm_statistic.values:7.3f}, "

273

f"p={dm_result.p_value.values:.4f}{significance_star:1s}, "

274

f"Better: {better_system}")

275

276

# Summary statistics

277

print(f"\nForecast System Performance Summary:")

278

for system_name in system_names:

279

system_mse = mse(forecasts[system_name], observations)

280

print(f"{system_name:12s}: MSE = {system_mse.values:.4f}")

281

```

282

283

### Effect Size Analysis

284

285

```python

286

# Analyze practical significance alongside statistical significance

287

dm_detailed = diebold_mariano(

288

forecast_1, forecast_2, observations,

289

confidence_level=0.95

290

)

291

292

# Calculate effect size measures

293

mse_1 = mse(forecast_1, observations)

294

mse_2 = mse(forecast_2, observations)

295

296

relative_improvement = (mse_2 - mse_1) / mse_2 * 100

297

absolute_difference = abs(dm_detailed.mean_difference.values)

298

299

print("\nEffect Size Analysis:")

300

print(f"MSE System 1: {mse_1.values:.4f}")

301

print(f"MSE System 2: {mse_2.values:.4f}")

302

print(f"Absolute difference: {absolute_difference:.4f}")

303

print(f"Relative improvement: {relative_improvement.values:.2f}%")

304

print(f"Statistical significance: {dm_detailed.significant.values}")

305

306

# Practical significance threshold (example: 5% improvement)

307

practical_threshold = 0.05

308

practically_significant = abs(relative_improvement.values) > practical_threshold * 100

309

310

print(f"Practically significant (>{practical_threshold*100:.0f}% improvement): {practically_significant}")

311

312

if dm_detailed.significant.values and practically_significant:

313

print("✓ Both statistically and practically significant")

314

elif dm_detailed.significant.values:

315

print("⚠ Statistically significant but small practical effect")

316

elif practically_significant:

317

print("⚠ Large practical effect but not statistically significant")

318

else:

319

print("✗ Neither statistically nor practically significant")

320

```

321

322

### Advanced Test Configuration

323

324

```python

325

# Test with different configurations

326

327

# Without variance adjustment (for comparison)

328

dm_no_adj = diebold_mariano(

329

forecast_1, forecast_2, observations,

330

variance_adjustment=False

331

)

332

333

# Different confidence levels

334

confidence_levels = [0.90, 0.95, 0.99]

335

print("\nSensitivity to Confidence Level:")

336

337

for conf_level in confidence_levels:

338

dm_conf = diebold_mariano(

339

forecast_1, forecast_2, observations,

340

confidence_level=conf_level

341

)

342

print(f"Confidence {conf_level*100:.0f}%: "

343

f"Significant = {dm_conf.significant.values}")

344

345

print(f"\nVariance Adjustment Impact:")

346

print(f"With adjustment: DM = {dm_results.dm_statistic.values:.4f}, p = {dm_results.p_value.values:.4f}")

347

print(f"Without adjustment: DM = {dm_no_adj.dm_statistic.values:.4f}, p = {dm_no_adj.p_value.values:.4f}")

348

```