or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

datasets.mdevaluation.mdindex.mdindexing.mdjava.mdretrieval.mdtext-processing.mdtransformers.mdutilities.md

evaluation.mddocs/

0

# Evaluation Framework

1

2

PyTerrier's evaluation framework provides comprehensive tools for conducting information retrieval experiments, including statistical significance testing, parameter tuning, and cross-validation. The framework is designed for rigorous experimental evaluation of retrieval systems.

3

4

## Capabilities

5

6

### Experiment Class

7

8

Core experimental framework for evaluating multiple retrieval systems with statistical significance testing.

9

10

```python { .api }

11

class Experiment:

12

"""

13

Comprehensive evaluation framework for comparing retrieval systems.

14

15

Parameters:

16

- retr_systems: List of transformer systems to evaluate

17

- topics: DataFrame with queries ('qid', 'query' columns)

18

- qrels: DataFrame with relevance judgments ('qid', 'docno', 'label' columns)

19

- eval_metrics: List of evaluation metrics to compute

20

- names: Optional list of system names for results

21

- perquery: Whether to compute per-query results (default: False)

22

- dataframe: Whether to return results as DataFrame (default: True)

23

- batch_size: Batch size for processing queries (default: None)

24

- filter_by_qrels: Filter results to only qrels topics (default: True)

25

- filter_by_topics: Filter qrels to only topic qids (default: True)

26

- baseline: Index of baseline system for significance testing

27

- test: Statistical test to use ('ttest', 'wilcoxon', 'fisher')

28

- correction: Multiple testing correction ('bonferroni', 'holm', 'fdr')

29

- highlight: Highlighting mode for significant results ('bold', 'color')

30

- round: Number of decimal places for results (default: 4)

31

- verbose: Enable verbose output (default: False)

32

- save_dir: Directory to save detailed results

33

"""

34

def __init__(self, retr_systems: List['Transformer'], topics: pd.DataFrame,

35

qrels: pd.DataFrame, eval_metrics: List[str],

36

names: List[str] = None, perquery: bool = False,

37

dataframe: bool = True, batch_size: int = None,

38

filter_by_qrels: bool = True, filter_by_topics: bool = True,

39

baseline: int = None, test: str = None, correction: str = None,

40

highlight: str = None, round: int = 4, verbose: bool = False,

41

save_dir: str = None, **kwargs) -> pd.DataFrame: ...

42

```

43

44

**Usage Examples:**

45

46

```python

47

# Basic experiment

48

systems = [bm25_retriever, pl2_retriever, dfr_retriever]

49

topics = dataset.get_topics()

50

qrels = dataset.get_qrels()

51

52

results = pt.Experiment(

53

systems,

54

topics,

55

qrels,

56

['map', 'ndcg', 'P_10'],

57

names=['BM25', 'PL2', 'DFR']

58

)

59

print(results)

60

61

# Experiment with significance testing

62

results = pt.Experiment(

63

systems,

64

topics,

65

qrels,

66

['map', 'ndcg'],

67

names=['BM25', 'PL2', 'DFR'],

68

baseline=0, # Use first system as baseline

69

test='ttest', # Paired t-test

70

correction='bonferroni' # Multiple testing correction

71

)

72

73

# Per-query results for detailed analysis

74

perquery_results = pt.Experiment(

75

systems,

76

topics,

77

qrels,

78

['map', 'ndcg'],

79

perquery=True, # Return per-query scores

80

save_dir='/path/to/results' # Save detailed results

81

)

82

```

83

84

### Parameter Tuning Classes

85

86

Advanced parameter optimization classes for systematic hyperparameter tuning.

87

88

```python { .api }

89

class GridSearch:

90

"""

91

Grid search optimization for finding best parameter combinations.

92

93

Parameters:

94

- pipeline: Transformer pipeline to optimize

95

- params: Dictionary mapping parameter names to value lists

96

- topics: Topics DataFrame for evaluation

97

- qrels: Qrels DataFrame for evaluation

98

- metric: Single metric to optimize (e.g., 'map', 'ndcg')

99

- verbose: Enable verbose output

100

- jobs: Number of parallel jobs for evaluation

101

"""

102

def __init__(self, pipeline: 'Transformer', params: Dict[str, List[Any]],

103

topics: pd.DataFrame, qrels: pd.DataFrame, metric: str,

104

verbose: bool = False, jobs: int = 1, **kwargs) -> 'Transformer': ...

105

106

class GridScan:

107

"""

108

Parameter grid scanning for exploring parameter space.

109

110

Parameters:

111

- pipeline: Transformer pipeline to scan

112

- params: Dictionary mapping parameter names to value lists

113

- topics: Topics DataFrame for evaluation

114

- qrels: Qrels DataFrame for evaluation

115

- metrics: List of metrics to compute for each parameter combination

116

- verbose: Enable verbose output

117

- jobs: Number of parallel jobs for evaluation

118

"""

119

def __init__(self, pipeline: 'Transformer', params: Dict[str, List[Any]],

120

topics: pd.DataFrame, qrels: pd.DataFrame, metrics: List[str],

121

verbose: bool = False, jobs: int = 1, **kwargs) -> pd.DataFrame: ...

122

123

class KFoldGridSearch:

124

"""

125

K-fold cross-validation grid search for robust parameter optimization.

126

127

Parameters:

128

- pipeline: Transformer pipeline to optimize

129

- params: Dictionary mapping parameter names to value lists

130

- topics_list: List of topic DataFrames for each fold

131

- qrels: Qrels DataFrame for evaluation

132

- metric: Single metric to optimize

133

- verbose: Enable verbose output

134

- jobs: Number of parallel jobs for evaluation

135

"""

136

def __init__(self, pipeline: 'Transformer', params: Dict[str, List[Any]],

137

topics_list: List[pd.DataFrame], qrels: pd.DataFrame,

138

metric: str, verbose: bool = False, jobs: int = 1, **kwargs) -> 'Transformer': ...

139

```

140

141

**Usage Examples:**

142

143

```python

144

# Grid search for optimal BM25 parameters

145

bm25_pipeline = pt.terrier.Retriever(index_ref, wmodel='BM25')

146

param_grid = {

147

'k1': [0.9, 1.2, 1.5, 1.8],

148

'b': [0.3, 0.5, 0.7, 0.9]

149

}

150

151

best_bm25 = pt.GridSearch(

152

bm25_pipeline,

153

param_grid,

154

topics,

155

qrels,

156

'map',

157

verbose=True

158

)

159

160

# Grid scan to explore parameter space

161

scan_results = pt.GridScan(

162

bm25_pipeline,

163

param_grid,

164

topics,

165

qrels,

166

['map', 'ndcg', 'P_10'],

167

verbose=True

168

)

169

print(scan_results)

170

171

# K-fold cross-validation for robust optimization

172

from sklearn.model_selection import KFold

173

kf = KFold(n_splits=5, shuffle=True, random_state=42)

174

topics_folds = [topics.iloc[train_idx] for train_idx, _ in kf.split(topics)]

175

176

best_params = pt.KFoldGridSearch(

177

bm25_pipeline,

178

param_grid,

179

topics_folds,

180

qrels,

181

'map'

182

)

183

```

184

185

### Single Evaluation Function

186

187

Simple evaluation function for single system assessment.

188

189

```python { .api }

190

class Evaluate:

191

"""

192

Single result evaluation for computing metrics on retrieval results.

193

194

Parameters:

195

- res: Results DataFrame with 'qid', 'docno', 'score' columns

196

- qrels: Qrels DataFrame with 'qid', 'docno', 'label' columns

197

- metrics: List of evaluation metrics to compute

198

- perquery: Whether to return per-query results (default: False)

199

"""

200

def __init__(self, res: pd.DataFrame, qrels: pd.DataFrame,

201

metrics: List[str], perquery: bool = False) -> pd.DataFrame: ...

202

```

203

204

**Usage Example:**

205

206

```python

207

# Evaluate single system results

208

results = retriever.transform(topics)

209

evaluation = pt.Evaluate(results, qrels, ['map', 'ndcg', 'P_10'])

210

print(evaluation)

211

212

# Per-query evaluation

213

perquery_eval = pt.Evaluate(results, qrels, ['map', 'ndcg'], perquery=True)

214

```

215

216

### Utility Transformers

217

218

Helper transformers for result processing and normalization in evaluation pipelines.

219

220

```python { .api }

221

class PerQueryMaxMinScoreTransformer(Transformer):

222

"""

223

Per-query min-max score normalization transformer.

224

225

Normalizes document scores within each query to [0,1] range using

226

min-max normalization per query.

227

"""

228

def __init__(self): ...

229

```

230

231

**Usage Example:**

232

233

```python

234

# Normalize scores before fusion

235

normalized_pipeline = (

236

retriever >>

237

pt.pipelines.PerQueryMaxMinScoreTransformer() >>

238

reranker

239

)

240

```

241

242

## Evaluation Metrics

243

244

PyTerrier supports comprehensive evaluation metrics through integration with the `ir-measures` library:

245

246

### Precision-Based Metrics

247

- `P_5`, `P_10`, `P_20`: Precision at rank cutoffs

248

- `R_5`, `R_10`, `R_20`: Recall at rank cutoffs

249

- `F_5`, `F_10`, `F_20`: F1-score at rank cutoffs

250

251

### Ranking-Based Metrics

252

- `map`: Mean Average Precision

253

- `ndcg`: Normalized Discounted Cumulative Gain

254

- `ndcg_cut_5`, `ndcg_cut_10`, `ndcg_cut_20`: NDCG at rank cutoffs

255

- `mrr`: Mean Reciprocal Rank

256

257

### Advanced Metrics

258

- `bpref`: Binary preference measure

259

- `rbp`: Rank-biased precision

260

- `err`: Expected reciprocal rank

261

- `infap`: Inferred Average Precision

262

263

## Advanced Evaluation Patterns

264

265

### Multi-Dataset Evaluation

266

267

```python

268

# Evaluate across multiple datasets

269

datasets = ['vaswani', 'antique', 'trec-robust-2004']

270

systems = [bm25, pl2, dfr]

271

all_results = []

272

273

for dataset_name in datasets:

274

dataset = pt.get_dataset(dataset_name)

275

topics = dataset.get_topics()

276

qrels = dataset.get_qrels()

277

278

results = pt.Experiment(

279

systems,

280

topics,

281

qrels,

282

['map', 'ndcg'],

283

names=['BM25', 'PL2', 'DFR']

284

)

285

results['dataset'] = dataset_name

286

all_results.append(results)

287

288

combined = pd.concat(all_results)

289

```

290

291

### Statistical Significance Analysis

292

293

```python

294

# Comprehensive significance testing

295

results = pt.Experiment(

296

[bm25, improved_system],

297

topics,

298

qrels,

299

['map', 'ndcg', 'P_10'],

300

names=['Baseline', 'Improved'],

301

baseline=0, # First system as baseline

302

test='ttest', # Paired t-test

303

correction='bonferroni', # Multiple testing correction

304

highlight='bold', # Bold significant improvements

305

perquery=True, # Enable per-query analysis

306

save_dir='/path/to/results' # Save detailed results

307

)

308

```

309

310

### Learning Curve Analysis

311

312

```python

313

# Evaluate with different training set sizes

314

training_sizes = [0.1, 0.2, 0.5, 0.8, 1.0]

315

learning_results = []

316

317

for size in training_sizes:

318

# Sample training data

319

train_sample = training_data.sample(frac=size, random_state=42)

320

321

# Train model

322

trained_model = ltr_model.fit(train_sample)

323

324

# Evaluate

325

pipeline = retriever >> trained_model

326

result = pt.Experiment([pipeline], test_topics, test_qrels, ['map'])

327

result['training_size'] = size

328

learning_results.append(result)

329

330

learning_curve = pd.concat(learning_results)

331

```

332

333

### Cross-Validation Evaluation

334

335

```python

336

from sklearn.model_selection import KFold

337

338

# K-fold cross-validation for robust evaluation

339

kf = KFold(n_splits=5, shuffle=True, random_state=42)

340

cv_results = []

341

342

for fold, (train_idx, test_idx) in enumerate(kf.split(topics)):

343

train_topics = topics.iloc[train_idx]

344

test_topics = topics.iloc[test_idx]

345

346

# Train on fold

347

trained_system = estimator.fit(train_topics)

348

349

# Evaluate on fold

350

result = pt.Experiment([trained_system], test_topics, qrels, ['map'])

351

result['fold'] = fold

352

cv_results.append(result)

353

354

cv_summary = pd.concat(cv_results).groupby('name').agg({

355

'map': ['mean', 'std']

356

}).round(4)

357

```

358

359

## Types

360

361

```python { .api }

362

from typing import Dict, List, Any, Union, Optional

363

import pandas as pd

364

365

# Evaluation-specific types

366

SystemList = List['Transformer'] # List of systems to evaluate

367

MetricList = List[str] # List of evaluation metrics

368

ParameterGrid = Dict[str, List[Any]] # Parameter search space

369

StatisticalTest = str # Statistical test name ('ttest', 'wilcoxon', 'fisher')

370

CorrectionMethod = str # Multiple testing correction method

371

HighlightMode = str # Result highlighting mode ('bold', 'color')

372

BaselineIndex = int # Index of baseline system

373

TopicsFold = List[pd.DataFrame] # List of topic DataFrames for cross-validation

374

```