or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration-utilities.mdevaluation-metrics.mdindex.mdrequest-response-api.md

evaluation-metrics.mddocs/

0

# Evaluation Metrics

1

2

This section covers the implementation of information retrieval metrics for ranking quality assessment, including Precision@K, Recall@K, Mean Reciprocal Rank (MRR), Discounted Cumulative Gain (DCG), and Expected Reciprocal Rank (ERR).

3

4

## Base Interface

5

6

### EvaluationMetric

7

8

Base interface that all evaluation metrics must implement.

9

10

```java { .api }

11

public interface EvaluationMetric extends ToXContentObject, NamedWriteable {

12

// Core evaluation method

13

EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);

14

15

// Result combination (default: average)

16

default double combine(Collection<EvalQueryQuality> partialResults);

17

18

// Optional search window size constraint

19

default OptionalInt forcedSearchSize();

20

21

// Utility methods for joining results with ratings

22

static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocument> ratedDocs);

23

static List<DocumentKey> filterUnratedDocuments(List<RatedSearchHit> ratedHits);

24

}

25

```

26

27

## Precision Metrics

28

29

### PrecisionAtK

30

31

Measures the fraction of retrieved documents that are relevant at rank K.

32

33

```java { .api }

34

public class PrecisionAtK implements EvaluationMetric {

35

public static final String NAME = "precision";

36

37

// Constructors

38

public PrecisionAtK();

39

public PrecisionAtK(int relevantRatingThreshold, boolean ignoreUnlabeled, int k);

40

41

// Configuration access

42

public int getK();

43

public int getRelevantRatingThreshold();

44

public boolean getIgnoreUnlabeled();

45

46

// Evaluation implementation

47

public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);

48

49

// Serialization

50

public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;

51

public static PrecisionAtK fromXContent(XContentParser parser) throws IOException;

52

}

53

```

54

55

**Configuration Parameters:**

56

- `k` (default: 10) - Search window size, only considers top K results

57

- `relevant_rating_threshold` (default: 1) - Minimum rating to consider document relevant

58

- `ignore_unlabeled` (default: false) - Whether to ignore documents without ratings

59

60

**Usage:**

61

```java

62

// Default precision@10 with rating threshold 1

63

PrecisionAtK precision = new PrecisionAtK();

64

65

// Custom precision@5 with higher relevance threshold

66

PrecisionAtK customPrecision = new PrecisionAtK(2, false, 5);

67

68

// Use in evaluation spec

69

RankEvalSpec spec = new RankEvalSpec(ratedRequests, precision);

70

```

71

72

**Calculation:**

73

```

74

Precision@K = (Number of relevant documents in top K) / K

75

```

76

77

## Recall Metrics

78

79

### RecallAtK

80

81

Measures the fraction of relevant documents that are retrieved at rank K.

82

83

```java { .api }

84

public class RecallAtK implements EvaluationMetric {

85

public static final String NAME = "recall";

86

87

// Constructors

88

public RecallAtK();

89

public RecallAtK(int relevantRatingThreshold, int k);

90

91

// Configuration access

92

public int getK();

93

public int getRelevantRatingThreshold();

94

95

// Evaluation implementation

96

public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);

97

98

// Serialization

99

public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;

100

public static RecallAtK fromXContent(XContentParser parser) throws IOException;

101

}

102

```

103

104

**Configuration Parameters:**

105

- `k` (default: 10) - Search window size

106

- `relevant_rating_threshold` (default: 1) - Minimum rating to consider document relevant

107

108

**Usage:**

109

```java

110

// Default recall@10

111

RecallAtK recall = new RecallAtK();

112

113

// Custom recall@20 with higher threshold

114

RecallAtK customRecall = new RecallAtK(2, 20);

115

```

116

117

**Calculation:**

118

```

119

Recall@K = (Number of relevant documents in top K) / (Total number of relevant documents)

120

```

121

122

## Ranking Position Metrics

123

124

### MeanReciprocalRank

125

126

Measures the average reciprocal rank of the first relevant document.

127

128

```java { .api }

129

public class MeanReciprocalRank implements EvaluationMetric {

130

public static final String NAME = "mean_reciprocal_rank";

131

132

// Constructors

133

public MeanReciprocalRank();

134

public MeanReciprocalRank(int relevantRatingThreshold, int k);

135

136

// Configuration access

137

public int getK();

138

public int getRelevantRatingThreshold();

139

140

// Evaluation implementation

141

public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);

142

143

// Serialization

144

public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;

145

public static MeanReciprocalRank fromXContent(XContentParser parser) throws IOException;

146

}

147

```

148

149

**Configuration Parameters:**

150

- `k` (default: 10) - Search window size

151

- `relevant_rating_threshold` (default: 1) - Minimum rating to consider document relevant

152

153

**Usage:**

154

```java

155

// Default MRR@10

156

MeanReciprocalRank mrr = new MeanReciprocalRank();

157

158

// Custom MRR@5 with threshold 2

159

MeanReciprocalRank customMrr = new MeanReciprocalRank(2, 5);

160

```

161

162

**Calculation:**

163

```

164

MRR = 1 / (rank of first relevant document)

165

If no relevant documents found: MRR = 0

166

```

167

168

## Discounted Metrics

169

170

### DiscountedCumulativeGain

171

172

Measures the cumulative gain of documents with position-based discounting.

173

174

```java { .api }

175

public class DiscountedCumulativeGain implements EvaluationMetric {

176

public static final String NAME = "dcg";

177

178

// Constructors

179

public DiscountedCumulativeGain();

180

public DiscountedCumulativeGain(boolean normalize, Integer unknownDocRating, int k);

181

182

// Configuration access

183

public int getK();

184

boolean getNormalize(); // package-private

185

186

// Evaluation implementation

187

public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);

188

189

// Serialization

190

public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;

191

public static DiscountedCumulativeGain fromXContent(XContentParser parser) throws IOException;

192

}

193

```

194

195

**Configuration Parameters:**

196

- `normalize` (default: false) - Whether to compute normalized DCG (nDCG)

197

- `unknownDocRating` (default: null) - Rating to assign to documents not present in the rated documents list

198

- `k` (default: 10) - Search window size

199

200

**Usage:**

201

```java

202

// Standard DCG@10

203

DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();

204

205

// Normalized DCG@20 with unknown docs treated as rating 0

206

DiscountedCumulativeGain ndcg = new DiscountedCumulativeGain(true, 0, 20);

207

```

208

209

**Calculation:**

210

```

211

DCG@K = sum(i=1 to K) [ (2^rating_i - 1) / log2(i + 1) ]

212

nDCG@K = DCG@K / IDCG@K (where IDCG is ideal DCG)

213

```

214

215

### ExpectedReciprocalRank

216

217

Measures the expected reciprocal rank based on a cascade model.

218

219

```java { .api }

220

public class ExpectedReciprocalRank implements EvaluationMetric {

221

public static final String NAME = "expected_reciprocal_rank";

222

223

// Constructors

224

public ExpectedReciprocalRank();

225

public ExpectedReciprocalRank(int maxRelevance);

226

public ExpectedReciprocalRank(int maxRelevance, Integer unknownDocRating, int k);

227

228

// Configuration access

229

int getK(); // package-private

230

int getMaxRelevance(); // package-private

231

232

// Evaluation implementation

233

public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);

234

235

// Serialization

236

public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;

237

public static ExpectedReciprocalRank fromXContent(XContentParser parser) throws IOException;

238

}

239

```

240

241

**Configuration Parameters:**

242

- `maxRelevance` (default: 3) - Maximum relevance grade used for probability calculation

243

- `unknownDocRating` (default: null) - Rating to assign to documents not present in the rated documents list

244

- `k` (default: 10) - Search window size

245

246

**Usage:**

247

```java

248

// Default ERR@10 with max relevance 3

249

ExpectedReciprocalRank err = new ExpectedReciprocalRank();

250

251

// Custom ERR with max relevance 5

252

ExpectedReciprocalRank customErr = new ExpectedReciprocalRank(5);

253

254

// Custom ERR@5 with max relevance 4 and unknown docs treated as rating 0

255

ExpectedReciprocalRank advancedErr = new ExpectedReciprocalRank(4, 0, 5);

256

```

257

258

**Calculation:**

259

```

260

ERR = sum(i=1 to K) [ (1/i) * P(user stops at position i) ]

261

where P(stop at i) = R_i * prod(j=1 to i-1) [ (1 - R_j) ]

262

and R_i = (2^rating_i - 1) / (2^max_relevance)

263

```

264

265

## Usage Examples

266

267

### Single Metric Evaluation

268

269

```java

270

// Create precision@10 metric

271

PrecisionAtK precision = new PrecisionAtK(1, false, 10);

272

273

// Use in evaluation

274

RankEvalSpec spec = new RankEvalSpec(ratedRequests, precision);

275

RankEvalRequest request = new RankEvalRequest(spec, indices);

276

RankEvalResponse response = client.execute(RankEvalAction.INSTANCE, request).get();

277

278

System.out.println("Average Precision@10: " + response.getMetricScore());

279

```

280

281

### Multiple Metrics Comparison

282

283

```java

284

// Define different metric configurations

285

List<EvaluationMetric> metrics = Arrays.asList(

286

new PrecisionAtK(1, false, 5),

287

new PrecisionAtK(1, false, 10),

288

new RecallAtK(1, 10),

289

new MeanReciprocalRank(1, 10),

290

new DiscountedCumulativeGain(true, null, 10) // nDCG@10

291

);

292

293

// Evaluate with each metric

294

Map<String, Double> results = new HashMap<>();

295

for (EvaluationMetric metric : metrics) {

296

RankEvalSpec spec = new RankEvalSpec(ratedRequests, metric);

297

RankEvalRequest request = new RankEvalRequest(spec, indices);

298

RankEvalResponse response = client.execute(RankEvalAction.INSTANCE, request).get();

299

300

results.put(metric.getWriteableName(), response.getMetricScore());

301

}

302

303

// Display comparison

304

results.forEach((name, score) ->

305

System.out.println(name + ": " + String.format("%.4f", score))

306

);

307

```

308

309

### Custom Metric Configuration

310

311

```java

312

// High precision requirements: only consider highly relevant docs (rating >= 3)

313

PrecisionAtK strictPrecision = new PrecisionAtK(3, true, 5);

314

315

// Comprehensive recall: large window, low threshold

316

RecallAtK comprehensiveRecall = new RecallAtK(1, 100);

317

318

// Quality-focused ranking: nDCG with moderate window

319

DiscountedCumulativeGain qualityRanking = new DiscountedCumulativeGain(true, null, 20);

320

```

321

322

### Template-Based Multi-Metric Evaluation

323

324

```java

325

// Create template for consistent query structure

326

Script queryTemplate = new Script(

327

ScriptType.INLINE,

328

"mustache",

329

"{\"query\": {\"multi_match\": {\"query\": \"{{search_term}}\", \"fields\": [\"title^2\", \"content\"]}}}",

330

Collections.emptyMap()

331

);

332

333

ScriptWithId templateScript = new ScriptWithId("product_search", queryTemplate);

334

335

// Define different parameter sets for A/B testing

336

List<Map<String, Object>> paramSets = Arrays.asList(

337

Map.of("search_term", "laptop computer"),

338

Map.of("search_term", "gaming laptop"),

339

Map.of("search_term", "business laptop")

340

);

341

342

// Create rated requests using template

343

List<RatedRequest> templateRequests = new ArrayList<>();

344

for (int i = 0; i < paramSets.size(); i++) {

345

RatedRequest request = new RatedRequest(

346

"query_" + i,

347

"product_search",

348

paramSets.get(i),

349

ratedDocuments.get(i)

350

);

351

templateRequests.add(request);

352

}

353

354

// Evaluate with multiple metrics using templates

355

List<EvaluationMetric> metrics = Arrays.asList(

356

new PrecisionAtK(2, false, 10),

357

new DiscountedCumulativeGain(true, null, 10),

358

new ExpectedReciprocalRank(4, null, 10)

359

);

360

361

for (EvaluationMetric metric : metrics) {

362

RankEvalSpec spec = new RankEvalSpec(

363

templateRequests,

364

metric,

365

Arrays.asList(templateScript)

366

);

367

368

RankEvalRequest request = new RankEvalRequest(spec, new String[]{"products"});

369

RankEvalResponse response = client.execute(RankEvalAction.INSTANCE, request).get();

370

371

System.out.println(metric.getWriteableName() + " with templates: " + response.getMetricScore());

372

}

373

```

374

375

## Metric Detail Classes

376

377

Each metric provides detailed breakdown information through nested Detail classes:

378

379

```java { .api }

380

// Base detail interface

381

public abstract class MetricDetail implements NamedWriteable, ToXContentFragment {

382

// Implemented by each metric's Detail nested class

383

}

384

385

// Example: PrecisionAtK.Detail

386

public static class Detail extends MetricDetail {

387

public int getRelevantRetrieved();

388

public int getRetrieved();

389

// Additional metric-specific details

390

}

391

```

392

393

**Access detailed results:**

394

```java

395

Map<String, EvalQueryQuality> queryResults = response.getPartialResults();

396

for (EvalQueryQuality quality : queryResults.values()) {

397

MetricDetail details = quality.getMetricDetails();

398

if (details instanceof PrecisionAtK.Detail) {

399

PrecisionAtK.Detail precisionDetails = (PrecisionAtK.Detail) details;

400

int relevant = precisionDetails.getRelevantRetrieved();

401

int total = precisionDetails.getRetrieved();

402

System.out.println("Found " + relevant + " relevant out of " + total + " retrieved");

403

}

404

}

405

```