Elasticsearch plugin providing comprehensive ranking evaluation capabilities for search quality assessment.
—
This section covers the implementation of information retrieval metrics for ranking quality assessment, including Precision@K, Recall@K, Mean Reciprocal Rank (MRR), Discounted Cumulative Gain (DCG), and Expected Reciprocal Rank (ERR).
Base interface that all evaluation metrics must implement.
public interface EvaluationMetric extends ToXContentObject, NamedWriteable {
// Core evaluation method
EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
// Result combination (default: average)
default double combine(Collection<EvalQueryQuality> partialResults);
// Optional search window size constraint
default OptionalInt forcedSearchSize();
// Utility methods for joining results with ratings
static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocument> ratedDocs);
static List<DocumentKey> filterUnratedDocuments(List<RatedSearchHit> ratedHits);
}Measures the fraction of retrieved documents that are relevant at rank K.
public class PrecisionAtK implements EvaluationMetric {
public static final String NAME = "precision";
// Constructors
public PrecisionAtK();
public PrecisionAtK(int relevantRatingThreshold, boolean ignoreUnlabeled, int k);
// Configuration access
public int getK();
public int getRelevantRatingThreshold();
public boolean getIgnoreUnlabeled();
// Evaluation implementation
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
// Serialization
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;
public static PrecisionAtK fromXContent(XContentParser parser) throws IOException;
}Configuration Parameters:
k (default: 10) - Search window size, only considers top K resultsrelevant_rating_threshold (default: 1) - Minimum rating to consider document relevantignore_unlabeled (default: false) - Whether to ignore documents without ratingsUsage:
// Default precision@10 with rating threshold 1
PrecisionAtK precision = new PrecisionAtK();
// Custom precision@5 with higher relevance threshold
PrecisionAtK customPrecision = new PrecisionAtK(2, false, 5);
// Use in evaluation spec
RankEvalSpec spec = new RankEvalSpec(ratedRequests, precision);Calculation:
Precision@K = (Number of relevant documents in top K) / KMeasures the fraction of relevant documents that are retrieved at rank K.
public class RecallAtK implements EvaluationMetric {
public static final String NAME = "recall";
// Constructors
public RecallAtK();
public RecallAtK(int relevantRatingThreshold, int k);
// Configuration access
public int getK();
public int getRelevantRatingThreshold();
// Evaluation implementation
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
// Serialization
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;
public static RecallAtK fromXContent(XContentParser parser) throws IOException;
}Configuration Parameters:
k (default: 10) - Search window sizerelevant_rating_threshold (default: 1) - Minimum rating to consider document relevantUsage:
// Default recall@10
RecallAtK recall = new RecallAtK();
// Custom recall@20 with higher threshold
RecallAtK customRecall = new RecallAtK(2, 20);Calculation:
Recall@K = (Number of relevant documents in top K) / (Total number of relevant documents)Measures the average reciprocal rank of the first relevant document.
public class MeanReciprocalRank implements EvaluationMetric {
public static final String NAME = "mean_reciprocal_rank";
// Constructors
public MeanReciprocalRank();
public MeanReciprocalRank(int relevantRatingThreshold, int k);
// Configuration access
public int getK();
public int getRelevantRatingThreshold();
// Evaluation implementation
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
// Serialization
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;
public static MeanReciprocalRank fromXContent(XContentParser parser) throws IOException;
}Configuration Parameters:
k (default: 10) - Search window sizerelevant_rating_threshold (default: 1) - Minimum rating to consider document relevantUsage:
// Default MRR@10
MeanReciprocalRank mrr = new MeanReciprocalRank();
// Custom MRR@5 with threshold 2
MeanReciprocalRank customMrr = new MeanReciprocalRank(2, 5);Calculation:
MRR = 1 / (rank of first relevant document)
If no relevant documents found: MRR = 0Measures the cumulative gain of documents with position-based discounting.
public class DiscountedCumulativeGain implements EvaluationMetric {
public static final String NAME = "dcg";
// Constructors
public DiscountedCumulativeGain();
public DiscountedCumulativeGain(boolean normalize, Integer unknownDocRating, int k);
// Configuration access
public int getK();
boolean getNormalize(); // package-private
// Evaluation implementation
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
// Serialization
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;
public static DiscountedCumulativeGain fromXContent(XContentParser parser) throws IOException;
}Configuration Parameters:
normalize (default: false) - Whether to compute normalized DCG (nDCG)unknownDocRating (default: null) - Rating to assign to documents not present in the rated documents listk (default: 10) - Search window sizeUsage:
// Standard DCG@10
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
// Normalized DCG@20 with unknown docs treated as rating 0
DiscountedCumulativeGain ndcg = new DiscountedCumulativeGain(true, 0, 20);Calculation:
DCG@K = sum(i=1 to K) [ (2^rating_i - 1) / log2(i + 1) ]
nDCG@K = DCG@K / IDCG@K (where IDCG is ideal DCG)Measures the expected reciprocal rank based on a cascade model.
public class ExpectedReciprocalRank implements EvaluationMetric {
public static final String NAME = "expected_reciprocal_rank";
// Constructors
public ExpectedReciprocalRank();
public ExpectedReciprocalRank(int maxRelevance);
public ExpectedReciprocalRank(int maxRelevance, Integer unknownDocRating, int k);
// Configuration access
int getK(); // package-private
int getMaxRelevance(); // package-private
// Evaluation implementation
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
// Serialization
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException;
public static ExpectedReciprocalRank fromXContent(XContentParser parser) throws IOException;
}Configuration Parameters:
maxRelevance (default: 3) - Maximum relevance grade used for probability calculationunknownDocRating (default: null) - Rating to assign to documents not present in the rated documents listk (default: 10) - Search window sizeUsage:
// Default ERR@10 with max relevance 3
ExpectedReciprocalRank err = new ExpectedReciprocalRank();
// Custom ERR with max relevance 5
ExpectedReciprocalRank customErr = new ExpectedReciprocalRank(5);
// Custom ERR@5 with max relevance 4 and unknown docs treated as rating 0
ExpectedReciprocalRank advancedErr = new ExpectedReciprocalRank(4, 0, 5);Calculation:
ERR = sum(i=1 to K) [ (1/i) * P(user stops at position i) ]
where P(stop at i) = R_i * prod(j=1 to i-1) [ (1 - R_j) ]
and R_i = (2^rating_i - 1) / (2^max_relevance)// Create precision@10 metric
PrecisionAtK precision = new PrecisionAtK(1, false, 10);
// Use in evaluation
RankEvalSpec spec = new RankEvalSpec(ratedRequests, precision);
RankEvalRequest request = new RankEvalRequest(spec, indices);
RankEvalResponse response = client.execute(RankEvalAction.INSTANCE, request).get();
System.out.println("Average Precision@10: " + response.getMetricScore());// Define different metric configurations
List<EvaluationMetric> metrics = Arrays.asList(
new PrecisionAtK(1, false, 5),
new PrecisionAtK(1, false, 10),
new RecallAtK(1, 10),
new MeanReciprocalRank(1, 10),
new DiscountedCumulativeGain(true, null, 10) // nDCG@10
);
// Evaluate with each metric
Map<String, Double> results = new HashMap<>();
for (EvaluationMetric metric : metrics) {
RankEvalSpec spec = new RankEvalSpec(ratedRequests, metric);
RankEvalRequest request = new RankEvalRequest(spec, indices);
RankEvalResponse response = client.execute(RankEvalAction.INSTANCE, request).get();
results.put(metric.getWriteableName(), response.getMetricScore());
}
// Display comparison
results.forEach((name, score) ->
System.out.println(name + ": " + String.format("%.4f", score))
);// High precision requirements: only consider highly relevant docs (rating >= 3)
PrecisionAtK strictPrecision = new PrecisionAtK(3, true, 5);
// Comprehensive recall: large window, low threshold
RecallAtK comprehensiveRecall = new RecallAtK(1, 100);
// Quality-focused ranking: nDCG with moderate window
DiscountedCumulativeGain qualityRanking = new DiscountedCumulativeGain(true, null, 20);// Create template for consistent query structure
Script queryTemplate = new Script(
ScriptType.INLINE,
"mustache",
"{\"query\": {\"multi_match\": {\"query\": \"{{search_term}}\", \"fields\": [\"title^2\", \"content\"]}}}",
Collections.emptyMap()
);
ScriptWithId templateScript = new ScriptWithId("product_search", queryTemplate);
// Define different parameter sets for A/B testing
List<Map<String, Object>> paramSets = Arrays.asList(
Map.of("search_term", "laptop computer"),
Map.of("search_term", "gaming laptop"),
Map.of("search_term", "business laptop")
);
// Create rated requests using template
List<RatedRequest> templateRequests = new ArrayList<>();
for (int i = 0; i < paramSets.size(); i++) {
RatedRequest request = new RatedRequest(
"query_" + i,
"product_search",
paramSets.get(i),
ratedDocuments.get(i)
);
templateRequests.add(request);
}
// Evaluate with multiple metrics using templates
List<EvaluationMetric> metrics = Arrays.asList(
new PrecisionAtK(2, false, 10),
new DiscountedCumulativeGain(true, null, 10),
new ExpectedReciprocalRank(4, null, 10)
);
for (EvaluationMetric metric : metrics) {
RankEvalSpec spec = new RankEvalSpec(
templateRequests,
metric,
Arrays.asList(templateScript)
);
RankEvalRequest request = new RankEvalRequest(spec, new String[]{"products"});
RankEvalResponse response = client.execute(RankEvalAction.INSTANCE, request).get();
System.out.println(metric.getWriteableName() + " with templates: " + response.getMetricScore());
}Each metric provides detailed breakdown information through nested Detail classes:
// Base detail interface
public abstract class MetricDetail implements NamedWriteable, ToXContentFragment {
// Implemented by each metric's Detail nested class
}
// Example: PrecisionAtK.Detail
public static class Detail extends MetricDetail {
public int getRelevantRetrieved();
public int getRetrieved();
// Additional metric-specific details
}Access detailed results:
Map<String, EvalQueryQuality> queryResults = response.getPartialResults();
for (EvalQueryQuality quality : queryResults.values()) {
MetricDetail details = quality.getMetricDetails();
if (details instanceof PrecisionAtK.Detail) {
PrecisionAtK.Detail precisionDetails = (PrecisionAtK.Detail) details;
int relevant = precisionDetails.getRelevantRetrieved();
int total = precisionDetails.getRetrieved();
System.out.println("Found " + relevant + " relevant out of " + total + " retrieved");
}
}Install with Tessl CLI
npx tessl i tessl/maven-org-elasticsearch-plugin--rank-eval-client