0
# Evaluation
1
2
Metrics and evaluation components for assessing pipeline performance, answer quality, and retrieval effectiveness. Haystack provides comprehensive evaluation tools for measuring various aspects of AI system performance.
3
4
## Capabilities
5
6
### Context Relevance Evaluation
7
8
Evaluate how relevant retrieved contexts are to given questions using LLM-based assessment.
9
10
```python { .api }
11
class ContextRelevanceEvaluator:
12
def __init__(
13
self,
14
api_key: Secret,
15
model: str = "gpt-3.5-turbo",
16
instructions: Optional[str] = None,
17
inputs: Optional[List[str]] = None,
18
outputs: Optional[List[str]] = None,
19
examples: Optional[List[Dict[str, str]]] = None,
20
api: Literal["openai", "azure"] = "openai",
21
azure_endpoint: Optional[str] = None,
22
azure_deployment: Optional[str] = None,
23
api_version: Optional[str] = None
24
) -> None:
25
"""
26
Initialize context relevance evaluator.
27
28
Args:
29
api_key: API key for the LLM service
30
model: Model name to use for evaluation
31
instructions: Custom evaluation instructions
32
inputs: Input field names
33
outputs: Output field names
34
examples: Few-shot examples for evaluation
35
api: API service to use (openai or azure)
36
azure_endpoint: Azure OpenAI endpoint
37
azure_deployment: Azure OpenAI deployment name
38
api_version: Azure OpenAI API version
39
"""
40
41
def run(
42
self,
43
questions: List[str],
44
contexts: List[List[str]]
45
) -> Dict[str, List[float]]:
46
"""
47
Evaluate context relevance for question-context pairs.
48
49
Args:
50
questions: List of questions
51
contexts: List of context lists, one per question
52
53
Returns:
54
Dictionary with 'individual_scores' containing relevance scores (0-1)
55
"""
56
```
57
58
### Faithfulness Evaluation
59
60
Assess whether generated answers are faithful to the provided context and don't contain hallucinations.
61
62
```python { .api }
63
class FaithfulnessEvaluator:
64
def __init__(
65
self,
66
api_key: Secret,
67
model: str = "gpt-3.5-turbo",
68
instructions: Optional[str] = None,
69
inputs: Optional[List[str]] = None,
70
outputs: Optional[List[str]] = None,
71
examples: Optional[List[Dict[str, str]]] = None,
72
api: Literal["openai", "azure"] = "openai",
73
azure_endpoint: Optional[str] = None,
74
azure_deployment: Optional[str] = None,
75
api_version: Optional[str] = None
76
) -> None:
77
"""Initialize faithfulness evaluator."""
78
79
def run(
80
self,
81
questions: List[str],
82
contexts: List[List[str]],
83
responses: List[str]
84
) -> Dict[str, List[float]]:
85
"""
86
Evaluate faithfulness of responses to contexts.
87
88
Args:
89
questions: List of questions
90
contexts: List of context lists, one per question
91
responses: List of generated responses
92
93
Returns:
94
Dictionary with 'individual_scores' containing faithfulness scores (0-1)
95
"""
96
```
97
98
### Answer Exact Match Evaluation
99
100
Compare generated answers with reference answers using exact string matching.
101
102
```python { .api }
103
class AnswerExactMatchEvaluator:
104
def __init__(
105
self,
106
ignore_case: bool = False,
107
ignore_punctuation: bool = False,
108
ignore_whitespace: bool = False,
109
regex_pattern: Optional[str] = None
110
) -> None:
111
"""
112
Initialize exact match evaluator.
113
114
Args:
115
ignore_case: Whether to ignore case differences
116
ignore_punctuation: Whether to ignore punctuation differences
117
ignore_whitespace: Whether to ignore whitespace differences
118
regex_pattern: Optional regex pattern for custom matching
119
"""
120
121
def run(
122
self,
123
expected_answers: List[List[str]],
124
predicted_answers: List[str]
125
) -> Dict[str, List[int]]:
126
"""
127
Evaluate exact match between predicted and expected answers.
128
129
Args:
130
expected_answers: List of expected answer lists
131
predicted_answers: List of predicted answers
132
133
Returns:
134
Dictionary with 'individual_scores' containing match scores (0 or 1)
135
"""
136
```
137
138
### Document Retrieval Evaluation
139
140
Evaluate retrieval performance using information retrieval metrics.
141
142
```python { .api }
143
class DocumentMAPEvaluator:
144
def __init__(
145
self,
146
mode: Literal["individual", "average"] = "individual"
147
) -> None:
148
"""
149
Initialize Mean Average Precision evaluator.
150
151
Args:
152
mode: Whether to return individual scores or average
153
"""
154
155
def run(
156
self,
157
ground_truth_documents: List[List[str]],
158
retrieved_documents: List[List[str]]
159
) -> Dict[str, Union[List[float], float]]:
160
"""
161
Calculate Mean Average Precision for retrieval results.
162
163
Args:
164
ground_truth_documents: List of relevant document ID lists
165
retrieved_documents: List of retrieved document ID lists
166
167
Returns:
168
Dictionary with MAP scores
169
"""
170
171
class DocumentMRREvaluator:
172
def __init__(
173
self,
174
mode: Literal["individual", "average"] = "individual"
175
) -> None:
176
"""Initialize Mean Reciprocal Rank evaluator."""
177
178
def run(
179
self,
180
ground_truth_documents: List[List[str]],
181
retrieved_documents: List[List[str]]
182
) -> Dict[str, Union[List[float], float]]:
183
"""Calculate Mean Reciprocal Rank for retrieval results."""
184
185
class DocumentNDCGEvaluator:
186
def __init__(
187
self,
188
mode: Literal["individual", "average"] = "individual",
189
normalize: bool = True,
190
k: Optional[int] = None
191
) -> None:
192
"""
193
Initialize Normalized Discounted Cumulative Gain evaluator.
194
195
Args:
196
mode: Whether to return individual scores or average
197
normalize: Whether to normalize NDCG scores
198
k: Cut-off rank for NDCG@k calculation
199
"""
200
201
def run(
202
self,
203
ground_truth_documents: List[List[str]],
204
retrieved_documents: List[List[str]],
205
relevance_scores: Optional[List[List[float]]] = None
206
) -> Dict[str, Union[List[float], float]]:
207
"""
208
Calculate NDCG for retrieval results.
209
210
Args:
211
ground_truth_documents: List of relevant document ID lists
212
retrieved_documents: List of retrieved document ID lists
213
relevance_scores: Optional relevance scores for documents
214
215
Returns:
216
Dictionary with NDCG scores
217
"""
218
219
class DocumentRecallEvaluator:
220
def __init__(
221
self,
222
mode: Literal["individual", "average"] = "individual"
223
) -> None:
224
"""Initialize document recall evaluator."""
225
226
def run(
227
self,
228
ground_truth_documents: List[List[str]],
229
retrieved_documents: List[List[str]]
230
) -> Dict[str, Union[List[float], float]]:
231
"""Calculate recall for retrieval results."""
232
```
233
234
### Semantic Answer Similarity Evaluation
235
236
Evaluate semantic similarity between generated and reference answers.
237
238
```python { .api }
239
class SASEvaluator:
240
def __init__(
241
self,
242
model: str = "sentence-transformers/all-MiniLM-L6-v2",
243
device: Optional[str] = None,
244
token: Secret = None,
245
similarity_threshold: float = 0.8
246
) -> None:
247
"""
248
Initialize Semantic Answer Similarity evaluator.
249
250
Args:
251
model: Sentence transformer model for embeddings
252
device: Device to run the model on
253
token: HuggingFace token for private models
254
similarity_threshold: Threshold for binary classification
255
"""
256
257
def run(
258
self,
259
predicted_answers: List[str],
260
ground_truth_answers: List[List[str]]
261
) -> Dict[str, List[float]]:
262
"""
263
Calculate semantic similarity between answers.
264
265
Args:
266
predicted_answers: List of predicted answers
267
ground_truth_answers: List of reference answer lists
268
269
Returns:
270
Dictionary with similarity scores
271
"""
272
```
273
274
### LLM-Based Custom Evaluation
275
276
Create custom evaluation metrics using language models.
277
278
```python { .api }
279
class LLMEvaluator:
280
def __init__(
281
self,
282
instructions: str,
283
inputs: List[str],
284
outputs: List[str],
285
examples: Optional[List[Dict[str, str]]] = None,
286
api_key: Secret = None,
287
model: str = "gpt-3.5-turbo",
288
api: Literal["openai", "azure"] = "openai",
289
azure_endpoint: Optional[str] = None,
290
azure_deployment: Optional[str] = None,
291
api_version: Optional[str] = None,
292
raise_on_failure: bool = True
293
) -> None:
294
"""
295
Initialize custom LLM evaluator.
296
297
Args:
298
instructions: Evaluation instructions for the LLM
299
inputs: List of input field names
300
outputs: List of output field names
301
examples: Few-shot examples for the evaluator
302
api_key: API key for the LLM service
303
model: Model name to use
304
api: API service to use
305
azure_endpoint: Azure OpenAI endpoint
306
azure_deployment: Azure deployment name
307
api_version: Azure API version
308
raise_on_failure: Whether to raise on evaluation failures
309
"""
310
311
def run(self, **inputs) -> Dict[str, Any]:
312
"""
313
Run custom LLM evaluation.
314
315
Args:
316
**inputs: Input values for evaluation
317
318
Returns:
319
Dictionary with evaluation results
320
"""
321
```
322
323
### Evaluation Run Results
324
325
Aggregate and manage evaluation results across multiple metrics.
326
327
```python { .api }
328
class EvaluationRunResult:
329
def __init__(
330
self,
331
run_name: str,
332
inputs: Dict[str, List[Any]],
333
results: Dict[str, List[Any]]
334
) -> None:
335
"""
336
Initialize evaluation run result.
337
338
Args:
339
run_name: Name of the evaluation run
340
inputs: Input data used for evaluation
341
results: Evaluation results by metric
342
"""
343
344
def score_report(self) -> Dict[str, float]:
345
"""
346
Generate aggregate score report.
347
348
Returns:
349
Dictionary with average scores by metric
350
"""
351
352
def comparative_individual_scores_report(
353
self,
354
other_result: "EvaluationRunResult"
355
) -> Dict[str, Dict[str, List[float]]]:
356
"""
357
Compare individual scores with another evaluation result.
358
359
Args:
360
other_result: Another evaluation result to compare with
361
362
Returns:
363
Comparative score report
364
"""
365
366
def to_pandas(self) -> "DataFrame":
367
"""Convert results to pandas DataFrame."""
368
369
def to_csv(self, csv_path: str) -> None:
370
"""Export results to CSV file."""
371
```
372
373
## Usage Examples
374
375
### Basic Context Relevance Evaluation
376
377
```python
378
from haystack.components.evaluators import ContextRelevanceEvaluator
379
from haystack.utils import Secret
380
381
# Initialize evaluator
382
context_evaluator = ContextRelevanceEvaluator(
383
api_key=Secret.from_env_var("OPENAI_API_KEY"),
384
model="gpt-3.5-turbo"
385
)
386
387
# Prepare evaluation data
388
questions = [
389
"What is Python?",
390
"How does machine learning work?"
391
]
392
393
contexts = [
394
["Python is a programming language.", "JavaScript is also popular."],
395
["ML uses algorithms to find patterns.", "Python has many libraries."]
396
]
397
398
# Run evaluation
399
result = context_evaluator.run(
400
questions=questions,
401
contexts=contexts
402
)
403
404
# Print results
405
for i, score in enumerate(result["individual_scores"]):
406
print(f"Question {i+1} context relevance: {score:.3f}")
407
408
# Calculate average
409
avg_relevance = sum(result["individual_scores"]) / len(result["individual_scores"])
410
print(f"Average context relevance: {avg_relevance:.3f}")
411
```
412
413
### Faithfulness Evaluation Pipeline
414
415
```python
416
from haystack.components.evaluators import FaithfulnessEvaluator
417
from haystack import Pipeline
418
419
# Create evaluation pipeline
420
eval_pipeline = Pipeline()
421
422
# Add faithfulness evaluator
423
faithfulness_evaluator = FaithfulnessEvaluator(
424
api_key=Secret.from_env_var("OPENAI_API_KEY"),
425
model="gpt-4"
426
)
427
428
eval_pipeline.add_component("faithfulness", faithfulness_evaluator)
429
430
# Evaluation data
431
questions = ["What programming language should I learn?"]
432
contexts = [["Python is beginner-friendly and versatile."]]
433
responses = ["I recommend learning Python because it's easy to learn and widely used."]
434
435
# Run evaluation
436
result = eval_pipeline.run({
437
"faithfulness": {
438
"questions": questions,
439
"contexts": contexts,
440
"responses": responses
441
}
442
})
443
444
faithfulness_score = result["faithfulness"]["individual_scores"][0]
445
print(f"Faithfulness score: {faithfulness_score:.3f}")
446
```
447
448
### Retrieval Performance Evaluation
449
450
```python
451
from haystack.components.evaluators import DocumentMAPEvaluator, DocumentRecallEvaluator
452
453
# Initialize retrieval evaluators
454
map_evaluator = DocumentMAPEvaluator(mode="individual")
455
recall_evaluator = DocumentRecallEvaluator(mode="individual")
456
457
# Ground truth: relevant documents for each query
458
ground_truth = [
459
["doc_1", "doc_3", "doc_5"], # Query 1 relevant docs
460
["doc_2", "doc_4"], # Query 2 relevant docs
461
["doc_1", "doc_2", "doc_6"] # Query 3 relevant docs
462
]
463
464
# Retrieved documents from system
465
retrieved = [
466
["doc_1", "doc_2", "doc_3"], # Query 1 retrieved docs
467
["doc_2", "doc_3", "doc_4"], # Query 2 retrieved docs
468
["doc_1", "doc_7", "doc_2"] # Query 3 retrieved docs
469
]
470
471
# Calculate MAP
472
map_result = map_evaluator.run(
473
ground_truth_documents=ground_truth,
474
retrieved_documents=retrieved
475
)
476
477
# Calculate Recall
478
recall_result = recall_evaluator.run(
479
ground_truth_documents=ground_truth,
480
retrieved_documents=retrieved
481
)
482
483
# Print results
484
for i, (map_score, recall_score) in enumerate(zip(
485
map_result["individual_scores"],
486
recall_result["individual_scores"]
487
)):
488
print(f"Query {i+1} - MAP: {map_score:.3f}, Recall: {recall_score:.3f}")
489
```
490
491
### Comprehensive RAG Evaluation
492
493
```python
494
from haystack.evaluation import EvaluationRunResult
495
from haystack.components.evaluators import (
496
ContextRelevanceEvaluator,
497
FaithfulnessEvaluator,
498
AnswerExactMatchEvaluator,
499
SASEvaluator
500
)
501
502
# Initialize all evaluators
503
evaluators = {
504
"context_relevance": ContextRelevanceEvaluator(
505
api_key=Secret.from_env_var("OPENAI_API_KEY")
506
),
507
"faithfulness": FaithfulnessEvaluator(
508
api_key=Secret.from_env_var("OPENAI_API_KEY")
509
),
510
"exact_match": AnswerExactMatchEvaluator(ignore_case=True),
511
"semantic_similarity": SASEvaluator()
512
}
513
514
# Evaluation dataset
515
eval_data = {
516
"questions": [
517
"What is Python?",
518
"How does neural network training work?",
519
"What are the benefits of cloud computing?"
520
],
521
"contexts": [
522
["Python is a high-level programming language known for its simplicity."],
523
["Neural networks learn by adjusting weights through backpropagation."],
524
["Cloud computing provides scalable resources and reduces infrastructure costs."]
525
],
526
"generated_answers": [
527
"Python is a programming language that is easy to learn and use.",
528
"Neural networks are trained using backpropagation to update weights.",
529
"Cloud computing offers flexibility and cost savings for businesses."
530
],
531
"reference_answers": [
532
["Python is a programming language."],
533
["Neural networks learn through backpropagation."],
534
["Cloud computing provides scalable and cost-effective resources."]
535
]
536
}
537
538
# Run all evaluations
539
results = {}
540
541
# Context relevance
542
results["context_relevance"] = evaluators["context_relevance"].run(
543
questions=eval_data["questions"],
544
contexts=eval_data["contexts"]
545
)
546
547
# Faithfulness
548
results["faithfulness"] = evaluators["faithfulness"].run(
549
questions=eval_data["questions"],
550
contexts=eval_data["contexts"],
551
responses=eval_data["generated_answers"]
552
)
553
554
# Exact match
555
results["exact_match"] = evaluators["exact_match"].run(
556
expected_answers=eval_data["reference_answers"],
557
predicted_answers=eval_data["generated_answers"]
558
)
559
560
# Semantic similarity
561
results["semantic_similarity"] = evaluators["semantic_similarity"].run(
562
predicted_answers=eval_data["generated_answers"],
563
ground_truth_answers=eval_data["reference_answers"]
564
)
565
566
# Create evaluation result
567
eval_result = EvaluationRunResult(
568
run_name="RAG_System_Evaluation",
569
inputs=eval_data,
570
results=results
571
)
572
573
# Generate report
574
score_report = eval_result.score_report()
575
print("Evaluation Results:")
576
for metric, score in score_report.items():
577
print(f"{metric}: {score:.3f}")
578
579
# Export to CSV
580
eval_result.to_csv("rag_evaluation_results.csv")
581
```
582
583
### Custom LLM Evaluator
584
585
```python
586
from haystack.components.evaluators import LLMEvaluator
587
588
# Create custom evaluator for answer completeness
589
completeness_evaluator = LLMEvaluator(
590
instructions="""
591
Evaluate how complete the given answer is for the question.
592
Consider whether all important aspects are covered.
593
Rate on a scale of 1-5 where:
594
1 = Very incomplete, major aspects missing
595
2 = Incomplete, some important aspects missing
596
3 = Moderately complete, minor aspects missing
597
4 = Mostly complete, very minor aspects missing
598
5 = Very complete, covers all important aspects
599
""",
600
inputs=["question", "answer"],
601
outputs=["completeness_score", "explanation"],
602
examples=[
603
{
604
"question": "What is photosynthesis?",
605
"answer": "Photosynthesis is how plants make food.",
606
"completeness_score": "2",
607
"explanation": "Answer is too brief and misses key details like light, CO2, oxygen production."
608
}
609
],
610
api_key=Secret.from_env_var("OPENAI_API_KEY"),
611
model="gpt-4"
612
)
613
614
# Use custom evaluator
615
custom_result = completeness_evaluator.run(
616
question="How does machine learning work?",
617
answer="Machine learning uses algorithms to learn patterns from data and make predictions."
618
)
619
620
print(f"Completeness score: {custom_result['completeness_score']}")
621
print(f"Explanation: {custom_result['explanation']}")
622
```
623
624
### Comparative Evaluation
625
626
```python
627
# Evaluate two different systems
628
system_a_results = EvaluationRunResult(
629
run_name="System_A",
630
inputs=eval_data,
631
results=results # From previous example
632
)
633
634
# Run evaluation for system B (with different answers)
635
system_b_data = eval_data.copy()
636
system_b_data["generated_answers"] = [
637
"Python is a versatile, high-level programming language.",
638
"Neural networks use backpropagation algorithm for training.",
639
"Cloud computing delivers computing services over the internet."
640
]
641
642
# ... run evaluations for system B ...
643
# system_b_results = EvaluationRunResult(...)
644
645
# Compare systems
646
# comparison = system_a_results.comparative_individual_scores_report(system_b_results)
647
# print("System Comparison:")
648
# for metric, scores in comparison.items():
649
# print(f"{metric}:")
650
# print(f" System A: {scores['System_A']}")
651
# print(f" System B: {scores['System_B']}")
652
```
653
654
### Advanced NDCG Evaluation
655
656
```python
657
from haystack.components.evaluators import DocumentNDCGEvaluator
658
659
# Initialize NDCG evaluator with cut-off
660
ndcg_evaluator = DocumentNDCGEvaluator(
661
mode="individual",
662
normalize=True,
663
k=5 # NDCG@5
664
)
665
666
# Ground truth with relevance scores
667
ground_truth_docs = [["doc_1", "doc_2", "doc_3", "doc_4"]]
668
retrieved_docs = [["doc_1", "doc_5", "doc_2", "doc_3", "doc_6"]]
669
670
# Optional: provide relevance scores (0-3 scale)
671
relevance_scores = [[3, 2, 2, 1]] # Relevance of ground truth docs
672
673
# Calculate NDCG
674
ndcg_result = ndcg_evaluator.run(
675
ground_truth_documents=ground_truth_docs,
676
retrieved_documents=retrieved_docs,
677
relevance_scores=relevance_scores
678
)
679
680
print(f"NDCG@5 score: {ndcg_result['individual_scores'][0]:.3f}")
681
```
682
683
## Types
684
685
```python { .api }
686
from typing import List, Dict, Any, Union, Optional, Literal
687
from enum import Enum
688
from haystack.utils import Secret
689
690
class EvaluationMode(Enum):
691
INDIVIDUAL = "individual"
692
AVERAGE = "average"
693
694
class MetricType(Enum):
695
RELEVANCE = "relevance"
696
FAITHFULNESS = "faithfulness"
697
SIMILARITY = "similarity"
698
RETRIEVAL = "retrieval"
699
CUSTOM = "custom"
700
701
class EvaluationMetric:
702
name: str
703
type: MetricType
704
score: float
705
details: Dict[str, Any]
706
```