0
# Evaluation & Utilities
1
2
Evaluation metrics, model evaluation tools, and utility functions for assessing pipeline performance and data processing. Haystack provides built-in evaluation methods through Pipeline classes and various utility functions for development and testing.
3
4
## Core Imports
5
6
```python
7
from haystack import Pipeline
8
from haystack.schema import EvaluationResult, MultiLabel
9
from haystack.utils import launch_es, launch_opensearch, print_answers, print_documents
10
from haystack.pipelines.utils import print_eval_report
11
```
12
13
## Capabilities
14
15
### Pipeline Evaluation Methods
16
17
Built-in evaluation methods available on Pipeline instances for assessing performance on labeled datasets.
18
19
```python { .api }
20
class Pipeline:
21
def eval(
22
self,
23
labels: List[MultiLabel],
24
documents: Optional[List[List[Document]]] = None,
25
params: Optional[dict] = None,
26
sas_model_name_or_path: Optional[str] = None,
27
sas_batch_size: int = 32,
28
sas_use_gpu: bool = True,
29
add_isolated_node_eval: bool = False,
30
custom_document_id_field: Optional[str] = None,
31
context_matching_min_length: int = 100,
32
context_matching_boost_split_overlaps: bool = True,
33
context_matching_threshold: float = 65.0,
34
) -> EvaluationResult:
35
"""
36
Evaluate pipeline performance on labeled data.
37
38
Args:
39
labels: Ground truth labels for evaluation
40
documents: Optional documents to use instead of retrieving
41
params: Parameters to pass to pipeline during evaluation
42
sas_model_name_or_path: Model for semantic answer similarity
43
sas_batch_size: Batch size for SAS model
44
sas_use_gpu: Use GPU for SAS evaluation
45
add_isolated_node_eval: Include individual node evaluation
46
custom_document_id_field: Custom field for document identification
47
context_matching_min_length: Minimum context length for matching
48
context_matching_boost_split_overlaps: Boost overlapping splits
49
context_matching_threshold: Threshold for context matching
50
51
Returns:
52
EvaluationResult containing metrics and analysis
53
"""
54
55
def eval_batch(
56
self,
57
labels: List[MultiLabel],
58
documents: Optional[List[List[Document]]] = None,
59
params: Optional[dict] = None,
60
sas_model_name_or_path: Optional[str] = None,
61
sas_batch_size: int = 32,
62
sas_use_gpu: bool = True,
63
add_isolated_node_eval: bool = False,
64
custom_document_id_field: Optional[str] = None,
65
context_matching_min_length: int = 100,
66
context_matching_boost_split_overlaps: bool = True,
67
context_matching_threshold: float = 65.0,
68
) -> EvaluationResult:
69
"""Batch evaluation version for better performance on large datasets."""
70
71
@classmethod
72
def eval_beir(
73
cls,
74
index_pipeline: Pipeline,
75
query_pipeline: Pipeline,
76
index_params: Optional[Dict] = None,
77
query_params: Optional[Dict] = None,
78
dataset: str = "scifact",
79
dataset_dir: Path = Path("."),
80
num_documents: Optional[int] = None,
81
top_k_values: Optional[List[int]] = None,
82
keep_index: bool = False,
83
) -> Dict[str, float]:
84
"""Evaluate pipelines using BEIR benchmark datasets."""
85
```
86
87
### EvaluationResult
88
89
Container for evaluation metrics and detailed analysis results.
90
91
```python { .api }
92
class EvaluationResult:
93
def __init__(self):
94
"""Container for evaluation metrics and results."""
95
self.retriever_metrics: Dict[str, float] = {}
96
self.reader_metrics: Dict[str, float] = {}
97
self.pipeline_metrics: Dict[str, float] = {}
98
99
def calculate_metrics(self) -> Dict[str, float]:
100
"""Calculate and return all evaluation metrics."""
101
102
def to_dict(self) -> Dict[str, Any]:
103
"""Convert evaluation result to dictionary format."""
104
105
def save_to_file(self, file_path: str) -> None:
106
"""Save evaluation results to file."""
107
```
108
109
### Utility Functions
110
111
Development and debugging utilities for working with Haystack components.
112
113
```python { .api }
114
def launch_es(sleep: int = 15, delete_existing: bool = False) -> None:
115
"""
116
Launch Elasticsearch in Docker container for development.
117
118
Args:
119
sleep: Seconds to wait for startup
120
delete_existing: Remove existing container first
121
"""
122
123
def launch_opensearch(sleep: int = 15, delete_existing: bool = False) -> None:
124
"""
125
Launch OpenSearch in Docker container for development.
126
127
Args:
128
sleep: Seconds to wait for startup
129
delete_existing: Remove existing container first
130
"""
131
132
def print_answers(results: Dict, details: str = "minimal") -> None:
133
"""
134
Print formatted answers from pipeline results.
135
136
Args:
137
results: Pipeline output dictionary with 'answers' key
138
details: Detail level ("minimal", "medium", "all")
139
"""
140
141
def print_documents(results: Dict, max_text_len: int = 200) -> None:
142
"""
143
Print formatted documents from pipeline results.
144
145
Args:
146
results: Pipeline output dictionary with 'documents' key
147
max_text_len: Maximum text length to display per document
148
"""
149
150
def print_eval_report(eval_result: EvaluationResult) -> None:
151
"""
152
Print formatted evaluation report.
153
154
Args:
155
eval_result: EvaluationResult object to format and print
156
"""
157
```
158
159
## Usage Examples
160
161
### Basic Pipeline Evaluation
162
163
```python
164
from haystack import Pipeline
165
from haystack.schema import MultiLabel, Answer, Document
166
from haystack.pipelines import ExtractiveQAPipeline
167
from haystack.nodes import BM25Retriever, FARMReader
168
from haystack.document_stores import InMemoryDocumentStore
169
170
# Set up pipeline
171
doc_store = InMemoryDocumentStore()
172
retriever = BM25Retriever(document_store=doc_store)
173
reader = FARMReader("deepset/roberta-base-squad2")
174
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
175
176
# Create evaluation labels
177
labels = [
178
MultiLabel(
179
labels=[
180
Label(
181
query="What is Python?",
182
answer=Answer(answer="Python is a programming language"),
183
document=Document(content="Python is a high-level programming language..."),
184
is_correct_answer=True,
185
is_correct_document=True,
186
origin="gold-label"
187
)
188
]
189
)
190
]
191
192
# Evaluate pipeline
193
eval_result = pipeline.eval(
194
labels=labels,
195
sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
196
add_isolated_node_eval=True
197
)
198
199
# Print results
200
from haystack.pipelines.utils import print_eval_report
201
print_eval_report(eval_result)
202
```
203
204
### Batch Evaluation for Large Datasets
205
206
```python
207
# Load large evaluation dataset
208
import json
209
from haystack.schema import MultiLabel
210
211
with open("large_eval_dataset.json", "r") as f:
212
eval_data = json.load(f)
213
214
# Convert to MultiLabel format
215
labels = []
216
for item in eval_data:
217
# Convert your data format to MultiLabel objects
218
label = create_multilabel_from_data(item) # Your conversion function
219
labels.append(label)
220
221
# Batch evaluation for better performance
222
eval_result = pipeline.eval_batch(
223
labels=labels,
224
sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
225
sas_batch_size=64,
226
sas_use_gpu=True
227
)
228
229
# Save results
230
eval_result.save_to_file("evaluation_results.json")
231
print(f"Overall F1: {eval_result.pipeline_metrics.get('f1', 'N/A')}")
232
print(f"Exact Match: {eval_result.pipeline_metrics.get('exact_match', 'N/A')}")
233
```
234
235
### BEIR Benchmark Evaluation
236
237
```python
238
from haystack import Pipeline
239
from pathlib import Path
240
241
# Evaluate using BEIR benchmark
242
beir_results = Pipeline.eval_beir(
243
index_pipeline=indexing_pipeline,
244
query_pipeline=query_pipeline,
245
dataset="scifact", # BEIR dataset name
246
dataset_dir=Path("./beir_data"),
247
top_k_values=[1, 5, 10],
248
num_documents=1000, # Limit for faster testing
249
keep_index=False
250
)
251
252
print("BEIR Results:")
253
for metric, value in beir_results.items():
254
print(f"{metric}: {value:.4f}")
255
```
256
257
### Development Utilities
258
259
```python
260
from haystack.utils import launch_es, print_answers, print_documents
261
262
# Launch Elasticsearch for development
263
launch_es(sleep=20, delete_existing=True)
264
265
# Test pipeline and examine outputs
266
results = pipeline.run(query="What is machine learning?")
267
268
# Print formatted answers
269
print_answers(results, details="medium")
270
271
# Print retrieved documents
272
print_documents(results, max_text_len=300)
273
274
# Examine raw results structure
275
print("Raw results keys:", results.keys())
276
print("Number of answers:", len(results.get("answers", [])))
277
print("Number of documents:", len(results.get("documents", [])))
278
```
279
280
### Custom Evaluation Metrics
281
282
```python
283
from haystack.modeling.evaluation.squad import compute_f1, compute_exact
284
285
def custom_evaluation(pipeline, test_queries, ground_truth_answers):
286
"""Custom evaluation function using Haystack's metric functions."""
287
f1_scores = []
288
em_scores = []
289
290
for query, true_answer in zip(test_queries, ground_truth_answers):
291
result = pipeline.run(query=query)
292
if result["answers"]:
293
predicted_answer = result["answers"][0].answer
294
295
# Use Haystack's evaluation functions
296
f1 = compute_f1(true_answer, predicted_answer)
297
em = compute_exact(true_answer, predicted_answer)
298
299
f1_scores.append(f1)
300
em_scores.append(em)
301
302
return {
303
"average_f1": sum(f1_scores) / len(f1_scores),
304
"average_em": sum(em_scores) / len(em_scores),
305
"total_queries": len(test_queries)
306
}
307
308
# Run custom evaluation
309
custom_results = custom_evaluation(pipeline, test_queries, ground_truth)
310
print("Custom Evaluation Results:", custom_results)
311
```
312
313
## Types
314
315
```python { .api }
316
from typing import Dict, List, Optional, Any, Union
317
from pathlib import Path
318
319
# Evaluation data structures
320
class MultiLabel:
321
"""Container for multiple labels associated with a query."""
322
labels: List[Label]
323
324
class Label:
325
"""Individual evaluation label with query, answer, and metadata."""
326
query: str
327
answer: Answer
328
document: Document
329
is_correct_answer: bool
330
is_correct_document: bool
331
origin: str
332
333
# Metric calculation results
334
MetricsDict = Dict[str, Union[float, int, str]]
335
EvalResults = Dict[str, Any]
336
```