Tessl Tile for pypi/farm-haystack@1.26.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

agents.md core-schema.md document-stores.md evaluation-utilities.md file-processing.md generators.md index.md pipelines.md readers.md retrievers.md

evaluation-utilities.mddocs/

0
# Evaluation & Utilities
1

2
Evaluation metrics, model evaluation tools, and utility functions for assessing pipeline performance and data processing. Haystack provides built-in evaluation methods through Pipeline classes and various utility functions for development and testing.
3

4
## Core Imports
5

6
```python
7
from haystack import Pipeline
8
from haystack.schema import EvaluationResult, MultiLabel
9
from haystack.utils import launch_es, launch_opensearch, print_answers, print_documents
10
from haystack.pipelines.utils import print_eval_report
11
```
12

13
## Capabilities
14

15
### Pipeline Evaluation Methods
16

17
Built-in evaluation methods available on Pipeline instances for assessing performance on labeled datasets.
18

19
```python { .api }
20
class Pipeline:
21
    def eval(
22
        self,
23
        labels: List[MultiLabel],
24
        documents: Optional[List[List[Document]]] = None,
25
        params: Optional[dict] = None,
26
        sas_model_name_or_path: Optional[str] = None,
27
        sas_batch_size: int = 32,
28
        sas_use_gpu: bool = True,
29
        add_isolated_node_eval: bool = False,
30
        custom_document_id_field: Optional[str] = None,
31
        context_matching_min_length: int = 100,
32
        context_matching_boost_split_overlaps: bool = True,
33
        context_matching_threshold: float = 65.0,
34
    ) -> EvaluationResult:
35
        """
36
        Evaluate pipeline performance on labeled data.
37
        
38
        Args:
39
            labels: Ground truth labels for evaluation
40
            documents: Optional documents to use instead of retrieving
41
            params: Parameters to pass to pipeline during evaluation
42
            sas_model_name_or_path: Model for semantic answer similarity
43
            sas_batch_size: Batch size for SAS model
44
            sas_use_gpu: Use GPU for SAS evaluation
45
            add_isolated_node_eval: Include individual node evaluation
46
            custom_document_id_field: Custom field for document identification
47
            context_matching_min_length: Minimum context length for matching
48
            context_matching_boost_split_overlaps: Boost overlapping splits
49
            context_matching_threshold: Threshold for context matching
50
            
51
        Returns:
52
            EvaluationResult containing metrics and analysis
53
        """
54
    
55
    def eval_batch(
56
        self,
57
        labels: List[MultiLabel],
58
        documents: Optional[List[List[Document]]] = None,
59
        params: Optional[dict] = None,
60
        sas_model_name_or_path: Optional[str] = None,
61
        sas_batch_size: int = 32,
62
        sas_use_gpu: bool = True,
63
        add_isolated_node_eval: bool = False,
64
        custom_document_id_field: Optional[str] = None,
65
        context_matching_min_length: int = 100,
66
        context_matching_boost_split_overlaps: bool = True,
67
        context_matching_threshold: float = 65.0,
68
    ) -> EvaluationResult:
69
        """Batch evaluation version for better performance on large datasets."""
70
    
71
    @classmethod
72
    def eval_beir(
73
        cls,
74
        index_pipeline: Pipeline,
75
        query_pipeline: Pipeline,
76
        index_params: Optional[Dict] = None,
77
        query_params: Optional[Dict] = None,
78
        dataset: str = "scifact",
79
        dataset_dir: Path = Path("."),
80
        num_documents: Optional[int] = None,
81
        top_k_values: Optional[List[int]] = None,
82
        keep_index: bool = False,
83
    ) -> Dict[str, float]:
84
        """Evaluate pipelines using BEIR benchmark datasets."""
85
```
86

87
### EvaluationResult
88

89
Container for evaluation metrics and detailed analysis results.
90

91
```python { .api }
92
class EvaluationResult:
93
    def __init__(self):
94
        """Container for evaluation metrics and results."""
95
        self.retriever_metrics: Dict[str, float] = {}
96
        self.reader_metrics: Dict[str, float] = {}
97
        self.pipeline_metrics: Dict[str, float] = {}
98
    
99
    def calculate_metrics(self) -> Dict[str, float]:
100
        """Calculate and return all evaluation metrics."""
101
    
102
    def to_dict(self) -> Dict[str, Any]:
103
        """Convert evaluation result to dictionary format."""
104
    
105
    def save_to_file(self, file_path: str) -> None:
106
        """Save evaluation results to file."""
107
```
108

109
### Utility Functions
110

111
Development and debugging utilities for working with Haystack components.
112

113
```python { .api }
114
def launch_es(sleep: int = 15, delete_existing: bool = False) -> None:
115
    """
116
    Launch Elasticsearch in Docker container for development.
117
    
118
    Args:
119
        sleep: Seconds to wait for startup
120
        delete_existing: Remove existing container first
121
    """
122

123
def launch_opensearch(sleep: int = 15, delete_existing: bool = False) -> None:
124
    """
125
    Launch OpenSearch in Docker container for development.
126
    
127
    Args:
128
        sleep: Seconds to wait for startup  
129
        delete_existing: Remove existing container first
130
    """
131

132
def print_answers(results: Dict, details: str = "minimal") -> None:
133
    """
134
    Print formatted answers from pipeline results.
135
    
136
    Args:
137
        results: Pipeline output dictionary with 'answers' key
138
        details: Detail level ("minimal", "medium", "all")
139
    """
140

141
def print_documents(results: Dict, max_text_len: int = 200) -> None:
142
    """
143
    Print formatted documents from pipeline results.
144
    
145
    Args:
146
        results: Pipeline output dictionary with 'documents' key
147
        max_text_len: Maximum text length to display per document
148
    """
149

150
def print_eval_report(eval_result: EvaluationResult) -> None:
151
    """
152
    Print formatted evaluation report.
153
    
154
    Args:
155
        eval_result: EvaluationResult object to format and print
156
    """
157
```
158

159
## Usage Examples
160

161
### Basic Pipeline Evaluation
162

163
```python
164
from haystack import Pipeline
165
from haystack.schema import MultiLabel, Answer, Document
166
from haystack.pipelines import ExtractiveQAPipeline
167
from haystack.nodes import BM25Retriever, FARMReader
168
from haystack.document_stores import InMemoryDocumentStore
169

170
# Set up pipeline
171
doc_store = InMemoryDocumentStore()
172
retriever = BM25Retriever(document_store=doc_store)
173
reader = FARMReader("deepset/roberta-base-squad2")
174
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
175

176
# Create evaluation labels
177
labels = [
178
    MultiLabel(
179
        labels=[
180
            Label(
181
                query="What is Python?",
182
                answer=Answer(answer="Python is a programming language"),
183
                document=Document(content="Python is a high-level programming language..."),
184
                is_correct_answer=True,
185
                is_correct_document=True,
186
                origin="gold-label"
187
            )
188
        ]
189
    )
190
]
191

192
# Evaluate pipeline
193
eval_result = pipeline.eval(
194
    labels=labels,
195
    sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
196
    add_isolated_node_eval=True
197
)
198

199
# Print results
200
from haystack.pipelines.utils import print_eval_report
201
print_eval_report(eval_result)
202
```
203

204
### Batch Evaluation for Large Datasets
205

206
```python
207
# Load large evaluation dataset
208
import json
209
from haystack.schema import MultiLabel
210

211
with open("large_eval_dataset.json", "r") as f:
212
    eval_data = json.load(f)
213

214
# Convert to MultiLabel format
215
labels = []
216
for item in eval_data:
217
    # Convert your data format to MultiLabel objects
218
    label = create_multilabel_from_data(item)  # Your conversion function
219
    labels.append(label)
220

221
# Batch evaluation for better performance
222
eval_result = pipeline.eval_batch(
223
    labels=labels,
224
    sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
225
    sas_batch_size=64,
226
    sas_use_gpu=True
227
)
228

229
# Save results
230
eval_result.save_to_file("evaluation_results.json")
231
print(f"Overall F1: {eval_result.pipeline_metrics.get('f1', 'N/A')}")
232
print(f"Exact Match: {eval_result.pipeline_metrics.get('exact_match', 'N/A')}")
233
```
234

235
### BEIR Benchmark Evaluation
236

237
```python
238
from haystack import Pipeline
239
from pathlib import Path
240

241
# Evaluate using BEIR benchmark
242
beir_results = Pipeline.eval_beir(
243
    index_pipeline=indexing_pipeline,
244
    query_pipeline=query_pipeline,
245
    dataset="scifact",  # BEIR dataset name
246
    dataset_dir=Path("./beir_data"),
247
    top_k_values=[1, 5, 10],
248
    num_documents=1000,  # Limit for faster testing
249
    keep_index=False
250
)
251

252
print("BEIR Results:")
253
for metric, value in beir_results.items():
254
    print(f"{metric}: {value:.4f}")
255
```
256

257
### Development Utilities
258

259
```python
260
from haystack.utils import launch_es, print_answers, print_documents
261

262
# Launch Elasticsearch for development
263
launch_es(sleep=20, delete_existing=True)
264

265
# Test pipeline and examine outputs
266
results = pipeline.run(query="What is machine learning?")
267

268
# Print formatted answers
269
print_answers(results, details="medium")
270

271
# Print retrieved documents
272
print_documents(results, max_text_len=300)
273

274
# Examine raw results structure
275
print("Raw results keys:", results.keys())
276
print("Number of answers:", len(results.get("answers", [])))
277
print("Number of documents:", len(results.get("documents", [])))
278
```
279

280
### Custom Evaluation Metrics
281

282
```python
283
from haystack.modeling.evaluation.squad import compute_f1, compute_exact
284

285
def custom_evaluation(pipeline, test_queries, ground_truth_answers):
286
    """Custom evaluation function using Haystack's metric functions."""
287
    f1_scores = []
288
    em_scores = []
289
    
290
    for query, true_answer in zip(test_queries, ground_truth_answers):
291
        result = pipeline.run(query=query)
292
        if result["answers"]:
293
            predicted_answer = result["answers"][0].answer
294
            
295
            # Use Haystack's evaluation functions
296
            f1 = compute_f1(true_answer, predicted_answer)
297
            em = compute_exact(true_answer, predicted_answer)
298
            
299
            f1_scores.append(f1)
300
            em_scores.append(em)
301
    
302
    return {
303
        "average_f1": sum(f1_scores) / len(f1_scores),
304
        "average_em": sum(em_scores) / len(em_scores),
305
        "total_queries": len(test_queries)
306
    }
307

308
# Run custom evaluation
309
custom_results = custom_evaluation(pipeline, test_queries, ground_truth)
310
print("Custom Evaluation Results:", custom_results)
311
```
312

313
## Types
314

315
```python { .api }
316
from typing import Dict, List, Optional, Any, Union
317
from pathlib import Path
318

319
# Evaluation data structures
320
class MultiLabel:
321
    """Container for multiple labels associated with a query."""
322
    labels: List[Label]
323
    
324
class Label:
325
    """Individual evaluation label with query, answer, and metadata."""
326
    query: str
327
    answer: Answer
328
    document: Document
329
    is_correct_answer: bool
330
    is_correct_document: bool
331
    origin: str
332

333
# Metric calculation results
334
MetricsDict = Dict[str, Union[float, int, str]]
335
EvalResults = Dict[str, Any]
336
```

Version

Tile

Files

evaluation-utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

evaluation-utilities.mddocs/