or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agents.mdcore-schema.mddocument-stores.mdevaluation-utilities.mdfile-processing.mdgenerators.mdindex.mdpipelines.mdreaders.mdretrievers.md

evaluation-utilities.mddocs/

0

# Evaluation & Utilities

1

2

Evaluation metrics, model evaluation tools, and utility functions for assessing pipeline performance and data processing. Haystack provides built-in evaluation methods through Pipeline classes and various utility functions for development and testing.

3

4

## Core Imports

5

6

```python

7

from haystack import Pipeline

8

from haystack.schema import EvaluationResult, MultiLabel

9

from haystack.utils import launch_es, launch_opensearch, print_answers, print_documents

10

from haystack.pipelines.utils import print_eval_report

11

```

12

13

## Capabilities

14

15

### Pipeline Evaluation Methods

16

17

Built-in evaluation methods available on Pipeline instances for assessing performance on labeled datasets.

18

19

```python { .api }

20

class Pipeline:

21

def eval(

22

self,

23

labels: List[MultiLabel],

24

documents: Optional[List[List[Document]]] = None,

25

params: Optional[dict] = None,

26

sas_model_name_or_path: Optional[str] = None,

27

sas_batch_size: int = 32,

28

sas_use_gpu: bool = True,

29

add_isolated_node_eval: bool = False,

30

custom_document_id_field: Optional[str] = None,

31

context_matching_min_length: int = 100,

32

context_matching_boost_split_overlaps: bool = True,

33

context_matching_threshold: float = 65.0,

34

) -> EvaluationResult:

35

"""

36

Evaluate pipeline performance on labeled data.

37

38

Args:

39

labels: Ground truth labels for evaluation

40

documents: Optional documents to use instead of retrieving

41

params: Parameters to pass to pipeline during evaluation

42

sas_model_name_or_path: Model for semantic answer similarity

43

sas_batch_size: Batch size for SAS model

44

sas_use_gpu: Use GPU for SAS evaluation

45

add_isolated_node_eval: Include individual node evaluation

46

custom_document_id_field: Custom field for document identification

47

context_matching_min_length: Minimum context length for matching

48

context_matching_boost_split_overlaps: Boost overlapping splits

49

context_matching_threshold: Threshold for context matching

50

51

Returns:

52

EvaluationResult containing metrics and analysis

53

"""

54

55

def eval_batch(

56

self,

57

labels: List[MultiLabel],

58

documents: Optional[List[List[Document]]] = None,

59

params: Optional[dict] = None,

60

sas_model_name_or_path: Optional[str] = None,

61

sas_batch_size: int = 32,

62

sas_use_gpu: bool = True,

63

add_isolated_node_eval: bool = False,

64

custom_document_id_field: Optional[str] = None,

65

context_matching_min_length: int = 100,

66

context_matching_boost_split_overlaps: bool = True,

67

context_matching_threshold: float = 65.0,

68

) -> EvaluationResult:

69

"""Batch evaluation version for better performance on large datasets."""

70

71

@classmethod

72

def eval_beir(

73

cls,

74

index_pipeline: Pipeline,

75

query_pipeline: Pipeline,

76

index_params: Optional[Dict] = None,

77

query_params: Optional[Dict] = None,

78

dataset: str = "scifact",

79

dataset_dir: Path = Path("."),

80

num_documents: Optional[int] = None,

81

top_k_values: Optional[List[int]] = None,

82

keep_index: bool = False,

83

) -> Dict[str, float]:

84

"""Evaluate pipelines using BEIR benchmark datasets."""

85

```

86

87

### EvaluationResult

88

89

Container for evaluation metrics and detailed analysis results.

90

91

```python { .api }

92

class EvaluationResult:

93

def __init__(self):

94

"""Container for evaluation metrics and results."""

95

self.retriever_metrics: Dict[str, float] = {}

96

self.reader_metrics: Dict[str, float] = {}

97

self.pipeline_metrics: Dict[str, float] = {}

98

99

def calculate_metrics(self) -> Dict[str, float]:

100

"""Calculate and return all evaluation metrics."""

101

102

def to_dict(self) -> Dict[str, Any]:

103

"""Convert evaluation result to dictionary format."""

104

105

def save_to_file(self, file_path: str) -> None:

106

"""Save evaluation results to file."""

107

```

108

109

### Utility Functions

110

111

Development and debugging utilities for working with Haystack components.

112

113

```python { .api }

114

def launch_es(sleep: int = 15, delete_existing: bool = False) -> None:

115

"""

116

Launch Elasticsearch in Docker container for development.

117

118

Args:

119

sleep: Seconds to wait for startup

120

delete_existing: Remove existing container first

121

"""

122

123

def launch_opensearch(sleep: int = 15, delete_existing: bool = False) -> None:

124

"""

125

Launch OpenSearch in Docker container for development.

126

127

Args:

128

sleep: Seconds to wait for startup

129

delete_existing: Remove existing container first

130

"""

131

132

def print_answers(results: Dict, details: str = "minimal") -> None:

133

"""

134

Print formatted answers from pipeline results.

135

136

Args:

137

results: Pipeline output dictionary with 'answers' key

138

details: Detail level ("minimal", "medium", "all")

139

"""

140

141

def print_documents(results: Dict, max_text_len: int = 200) -> None:

142

"""

143

Print formatted documents from pipeline results.

144

145

Args:

146

results: Pipeline output dictionary with 'documents' key

147

max_text_len: Maximum text length to display per document

148

"""

149

150

def print_eval_report(eval_result: EvaluationResult) -> None:

151

"""

152

Print formatted evaluation report.

153

154

Args:

155

eval_result: EvaluationResult object to format and print

156

"""

157

```

158

159

## Usage Examples

160

161

### Basic Pipeline Evaluation

162

163

```python

164

from haystack import Pipeline

165

from haystack.schema import MultiLabel, Answer, Document

166

from haystack.pipelines import ExtractiveQAPipeline

167

from haystack.nodes import BM25Retriever, FARMReader

168

from haystack.document_stores import InMemoryDocumentStore

169

170

# Set up pipeline

171

doc_store = InMemoryDocumentStore()

172

retriever = BM25Retriever(document_store=doc_store)

173

reader = FARMReader("deepset/roberta-base-squad2")

174

pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)

175

176

# Create evaluation labels

177

labels = [

178

MultiLabel(

179

labels=[

180

Label(

181

query="What is Python?",

182

answer=Answer(answer="Python is a programming language"),

183

document=Document(content="Python is a high-level programming language..."),

184

is_correct_answer=True,

185

is_correct_document=True,

186

origin="gold-label"

187

)

188

]

189

)

190

]

191

192

# Evaluate pipeline

193

eval_result = pipeline.eval(

194

labels=labels,

195

sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",

196

add_isolated_node_eval=True

197

)

198

199

# Print results

200

from haystack.pipelines.utils import print_eval_report

201

print_eval_report(eval_result)

202

```

203

204

### Batch Evaluation for Large Datasets

205

206

```python

207

# Load large evaluation dataset

208

import json

209

from haystack.schema import MultiLabel

210

211

with open("large_eval_dataset.json", "r") as f:

212

eval_data = json.load(f)

213

214

# Convert to MultiLabel format

215

labels = []

216

for item in eval_data:

217

# Convert your data format to MultiLabel objects

218

label = create_multilabel_from_data(item) # Your conversion function

219

labels.append(label)

220

221

# Batch evaluation for better performance

222

eval_result = pipeline.eval_batch(

223

labels=labels,

224

sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",

225

sas_batch_size=64,

226

sas_use_gpu=True

227

)

228

229

# Save results

230

eval_result.save_to_file("evaluation_results.json")

231

print(f"Overall F1: {eval_result.pipeline_metrics.get('f1', 'N/A')}")

232

print(f"Exact Match: {eval_result.pipeline_metrics.get('exact_match', 'N/A')}")

233

```

234

235

### BEIR Benchmark Evaluation

236

237

```python

238

from haystack import Pipeline

239

from pathlib import Path

240

241

# Evaluate using BEIR benchmark

242

beir_results = Pipeline.eval_beir(

243

index_pipeline=indexing_pipeline,

244

query_pipeline=query_pipeline,

245

dataset="scifact", # BEIR dataset name

246

dataset_dir=Path("./beir_data"),

247

top_k_values=[1, 5, 10],

248

num_documents=1000, # Limit for faster testing

249

keep_index=False

250

)

251

252

print("BEIR Results:")

253

for metric, value in beir_results.items():

254

print(f"{metric}: {value:.4f}")

255

```

256

257

### Development Utilities

258

259

```python

260

from haystack.utils import launch_es, print_answers, print_documents

261

262

# Launch Elasticsearch for development

263

launch_es(sleep=20, delete_existing=True)

264

265

# Test pipeline and examine outputs

266

results = pipeline.run(query="What is machine learning?")

267

268

# Print formatted answers

269

print_answers(results, details="medium")

270

271

# Print retrieved documents

272

print_documents(results, max_text_len=300)

273

274

# Examine raw results structure

275

print("Raw results keys:", results.keys())

276

print("Number of answers:", len(results.get("answers", [])))

277

print("Number of documents:", len(results.get("documents", [])))

278

```

279

280

### Custom Evaluation Metrics

281

282

```python

283

from haystack.modeling.evaluation.squad import compute_f1, compute_exact

284

285

def custom_evaluation(pipeline, test_queries, ground_truth_answers):

286

"""Custom evaluation function using Haystack's metric functions."""

287

f1_scores = []

288

em_scores = []

289

290

for query, true_answer in zip(test_queries, ground_truth_answers):

291

result = pipeline.run(query=query)

292

if result["answers"]:

293

predicted_answer = result["answers"][0].answer

294

295

# Use Haystack's evaluation functions

296

f1 = compute_f1(true_answer, predicted_answer)

297

em = compute_exact(true_answer, predicted_answer)

298

299

f1_scores.append(f1)

300

em_scores.append(em)

301

302

return {

303

"average_f1": sum(f1_scores) / len(f1_scores),

304

"average_em": sum(em_scores) / len(em_scores),

305

"total_queries": len(test_queries)

306

}

307

308

# Run custom evaluation

309

custom_results = custom_evaluation(pipeline, test_queries, ground_truth)

310

print("Custom Evaluation Results:", custom_results)

311

```

312

313

## Types

314

315

```python { .api }

316

from typing import Dict, List, Optional, Any, Union

317

from pathlib import Path

318

319

# Evaluation data structures

320

class MultiLabel:

321

"""Container for multiple labels associated with a query."""

322

labels: List[Label]

323

324

class Label:

325

"""Individual evaluation label with query, answer, and metadata."""

326

query: str

327

answer: Answer

328

document: Document

329

is_correct_answer: bool

330

is_correct_document: bool

331

origin: str

332

333

# Metric calculation results

334

MetricsDict = Dict[str, Union[float, int, str]]

335

EvalResults = Dict[str, Any]

336

```