or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agent-framework.mdcore-framework.mddocument-processing.mddocument-stores.mdevaluation.mdindex.mdprompt-building.mdretrieval.mdtext-embeddings.mdtext-generation.md

evaluation.mddocs/

0

# Evaluation

1

2

Metrics and evaluation components for assessing pipeline performance, answer quality, and retrieval effectiveness. Haystack provides comprehensive evaluation tools for measuring various aspects of AI system performance.

3

4

## Capabilities

5

6

### Context Relevance Evaluation

7

8

Evaluate how relevant retrieved contexts are to given questions using LLM-based assessment.

9

10

```python { .api }

11

class ContextRelevanceEvaluator:

12

def __init__(

13

self,

14

api_key: Secret,

15

model: str = "gpt-3.5-turbo",

16

instructions: Optional[str] = None,

17

inputs: Optional[List[str]] = None,

18

outputs: Optional[List[str]] = None,

19

examples: Optional[List[Dict[str, str]]] = None,

20

api: Literal["openai", "azure"] = "openai",

21

azure_endpoint: Optional[str] = None,

22

azure_deployment: Optional[str] = None,

23

api_version: Optional[str] = None

24

) -> None:

25

"""

26

Initialize context relevance evaluator.

27

28

Args:

29

api_key: API key for the LLM service

30

model: Model name to use for evaluation

31

instructions: Custom evaluation instructions

32

inputs: Input field names

33

outputs: Output field names

34

examples: Few-shot examples for evaluation

35

api: API service to use (openai or azure)

36

azure_endpoint: Azure OpenAI endpoint

37

azure_deployment: Azure OpenAI deployment name

38

api_version: Azure OpenAI API version

39

"""

40

41

def run(

42

self,

43

questions: List[str],

44

contexts: List[List[str]]

45

) -> Dict[str, List[float]]:

46

"""

47

Evaluate context relevance for question-context pairs.

48

49

Args:

50

questions: List of questions

51

contexts: List of context lists, one per question

52

53

Returns:

54

Dictionary with 'individual_scores' containing relevance scores (0-1)

55

"""

56

```

57

58

### Faithfulness Evaluation

59

60

Assess whether generated answers are faithful to the provided context and don't contain hallucinations.

61

62

```python { .api }

63

class FaithfulnessEvaluator:

64

def __init__(

65

self,

66

api_key: Secret,

67

model: str = "gpt-3.5-turbo",

68

instructions: Optional[str] = None,

69

inputs: Optional[List[str]] = None,

70

outputs: Optional[List[str]] = None,

71

examples: Optional[List[Dict[str, str]]] = None,

72

api: Literal["openai", "azure"] = "openai",

73

azure_endpoint: Optional[str] = None,

74

azure_deployment: Optional[str] = None,

75

api_version: Optional[str] = None

76

) -> None:

77

"""Initialize faithfulness evaluator."""

78

79

def run(

80

self,

81

questions: List[str],

82

contexts: List[List[str]],

83

responses: List[str]

84

) -> Dict[str, List[float]]:

85

"""

86

Evaluate faithfulness of responses to contexts.

87

88

Args:

89

questions: List of questions

90

contexts: List of context lists, one per question

91

responses: List of generated responses

92

93

Returns:

94

Dictionary with 'individual_scores' containing faithfulness scores (0-1)

95

"""

96

```

97

98

### Answer Exact Match Evaluation

99

100

Compare generated answers with reference answers using exact string matching.

101

102

```python { .api }

103

class AnswerExactMatchEvaluator:

104

def __init__(

105

self,

106

ignore_case: bool = False,

107

ignore_punctuation: bool = False,

108

ignore_whitespace: bool = False,

109

regex_pattern: Optional[str] = None

110

) -> None:

111

"""

112

Initialize exact match evaluator.

113

114

Args:

115

ignore_case: Whether to ignore case differences

116

ignore_punctuation: Whether to ignore punctuation differences

117

ignore_whitespace: Whether to ignore whitespace differences

118

regex_pattern: Optional regex pattern for custom matching

119

"""

120

121

def run(

122

self,

123

expected_answers: List[List[str]],

124

predicted_answers: List[str]

125

) -> Dict[str, List[int]]:

126

"""

127

Evaluate exact match between predicted and expected answers.

128

129

Args:

130

expected_answers: List of expected answer lists

131

predicted_answers: List of predicted answers

132

133

Returns:

134

Dictionary with 'individual_scores' containing match scores (0 or 1)

135

"""

136

```

137

138

### Document Retrieval Evaluation

139

140

Evaluate retrieval performance using information retrieval metrics.

141

142

```python { .api }

143

class DocumentMAPEvaluator:

144

def __init__(

145

self,

146

mode: Literal["individual", "average"] = "individual"

147

) -> None:

148

"""

149

Initialize Mean Average Precision evaluator.

150

151

Args:

152

mode: Whether to return individual scores or average

153

"""

154

155

def run(

156

self,

157

ground_truth_documents: List[List[str]],

158

retrieved_documents: List[List[str]]

159

) -> Dict[str, Union[List[float], float]]:

160

"""

161

Calculate Mean Average Precision for retrieval results.

162

163

Args:

164

ground_truth_documents: List of relevant document ID lists

165

retrieved_documents: List of retrieved document ID lists

166

167

Returns:

168

Dictionary with MAP scores

169

"""

170

171

class DocumentMRREvaluator:

172

def __init__(

173

self,

174

mode: Literal["individual", "average"] = "individual"

175

) -> None:

176

"""Initialize Mean Reciprocal Rank evaluator."""

177

178

def run(

179

self,

180

ground_truth_documents: List[List[str]],

181

retrieved_documents: List[List[str]]

182

) -> Dict[str, Union[List[float], float]]:

183

"""Calculate Mean Reciprocal Rank for retrieval results."""

184

185

class DocumentNDCGEvaluator:

186

def __init__(

187

self,

188

mode: Literal["individual", "average"] = "individual",

189

normalize: bool = True,

190

k: Optional[int] = None

191

) -> None:

192

"""

193

Initialize Normalized Discounted Cumulative Gain evaluator.

194

195

Args:

196

mode: Whether to return individual scores or average

197

normalize: Whether to normalize NDCG scores

198

k: Cut-off rank for NDCG@k calculation

199

"""

200

201

def run(

202

self,

203

ground_truth_documents: List[List[str]],

204

retrieved_documents: List[List[str]],

205

relevance_scores: Optional[List[List[float]]] = None

206

) -> Dict[str, Union[List[float], float]]:

207

"""

208

Calculate NDCG for retrieval results.

209

210

Args:

211

ground_truth_documents: List of relevant document ID lists

212

retrieved_documents: List of retrieved document ID lists

213

relevance_scores: Optional relevance scores for documents

214

215

Returns:

216

Dictionary with NDCG scores

217

"""

218

219

class DocumentRecallEvaluator:

220

def __init__(

221

self,

222

mode: Literal["individual", "average"] = "individual"

223

) -> None:

224

"""Initialize document recall evaluator."""

225

226

def run(

227

self,

228

ground_truth_documents: List[List[str]],

229

retrieved_documents: List[List[str]]

230

) -> Dict[str, Union[List[float], float]]:

231

"""Calculate recall for retrieval results."""

232

```

233

234

### Semantic Answer Similarity Evaluation

235

236

Evaluate semantic similarity between generated and reference answers.

237

238

```python { .api }

239

class SASEvaluator:

240

def __init__(

241

self,

242

model: str = "sentence-transformers/all-MiniLM-L6-v2",

243

device: Optional[str] = None,

244

token: Secret = None,

245

similarity_threshold: float = 0.8

246

) -> None:

247

"""

248

Initialize Semantic Answer Similarity evaluator.

249

250

Args:

251

model: Sentence transformer model for embeddings

252

device: Device to run the model on

253

token: HuggingFace token for private models

254

similarity_threshold: Threshold for binary classification

255

"""

256

257

def run(

258

self,

259

predicted_answers: List[str],

260

ground_truth_answers: List[List[str]]

261

) -> Dict[str, List[float]]:

262

"""

263

Calculate semantic similarity between answers.

264

265

Args:

266

predicted_answers: List of predicted answers

267

ground_truth_answers: List of reference answer lists

268

269

Returns:

270

Dictionary with similarity scores

271

"""

272

```

273

274

### LLM-Based Custom Evaluation

275

276

Create custom evaluation metrics using language models.

277

278

```python { .api }

279

class LLMEvaluator:

280

def __init__(

281

self,

282

instructions: str,

283

inputs: List[str],

284

outputs: List[str],

285

examples: Optional[List[Dict[str, str]]] = None,

286

api_key: Secret = None,

287

model: str = "gpt-3.5-turbo",

288

api: Literal["openai", "azure"] = "openai",

289

azure_endpoint: Optional[str] = None,

290

azure_deployment: Optional[str] = None,

291

api_version: Optional[str] = None,

292

raise_on_failure: bool = True

293

) -> None:

294

"""

295

Initialize custom LLM evaluator.

296

297

Args:

298

instructions: Evaluation instructions for the LLM

299

inputs: List of input field names

300

outputs: List of output field names

301

examples: Few-shot examples for the evaluator

302

api_key: API key for the LLM service

303

model: Model name to use

304

api: API service to use

305

azure_endpoint: Azure OpenAI endpoint

306

azure_deployment: Azure deployment name

307

api_version: Azure API version

308

raise_on_failure: Whether to raise on evaluation failures

309

"""

310

311

def run(self, **inputs) -> Dict[str, Any]:

312

"""

313

Run custom LLM evaluation.

314

315

Args:

316

**inputs: Input values for evaluation

317

318

Returns:

319

Dictionary with evaluation results

320

"""

321

```

322

323

### Evaluation Run Results

324

325

Aggregate and manage evaluation results across multiple metrics.

326

327

```python { .api }

328

class EvaluationRunResult:

329

def __init__(

330

self,

331

run_name: str,

332

inputs: Dict[str, List[Any]],

333

results: Dict[str, List[Any]]

334

) -> None:

335

"""

336

Initialize evaluation run result.

337

338

Args:

339

run_name: Name of the evaluation run

340

inputs: Input data used for evaluation

341

results: Evaluation results by metric

342

"""

343

344

def score_report(self) -> Dict[str, float]:

345

"""

346

Generate aggregate score report.

347

348

Returns:

349

Dictionary with average scores by metric

350

"""

351

352

def comparative_individual_scores_report(

353

self,

354

other_result: "EvaluationRunResult"

355

) -> Dict[str, Dict[str, List[float]]]:

356

"""

357

Compare individual scores with another evaluation result.

358

359

Args:

360

other_result: Another evaluation result to compare with

361

362

Returns:

363

Comparative score report

364

"""

365

366

def to_pandas(self) -> "DataFrame":

367

"""Convert results to pandas DataFrame."""

368

369

def to_csv(self, csv_path: str) -> None:

370

"""Export results to CSV file."""

371

```

372

373

## Usage Examples

374

375

### Basic Context Relevance Evaluation

376

377

```python

378

from haystack.components.evaluators import ContextRelevanceEvaluator

379

from haystack.utils import Secret

380

381

# Initialize evaluator

382

context_evaluator = ContextRelevanceEvaluator(

383

api_key=Secret.from_env_var("OPENAI_API_KEY"),

384

model="gpt-3.5-turbo"

385

)

386

387

# Prepare evaluation data

388

questions = [

389

"What is Python?",

390

"How does machine learning work?"

391

]

392

393

contexts = [

394

["Python is a programming language.", "JavaScript is also popular."],

395

["ML uses algorithms to find patterns.", "Python has many libraries."]

396

]

397

398

# Run evaluation

399

result = context_evaluator.run(

400

questions=questions,

401

contexts=contexts

402

)

403

404

# Print results

405

for i, score in enumerate(result["individual_scores"]):

406

print(f"Question {i+1} context relevance: {score:.3f}")

407

408

# Calculate average

409

avg_relevance = sum(result["individual_scores"]) / len(result["individual_scores"])

410

print(f"Average context relevance: {avg_relevance:.3f}")

411

```

412

413

### Faithfulness Evaluation Pipeline

414

415

```python

416

from haystack.components.evaluators import FaithfulnessEvaluator

417

from haystack import Pipeline

418

419

# Create evaluation pipeline

420

eval_pipeline = Pipeline()

421

422

# Add faithfulness evaluator

423

faithfulness_evaluator = FaithfulnessEvaluator(

424

api_key=Secret.from_env_var("OPENAI_API_KEY"),

425

model="gpt-4"

426

)

427

428

eval_pipeline.add_component("faithfulness", faithfulness_evaluator)

429

430

# Evaluation data

431

questions = ["What programming language should I learn?"]

432

contexts = [["Python is beginner-friendly and versatile."]]

433

responses = ["I recommend learning Python because it's easy to learn and widely used."]

434

435

# Run evaluation

436

result = eval_pipeline.run({

437

"faithfulness": {

438

"questions": questions,

439

"contexts": contexts,

440

"responses": responses

441

}

442

})

443

444

faithfulness_score = result["faithfulness"]["individual_scores"][0]

445

print(f"Faithfulness score: {faithfulness_score:.3f}")

446

```

447

448

### Retrieval Performance Evaluation

449

450

```python

451

from haystack.components.evaluators import DocumentMAPEvaluator, DocumentRecallEvaluator

452

453

# Initialize retrieval evaluators

454

map_evaluator = DocumentMAPEvaluator(mode="individual")

455

recall_evaluator = DocumentRecallEvaluator(mode="individual")

456

457

# Ground truth: relevant documents for each query

458

ground_truth = [

459

["doc_1", "doc_3", "doc_5"], # Query 1 relevant docs

460

["doc_2", "doc_4"], # Query 2 relevant docs

461

["doc_1", "doc_2", "doc_6"] # Query 3 relevant docs

462

]

463

464

# Retrieved documents from system

465

retrieved = [

466

["doc_1", "doc_2", "doc_3"], # Query 1 retrieved docs

467

["doc_2", "doc_3", "doc_4"], # Query 2 retrieved docs

468

["doc_1", "doc_7", "doc_2"] # Query 3 retrieved docs

469

]

470

471

# Calculate MAP

472

map_result = map_evaluator.run(

473

ground_truth_documents=ground_truth,

474

retrieved_documents=retrieved

475

)

476

477

# Calculate Recall

478

recall_result = recall_evaluator.run(

479

ground_truth_documents=ground_truth,

480

retrieved_documents=retrieved

481

)

482

483

# Print results

484

for i, (map_score, recall_score) in enumerate(zip(

485

map_result["individual_scores"],

486

recall_result["individual_scores"]

487

)):

488

print(f"Query {i+1} - MAP: {map_score:.3f}, Recall: {recall_score:.3f}")

489

```

490

491

### Comprehensive RAG Evaluation

492

493

```python

494

from haystack.evaluation import EvaluationRunResult

495

from haystack.components.evaluators import (

496

ContextRelevanceEvaluator,

497

FaithfulnessEvaluator,

498

AnswerExactMatchEvaluator,

499

SASEvaluator

500

)

501

502

# Initialize all evaluators

503

evaluators = {

504

"context_relevance": ContextRelevanceEvaluator(

505

api_key=Secret.from_env_var("OPENAI_API_KEY")

506

),

507

"faithfulness": FaithfulnessEvaluator(

508

api_key=Secret.from_env_var("OPENAI_API_KEY")

509

),

510

"exact_match": AnswerExactMatchEvaluator(ignore_case=True),

511

"semantic_similarity": SASEvaluator()

512

}

513

514

# Evaluation dataset

515

eval_data = {

516

"questions": [

517

"What is Python?",

518

"How does neural network training work?",

519

"What are the benefits of cloud computing?"

520

],

521

"contexts": [

522

["Python is a high-level programming language known for its simplicity."],

523

["Neural networks learn by adjusting weights through backpropagation."],

524

["Cloud computing provides scalable resources and reduces infrastructure costs."]

525

],

526

"generated_answers": [

527

"Python is a programming language that is easy to learn and use.",

528

"Neural networks are trained using backpropagation to update weights.",

529

"Cloud computing offers flexibility and cost savings for businesses."

530

],

531

"reference_answers": [

532

["Python is a programming language."],

533

["Neural networks learn through backpropagation."],

534

["Cloud computing provides scalable and cost-effective resources."]

535

]

536

}

537

538

# Run all evaluations

539

results = {}

540

541

# Context relevance

542

results["context_relevance"] = evaluators["context_relevance"].run(

543

questions=eval_data["questions"],

544

contexts=eval_data["contexts"]

545

)

546

547

# Faithfulness

548

results["faithfulness"] = evaluators["faithfulness"].run(

549

questions=eval_data["questions"],

550

contexts=eval_data["contexts"],

551

responses=eval_data["generated_answers"]

552

)

553

554

# Exact match

555

results["exact_match"] = evaluators["exact_match"].run(

556

expected_answers=eval_data["reference_answers"],

557

predicted_answers=eval_data["generated_answers"]

558

)

559

560

# Semantic similarity

561

results["semantic_similarity"] = evaluators["semantic_similarity"].run(

562

predicted_answers=eval_data["generated_answers"],

563

ground_truth_answers=eval_data["reference_answers"]

564

)

565

566

# Create evaluation result

567

eval_result = EvaluationRunResult(

568

run_name="RAG_System_Evaluation",

569

inputs=eval_data,

570

results=results

571

)

572

573

# Generate report

574

score_report = eval_result.score_report()

575

print("Evaluation Results:")

576

for metric, score in score_report.items():

577

print(f"{metric}: {score:.3f}")

578

579

# Export to CSV

580

eval_result.to_csv("rag_evaluation_results.csv")

581

```

582

583

### Custom LLM Evaluator

584

585

```python

586

from haystack.components.evaluators import LLMEvaluator

587

588

# Create custom evaluator for answer completeness

589

completeness_evaluator = LLMEvaluator(

590

instructions="""

591

Evaluate how complete the given answer is for the question.

592

Consider whether all important aspects are covered.

593

Rate on a scale of 1-5 where:

594

1 = Very incomplete, major aspects missing

595

2 = Incomplete, some important aspects missing

596

3 = Moderately complete, minor aspects missing

597

4 = Mostly complete, very minor aspects missing

598

5 = Very complete, covers all important aspects

599

""",

600

inputs=["question", "answer"],

601

outputs=["completeness_score", "explanation"],

602

examples=[

603

{

604

"question": "What is photosynthesis?",

605

"answer": "Photosynthesis is how plants make food.",

606

"completeness_score": "2",

607

"explanation": "Answer is too brief and misses key details like light, CO2, oxygen production."

608

}

609

],

610

api_key=Secret.from_env_var("OPENAI_API_KEY"),

611

model="gpt-4"

612

)

613

614

# Use custom evaluator

615

custom_result = completeness_evaluator.run(

616

question="How does machine learning work?",

617

answer="Machine learning uses algorithms to learn patterns from data and make predictions."

618

)

619

620

print(f"Completeness score: {custom_result['completeness_score']}")

621

print(f"Explanation: {custom_result['explanation']}")

622

```

623

624

### Comparative Evaluation

625

626

```python

627

# Evaluate two different systems

628

system_a_results = EvaluationRunResult(

629

run_name="System_A",

630

inputs=eval_data,

631

results=results # From previous example

632

)

633

634

# Run evaluation for system B (with different answers)

635

system_b_data = eval_data.copy()

636

system_b_data["generated_answers"] = [

637

"Python is a versatile, high-level programming language.",

638

"Neural networks use backpropagation algorithm for training.",

639

"Cloud computing delivers computing services over the internet."

640

]

641

642

# ... run evaluations for system B ...

643

# system_b_results = EvaluationRunResult(...)

644

645

# Compare systems

646

# comparison = system_a_results.comparative_individual_scores_report(system_b_results)

647

# print("System Comparison:")

648

# for metric, scores in comparison.items():

649

# print(f"{metric}:")

650

# print(f" System A: {scores['System_A']}")

651

# print(f" System B: {scores['System_B']}")

652

```

653

654

### Advanced NDCG Evaluation

655

656

```python

657

from haystack.components.evaluators import DocumentNDCGEvaluator

658

659

# Initialize NDCG evaluator with cut-off

660

ndcg_evaluator = DocumentNDCGEvaluator(

661

mode="individual",

662

normalize=True,

663

k=5 # NDCG@5

664

)

665

666

# Ground truth with relevance scores

667

ground_truth_docs = [["doc_1", "doc_2", "doc_3", "doc_4"]]

668

retrieved_docs = [["doc_1", "doc_5", "doc_2", "doc_3", "doc_6"]]

669

670

# Optional: provide relevance scores (0-3 scale)

671

relevance_scores = [[3, 2, 2, 1]] # Relevance of ground truth docs

672

673

# Calculate NDCG

674

ndcg_result = ndcg_evaluator.run(

675

ground_truth_documents=ground_truth_docs,

676

retrieved_documents=retrieved_docs,

677

relevance_scores=relevance_scores

678

)

679

680

print(f"NDCG@5 score: {ndcg_result['individual_scores'][0]:.3f}")

681

```

682

683

## Types

684

685

```python { .api }

686

from typing import List, Dict, Any, Union, Optional, Literal

687

from enum import Enum

688

from haystack.utils import Secret

689

690

class EvaluationMode(Enum):

691

INDIVIDUAL = "individual"

692

AVERAGE = "average"

693

694

class MetricType(Enum):

695

RELEVANCE = "relevance"

696

FAITHFULNESS = "faithfulness"

697

SIMILARITY = "similarity"

698

RETRIEVAL = "retrieval"

699

CUSTOM = "custom"

700

701

class EvaluationMetric:

702

name: str

703

type: MetricType

704

score: float

705

details: Dict[str, Any]

706

```