or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agents-tools.mddocuments-nodes.mdevaluation.mdindex.mdindices.mdllms-embeddings.mdnode-parsers.mdpostprocessors.mdprompts.mdquery-engines.mdretrievers.mdsettings.mdstorage.md

evaluation.mddocs/

0

# Evaluation

1

2

Comprehensive evaluation capabilities for RAG systems including retrieval metrics, response quality assessment, and dataset generation. The evaluation framework provides tools for measuring system performance, generating synthetic test data, and conducting systematic evaluations of LLM applications.

3

4

## Capabilities

5

6

### Base Evaluation Interfaces

7

8

Foundation interfaces for evaluation operations with standardized result structures and metrics computation.

9

10

```python { .api }

11

class BaseEvaluator:

12

"""

13

Base interface for evaluation implementations.

14

15

Evaluators assess different aspects of system performance including

16

response quality, retrieval accuracy, and adherence to guidelines.

17

"""

18

def __init__(self, **kwargs): ...

19

20

def evaluate(

21

self,

22

query: Optional[str] = None,

23

contexts: Optional[Sequence[str]] = None,

24

response: Optional[str] = None,

25

**kwargs

26

) -> EvaluationResult:

27

"""

28

Evaluate system performance on given inputs.

29

30

Parameters:

31

- query: Optional[str], input query or question

32

- contexts: Optional[Sequence[str]], retrieved context passages

33

- response: Optional[str], system response to evaluate

34

- **kwargs: additional evaluation parameters

35

36

Returns:

37

- EvaluationResult, evaluation result with score and feedback

38

"""

39

40

async def aevaluate(

41

self,

42

query: Optional[str] = None,

43

contexts: Optional[Sequence[str]] = None,

44

response: Optional[str] = None,

45

**kwargs

46

) -> EvaluationResult:

47

"""Asynchronous evaluation method."""

48

49

class EvaluationResult:

50

"""

51

Result of evaluation with score and detailed feedback.

52

53

Parameters:

54

- score: Optional[float], numerical evaluation score

55

- passing: Optional[bool], whether evaluation passed threshold

56

- feedback: Optional[str], detailed feedback and explanation

57

- metadata: Optional[dict], additional evaluation metadata

58

"""

59

def __init__(

60

self,

61

score: Optional[float] = None,

62

passing: Optional[bool] = None,

63

feedback: Optional[str] = None,

64

metadata: Optional[dict] = None,

65

**kwargs

66

): ...

67

68

def __str__(self) -> str:

69

"""String representation of evaluation result."""

70

```

71

72

### Response Quality Evaluators

73

74

Evaluators for assessing the quality, relevance, and accuracy of system responses.

75

76

```python { .api }

77

class FaithfulnessEvaluator(BaseEvaluator):

78

"""

79

Evaluator for response faithfulness to retrieved contexts.

80

81

Assesses whether the response is grounded in and consistent with

82

the provided context information without hallucination.

83

84

Parameters:

85

- llm: Optional[LLM], language model for faithfulness assessment

86

- raise_error: bool, whether to raise errors on evaluation failure

87

- eval_template: Optional[BasePromptTemplate], evaluation prompt template

88

"""

89

def __init__(

90

self,

91

llm: Optional[LLM] = None,

92

raise_error: bool = False,

93

eval_template: Optional[BasePromptTemplate] = None,

94

**kwargs

95

): ...

96

97

class AnswerRelevancyEvaluator(BaseEvaluator):

98

"""

99

Evaluator for answer relevancy to the input query.

100

101

Measures how well the response addresses the specific question asked

102

and whether it provides relevant information.

103

104

Parameters:

105

- llm: Optional[LLM], language model for relevancy assessment

106

- embed_model: Optional[BaseEmbedding], embedding model for similarity

107

- raise_error: bool, whether to raise errors on evaluation failure

108

"""

109

def __init__(

110

self,

111

llm: Optional[LLM] = None,

112

embed_model: Optional[BaseEmbedding] = None,

113

raise_error: bool = False,

114

**kwargs

115

): ...

116

117

class ContextRelevancyEvaluator(BaseEvaluator):

118

"""

119

Evaluator for context relevancy to the query.

120

121

Assesses whether the retrieved context passages are relevant

122

to answering the input query.

123

124

Parameters:

125

- llm: Optional[LLM], language model for relevancy assessment

126

- raise_error: bool, whether to raise errors on evaluation failure

127

"""

128

def __init__(

129

self,

130

llm: Optional[LLM] = None,

131

raise_error: bool = False,

132

**kwargs

133

): ...

134

135

class CorrectnessEvaluator(BaseEvaluator):

136

"""

137

Evaluator for response correctness against reference answers.

138

139

Compares system responses to ground truth answers to assess

140

factual accuracy and completeness.

141

142

Parameters:

143

- llm: Optional[LLM], language model for correctness assessment

144

- raise_error: bool, whether to raise errors on evaluation failure

145

- eval_template: Optional[BasePromptTemplate], evaluation prompt template

146

"""

147

def __init__(

148

self,

149

llm: Optional[LLM] = None,

150

raise_error: bool = False,

151

eval_template: Optional[BasePromptTemplate] = None,

152

**kwargs

153

): ...

154

155

class SemanticSimilarityEvaluator(BaseEvaluator):

156

"""

157

Evaluator for semantic similarity between responses and references.

158

159

Uses embedding models to assess semantic similarity between

160

generated responses and reference answers.

161

162

Parameters:

163

- embed_model: Optional[BaseEmbedding], embedding model for similarity

164

- similarity_threshold: float, threshold for similarity scoring

165

"""

166

def __init__(

167

self,

168

embed_model: Optional[BaseEmbedding] = None,

169

similarity_threshold: float = 0.8,

170

**kwargs

171

): ...

172

173

class RelevancyEvaluator(BaseEvaluator):

174

"""

175

General relevancy evaluator for response-query alignment.

176

177

Provides flexible relevancy assessment with customizable

178

evaluation criteria and scoring methods.

179

180

Parameters:

181

- llm: Optional[LLM], language model for relevancy assessment

182

- raise_error: bool, whether to raise errors on evaluation failure

183

"""

184

def __init__(

185

self,

186

llm: Optional[LLM] = None,

187

raise_error: bool = False,

188

**kwargs

189

): ...

190

```

191

192

### Guideline & Criteria Evaluators

193

194

Evaluators for adherence to specific guidelines, criteria, or custom evaluation frameworks.

195

196

```python { .api }

197

class GuidelineEvaluator(BaseEvaluator):

198

"""

199

Evaluator for adherence to specific guidelines or criteria.

200

201

Assesses whether responses follow specified guidelines,

202

style requirements, or content policies.

203

204

Parameters:

205

- guidelines: str, guidelines text or criteria description

206

- llm: Optional[LLM], language model for guideline assessment

207

- eval_template: Optional[BasePromptTemplate], evaluation prompt template

208

"""

209

def __init__(

210

self,

211

guidelines: str,

212

llm: Optional[LLM] = None,

213

eval_template: Optional[BasePromptTemplate] = None,

214

**kwargs

215

): ...

216

217

class PairwiseComparisonEvaluator(BaseEvaluator):

218

"""

219

Evaluator for pairwise comparison between responses.

220

221

Compares two responses to determine which better satisfies

222

the evaluation criteria or user requirements.

223

224

Parameters:

225

- llm: Optional[LLM], language model for comparison assessment

226

- eval_template: Optional[BasePromptTemplate], comparison prompt template

227

"""

228

def __init__(

229

self,

230

llm: Optional[LLM] = None,

231

eval_template: Optional[BasePromptTemplate] = None,

232

**kwargs

233

): ...

234

235

def evaluate(

236

self,

237

query: str,

238

response_a: str,

239

response_b: str,

240

contexts: Optional[Sequence[str]] = None,

241

**kwargs

242

) -> EvaluationResult:

243

"""

244

Compare two responses for the same query.

245

246

Parameters:

247

- query: str, input query for comparison

248

- response_a: str, first response to compare

249

- response_b: str, second response to compare

250

- contexts: Optional[Sequence[str]], context for evaluation

251

252

Returns:

253

- EvaluationResult, comparison result indicating preferred response

254

"""

255

```

256

257

### Retrieval Evaluation

258

259

Specialized evaluation framework for retrieval system performance and accuracy.

260

261

```python { .api }

262

class BaseRetrievalEvaluator:

263

"""

264

Base interface for retrieval evaluation implementations.

265

266

Retrieval evaluators assess the quality of information retrieval

267

systems including accuracy, coverage, and ranking quality.

268

"""

269

def __init__(self, **kwargs): ...

270

271

def evaluate(

272

self,

273

query: str,

274

expected_ids: List[str],

275

retrieved_ids: List[str],

276

**kwargs

277

) -> RetrievalEvalResult:

278

"""

279

Evaluate retrieval performance.

280

281

Parameters:

282

- query: str, input query

283

- expected_ids: List[str], expected relevant document IDs

284

- retrieved_ids: List[str], actually retrieved document IDs

285

286

Returns:

287

- RetrievalEvalResult, retrieval evaluation result

288

"""

289

290

class RetrievalEvalResult:

291

"""

292

Result of retrieval evaluation with multiple metrics.

293

294

Parameters:

295

- query: str, evaluated query

296

- expected_ids: List[str], expected relevant IDs

297

- retrieved_ids: List[str], retrieved IDs

298

- metric_vals_dict: Dict[str, float], metric name to value mapping

299

"""

300

def __init__(

301

self,

302

query: str,

303

expected_ids: List[str],

304

retrieved_ids: List[str],

305

metric_vals_dict: Dict[str, float],

306

**kwargs

307

): ...

308

309

class RetrieverEvaluator(BaseRetrievalEvaluator):

310

"""

311

Comprehensive retriever evaluation with multiple metrics.

312

313

Parameters:

314

- metrics: Optional[List[BaseMetric]], metrics to compute

315

- retriever: Optional[BaseRetriever], retriever to evaluate

316

- node_postprocessors: Optional[List[BaseNodePostprocessor]], postprocessors

317

"""

318

def __init__(

319

self,

320

metrics: Optional[List[BaseMetric]] = None,

321

retriever: Optional[BaseRetriever] = None,

322

node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,

323

**kwargs

324

): ...

325

326

async def aevaluate_dataset(

327

self,

328

dataset: Any,

329

workers: int = 2,

330

show_progress: bool = False,

331

**kwargs

332

) -> List[RetrievalEvalResult]:

333

"""Asynchronously evaluate entire dataset."""

334

335

class MultiModalRetrieverEvaluator(BaseRetrievalEvaluator):

336

"""

337

Evaluator for multi-modal retrieval systems.

338

339

Assesses retrieval performance for systems handling multiple

340

content modalities including text, images, and other media.

341

342

Parameters:

343

- metrics: Optional[List[BaseMetric]], evaluation metrics

344

- retriever: Optional[BaseRetriever], multi-modal retriever to evaluate

345

"""

346

def __init__(

347

self,

348

metrics: Optional[List[BaseMetric]] = None,

349

retriever: Optional[BaseRetriever] = None,

350

**kwargs

351

): ...

352

```

353

354

### Retrieval Metrics

355

356

Specific metrics for measuring retrieval system performance and quality.

357

358

```python { .api }

359

class RetrievalMetricResult:

360

"""

361

Result container for individual retrieval metrics.

362

363

Parameters:

364

- metric_name: str, name of the computed metric

365

- score: float, metric score value

366

- metadata: Optional[dict], additional metric metadata

367

"""

368

def __init__(

369

self,

370

metric_name: str,

371

score: float,

372

metadata: Optional[dict] = None

373

): ...

374

375

class HitRate:

376

"""

377

Hit rate metric for retrieval evaluation.

378

379

Measures the fraction of queries for which at least one

380

relevant document was retrieved in the top-k results.

381

"""

382

def __init__(self): ...

383

384

def compute(

385

self,

386

query: str,

387

expected_ids: List[str],

388

retrieved_ids: List[str]

389

) -> RetrievalMetricResult:

390

"""

391

Compute hit rate for single query.

392

393

Parameters:

394

- query: str, input query

395

- expected_ids: List[str], relevant document IDs

396

- retrieved_ids: List[str], retrieved document IDs

397

398

Returns:

399

- RetrievalMetricResult, hit rate result (1.0 if hit, 0.0 if miss)

400

"""

401

402

class MRR:

403

"""

404

Mean Reciprocal Rank (MRR) metric for retrieval evaluation.

405

406

Measures the quality of ranking by computing the reciprocal

407

of the rank of the first relevant document retrieved.

408

"""

409

def __init__(self): ...

410

411

def compute(

412

self,

413

query: str,

414

expected_ids: List[str],

415

retrieved_ids: List[str]

416

) -> RetrievalMetricResult:

417

"""

418

Compute MRR for single query.

419

420

Parameters:

421

- query: str, input query

422

- expected_ids: List[str], relevant document IDs

423

- retrieved_ids: List[str], retrieved document IDs

424

425

Returns:

426

- RetrievalMetricResult, MRR score (1/rank of first relevant doc)

427

"""

428

429

def resolve_metrics(metric_names: List[str]) -> List[BaseMetric]:

430

"""

431

Resolve metric instances from metric names.

432

433

Parameters:

434

- metric_names: List[str], names of metrics to resolve

435

436

Returns:

437

- List[BaseMetric], resolved metric instances

438

"""

439

```

440

441

### Dataset Generation

442

443

Tools for generating synthetic evaluation datasets and test cases for system validation.

444

445

```python { .api }

446

class DatasetGenerator:

447

"""

448

Generator for synthetic evaluation datasets.

449

450

Creates question-answer pairs, retrieval test cases, and other

451

evaluation data from source documents and knowledge bases.

452

453

Parameters:

454

- nodes: List[BaseNode], source nodes for dataset generation

455

- llm: Optional[LLM], language model for question generation

456

- num_questions_per_chunk: int, questions to generate per text chunk

457

- text_question_template: Optional[BasePromptTemplate], question generation prompt

458

- text_qa_template: Optional[BasePromptTemplate], QA pair generation prompt

459

"""

460

def __init__(

461

self,

462

nodes: List[BaseNode],

463

llm: Optional[LLM] = None,

464

num_questions_per_chunk: int = 2,

465

text_question_template: Optional[BasePromptTemplate] = None,

466

text_qa_template: Optional[BasePromptTemplate] = None,

467

**kwargs

468

): ...

469

470

def generate_questions_from_nodes(self, num: Optional[int] = None) -> List[str]:

471

"""

472

Generate questions from source nodes.

473

474

Parameters:

475

- num: Optional[int], number of questions to generate

476

477

Returns:

478

- List[str], generated questions

479

"""

480

481

def generate_dataset_from_nodes(self, num: Optional[int] = None) -> "QueryResponseDataset":

482

"""

483

Generate complete QA dataset from nodes.

484

485

Parameters:

486

- num: Optional[int], number of QA pairs to generate

487

488

Returns:

489

- QueryResponseDataset, generated dataset with questions and answers

490

"""

491

492

async def agenerate_questions_from_nodes(self, num: Optional[int] = None) -> List[str]:

493

"""Async version of question generation."""

494

495

async def agenerate_dataset_from_nodes(self, num: Optional[int] = None) -> "QueryResponseDataset":

496

"""Async version of dataset generation."""

497

498

class QueryResponseDataset:

499

"""

500

Dataset container for query-response evaluation pairs.

501

502

Parameters:

503

- queries: List[str], evaluation queries

504

- responses: List[str], expected responses

505

- relevant_docs: Optional[List[List[str]]], relevant document IDs per query

506

"""

507

def __init__(

508

self,

509

queries: List[str],

510

responses: List[str],

511

relevant_docs: Optional[List[List[str]]] = None,

512

**kwargs

513

): ...

514

515

def save_json(self, path: str) -> None:

516

"""Save dataset to JSON file."""

517

518

@classmethod

519

def from_json(cls, path: str) -> "QueryResponseDataset":

520

"""Load dataset from JSON file."""

521

522

def __len__(self) -> int:

523

"""Get dataset size."""

524

525

def __getitem__(self, idx: int) -> Dict[str, Any]:

526

"""Get dataset item by index."""

527

528

class EmbeddingQAFinetuneDataset:

529

"""

530

Dataset for embedding model fine-tuning with query-context pairs.

531

532

Parameters:

533

- queries: List[str], training queries

534

- corpus: List[str], text corpus for contexts

535

- relevant_docs: List[List[str]], relevant document mapping

536

"""

537

def __init__(

538

self,

539

queries: List[str],

540

corpus: List[str],

541

relevant_docs: List[List[str]],

542

**kwargs

543

): ...

544

545

def save_json(self, path: str) -> None:

546

"""Save fine-tuning dataset to JSON file."""

547

548

@classmethod

549

def from_json(cls, path: str) -> "EmbeddingQAFinetuneDataset":

550

"""Load fine-tuning dataset from JSON file."""

551

```

552

553

### Batch Evaluation

554

555

Tools for running large-scale evaluations across multiple queries and systems.

556

557

```python { .api }

558

class BatchEvalRunner:

559

"""

560

Batch evaluation runner for systematic evaluation across datasets.

561

562

Coordinates evaluation of multiple queries, responses, and systems

563

with parallel processing and result aggregation.

564

565

Parameters:

566

- evaluators: Dict[str, BaseEvaluator], named evaluators to run

567

- workers: int, number of worker processes for parallel evaluation

568

- show_progress: bool, whether to show evaluation progress

569

"""

570

def __init__(

571

self,

572

evaluators: Dict[str, BaseEvaluator],

573

workers: int = 2,

574

show_progress: bool = True,

575

**kwargs

576

): ...

577

578

def evaluate_queries(

579

self,

580

queries: List[str],

581

responses: List[str],

582

contexts_list: Optional[List[List[str]]] = None,

583

**kwargs

584

) -> Dict[str, List[EvaluationResult]]:

585

"""

586

Evaluate multiple queries in batch.

587

588

Parameters:

589

- queries: List[str], evaluation queries

590

- responses: List[str], system responses to evaluate

591

- contexts_list: Optional[List[List[str]]], contexts per query

592

593

Returns:

594

- Dict[str, List[EvaluationResult]], evaluator name to results mapping

595

"""

596

597

async def aevaluate_queries(

598

self,

599

queries: List[str],

600

responses: List[str],

601

contexts_list: Optional[List[List[str]]] = None,

602

**kwargs

603

) -> Dict[str, List[EvaluationResult]]:

604

"""Async batch evaluation."""

605

```

606

607

### Utility Functions

608

609

Helper functions for evaluation dataset generation and result processing.

610

611

```python { .api }

612

def generate_qa_embedding_pairs(

613

nodes: List[BaseNode],

614

llm: LLM,

615

qa_generate_prompt_tmpl: str,

616

num_questions_per_chunk: int = 2

617

) -> EmbeddingQAFinetuneDataset:

618

"""

619

Generate QA pairs for embedding fine-tuning.

620

621

Parameters:

622

- nodes: List[BaseNode], source nodes for generation

623

- llm: LLM, language model for question generation

624

- qa_generate_prompt_tmpl: str, prompt template for QA generation

625

- num_questions_per_chunk: int, questions per text chunk

626

627

Returns:

628

- EmbeddingQAFinetuneDataset, generated QA dataset for fine-tuning

629

"""

630

631

def generate_question_context_pairs(

632

nodes: List[BaseNode],

633

llm: LLM,

634

num_questions_per_chunk: int = 2,

635

question_gen_query: str = "Generate questions from context"

636

) -> List[Tuple[str, str]]:

637

"""

638

Generate question-context pairs for evaluation.

639

640

Parameters:

641

- nodes: List[BaseNode], source nodes

642

- llm: LLM, language model for generation

643

- num_questions_per_chunk: int, questions per chunk

644

- question_gen_query: str, query for question generation

645

646

Returns:

647

- List[Tuple[str, str]], question-context pairs

648

"""

649

650

def get_retrieval_results_df(

651

names: List[str],

652

results_arr: List[List[RetrievalEvalResult]]

653

) -> "pd.DataFrame":

654

"""

655

Convert retrieval results to pandas DataFrame for analysis.

656

657

Parameters:

658

- names: List[str], names for result sets

659

- results_arr: List[List[RetrievalEvalResult]], evaluation results

660

661

Returns:

662

- pd.DataFrame, results formatted as DataFrame

663

"""

664

```

665

666

### Legacy Compatibility

667

668

Legacy evaluator interfaces maintained for backward compatibility.

669

670

```python { .api }

671

# Legacy aliases and classes maintained for compatibility

672

QueryResponseEvaluator = BaseEvaluator

673

ResponseEvaluator = BaseEvaluator

674

LabelledQADataset = QueryResponseDataset

675

```

676

677

## Usage Examples

678

679

### Basic Response Evaluation

680

681

```python

682

from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator

683

from llama_index.core.llms import MockLLM

684

685

# Initialize evaluators

686

llm = MockLLM()

687

faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)

688

relevancy_evaluator = RelevancyEvaluator(llm=llm)

689

690

# Sample data for evaluation

691

query = "What is machine learning?"

692

contexts = [

693

"Machine learning is a subset of artificial intelligence that enables computers to learn from data.",

694

"Deep learning uses neural networks with multiple layers to process complex patterns."

695

]

696

response = "Machine learning is a branch of AI that allows systems to automatically learn from data without being explicitly programmed."

697

698

# Evaluate faithfulness

699

faithfulness_result = faithfulness_evaluator.evaluate(

700

query=query,

701

contexts=contexts,

702

response=response

703

)

704

705

print(f"Faithfulness Score: {faithfulness_result.score}")

706

print(f"Faithfulness Feedback: {faithfulness_result.feedback}")

707

708

# Evaluate relevancy

709

relevancy_result = relevancy_evaluator.evaluate(

710

query=query,

711

response=response

712

)

713

714

print(f"Relevancy Score: {relevancy_result.score}")

715

print(f"Relevancy Feedback: {relevancy_result.feedback}")

716

```

717

718

### Retrieval System Evaluation

719

720

```python

721

from llama_index.core.evaluation import RetrieverEvaluator, HitRate, MRR

722

from llama_index.core import VectorStoreIndex, Document

723

724

# Create test index and retriever

725

documents = [

726

Document(text="Machine learning algorithms learn patterns from data.", metadata={"doc_id": "doc1"}),

727

Document(text="Deep learning uses neural networks for complex tasks.", metadata={"doc_id": "doc2"}),

728

Document(text="Natural language processing handles text understanding.", metadata={"doc_id": "doc3"})

729

]

730

731

index = VectorStoreIndex.from_documents(documents)

732

retriever = index.as_retriever(similarity_top_k=2)

733

734

# Initialize retrieval evaluator

735

metrics = [HitRate(), MRR()]

736

retrieval_evaluator = RetrieverEvaluator(

737

metrics=metrics,

738

retriever=retriever

739

)

740

741

# Evaluate single query

742

query = "What are neural networks used for?"

743

expected_ids = ["doc2"] # Ground truth relevant documents

744

745

# Get retrieved results

746

retrieved_nodes = retriever.retrieve(query)

747

retrieved_ids = [node.node.metadata["doc_id"] for node in retrieved_nodes]

748

749

# Evaluate retrieval performance

750

eval_result = retrieval_evaluator.evaluate(

751

query=query,

752

expected_ids=expected_ids,

753

retrieved_ids=retrieved_ids

754

)

755

756

print(f"Hit Rate: {eval_result.metric_vals_dict.get('hit_rate', 0)}")

757

print(f"MRR: {eval_result.metric_vals_dict.get('mrr', 0)}")

758

```

759

760

### Dataset Generation for Evaluation

761

762

```python

763

from llama_index.core.evaluation import DatasetGenerator

764

from llama_index.core.node_parser import SentenceSplitter

765

766

# Parse documents into nodes

767

parser = SentenceSplitter(chunk_size=512)

768

nodes = parser.get_nodes_from_documents(documents)

769

770

# Generate evaluation dataset

771

dataset_generator = DatasetGenerator(

772

nodes=nodes,

773

llm=llm,

774

num_questions_per_chunk=3

775

)

776

777

# Generate questions and QA dataset

778

questions = dataset_generator.generate_questions_from_nodes(num=5)

779

print("Generated Questions:")

780

for i, question in enumerate(questions, 1):

781

print(f"{i}. {question}")

782

783

# Generate complete QA dataset

784

qa_dataset = dataset_generator.generate_dataset_from_nodes(num=5)

785

print(f"\\nGenerated {len(qa_dataset)} QA pairs")

786

787

for i in range(min(2, len(qa_dataset))):

788

item = qa_dataset[i]

789

print(f"Q{i+1}: {item['query']}")

790

print(f"A{i+1}: {item['response']}")

791

```

792

793

### Batch Evaluation

794

795

```python

796

from llama_index.core.evaluation import BatchEvalRunner

797

798

# Setup multiple evaluators

799

evaluators = {

800

"faithfulness": FaithfulnessEvaluator(llm=llm),

801

"relevancy": RelevancyEvaluator(llm=llm),

802

"answer_relevancy": AnswerRelevancyEvaluator(llm=llm)

803

}

804

805

# Create batch runner

806

batch_runner = BatchEvalRunner(

807

evaluators=evaluators,

808

workers=2,

809

show_progress=True

810

)

811

812

# Prepare evaluation data

813

eval_queries = [

814

"What is machine learning?",

815

"How do neural networks work?",

816

"What is natural language processing?"

817

]

818

819

eval_responses = [

820

"Machine learning is AI that learns from data automatically.",

821

"Neural networks are computing systems inspired by biological neural networks.",

822

"NLP is a field focused on interaction between computers and human language."

823

]

824

825

eval_contexts = [

826

["Machine learning enables computers to learn from data without explicit programming."],

827

["Neural networks consist of interconnected nodes that process information."],

828

["Natural language processing combines linguistics and computer science."]

829

]

830

831

# Run batch evaluation

832

batch_results = batch_runner.evaluate_queries(

833

queries=eval_queries,

834

responses=eval_responses,

835

contexts_list=eval_contexts

836

)

837

838

# Process results

839

for evaluator_name, results in batch_results.items():

840

avg_score = sum(r.score or 0 for r in results) / len(results)

841

print(f"{evaluator_name.title()} - Average Score: {avg_score:.3f}")

842

```

843

844

### Guideline-Based Evaluation

845

846

```python

847

from llama_index.core.evaluation import GuidelineEvaluator

848

849

# Define evaluation guidelines

850

guidelines = """

851

Response Quality Guidelines:

852

1. Answers should be concise and directly address the question

853

2. Technical terms should be explained simply

854

3. Responses should be factual and avoid speculation

855

4. Include examples when helpful

856

5. Maintain a helpful and professional tone

857

"""

858

859

# Create guideline evaluator

860

guideline_evaluator = GuidelineEvaluator(

861

guidelines=guidelines,

862

llm=llm

863

)

864

865

# Evaluate response against guidelines

866

response_to_evaluate = "Machine learning is super complicated stuff that uses math and computers and data and things."

867

868

guideline_result = guideline_evaluator.evaluate(

869

query="What is machine learning?",

870

response=response_to_evaluate

871

)

872

873

print(f"Guideline Adherence Score: {guideline_result.score}")

874

print(f"Guideline Feedback: {guideline_result.feedback}")

875

```

876

877

### Pairwise Comparison

878

879

```python

880

from llama_index.core.evaluation import PairwiseComparisonEvaluator

881

882

# Create pairwise evaluator

883

pairwise_evaluator = PairwiseComparisonEvaluator(llm=llm)

884

885

# Compare two different responses

886

response_a = "Machine learning is a subset of AI that learns from data."

887

response_b = "Machine learning uses algorithms to find patterns in data and make predictions automatically."

888

889

comparison_result = pairwise_evaluator.evaluate(

890

query="What is machine learning?",

891

response_a=response_a,

892

response_b=response_b

893

)

894

895

print(f"Preferred Response: {comparison_result.feedback}")

896

print(f"Comparison Score: {comparison_result.score}")

897

```

898

899

### Custom Metric Implementation

900

901

```python

902

from llama_index.core.evaluation import RetrievalMetricResult

903

904

class Precision:

905

"""Custom precision metric for retrieval evaluation."""

906

907

def __init__(self, k: int = 10):

908

self.k = k

909

910

def compute(

911

self,

912

query: str,

913

expected_ids: List[str],

914

retrieved_ids: List[str]

915

) -> RetrievalMetricResult:

916

"""Compute precision@k."""

917

# Take top k retrieved documents

918

top_k_retrieved = retrieved_ids[:self.k]

919

920

# Count relevant documents in top k

921

relevant_in_top_k = len(set(top_k_retrieved) & set(expected_ids))

922

923

# Calculate precision

924

precision = relevant_in_top_k / len(top_k_retrieved) if top_k_retrieved else 0.0

925

926

return RetrievalMetricResult(

927

metric_name=f"precision_at_{self.k}",

928

score=precision

929

)

930

931

# Use custom metric

932

precision_metric = Precision(k=5)

933

precision_result = precision_metric.compute(

934

query="test query",

935

expected_ids=["doc1", "doc3"],

936

retrieved_ids=["doc1", "doc2", "doc3", "doc4", "doc5"]

937

)

938

939

print(f"Precision@5: {precision_result.score}")

940

```

941

942

### Evaluation Results Analysis

943

944

```python

945

# Collect evaluation results across different queries

946

all_results = []

947

948

for query, response in zip(eval_queries, eval_responses):

949

faithfulness = faithfulness_evaluator.evaluate(

950

query=query,

951

contexts=eval_contexts[eval_queries.index(query)],

952

response=response

953

)

954

955

relevancy = relevancy_evaluator.evaluate(

956

query=query,

957

response=response

958

)

959

960

all_results.append({

961

"query": query,

962

"faithfulness_score": faithfulness.score,

963

"relevancy_score": relevancy.score,

964

"average_score": (faithfulness.score + relevancy.score) / 2

965

})

966

967

# Analyze results

968

for result in all_results:

969

print(f"Query: {result['query']}")

970

print(f" Faithfulness: {result['faithfulness_score']:.3f}")

971

print(f" Relevancy: {result['relevancy_score']:.3f}")

972

print(f" Average: {result['average_score']:.3f}")

973

print()

974

```

975

976

## Configuration & Types

977

978

```python { .api }

979

# Evaluation configuration

980

DEFAULT_EVAL_BATCH_SIZE = 20

981

DEFAULT_WORKERS = 2

982

DEFAULT_SIMILARITY_THRESHOLD = 0.8

983

984

# Evaluation modes

985

class EvaluationMode(str, Enum):

986

SINGLE = "single"

987

BATCH = "batch"

988

STREAMING = "streaming"

989

990

# Metric types

991

BaseMetric = Union[HitRate, MRR, Any]

992

993

# Dataset formats

994

SUPPORTED_DATASET_FORMATS = ["json", "csv", "jsonl"]

995

996

# Evaluation result types

997

EvalResultType = Union[EvaluationResult, RetrievalEvalResult]

998

```