0
# Evaluation
1
2
Comprehensive evaluation capabilities for RAG systems including retrieval metrics, response quality assessment, and dataset generation. The evaluation framework provides tools for measuring system performance, generating synthetic test data, and conducting systematic evaluations of LLM applications.
3
4
## Capabilities
5
6
### Base Evaluation Interfaces
7
8
Foundation interfaces for evaluation operations with standardized result structures and metrics computation.
9
10
```python { .api }
11
class BaseEvaluator:
12
"""
13
Base interface for evaluation implementations.
14
15
Evaluators assess different aspects of system performance including
16
response quality, retrieval accuracy, and adherence to guidelines.
17
"""
18
def __init__(self, **kwargs): ...
19
20
def evaluate(
21
self,
22
query: Optional[str] = None,
23
contexts: Optional[Sequence[str]] = None,
24
response: Optional[str] = None,
25
**kwargs
26
) -> EvaluationResult:
27
"""
28
Evaluate system performance on given inputs.
29
30
Parameters:
31
- query: Optional[str], input query or question
32
- contexts: Optional[Sequence[str]], retrieved context passages
33
- response: Optional[str], system response to evaluate
34
- **kwargs: additional evaluation parameters
35
36
Returns:
37
- EvaluationResult, evaluation result with score and feedback
38
"""
39
40
async def aevaluate(
41
self,
42
query: Optional[str] = None,
43
contexts: Optional[Sequence[str]] = None,
44
response: Optional[str] = None,
45
**kwargs
46
) -> EvaluationResult:
47
"""Asynchronous evaluation method."""
48
49
class EvaluationResult:
50
"""
51
Result of evaluation with score and detailed feedback.
52
53
Parameters:
54
- score: Optional[float], numerical evaluation score
55
- passing: Optional[bool], whether evaluation passed threshold
56
- feedback: Optional[str], detailed feedback and explanation
57
- metadata: Optional[dict], additional evaluation metadata
58
"""
59
def __init__(
60
self,
61
score: Optional[float] = None,
62
passing: Optional[bool] = None,
63
feedback: Optional[str] = None,
64
metadata: Optional[dict] = None,
65
**kwargs
66
): ...
67
68
def __str__(self) -> str:
69
"""String representation of evaluation result."""
70
```
71
72
### Response Quality Evaluators
73
74
Evaluators for assessing the quality, relevance, and accuracy of system responses.
75
76
```python { .api }
77
class FaithfulnessEvaluator(BaseEvaluator):
78
"""
79
Evaluator for response faithfulness to retrieved contexts.
80
81
Assesses whether the response is grounded in and consistent with
82
the provided context information without hallucination.
83
84
Parameters:
85
- llm: Optional[LLM], language model for faithfulness assessment
86
- raise_error: bool, whether to raise errors on evaluation failure
87
- eval_template: Optional[BasePromptTemplate], evaluation prompt template
88
"""
89
def __init__(
90
self,
91
llm: Optional[LLM] = None,
92
raise_error: bool = False,
93
eval_template: Optional[BasePromptTemplate] = None,
94
**kwargs
95
): ...
96
97
class AnswerRelevancyEvaluator(BaseEvaluator):
98
"""
99
Evaluator for answer relevancy to the input query.
100
101
Measures how well the response addresses the specific question asked
102
and whether it provides relevant information.
103
104
Parameters:
105
- llm: Optional[LLM], language model for relevancy assessment
106
- embed_model: Optional[BaseEmbedding], embedding model for similarity
107
- raise_error: bool, whether to raise errors on evaluation failure
108
"""
109
def __init__(
110
self,
111
llm: Optional[LLM] = None,
112
embed_model: Optional[BaseEmbedding] = None,
113
raise_error: bool = False,
114
**kwargs
115
): ...
116
117
class ContextRelevancyEvaluator(BaseEvaluator):
118
"""
119
Evaluator for context relevancy to the query.
120
121
Assesses whether the retrieved context passages are relevant
122
to answering the input query.
123
124
Parameters:
125
- llm: Optional[LLM], language model for relevancy assessment
126
- raise_error: bool, whether to raise errors on evaluation failure
127
"""
128
def __init__(
129
self,
130
llm: Optional[LLM] = None,
131
raise_error: bool = False,
132
**kwargs
133
): ...
134
135
class CorrectnessEvaluator(BaseEvaluator):
136
"""
137
Evaluator for response correctness against reference answers.
138
139
Compares system responses to ground truth answers to assess
140
factual accuracy and completeness.
141
142
Parameters:
143
- llm: Optional[LLM], language model for correctness assessment
144
- raise_error: bool, whether to raise errors on evaluation failure
145
- eval_template: Optional[BasePromptTemplate], evaluation prompt template
146
"""
147
def __init__(
148
self,
149
llm: Optional[LLM] = None,
150
raise_error: bool = False,
151
eval_template: Optional[BasePromptTemplate] = None,
152
**kwargs
153
): ...
154
155
class SemanticSimilarityEvaluator(BaseEvaluator):
156
"""
157
Evaluator for semantic similarity between responses and references.
158
159
Uses embedding models to assess semantic similarity between
160
generated responses and reference answers.
161
162
Parameters:
163
- embed_model: Optional[BaseEmbedding], embedding model for similarity
164
- similarity_threshold: float, threshold for similarity scoring
165
"""
166
def __init__(
167
self,
168
embed_model: Optional[BaseEmbedding] = None,
169
similarity_threshold: float = 0.8,
170
**kwargs
171
): ...
172
173
class RelevancyEvaluator(BaseEvaluator):
174
"""
175
General relevancy evaluator for response-query alignment.
176
177
Provides flexible relevancy assessment with customizable
178
evaluation criteria and scoring methods.
179
180
Parameters:
181
- llm: Optional[LLM], language model for relevancy assessment
182
- raise_error: bool, whether to raise errors on evaluation failure
183
"""
184
def __init__(
185
self,
186
llm: Optional[LLM] = None,
187
raise_error: bool = False,
188
**kwargs
189
): ...
190
```
191
192
### Guideline & Criteria Evaluators
193
194
Evaluators for adherence to specific guidelines, criteria, or custom evaluation frameworks.
195
196
```python { .api }
197
class GuidelineEvaluator(BaseEvaluator):
198
"""
199
Evaluator for adherence to specific guidelines or criteria.
200
201
Assesses whether responses follow specified guidelines,
202
style requirements, or content policies.
203
204
Parameters:
205
- guidelines: str, guidelines text or criteria description
206
- llm: Optional[LLM], language model for guideline assessment
207
- eval_template: Optional[BasePromptTemplate], evaluation prompt template
208
"""
209
def __init__(
210
self,
211
guidelines: str,
212
llm: Optional[LLM] = None,
213
eval_template: Optional[BasePromptTemplate] = None,
214
**kwargs
215
): ...
216
217
class PairwiseComparisonEvaluator(BaseEvaluator):
218
"""
219
Evaluator for pairwise comparison between responses.
220
221
Compares two responses to determine which better satisfies
222
the evaluation criteria or user requirements.
223
224
Parameters:
225
- llm: Optional[LLM], language model for comparison assessment
226
- eval_template: Optional[BasePromptTemplate], comparison prompt template
227
"""
228
def __init__(
229
self,
230
llm: Optional[LLM] = None,
231
eval_template: Optional[BasePromptTemplate] = None,
232
**kwargs
233
): ...
234
235
def evaluate(
236
self,
237
query: str,
238
response_a: str,
239
response_b: str,
240
contexts: Optional[Sequence[str]] = None,
241
**kwargs
242
) -> EvaluationResult:
243
"""
244
Compare two responses for the same query.
245
246
Parameters:
247
- query: str, input query for comparison
248
- response_a: str, first response to compare
249
- response_b: str, second response to compare
250
- contexts: Optional[Sequence[str]], context for evaluation
251
252
Returns:
253
- EvaluationResult, comparison result indicating preferred response
254
"""
255
```
256
257
### Retrieval Evaluation
258
259
Specialized evaluation framework for retrieval system performance and accuracy.
260
261
```python { .api }
262
class BaseRetrievalEvaluator:
263
"""
264
Base interface for retrieval evaluation implementations.
265
266
Retrieval evaluators assess the quality of information retrieval
267
systems including accuracy, coverage, and ranking quality.
268
"""
269
def __init__(self, **kwargs): ...
270
271
def evaluate(
272
self,
273
query: str,
274
expected_ids: List[str],
275
retrieved_ids: List[str],
276
**kwargs
277
) -> RetrievalEvalResult:
278
"""
279
Evaluate retrieval performance.
280
281
Parameters:
282
- query: str, input query
283
- expected_ids: List[str], expected relevant document IDs
284
- retrieved_ids: List[str], actually retrieved document IDs
285
286
Returns:
287
- RetrievalEvalResult, retrieval evaluation result
288
"""
289
290
class RetrievalEvalResult:
291
"""
292
Result of retrieval evaluation with multiple metrics.
293
294
Parameters:
295
- query: str, evaluated query
296
- expected_ids: List[str], expected relevant IDs
297
- retrieved_ids: List[str], retrieved IDs
298
- metric_vals_dict: Dict[str, float], metric name to value mapping
299
"""
300
def __init__(
301
self,
302
query: str,
303
expected_ids: List[str],
304
retrieved_ids: List[str],
305
metric_vals_dict: Dict[str, float],
306
**kwargs
307
): ...
308
309
class RetrieverEvaluator(BaseRetrievalEvaluator):
310
"""
311
Comprehensive retriever evaluation with multiple metrics.
312
313
Parameters:
314
- metrics: Optional[List[BaseMetric]], metrics to compute
315
- retriever: Optional[BaseRetriever], retriever to evaluate
316
- node_postprocessors: Optional[List[BaseNodePostprocessor]], postprocessors
317
"""
318
def __init__(
319
self,
320
metrics: Optional[List[BaseMetric]] = None,
321
retriever: Optional[BaseRetriever] = None,
322
node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,
323
**kwargs
324
): ...
325
326
async def aevaluate_dataset(
327
self,
328
dataset: Any,
329
workers: int = 2,
330
show_progress: bool = False,
331
**kwargs
332
) -> List[RetrievalEvalResult]:
333
"""Asynchronously evaluate entire dataset."""
334
335
class MultiModalRetrieverEvaluator(BaseRetrievalEvaluator):
336
"""
337
Evaluator for multi-modal retrieval systems.
338
339
Assesses retrieval performance for systems handling multiple
340
content modalities including text, images, and other media.
341
342
Parameters:
343
- metrics: Optional[List[BaseMetric]], evaluation metrics
344
- retriever: Optional[BaseRetriever], multi-modal retriever to evaluate
345
"""
346
def __init__(
347
self,
348
metrics: Optional[List[BaseMetric]] = None,
349
retriever: Optional[BaseRetriever] = None,
350
**kwargs
351
): ...
352
```
353
354
### Retrieval Metrics
355
356
Specific metrics for measuring retrieval system performance and quality.
357
358
```python { .api }
359
class RetrievalMetricResult:
360
"""
361
Result container for individual retrieval metrics.
362
363
Parameters:
364
- metric_name: str, name of the computed metric
365
- score: float, metric score value
366
- metadata: Optional[dict], additional metric metadata
367
"""
368
def __init__(
369
self,
370
metric_name: str,
371
score: float,
372
metadata: Optional[dict] = None
373
): ...
374
375
class HitRate:
376
"""
377
Hit rate metric for retrieval evaluation.
378
379
Measures the fraction of queries for which at least one
380
relevant document was retrieved in the top-k results.
381
"""
382
def __init__(self): ...
383
384
def compute(
385
self,
386
query: str,
387
expected_ids: List[str],
388
retrieved_ids: List[str]
389
) -> RetrievalMetricResult:
390
"""
391
Compute hit rate for single query.
392
393
Parameters:
394
- query: str, input query
395
- expected_ids: List[str], relevant document IDs
396
- retrieved_ids: List[str], retrieved document IDs
397
398
Returns:
399
- RetrievalMetricResult, hit rate result (1.0 if hit, 0.0 if miss)
400
"""
401
402
class MRR:
403
"""
404
Mean Reciprocal Rank (MRR) metric for retrieval evaluation.
405
406
Measures the quality of ranking by computing the reciprocal
407
of the rank of the first relevant document retrieved.
408
"""
409
def __init__(self): ...
410
411
def compute(
412
self,
413
query: str,
414
expected_ids: List[str],
415
retrieved_ids: List[str]
416
) -> RetrievalMetricResult:
417
"""
418
Compute MRR for single query.
419
420
Parameters:
421
- query: str, input query
422
- expected_ids: List[str], relevant document IDs
423
- retrieved_ids: List[str], retrieved document IDs
424
425
Returns:
426
- RetrievalMetricResult, MRR score (1/rank of first relevant doc)
427
"""
428
429
def resolve_metrics(metric_names: List[str]) -> List[BaseMetric]:
430
"""
431
Resolve metric instances from metric names.
432
433
Parameters:
434
- metric_names: List[str], names of metrics to resolve
435
436
Returns:
437
- List[BaseMetric], resolved metric instances
438
"""
439
```
440
441
### Dataset Generation
442
443
Tools for generating synthetic evaluation datasets and test cases for system validation.
444
445
```python { .api }
446
class DatasetGenerator:
447
"""
448
Generator for synthetic evaluation datasets.
449
450
Creates question-answer pairs, retrieval test cases, and other
451
evaluation data from source documents and knowledge bases.
452
453
Parameters:
454
- nodes: List[BaseNode], source nodes for dataset generation
455
- llm: Optional[LLM], language model for question generation
456
- num_questions_per_chunk: int, questions to generate per text chunk
457
- text_question_template: Optional[BasePromptTemplate], question generation prompt
458
- text_qa_template: Optional[BasePromptTemplate], QA pair generation prompt
459
"""
460
def __init__(
461
self,
462
nodes: List[BaseNode],
463
llm: Optional[LLM] = None,
464
num_questions_per_chunk: int = 2,
465
text_question_template: Optional[BasePromptTemplate] = None,
466
text_qa_template: Optional[BasePromptTemplate] = None,
467
**kwargs
468
): ...
469
470
def generate_questions_from_nodes(self, num: Optional[int] = None) -> List[str]:
471
"""
472
Generate questions from source nodes.
473
474
Parameters:
475
- num: Optional[int], number of questions to generate
476
477
Returns:
478
- List[str], generated questions
479
"""
480
481
def generate_dataset_from_nodes(self, num: Optional[int] = None) -> "QueryResponseDataset":
482
"""
483
Generate complete QA dataset from nodes.
484
485
Parameters:
486
- num: Optional[int], number of QA pairs to generate
487
488
Returns:
489
- QueryResponseDataset, generated dataset with questions and answers
490
"""
491
492
async def agenerate_questions_from_nodes(self, num: Optional[int] = None) -> List[str]:
493
"""Async version of question generation."""
494
495
async def agenerate_dataset_from_nodes(self, num: Optional[int] = None) -> "QueryResponseDataset":
496
"""Async version of dataset generation."""
497
498
class QueryResponseDataset:
499
"""
500
Dataset container for query-response evaluation pairs.
501
502
Parameters:
503
- queries: List[str], evaluation queries
504
- responses: List[str], expected responses
505
- relevant_docs: Optional[List[List[str]]], relevant document IDs per query
506
"""
507
def __init__(
508
self,
509
queries: List[str],
510
responses: List[str],
511
relevant_docs: Optional[List[List[str]]] = None,
512
**kwargs
513
): ...
514
515
def save_json(self, path: str) -> None:
516
"""Save dataset to JSON file."""
517
518
@classmethod
519
def from_json(cls, path: str) -> "QueryResponseDataset":
520
"""Load dataset from JSON file."""
521
522
def __len__(self) -> int:
523
"""Get dataset size."""
524
525
def __getitem__(self, idx: int) -> Dict[str, Any]:
526
"""Get dataset item by index."""
527
528
class EmbeddingQAFinetuneDataset:
529
"""
530
Dataset for embedding model fine-tuning with query-context pairs.
531
532
Parameters:
533
- queries: List[str], training queries
534
- corpus: List[str], text corpus for contexts
535
- relevant_docs: List[List[str]], relevant document mapping
536
"""
537
def __init__(
538
self,
539
queries: List[str],
540
corpus: List[str],
541
relevant_docs: List[List[str]],
542
**kwargs
543
): ...
544
545
def save_json(self, path: str) -> None:
546
"""Save fine-tuning dataset to JSON file."""
547
548
@classmethod
549
def from_json(cls, path: str) -> "EmbeddingQAFinetuneDataset":
550
"""Load fine-tuning dataset from JSON file."""
551
```
552
553
### Batch Evaluation
554
555
Tools for running large-scale evaluations across multiple queries and systems.
556
557
```python { .api }
558
class BatchEvalRunner:
559
"""
560
Batch evaluation runner for systematic evaluation across datasets.
561
562
Coordinates evaluation of multiple queries, responses, and systems
563
with parallel processing and result aggregation.
564
565
Parameters:
566
- evaluators: Dict[str, BaseEvaluator], named evaluators to run
567
- workers: int, number of worker processes for parallel evaluation
568
- show_progress: bool, whether to show evaluation progress
569
"""
570
def __init__(
571
self,
572
evaluators: Dict[str, BaseEvaluator],
573
workers: int = 2,
574
show_progress: bool = True,
575
**kwargs
576
): ...
577
578
def evaluate_queries(
579
self,
580
queries: List[str],
581
responses: List[str],
582
contexts_list: Optional[List[List[str]]] = None,
583
**kwargs
584
) -> Dict[str, List[EvaluationResult]]:
585
"""
586
Evaluate multiple queries in batch.
587
588
Parameters:
589
- queries: List[str], evaluation queries
590
- responses: List[str], system responses to evaluate
591
- contexts_list: Optional[List[List[str]]], contexts per query
592
593
Returns:
594
- Dict[str, List[EvaluationResult]], evaluator name to results mapping
595
"""
596
597
async def aevaluate_queries(
598
self,
599
queries: List[str],
600
responses: List[str],
601
contexts_list: Optional[List[List[str]]] = None,
602
**kwargs
603
) -> Dict[str, List[EvaluationResult]]:
604
"""Async batch evaluation."""
605
```
606
607
### Utility Functions
608
609
Helper functions for evaluation dataset generation and result processing.
610
611
```python { .api }
612
def generate_qa_embedding_pairs(
613
nodes: List[BaseNode],
614
llm: LLM,
615
qa_generate_prompt_tmpl: str,
616
num_questions_per_chunk: int = 2
617
) -> EmbeddingQAFinetuneDataset:
618
"""
619
Generate QA pairs for embedding fine-tuning.
620
621
Parameters:
622
- nodes: List[BaseNode], source nodes for generation
623
- llm: LLM, language model for question generation
624
- qa_generate_prompt_tmpl: str, prompt template for QA generation
625
- num_questions_per_chunk: int, questions per text chunk
626
627
Returns:
628
- EmbeddingQAFinetuneDataset, generated QA dataset for fine-tuning
629
"""
630
631
def generate_question_context_pairs(
632
nodes: List[BaseNode],
633
llm: LLM,
634
num_questions_per_chunk: int = 2,
635
question_gen_query: str = "Generate questions from context"
636
) -> List[Tuple[str, str]]:
637
"""
638
Generate question-context pairs for evaluation.
639
640
Parameters:
641
- nodes: List[BaseNode], source nodes
642
- llm: LLM, language model for generation
643
- num_questions_per_chunk: int, questions per chunk
644
- question_gen_query: str, query for question generation
645
646
Returns:
647
- List[Tuple[str, str]], question-context pairs
648
"""
649
650
def get_retrieval_results_df(
651
names: List[str],
652
results_arr: List[List[RetrievalEvalResult]]
653
) -> "pd.DataFrame":
654
"""
655
Convert retrieval results to pandas DataFrame for analysis.
656
657
Parameters:
658
- names: List[str], names for result sets
659
- results_arr: List[List[RetrievalEvalResult]], evaluation results
660
661
Returns:
662
- pd.DataFrame, results formatted as DataFrame
663
"""
664
```
665
666
### Legacy Compatibility
667
668
Legacy evaluator interfaces maintained for backward compatibility.
669
670
```python { .api }
671
# Legacy aliases and classes maintained for compatibility
672
QueryResponseEvaluator = BaseEvaluator
673
ResponseEvaluator = BaseEvaluator
674
LabelledQADataset = QueryResponseDataset
675
```
676
677
## Usage Examples
678
679
### Basic Response Evaluation
680
681
```python
682
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
683
from llama_index.core.llms import MockLLM
684
685
# Initialize evaluators
686
llm = MockLLM()
687
faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)
688
relevancy_evaluator = RelevancyEvaluator(llm=llm)
689
690
# Sample data for evaluation
691
query = "What is machine learning?"
692
contexts = [
693
"Machine learning is a subset of artificial intelligence that enables computers to learn from data.",
694
"Deep learning uses neural networks with multiple layers to process complex patterns."
695
]
696
response = "Machine learning is a branch of AI that allows systems to automatically learn from data without being explicitly programmed."
697
698
# Evaluate faithfulness
699
faithfulness_result = faithfulness_evaluator.evaluate(
700
query=query,
701
contexts=contexts,
702
response=response
703
)
704
705
print(f"Faithfulness Score: {faithfulness_result.score}")
706
print(f"Faithfulness Feedback: {faithfulness_result.feedback}")
707
708
# Evaluate relevancy
709
relevancy_result = relevancy_evaluator.evaluate(
710
query=query,
711
response=response
712
)
713
714
print(f"Relevancy Score: {relevancy_result.score}")
715
print(f"Relevancy Feedback: {relevancy_result.feedback}")
716
```
717
718
### Retrieval System Evaluation
719
720
```python
721
from llama_index.core.evaluation import RetrieverEvaluator, HitRate, MRR
722
from llama_index.core import VectorStoreIndex, Document
723
724
# Create test index and retriever
725
documents = [
726
Document(text="Machine learning algorithms learn patterns from data.", metadata={"doc_id": "doc1"}),
727
Document(text="Deep learning uses neural networks for complex tasks.", metadata={"doc_id": "doc2"}),
728
Document(text="Natural language processing handles text understanding.", metadata={"doc_id": "doc3"})
729
]
730
731
index = VectorStoreIndex.from_documents(documents)
732
retriever = index.as_retriever(similarity_top_k=2)
733
734
# Initialize retrieval evaluator
735
metrics = [HitRate(), MRR()]
736
retrieval_evaluator = RetrieverEvaluator(
737
metrics=metrics,
738
retriever=retriever
739
)
740
741
# Evaluate single query
742
query = "What are neural networks used for?"
743
expected_ids = ["doc2"] # Ground truth relevant documents
744
745
# Get retrieved results
746
retrieved_nodes = retriever.retrieve(query)
747
retrieved_ids = [node.node.metadata["doc_id"] for node in retrieved_nodes]
748
749
# Evaluate retrieval performance
750
eval_result = retrieval_evaluator.evaluate(
751
query=query,
752
expected_ids=expected_ids,
753
retrieved_ids=retrieved_ids
754
)
755
756
print(f"Hit Rate: {eval_result.metric_vals_dict.get('hit_rate', 0)}")
757
print(f"MRR: {eval_result.metric_vals_dict.get('mrr', 0)}")
758
```
759
760
### Dataset Generation for Evaluation
761
762
```python
763
from llama_index.core.evaluation import DatasetGenerator
764
from llama_index.core.node_parser import SentenceSplitter
765
766
# Parse documents into nodes
767
parser = SentenceSplitter(chunk_size=512)
768
nodes = parser.get_nodes_from_documents(documents)
769
770
# Generate evaluation dataset
771
dataset_generator = DatasetGenerator(
772
nodes=nodes,
773
llm=llm,
774
num_questions_per_chunk=3
775
)
776
777
# Generate questions and QA dataset
778
questions = dataset_generator.generate_questions_from_nodes(num=5)
779
print("Generated Questions:")
780
for i, question in enumerate(questions, 1):
781
print(f"{i}. {question}")
782
783
# Generate complete QA dataset
784
qa_dataset = dataset_generator.generate_dataset_from_nodes(num=5)
785
print(f"\\nGenerated {len(qa_dataset)} QA pairs")
786
787
for i in range(min(2, len(qa_dataset))):
788
item = qa_dataset[i]
789
print(f"Q{i+1}: {item['query']}")
790
print(f"A{i+1}: {item['response']}")
791
```
792
793
### Batch Evaluation
794
795
```python
796
from llama_index.core.evaluation import BatchEvalRunner
797
798
# Setup multiple evaluators
799
evaluators = {
800
"faithfulness": FaithfulnessEvaluator(llm=llm),
801
"relevancy": RelevancyEvaluator(llm=llm),
802
"answer_relevancy": AnswerRelevancyEvaluator(llm=llm)
803
}
804
805
# Create batch runner
806
batch_runner = BatchEvalRunner(
807
evaluators=evaluators,
808
workers=2,
809
show_progress=True
810
)
811
812
# Prepare evaluation data
813
eval_queries = [
814
"What is machine learning?",
815
"How do neural networks work?",
816
"What is natural language processing?"
817
]
818
819
eval_responses = [
820
"Machine learning is AI that learns from data automatically.",
821
"Neural networks are computing systems inspired by biological neural networks.",
822
"NLP is a field focused on interaction between computers and human language."
823
]
824
825
eval_contexts = [
826
["Machine learning enables computers to learn from data without explicit programming."],
827
["Neural networks consist of interconnected nodes that process information."],
828
["Natural language processing combines linguistics and computer science."]
829
]
830
831
# Run batch evaluation
832
batch_results = batch_runner.evaluate_queries(
833
queries=eval_queries,
834
responses=eval_responses,
835
contexts_list=eval_contexts
836
)
837
838
# Process results
839
for evaluator_name, results in batch_results.items():
840
avg_score = sum(r.score or 0 for r in results) / len(results)
841
print(f"{evaluator_name.title()} - Average Score: {avg_score:.3f}")
842
```
843
844
### Guideline-Based Evaluation
845
846
```python
847
from llama_index.core.evaluation import GuidelineEvaluator
848
849
# Define evaluation guidelines
850
guidelines = """
851
Response Quality Guidelines:
852
1. Answers should be concise and directly address the question
853
2. Technical terms should be explained simply
854
3. Responses should be factual and avoid speculation
855
4. Include examples when helpful
856
5. Maintain a helpful and professional tone
857
"""
858
859
# Create guideline evaluator
860
guideline_evaluator = GuidelineEvaluator(
861
guidelines=guidelines,
862
llm=llm
863
)
864
865
# Evaluate response against guidelines
866
response_to_evaluate = "Machine learning is super complicated stuff that uses math and computers and data and things."
867
868
guideline_result = guideline_evaluator.evaluate(
869
query="What is machine learning?",
870
response=response_to_evaluate
871
)
872
873
print(f"Guideline Adherence Score: {guideline_result.score}")
874
print(f"Guideline Feedback: {guideline_result.feedback}")
875
```
876
877
### Pairwise Comparison
878
879
```python
880
from llama_index.core.evaluation import PairwiseComparisonEvaluator
881
882
# Create pairwise evaluator
883
pairwise_evaluator = PairwiseComparisonEvaluator(llm=llm)
884
885
# Compare two different responses
886
response_a = "Machine learning is a subset of AI that learns from data."
887
response_b = "Machine learning uses algorithms to find patterns in data and make predictions automatically."
888
889
comparison_result = pairwise_evaluator.evaluate(
890
query="What is machine learning?",
891
response_a=response_a,
892
response_b=response_b
893
)
894
895
print(f"Preferred Response: {comparison_result.feedback}")
896
print(f"Comparison Score: {comparison_result.score}")
897
```
898
899
### Custom Metric Implementation
900
901
```python
902
from llama_index.core.evaluation import RetrievalMetricResult
903
904
class Precision:
905
"""Custom precision metric for retrieval evaluation."""
906
907
def __init__(self, k: int = 10):
908
self.k = k
909
910
def compute(
911
self,
912
query: str,
913
expected_ids: List[str],
914
retrieved_ids: List[str]
915
) -> RetrievalMetricResult:
916
"""Compute precision@k."""
917
# Take top k retrieved documents
918
top_k_retrieved = retrieved_ids[:self.k]
919
920
# Count relevant documents in top k
921
relevant_in_top_k = len(set(top_k_retrieved) & set(expected_ids))
922
923
# Calculate precision
924
precision = relevant_in_top_k / len(top_k_retrieved) if top_k_retrieved else 0.0
925
926
return RetrievalMetricResult(
927
metric_name=f"precision_at_{self.k}",
928
score=precision
929
)
930
931
# Use custom metric
932
precision_metric = Precision(k=5)
933
precision_result = precision_metric.compute(
934
query="test query",
935
expected_ids=["doc1", "doc3"],
936
retrieved_ids=["doc1", "doc2", "doc3", "doc4", "doc5"]
937
)
938
939
print(f"Precision@5: {precision_result.score}")
940
```
941
942
### Evaluation Results Analysis
943
944
```python
945
# Collect evaluation results across different queries
946
all_results = []
947
948
for query, response in zip(eval_queries, eval_responses):
949
faithfulness = faithfulness_evaluator.evaluate(
950
query=query,
951
contexts=eval_contexts[eval_queries.index(query)],
952
response=response
953
)
954
955
relevancy = relevancy_evaluator.evaluate(
956
query=query,
957
response=response
958
)
959
960
all_results.append({
961
"query": query,
962
"faithfulness_score": faithfulness.score,
963
"relevancy_score": relevancy.score,
964
"average_score": (faithfulness.score + relevancy.score) / 2
965
})
966
967
# Analyze results
968
for result in all_results:
969
print(f"Query: {result['query']}")
970
print(f" Faithfulness: {result['faithfulness_score']:.3f}")
971
print(f" Relevancy: {result['relevancy_score']:.3f}")
972
print(f" Average: {result['average_score']:.3f}")
973
print()
974
```
975
976
## Configuration & Types
977
978
```python { .api }
979
# Evaluation configuration
980
DEFAULT_EVAL_BATCH_SIZE = 20
981
DEFAULT_WORKERS = 2
982
DEFAULT_SIMILARITY_THRESHOLD = 0.8
983
984
# Evaluation modes
985
class EvaluationMode(str, Enum):
986
SINGLE = "single"
987
BATCH = "batch"
988
STREAMING = "streaming"
989
990
# Metric types
991
BaseMetric = Union[HitRate, MRR, Any]
992
993
# Dataset formats
994
SUPPORTED_DATASET_FORMATS = ["json", "csv", "jsonl"]
995
996
# Evaluation result types
997
EvalResultType = Union[EvaluationResult, RetrievalEvalResult]
998
```