0
# Core Evaluation
1
2
Core evaluation functions for running metrics against test cases. DeepEval supports pytest integration, standalone evaluation, model comparison, and flexible configuration for async execution, caching, and error handling.
3
4
## Imports
5
6
```python
7
from deepeval import evaluate, assert_test, compare
8
from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
9
```
10
11
## Capabilities
12
13
### Evaluate Function
14
15
Evaluates test cases against specified metrics in batch. Returns detailed results and optionally syncs with Confident AI platform.
16
17
```python { .api }
18
def evaluate(
19
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase], EvaluationDataset],
20
metrics: Optional[Union[List[BaseMetric], List[BaseConversationalMetric], List[BaseMultimodalMetric]]] = None,
21
metric_collection: Optional[str] = None,
22
hyperparameters: Optional[Dict[str, Union[str, int, float, Prompt]]] = None,
23
identifier: Optional[str] = None,
24
async_config: Optional[AsyncConfig] = None,
25
display_config: Optional[DisplayConfig] = None,
26
cache_config: Optional[CacheConfig] = None,
27
error_config: Optional[ErrorConfig] = None
28
) -> EvaluationResult:
29
"""
30
Evaluates test cases against specified metrics.
31
32
Parameters:
33
- test_cases: Test cases to evaluate (can be a list or EvaluationDataset)
34
- metrics: Metrics to use for evaluation
35
- metric_collection: Name of metric collection on Confident AI
36
- hyperparameters: Hyperparameters to log (e.g., model params, prompts)
37
- identifier: Identifier for the evaluation run
38
- async_config: Configuration for async execution
39
- display_config: Configuration for display/output
40
- cache_config: Configuration for caching
41
- error_config: Configuration for error handling
42
43
Returns:
44
- EvaluationResult: Contains test results, Confident AI link, and test run ID
45
"""
46
```
47
48
Usage example:
49
50
```python
51
from deepeval import evaluate
52
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
53
from deepeval.test_case import LLMTestCase
54
from deepeval.evaluate import AsyncConfig, DisplayConfig
55
56
# Create test cases
57
test_cases = [
58
LLMTestCase(
59
input="What's the return policy?",
60
actual_output="We offer 30-day returns.",
61
retrieval_context=["30-day return policy applies to all items"]
62
),
63
LLMTestCase(
64
input="How long does shipping take?",
65
actual_output="Shipping takes 3-5 business days.",
66
retrieval_context=["Standard shipping: 3-5 business days"]
67
)
68
]
69
70
# Define metrics
71
metrics = [
72
AnswerRelevancyMetric(threshold=0.7),
73
FaithfulnessMetric(threshold=0.8)
74
]
75
76
# Evaluate with custom configuration
77
result = evaluate(
78
test_cases,
79
metrics,
80
identifier="customer-support-v1",
81
hyperparameters={
82
"model": "gpt-4",
83
"temperature": 0.7,
84
"prompt_version": "v2.1"
85
},
86
async_config=AsyncConfig(
87
run_async=True,
88
max_concurrent=10
89
),
90
display_config=DisplayConfig(
91
print_results=True,
92
verbose_mode=True
93
)
94
)
95
96
print(f"Evaluation complete. View results at: {result.confident_link}")
97
print(f"Test run ID: {result.test_run_id}")
98
```
99
100
Evaluating a dataset:
101
102
```python
103
from deepeval import evaluate
104
from deepeval.dataset import EvaluationDataset, Golden
105
from deepeval.metrics import GEval
106
from deepeval.test_case import LLMTestCaseParams
107
108
# Create dataset
109
dataset = EvaluationDataset(
110
goldens=[
111
Golden(input="What is AI?", expected_output="Artificial Intelligence..."),
112
Golden(input="What is ML?", expected_output="Machine Learning...")
113
]
114
)
115
116
# Generate test cases
117
for golden in dataset.goldens:
118
test_case = LLMTestCase(
119
input=golden.input,
120
actual_output=your_llm_function(golden.input),
121
expected_output=golden.expected_output
122
)
123
dataset.add_test_case(test_case)
124
125
# Evaluate
126
result = evaluate(
127
dataset,
128
[GEval(
129
name="Correctness",
130
criteria="Determine if actual output matches expected output",
131
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
132
)]
133
)
134
```
135
136
### Assert Test Function
137
138
Asserts that a single test case passes specified metrics. Designed for pytest integration but can be used standalone. Raises AssertionError if metrics fail.
139
140
```python { .api }
141
def assert_test(
142
test_case: Optional[Union[LLMTestCase, ConversationalTestCase, MLLMTestCase]] = None,
143
metrics: Optional[Union[List[BaseMetric], List[BaseConversationalMetric], List[BaseMultimodalMetric]]] = None,
144
golden: Optional[Golden] = None,
145
observed_callback: Optional[Union[Callable, Awaitable]] = None,
146
run_async: bool = True
147
):
148
"""
149
Asserts that a single test case passes specified metrics.
150
151
Parameters:
152
- test_case: Test case to assert
153
- metrics: Metrics to evaluate
154
- golden: Golden data for agentic evaluation
155
- observed_callback: Callback function to execute (for component-level evaluation)
156
- run_async: Whether to run asynchronously (default: True)
157
158
Raises:
159
- AssertionError: If any metric fails (score below threshold)
160
"""
161
```
162
163
Usage with pytest:
164
165
```python
166
import pytest
167
from deepeval import assert_test
168
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
169
from deepeval.test_case import LLMTestCase
170
171
def test_customer_support_responses():
172
"""Test customer support chatbot responses."""
173
test_case = LLMTestCase(
174
input="How do I reset my password?",
175
actual_output="Click 'Forgot Password' on the login page and follow the instructions.",
176
retrieval_context=["Password reset instructions available on login page"]
177
)
178
179
metrics = [
180
AnswerRelevancyMetric(threshold=0.7),
181
FaithfulnessMetric(threshold=0.8)
182
]
183
184
assert_test(test_case, metrics)
185
186
@pytest.mark.parametrize("input_text,expected", [
187
("What's your return policy?", "return policy"),
188
("How long is shipping?", "shipping time"),
189
])
190
def test_topic_relevance(input_text, expected):
191
"""Test that responses are relevant to topics."""
192
test_case = LLMTestCase(
193
input=input_text,
194
actual_output=your_llm_function(input_text)
195
)
196
197
assert_test(test_case, [AnswerRelevancyMetric(threshold=0.7)])
198
```
199
200
Usage for component-level evaluation with observed callback:
201
202
```python
203
from deepeval import assert_test
204
from deepeval.tracing import observe, update_current_span
205
from deepeval.dataset import Golden
206
from deepeval.metrics import GEval
207
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
208
209
correctness = GEval(
210
name="Correctness",
211
criteria="Evaluate if the output is correct",
212
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
213
)
214
215
@observe(metrics=[correctness])
216
def my_llm_component(input_text):
217
"""A component to be evaluated."""
218
output = process_with_llm(input_text)
219
update_current_span(
220
test_case=LLMTestCase(input=input_text, actual_output=output)
221
)
222
return output
223
224
# Assert the component passes the metric
225
golden = Golden(input="What is 2+2?")
226
assert_test(
227
golden=golden,
228
observed_callback=my_llm_component
229
)
230
```
231
232
### Compare Function
233
234
Compares multiple contestants in arena-style evaluation to determine which performs better. Useful for A/B testing different models, prompts, or configurations.
235
236
```python { .api }
237
def compare(
238
test_cases: List[ArenaTestCase],
239
metric: ArenaGEval,
240
async_config: Optional[AsyncConfig] = None,
241
display_config: Optional[DisplayConfig] = None,
242
error_config: Optional[ErrorConfig] = None
243
) -> Dict[str, int]:
244
"""
245
Compares multiple contestants using arena-style evaluation.
246
247
Parameters:
248
- test_cases: List of ArenaTestCase instances containing contestants to compare
249
- metric: ArenaGEval metric for judging contestants
250
- async_config: Configuration for async execution
251
- display_config: Configuration for display/output
252
- error_config: Configuration for error handling
253
254
Returns:
255
- Dict[str, int]: Dictionary mapping contestant names to win counts
256
"""
257
```
258
259
Usage example:
260
261
```python
262
from deepeval import compare
263
from deepeval.metrics import ArenaGEval
264
from deepeval.test_case import ArenaTestCase, LLMTestCase, LLMTestCaseParams
265
266
# Create arena test cases
267
arena_test_cases = [
268
ArenaTestCase(
269
contestants={
270
"gpt-4": LLMTestCase(
271
input="Explain quantum computing",
272
actual_output="Quantum computing uses quantum bits..."
273
),
274
"claude-3": LLMTestCase(
275
input="Explain quantum computing",
276
actual_output="Quantum computing leverages quantum mechanics..."
277
),
278
"gemini-pro": LLMTestCase(
279
input="Explain quantum computing",
280
actual_output="Quantum computing is based on quantum physics..."
281
)
282
}
283
),
284
ArenaTestCase(
285
contestants={
286
"gpt-4": LLMTestCase(
287
input="What is machine learning?",
288
actual_output="Machine learning is a subset of AI..."
289
),
290
"claude-3": LLMTestCase(
291
input="What is machine learning?",
292
actual_output="Machine learning enables computers to learn..."
293
),
294
"gemini-pro": LLMTestCase(
295
input="What is machine learning?",
296
actual_output="Machine learning algorithms improve automatically..."
297
)
298
}
299
)
300
]
301
302
# Create arena metric
303
arena_metric = ArenaGEval(
304
name="Answer Quality",
305
criteria="Determine which answer is most clear, accurate, and helpful",
306
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
307
)
308
309
# Compare contestants
310
result = compare(
311
test_cases=arena_test_cases,
312
metric=arena_metric
313
)
314
315
# result is Dict[str, int] showing win counts
316
print(f"Results: {result}") # e.g., {'gpt-4': 5, 'claude-3': 8, 'gemini-pro': 3}
317
```
318
319
### Evaluation Result
320
321
Container for evaluation results returned by `evaluate()`.
322
323
```python { .api }
324
class EvaluationResult:
325
"""
326
Container for evaluation results.
327
328
Attributes:
329
- test_results (List[TestResult]): List of individual test results
330
- confident_link (str, optional): Link to Confident AI results page
331
- test_run_id (str, optional): Test run ID on Confident AI
332
"""
333
```
334
335
Usage example:
336
337
```python
338
from deepeval import evaluate
339
340
result = evaluate(test_cases, metrics)
341
342
# Access results
343
for test_result in result.test_results:
344
print(f"Test: {test_result.name}")
345
print(f"Success: {test_result.success}")
346
for metric_result in test_result.metrics:
347
print(f" {metric_result.name}: {metric_result.score}")
348
349
# Access Confident AI link
350
if result.confident_link:
351
print(f"View detailed results: {result.confident_link}")
352
```
353
354
### Configuration Classes
355
356
Configuration objects for customizing evaluation behavior.
357
358
```python { .api }
359
class AsyncConfig:
360
"""
361
Configuration for asynchronous execution.
362
363
Parameters:
364
- run_async (bool): Whether to run asynchronously (default: True)
365
- throttle_value (int): Throttle value in seconds (default: 0)
366
- max_concurrent (int): Maximum concurrent tasks (default: 20)
367
"""
368
369
class DisplayConfig:
370
"""
371
Configuration for display/output.
372
373
Parameters:
374
- show_indicator (bool): Show progress indicator (default: True)
375
- print_results (bool): Print results (default: True)
376
- verbose_mode (bool, optional): Verbose mode (default: None)
377
- display_option (TestRunResultDisplay, optional): Display option (default: ALL)
378
- file_output_dir (str, optional): Directory for file output
379
"""
380
381
class CacheConfig:
382
"""
383
Configuration for caching.
384
385
Parameters:
386
- write_cache (bool): Write to cache (default: True)
387
- use_cache (bool): Use cache for reading (default: False)
388
"""
389
390
class ErrorConfig:
391
"""
392
Configuration for error handling.
393
394
Parameters:
395
- ignore_errors (bool): Ignore errors and continue (default: False)
396
- skip_on_missing_params (bool): Skip metrics when test case params are missing (default: False)
397
"""
398
```
399
400
Usage example:
401
402
```python
403
from deepeval import evaluate
404
from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
405
406
result = evaluate(
407
test_cases,
408
metrics,
409
async_config=AsyncConfig(
410
run_async=True,
411
max_concurrent=10,
412
throttle_value=1 # 1 second between batches
413
),
414
display_config=DisplayConfig(
415
show_indicator=True,
416
print_results=True,
417
verbose_mode=True,
418
file_output_dir="./evaluation_results"
419
),
420
cache_config=CacheConfig(
421
use_cache=True, # Reuse cached results
422
write_cache=True
423
),
424
error_config=ErrorConfig(
425
ignore_errors=False, # Fail on first error
426
skip_on_missing_params=True # Skip metrics if params missing
427
)
428
)
429
```
430
431
### Pytest Integration
432
433
DeepEval integrates seamlessly with pytest for test organization and execution.
434
435
Running tests:
436
437
```bash
438
# Run all tests
439
deepeval test run test_my_llm.py
440
441
# Run tests in parallel
442
deepeval test run test_my_llm.py -n 4
443
444
# Run specific test
445
deepeval test run test_my_llm.py::test_customer_support
446
447
# Run with pytest directly
448
pytest test_my_llm.py -v
449
```
450
451
Test file structure:
452
453
```python
454
import pytest
455
from deepeval import assert_test
456
from deepeval.metrics import AnswerRelevancyMetric
457
from deepeval.test_case import LLMTestCase
458
from deepeval.dataset import EvaluationDataset
459
460
# Setup fixtures
461
@pytest.fixture
462
def metrics():
463
return [AnswerRelevancyMetric(threshold=0.7)]
464
465
@pytest.fixture
466
def dataset():
467
# Load or create dataset
468
return EvaluationDataset(...)
469
470
# Individual test
471
def test_single_response(metrics):
472
test_case = LLMTestCase(
473
input="What is AI?",
474
actual_output=your_llm("What is AI?")
475
)
476
assert_test(test_case, metrics)
477
478
# Parametrized tests
479
@pytest.mark.parametrize("test_case", dataset.test_cases)
480
def test_dataset(test_case, metrics):
481
assert_test(test_case, metrics)
482
483
# Test classes for organization
484
class TestCustomerSupport:
485
def test_refund_questions(self, metrics):
486
test_case = LLMTestCase(...)
487
assert_test(test_case, metrics)
488
489
def test_shipping_questions(self, metrics):
490
test_case = LLMTestCase(...)
491
assert_test(test_case, metrics)
492
```
493
494
### Dataset Evaluation Methods
495
496
The `EvaluationDataset` class also provides evaluation methods:
497
498
```python { .api }
499
class EvaluationDataset:
500
def evaluate(
501
self,
502
metrics: List[BaseMetric],
503
**kwargs
504
) -> EvaluationResult:
505
"""
506
Evaluate the dataset with specified metrics.
507
508
Parameters:
509
- metrics: Metrics to use for evaluation
510
- **kwargs: Additional arguments passed to evaluate()
511
512
Returns:
513
- EvaluationResult: Evaluation results
514
"""
515
```
516
517
Usage example:
518
519
```python
520
from deepeval.dataset import EvaluationDataset
521
from deepeval.metrics import AnswerRelevancyMetric
522
523
dataset = EvaluationDataset(...)
524
525
# Add test cases
526
for golden in dataset.goldens:
527
dataset.add_test_case(
528
LLMTestCase(
529
input=golden.input,
530
actual_output=your_llm(golden.input)
531
)
532
)
533
534
# Evaluate using dataset method
535
result = dataset.evaluate([AnswerRelevancyMetric(threshold=0.7)])
536
```
537