Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Core evaluation functions for running metrics against test cases. DeepEval supports pytest integration, standalone evaluation, model comparison, and flexible configuration for async execution, caching, and error handling.
from deepeval import evaluate, assert_test, compare
from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfigEvaluates test cases against specified metrics in batch. Returns detailed results and optionally syncs with Confident AI platform.
def evaluate(
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase], EvaluationDataset],
metrics: Optional[Union[List[BaseMetric], List[BaseConversationalMetric], List[BaseMultimodalMetric]]] = None,
metric_collection: Optional[str] = None,
hyperparameters: Optional[Dict[str, Union[str, int, float, Prompt]]] = None,
identifier: Optional[str] = None,
async_config: Optional[AsyncConfig] = None,
display_config: Optional[DisplayConfig] = None,
cache_config: Optional[CacheConfig] = None,
error_config: Optional[ErrorConfig] = None
) -> EvaluationResult:
"""
Evaluates test cases against specified metrics.
Parameters:
- test_cases: Test cases to evaluate (can be a list or EvaluationDataset)
- metrics: Metrics to use for evaluation
- metric_collection: Name of metric collection on Confident AI
- hyperparameters: Hyperparameters to log (e.g., model params, prompts)
- identifier: Identifier for the evaluation run
- async_config: Configuration for async execution
- display_config: Configuration for display/output
- cache_config: Configuration for caching
- error_config: Configuration for error handling
Returns:
- EvaluationResult: Contains test results, Confident AI link, and test run ID
"""Usage example:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from deepeval.evaluate import AsyncConfig, DisplayConfig
# Create test cases
test_cases = [
LLMTestCase(
input="What's the return policy?",
actual_output="We offer 30-day returns.",
retrieval_context=["30-day return policy applies to all items"]
),
LLMTestCase(
input="How long does shipping take?",
actual_output="Shipping takes 3-5 business days.",
retrieval_context=["Standard shipping: 3-5 business days"]
)
]
# Define metrics
metrics = [
AnswerRelevancyMetric(threshold=0.7),
FaithfulnessMetric(threshold=0.8)
]
# Evaluate with custom configuration
result = evaluate(
test_cases,
metrics,
identifier="customer-support-v1",
hyperparameters={
"model": "gpt-4",
"temperature": 0.7,
"prompt_version": "v2.1"
},
async_config=AsyncConfig(
run_async=True,
max_concurrent=10
),
display_config=DisplayConfig(
print_results=True,
verbose_mode=True
)
)
print(f"Evaluation complete. View results at: {result.confident_link}")
print(f"Test run ID: {result.test_run_id}")Evaluating a dataset:
from deepeval import evaluate
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
# Create dataset
dataset = EvaluationDataset(
goldens=[
Golden(input="What is AI?", expected_output="Artificial Intelligence..."),
Golden(input="What is ML?", expected_output="Machine Learning...")
]
)
# Generate test cases
for golden in dataset.goldens:
test_case = LLMTestCase(
input=golden.input,
actual_output=your_llm_function(golden.input),
expected_output=golden.expected_output
)
dataset.add_test_case(test_case)
# Evaluate
result = evaluate(
dataset,
[GEval(
name="Correctness",
criteria="Determine if actual output matches expected output",
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)]
)Asserts that a single test case passes specified metrics. Designed for pytest integration but can be used standalone. Raises AssertionError if metrics fail.
def assert_test(
test_case: Optional[Union[LLMTestCase, ConversationalTestCase, MLLMTestCase]] = None,
metrics: Optional[Union[List[BaseMetric], List[BaseConversationalMetric], List[BaseMultimodalMetric]]] = None,
golden: Optional[Golden] = None,
observed_callback: Optional[Union[Callable, Awaitable]] = None,
run_async: bool = True
):
"""
Asserts that a single test case passes specified metrics.
Parameters:
- test_case: Test case to assert
- metrics: Metrics to evaluate
- golden: Golden data for agentic evaluation
- observed_callback: Callback function to execute (for component-level evaluation)
- run_async: Whether to run asynchronously (default: True)
Raises:
- AssertionError: If any metric fails (score below threshold)
"""Usage with pytest:
import pytest
from deepeval import assert_test
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase
def test_customer_support_responses():
"""Test customer support chatbot responses."""
test_case = LLMTestCase(
input="How do I reset my password?",
actual_output="Click 'Forgot Password' on the login page and follow the instructions.",
retrieval_context=["Password reset instructions available on login page"]
)
metrics = [
AnswerRelevancyMetric(threshold=0.7),
FaithfulnessMetric(threshold=0.8)
]
assert_test(test_case, metrics)
@pytest.mark.parametrize("input_text,expected", [
("What's your return policy?", "return policy"),
("How long is shipping?", "shipping time"),
])
def test_topic_relevance(input_text, expected):
"""Test that responses are relevant to topics."""
test_case = LLMTestCase(
input=input_text,
actual_output=your_llm_function(input_text)
)
assert_test(test_case, [AnswerRelevancyMetric(threshold=0.7)])Usage for component-level evaluation with observed callback:
from deepeval import assert_test
from deepeval.tracing import observe, update_current_span
from deepeval.dataset import Golden
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
correctness = GEval(
name="Correctness",
criteria="Evaluate if the output is correct",
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
)
@observe(metrics=[correctness])
def my_llm_component(input_text):
"""A component to be evaluated."""
output = process_with_llm(input_text)
update_current_span(
test_case=LLMTestCase(input=input_text, actual_output=output)
)
return output
# Assert the component passes the metric
golden = Golden(input="What is 2+2?")
assert_test(
golden=golden,
observed_callback=my_llm_component
)Compares multiple contestants in arena-style evaluation to determine which performs better. Useful for A/B testing different models, prompts, or configurations.
def compare(
test_cases: List[ArenaTestCase],
metric: ArenaGEval,
async_config: Optional[AsyncConfig] = None,
display_config: Optional[DisplayConfig] = None,
error_config: Optional[ErrorConfig] = None
) -> Dict[str, int]:
"""
Compares multiple contestants using arena-style evaluation.
Parameters:
- test_cases: List of ArenaTestCase instances containing contestants to compare
- metric: ArenaGEval metric for judging contestants
- async_config: Configuration for async execution
- display_config: Configuration for display/output
- error_config: Configuration for error handling
Returns:
- Dict[str, int]: Dictionary mapping contestant names to win counts
"""Usage example:
from deepeval import compare
from deepeval.metrics import ArenaGEval
from deepeval.test_case import ArenaTestCase, LLMTestCase, LLMTestCaseParams
# Create arena test cases
arena_test_cases = [
ArenaTestCase(
contestants={
"gpt-4": LLMTestCase(
input="Explain quantum computing",
actual_output="Quantum computing uses quantum bits..."
),
"claude-3": LLMTestCase(
input="Explain quantum computing",
actual_output="Quantum computing leverages quantum mechanics..."
),
"gemini-pro": LLMTestCase(
input="Explain quantum computing",
actual_output="Quantum computing is based on quantum physics..."
)
}
),
ArenaTestCase(
contestants={
"gpt-4": LLMTestCase(
input="What is machine learning?",
actual_output="Machine learning is a subset of AI..."
),
"claude-3": LLMTestCase(
input="What is machine learning?",
actual_output="Machine learning enables computers to learn..."
),
"gemini-pro": LLMTestCase(
input="What is machine learning?",
actual_output="Machine learning algorithms improve automatically..."
)
}
)
]
# Create arena metric
arena_metric = ArenaGEval(
name="Answer Quality",
criteria="Determine which answer is most clear, accurate, and helpful",
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
)
# Compare contestants
result = compare(
test_cases=arena_test_cases,
metric=arena_metric
)
# result is Dict[str, int] showing win counts
print(f"Results: {result}") # e.g., {'gpt-4': 5, 'claude-3': 8, 'gemini-pro': 3}Container for evaluation results returned by evaluate().
class EvaluationResult:
"""
Container for evaluation results.
Attributes:
- test_results (List[TestResult]): List of individual test results
- confident_link (str, optional): Link to Confident AI results page
- test_run_id (str, optional): Test run ID on Confident AI
"""Usage example:
from deepeval import evaluate
result = evaluate(test_cases, metrics)
# Access results
for test_result in result.test_results:
print(f"Test: {test_result.name}")
print(f"Success: {test_result.success}")
for metric_result in test_result.metrics:
print(f" {metric_result.name}: {metric_result.score}")
# Access Confident AI link
if result.confident_link:
print(f"View detailed results: {result.confident_link}")Configuration objects for customizing evaluation behavior.
class AsyncConfig:
"""
Configuration for asynchronous execution.
Parameters:
- run_async (bool): Whether to run asynchronously (default: True)
- throttle_value (int): Throttle value in seconds (default: 0)
- max_concurrent (int): Maximum concurrent tasks (default: 20)
"""
class DisplayConfig:
"""
Configuration for display/output.
Parameters:
- show_indicator (bool): Show progress indicator (default: True)
- print_results (bool): Print results (default: True)
- verbose_mode (bool, optional): Verbose mode (default: None)
- display_option (TestRunResultDisplay, optional): Display option (default: ALL)
- file_output_dir (str, optional): Directory for file output
"""
class CacheConfig:
"""
Configuration for caching.
Parameters:
- write_cache (bool): Write to cache (default: True)
- use_cache (bool): Use cache for reading (default: False)
"""
class ErrorConfig:
"""
Configuration for error handling.
Parameters:
- ignore_errors (bool): Ignore errors and continue (default: False)
- skip_on_missing_params (bool): Skip metrics when test case params are missing (default: False)
"""Usage example:
from deepeval import evaluate
from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
result = evaluate(
test_cases,
metrics,
async_config=AsyncConfig(
run_async=True,
max_concurrent=10,
throttle_value=1 # 1 second between batches
),
display_config=DisplayConfig(
show_indicator=True,
print_results=True,
verbose_mode=True,
file_output_dir="./evaluation_results"
),
cache_config=CacheConfig(
use_cache=True, # Reuse cached results
write_cache=True
),
error_config=ErrorConfig(
ignore_errors=False, # Fail on first error
skip_on_missing_params=True # Skip metrics if params missing
)
)DeepEval integrates seamlessly with pytest for test organization and execution.
Running tests:
# Run all tests
deepeval test run test_my_llm.py
# Run tests in parallel
deepeval test run test_my_llm.py -n 4
# Run specific test
deepeval test run test_my_llm.py::test_customer_support
# Run with pytest directly
pytest test_my_llm.py -vTest file structure:
import pytest
from deepeval import assert_test
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
# Setup fixtures
@pytest.fixture
def metrics():
return [AnswerRelevancyMetric(threshold=0.7)]
@pytest.fixture
def dataset():
# Load or create dataset
return EvaluationDataset(...)
# Individual test
def test_single_response(metrics):
test_case = LLMTestCase(
input="What is AI?",
actual_output=your_llm("What is AI?")
)
assert_test(test_case, metrics)
# Parametrized tests
@pytest.mark.parametrize("test_case", dataset.test_cases)
def test_dataset(test_case, metrics):
assert_test(test_case, metrics)
# Test classes for organization
class TestCustomerSupport:
def test_refund_questions(self, metrics):
test_case = LLMTestCase(...)
assert_test(test_case, metrics)
def test_shipping_questions(self, metrics):
test_case = LLMTestCase(...)
assert_test(test_case, metrics)The EvaluationDataset class also provides evaluation methods:
class EvaluationDataset:
def evaluate(
self,
metrics: List[BaseMetric],
**kwargs
) -> EvaluationResult:
"""
Evaluate the dataset with specified metrics.
Parameters:
- metrics: Metrics to use for evaluation
- **kwargs: Additional arguments passed to evaluate()
Returns:
- EvaluationResult: Evaluation results
"""Usage example:
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import AnswerRelevancyMetric
dataset = EvaluationDataset(...)
# Add test cases
for golden in dataset.goldens:
dataset.add_test_case(
LLMTestCase(
input=golden.input,
actual_output=your_llm(golden.input)
)
)
# Evaluate using dataset method
result = dataset.evaluate([AnswerRelevancyMetric(threshold=0.7)])