Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Tools for managing collections of test cases and "golden" examples. Supports batch evaluation, synthetic data generation, dataset persistence, and integration with Confident AI platform.
from deepeval.dataset import EvaluationDataset, Golden, ConversationalGolden
from deepeval.contextvars import get_current_goldenManages collections of test cases and goldens for batch evaluation.
class EvaluationDataset:
"""
Manages collections of test cases and goldens for evaluation.
Parameters:
- goldens (Union[List[Golden], List[ConversationalGolden]], optional): Initial goldens
Properties:
- goldens: Getter/setter for goldens list
- test_cases: Getter/setter for test cases list
Methods:
- add_test_case(test_case): Add a test case
- add_golden(golden): Add a golden
- add_test_cases_from_csv_file(file_path, **kwargs): Load test cases from CSV
- add_test_cases_from_json_file(file_path, **kwargs): Load test cases from JSON
- add_goldens_from_csv_file(file_path, **kwargs): Load goldens from CSV
- add_goldens_from_json_file(file_path, **kwargs): Load goldens from JSON
- push(alias, finalized=True): Push to Confident AI
- pull(alias, finalized=True, auto_convert_goldens_to_test_cases=False): Pull from Confident AI
- queue(alias, goldens): Queue goldens to Confident AI
- delete(alias): Delete dataset from Confident AI
- generate_goldens_from_docs(document_paths, **kwargs) -> List[Golden]
- generate_goldens_from_contexts(contexts, **kwargs) -> List[Golden]
- generate_goldens_from_scratch(num_goldens, **kwargs) -> List[Golden]
- save_as(file_type, directory, file_name=None, include_test_cases=False): Save dataset to file
- evals_iterator(metrics, **kwargs): Iterator for agentic evaluations
- evaluate(metrics, **kwargs) -> EvaluationResult: Evaluate dataset
"""Usage example:
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
# Create dataset
dataset = EvaluationDataset(
goldens=[
Golden(
input="What is Python?",
expected_output="Python is a high-level programming language"
),
Golden(
input="What is JavaScript?",
expected_output="JavaScript is a scripting language for web development"
)
]
)
# Add more goldens
dataset.add_golden(Golden(input="What is Java?", expected_output="..."))
# Generate test cases from goldens
for golden in dataset.goldens:
test_case = LLMTestCase(
input=golden.input,
actual_output=your_llm_app(golden.input),
expected_output=golden.expected_output
)
dataset.add_test_case(test_case)
# Evaluate
result = dataset.evaluate([AnswerRelevancyMetric(threshold=0.7)])
print(f"Results: {result.confident_link}")
# Save dataset
dataset.save_as(
file_type="json",
directory="./datasets",
file_name="my_dataset",
include_test_cases=True
)Loading from files:
from deepeval.dataset import EvaluationDataset
# Load from CSV
dataset = EvaluationDataset()
dataset.add_goldens_from_csv_file(
file_path="./data/goldens.csv",
input_col="question",
expected_output_col="answer"
)
# Load from JSON
dataset.add_goldens_from_json_file(
file_path="./data/goldens.json"
)
# Load test cases
dataset.add_test_cases_from_json_file(
file_path="./data/test_cases.json"
)Represents a "golden" test case with expected input/output pairs.
class Golden:
"""
Represents a "golden" test case - expected input/output pairs.
Parameters:
- input (str): Input prompt
- actual_output (str, optional): Actual output
- expected_output (str, optional): Expected output
- context (List[str], optional): Context information
- retrieval_context (List[str], optional): Retrieved context
- additional_metadata (Dict, optional): Additional metadata
- comments (str, optional): Comments
- tools_called (List[ToolCall], optional): Tools called
- expected_tools (List[ToolCall], optional): Expected tools
- source_file (str, optional): Source file path
- name (str, optional): Name
- custom_column_key_values (Dict[str, str], optional): Custom columns
"""Usage example:
from deepeval.dataset import Golden
# Simple golden
golden = Golden(
input="What is the return policy?",
expected_output="30-day full refund"
)
# Golden with context
golden_with_context = Golden(
input="How long does shipping take?",
expected_output="3-5 business days",
context=["Standard shipping timeline"],
retrieval_context=["Shipping takes 3-5 business days for US orders"]
)
# Golden with metadata
golden_with_metadata = Golden(
input="Product inquiry",
expected_output="Product details",
additional_metadata={
"category": "support",
"priority": "high"
},
comments="Test case for product inquiry flow",
name="product_inquiry_test"
)Golden test case for conversational interactions.
class ConversationalGolden:
"""
Represents a "golden" conversational test case.
Parameters:
- scenario (str): Scenario description
- expected_outcome (str, optional): Expected outcome
- user_description (str, optional): User description
- context (List[str], optional): Context information
- additional_metadata (Dict, optional): Additional metadata
- comments (str, optional): Comments
- name (str, optional): Name
- custom_column_key_values (Dict[str, str], optional): Custom columns
- turns (List[Turn], optional): Conversation turns
"""Usage example:
from deepeval.dataset import ConversationalGolden
from deepeval.test_case import Turn
conversational_golden = ConversationalGolden(
scenario="Customer wants to track order",
expected_outcome="Customer receives tracking information",
user_description="Existing customer with pending order",
context=["Order placed 2 days ago"],
turns=[
Turn(role="user", content="Where is my order?"),
Turn(role="assistant", content="Let me check your order status...")
]
)Sync datasets with Confident AI platform for team collaboration.
from deepeval.dataset import EvaluationDataset
dataset = EvaluationDataset(goldens=[...])
# Push to Confident AI
dataset.push(
alias="customer-support-v1",
finalized=True # Mark as finalized/ready for use
)
# Pull from Confident AI
dataset_from_cloud = EvaluationDataset()
dataset_from_cloud.pull(
alias="customer-support-v1",
auto_convert_goldens_to_test_cases=False
)
# Delete from Confident AI
dataset.delete(alias="customer-support-v1")Generate goldens from documents or contexts.
from deepeval.dataset import EvaluationDataset
dataset = EvaluationDataset()
# Generate from documents
goldens = dataset.generate_goldens_from_docs(
document_paths=[
"./docs/product_guide.pdf",
"./docs/faq.txt"
],
max_goldens_per_document=10,
include_expected_output=True
)
# Generate from contexts
goldens = dataset.generate_goldens_from_contexts(
contexts=[
["Context about returns and refunds"],
["Context about shipping policies"]
],
max_goldens_per_context=5,
include_expected_output=True
)
# Generate from scratch (using styling config)
goldens = dataset.generate_goldens_from_scratch(
num_goldens=20
)
print(f"Generated {len(goldens)} goldens")Use goldens for agentic evaluation workflows.
from deepeval.dataset import EvaluationDataset, Golden, get_current_golden
from deepeval.tracing import observe
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
# Define evaluation metric
metric = GEval(
name="Correctness",
criteria="Evaluate correctness of output",
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
)
# Create agent function
@observe(metrics=[metric])
def my_agent(input_text: str):
# Get current golden from context
golden = get_current_golden()
# Process with agent
output = process_with_agent(input_text)
# Update span with test case
from deepeval.tracing import update_current_span
update_current_span(
test_case=LLMTestCase(
input=input_text,
actual_output=output,
expected_output=golden.expected_output if golden else None
)
)
return output
# Create dataset
dataset = EvaluationDataset(goldens=[
Golden(input="Question 1", expected_output="Answer 1"),
Golden(input="Question 2", expected_output="Answer 2")
])
# Evaluate using iterator
from deepeval import evaluate
result = evaluate(
observed_callback=my_agent,
goldens=dataset.goldens
)Iterate over dataset for batch processing.
from deepeval.dataset import EvaluationDataset
dataset = EvaluationDataset()
dataset.pull(alias="my-dataset")
# Iterate over goldens
for golden in dataset.goldens:
print(f"Input: {golden.input}")
print(f"Expected: {golden.expected_output}")
# Iterate over test cases
for test_case in dataset.test_cases:
print(f"Input: {test_case.input}")
print(f"Output: {test_case.actual_output}")
# Use with pytest parametrize
import pytest
from deepeval import assert_test
@pytest.mark.parametrize("test_case", dataset.test_cases)
def test_dataset(test_case):
assert_test(test_case, metrics)