Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Test cases are structured containers representing LLM interactions to be evaluated. DeepEval provides specialized test case classes for different evaluation scenarios: standard LLM tests, multi-turn conversations, multimodal inputs, and arena-style comparisons.
from deepeval.test_case import (
LLMTestCase,
LLMTestCaseParams,
ConversationalTestCase,
Turn,
TurnParams,
MLLMTestCase,
MLLMImage,
MLLMTestCaseParams,
ArenaTestCase,
Arena,
ToolCall,
ToolCallParams,
MCPServer,
MCPToolCall,
MCPPromptCall,
MCPResourceCall
)Standard test case for evaluating single LLM interactions, supporting inputs, outputs, context, and tool usage.
class LLMTestCase:
"""
Represents a test case for evaluating LLM outputs.
Parameters:
- input (str): Input prompt to the LLM
- actual_output (str, optional): Actual output from the LLM
- expected_output (str, optional): Expected output
- context (List[str], optional): Context information
- retrieval_context (List[str], optional): Retrieved context for RAG applications
- additional_metadata (Dict, optional): Additional metadata
- tools_called (List[ToolCall], optional): Tools called by the LLM
- expected_tools (List[ToolCall], optional): Expected tools to be called
- comments (str, optional): Comments about the test case
- token_cost (float, optional): Cost in tokens
- completion_time (float, optional): Time to complete in seconds
- name (str, optional): Name of the test case
- tags (List[str], optional): Tags for organization
- mcp_servers (List[MCPServer], optional): MCP servers configuration
- mcp_tools_called (List[MCPToolCall], optional): MCP tools called
- mcp_resources_called (List[MCPResourceCall], optional): MCP resources called
- mcp_prompts_called (List[MCPPromptCall], optional): MCP prompts called
"""Usage example:
from deepeval.test_case import LLMTestCase
# Basic test case
test_case = LLMTestCase(
input="What is the capital of France?",
actual_output="The capital of France is Paris.",
expected_output="Paris"
)
# RAG test case with retrieval context
rag_test_case = LLMTestCase(
input="What's our refund policy?",
actual_output="We offer a 30-day full refund at no extra cost.",
expected_output="30-day full refund policy",
retrieval_context=[
"All customers are eligible for a 30 day full refund at no extra costs.",
"Refunds are processed within 5-7 business days."
],
context=["Customer support FAQ"]
)
# Agentic test case with tool calls
agentic_test_case = LLMTestCase(
input="What's the weather in New York?",
actual_output="The current weather in New York is 72°F and sunny.",
tools_called=[
ToolCall(
name="get_weather",
input_parameters={"location": "New York", "unit": "fahrenheit"},
output={"temperature": 72, "condition": "sunny"}
)
],
expected_tools=[
ToolCall(name="get_weather", input_parameters={"location": "New York"})
]
)Enumeration of test case parameters for use with metrics.
class LLMTestCaseParams:
"""
Enumeration of test case parameters.
Values:
- INPUT: "input"
- ACTUAL_OUTPUT: "actual_output"
- EXPECTED_OUTPUT: "expected_output"
- CONTEXT: "context"
- RETRIEVAL_CONTEXT: "retrieval_context"
- TOOLS_CALLED: "tools_called"
- EXPECTED_TOOLS: "expected_tools"
- MCP_SERVERS: "mcp_servers"
- MCP_TOOLS_CALLED: "mcp_tools_called"
- MCP_RESOURCES_CALLED: "mcp_resources_called"
- MCP_PROMPTS_CALLED: "mcp_prompts_called"
"""Usage example:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
# Use params to specify what to evaluate
metric = GEval(
name="Answer Relevancy",
criteria="Determine if the actual output is relevant to the input.",
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
)Represents a tool call made by an LLM or expected to be called.
class ToolCall:
"""
Represents a tool call made by an LLM.
Parameters:
- name (str): Name of the tool
- description (str, optional): Description of the tool
- reasoning (str, optional): Reasoning for calling the tool
- output (Any, optional): Output from the tool
- input_parameters (Dict[str, Any], optional): Input parameters to the tool
"""Usage example:
from deepeval.test_case import ToolCall
# Define a tool call
tool_call = ToolCall(
name="search_database",
description="Searches the product database",
reasoning="Need to find product information",
input_parameters={
"query": "wireless headphones",
"max_results": 10
},
output=[
{"id": 1, "name": "Premium Wireless Headphones"},
{"id": 2, "name": "Budget Wireless Headphones"}
]
)Enumeration of tool call parameters.
class ToolCallParams:
"""
Enumeration of tool call parameters.
Values:
- INPUT_PARAMETERS: "input_parameters"
- OUTPUT: "output"
"""Test case for evaluating multi-turn conversational interactions.
class ConversationalTestCase:
"""
Represents a multi-turn conversational test case.
Parameters:
- turns (List[Turn]): List of conversation turns
- scenario (str, optional): Scenario description
- context (List[str], optional): Context information
- name (str, optional): Name of the test case
- user_description (str, optional): Description of the user
- expected_outcome (str, optional): Expected outcome of the conversation
- chatbot_role (str, optional): Role of the chatbot
- additional_metadata (Dict, optional): Additional metadata
- comments (str, optional): Comments
- tags (List[str], optional): Tags for organization
- mcp_servers (List[MCPServer], optional): MCP servers configuration
"""Usage example:
from deepeval.test_case import ConversationalTestCase, Turn
# Multi-turn customer support conversation
conversation = ConversationalTestCase(
scenario="Customer inquiring about product return",
chatbot_role="Customer support agent",
user_description="Customer who wants to return a product",
expected_outcome="Customer understands return process and is satisfied",
context=["30-day return policy", "Free return shipping"],
turns=[
Turn(
role="user",
content="I want to return my purchase"
),
Turn(
role="assistant",
content="I'd be happy to help with your return. Can you provide your order number?"
),
Turn(
role="user",
content="My order number is #12345"
),
Turn(
role="assistant",
content="Thank you. I've initiated your return. You'll receive a prepaid return label via email within 24 hours.",
retrieval_context=["Order #12345 placed on 2024-01-15"]
)
]
)Represents a single turn in a conversation.
class Turn:
"""
Represents a single turn in a conversation.
Parameters:
- role (Literal["user", "assistant"]): Role of the speaker
- content (str): Content of the turn
- user_id (str, optional): User identifier
- retrieval_context (List[str], optional): Retrieved context for this turn
- tools_called (List[ToolCall], optional): Tools called during this turn
- mcp_tools_called (List[MCPToolCall], optional): MCP tools called
- mcp_resources_called (List[MCPResourceCall], optional): MCP resources called
- mcp_prompts_called (List[MCPPromptCall], optional): MCP prompts called
- additional_metadata (Dict, optional): Additional metadata
"""Usage example:
from deepeval.test_case import Turn, ToolCall
# Assistant turn with tool usage
turn = Turn(
role="assistant",
content="I've checked the weather for you. It's currently 72°F and sunny in New York.",
tools_called=[
ToolCall(
name="get_weather",
input_parameters={"city": "New York"},
output={"temp": 72, "condition": "sunny"}
)
],
retrieval_context=["User prefers Fahrenheit for temperature"]
)Enumeration of turn parameters for use with conversational metrics.
class TurnParams:
"""
Enumeration of turn parameters.
Values:
- ROLE: "role"
- CONTENT: "content"
- SCENARIO: "scenario"
- EXPECTED_OUTCOME: "expected_outcome"
- RETRIEVAL_CONTEXT: "retrieval_context"
- TOOLS_CALLED: "tools_called"
- MCP_TOOLS: "mcp_tools_called"
- MCP_RESOURCES: "mcp_resources_called"
- MCP_PROMPTS: "mcp_prompts_called"
"""Test case for evaluating multimodal LLM interactions involving text and images.
class MLLMTestCase:
"""
Represents a test case for multimodal LLMs (text + images).
Parameters:
- input (List[Union[str, MLLMImage]]): Input with text and images
- actual_output (List[Union[str, MLLMImage]]): Actual output
- expected_output (List[Union[str, MLLMImage]], optional): Expected output
- context (List[Union[str, MLLMImage]], optional): Context
- retrieval_context (List[Union[str, MLLMImage]], optional): Retrieved context
- additional_metadata (Dict, optional): Additional metadata
- comments (str, optional): Comments
- tools_called (List[ToolCall], optional): Tools called
- expected_tools (List[ToolCall], optional): Expected tools
- token_cost (float, optional): Token cost
- completion_time (float, optional): Completion time in seconds
- name (str, optional): Name
"""Usage example:
from deepeval.test_case import MLLMTestCase, MLLMImage
# Image description test case
mllm_test_case = MLLMTestCase(
input=[
"Describe what you see in this image:",
MLLMImage(url="path/to/image.jpg", local=True)
],
actual_output=["A golden retriever playing in a park with a red ball."],
expected_output=["A dog playing with a ball in a park."]
)
# Visual question answering
vqa_test_case = MLLMTestCase(
input=[
"What color is the car in the image?",
MLLMImage(url="https://example.com/car.jpg")
],
actual_output=["The car is red."],
expected_output=["Red"]
)Represents an image in a multimodal test case.
class MLLMImage:
"""
Represents an image in a multimodal test case.
Parameters:
- url (str): URL or file path to the image
- local (bool, optional): Whether the image is local (default: False)
Computed Attributes (only populated for local images):
- filename (Optional[str]): Filename extracted from URL
- mimeType (Optional[str]): MIME type of the image
- dataBase64 (Optional[str]): Base64 encoded image data
Static Methods:
- process_url(url: str) -> str: Processes a URL and returns the processed path
- is_local_path(url: str) -> bool: Determines if a URL is a local file path
"""Usage example:
from deepeval.test_case import MLLMImage
# Local image
local_image = MLLMImage(
url="/path/to/local/image.png",
local=True
)
# Remote image
remote_image = MLLMImage(
url="https://example.com/image.jpg"
)Enumeration of multimodal test case parameters.
class MLLMTestCaseParams:
"""
Enumeration of multimodal test case parameters.
Values:
- INPUT: "input"
- ACTUAL_OUTPUT: "actual_output"
- EXPECTED_OUTPUT: "expected_output"
- CONTEXT: "context"
- RETRIEVAL_CONTEXT: "retrieval_context"
- TOOLS_CALLED: "tools_called"
- EXPECTED_TOOLS: "expected_tools"
"""Test case for comparing multiple LLM outputs in arena-style evaluation.
class ArenaTestCase:
"""
Represents a test case for comparing multiple LLM outputs (arena-style).
Parameters:
- contestants (Dict[str, LLMTestCase]): Dictionary mapping contestant names to test cases
"""Usage example:
from deepeval.test_case import ArenaTestCase, LLMTestCase
from deepeval.metrics import ArenaGEval
# Compare outputs from different models
arena_test = ArenaTestCase(
contestants={
"gpt-4": LLMTestCase(
input="Write a haiku about coding",
actual_output="Lines of code flow\\nBugs emerge, then disappear\\nSoftware takes its form"
),
"claude-3": LLMTestCase(
input="Write a haiku about coding",
actual_output="Keys click through the night\\nAlgorithms come alive\\nCode compiles at dawn"
),
"gemini-pro": LLMTestCase(
input="Write a haiku about coding",
actual_output="Functions nested deep\\nVariables dance in loops\\nPrograms bloom to life"
)
}
)
# Evaluate which is best
arena_metric = ArenaGEval(
name="Haiku Quality",
criteria="Determine which haiku best captures the essence of coding"
)
arena_metric.measure(arena_test)
print(f"Winner: {arena_metric.winner}") # Returns name of winning contestantContainer for multiple arena test cases.
class Arena:
"""
Container for managing multiple arena test cases.
Parameters:
- test_cases (List[ArenaTestCase]): List of arena test cases to manage
"""Usage example:
from deepeval.test_case import Arena, ArenaTestCase, LLMTestCase
# Create multiple arena test cases
arena = Arena(test_cases=[
ArenaTestCase(contestants={
"model-a": LLMTestCase(input="Question 1", actual_output="Answer A1"),
"model-b": LLMTestCase(input="Question 1", actual_output="Answer B1")
}),
ArenaTestCase(contestants={
"model-a": LLMTestCase(input="Question 2", actual_output="Answer A2"),
"model-b": LLMTestCase(input="Question 2", actual_output="Answer B2")
})
])Model Context Protocol (MCP) support for advanced tool and resource management.
class MCPServer:
"""
Represents an MCP (Model Context Protocol) server configuration.
Parameters:
- server_name (str): Name of the server
- transport (Literal["stdio", "sse", "streamable-http"], optional): Transport protocol
- available_tools (List, optional): Available tools
- available_resources (List, optional): Available resources
- available_prompts (List, optional): Available prompts
"""
class MCPToolCall(BaseModel):
"""
Represents an MCP tool call.
Parameters:
- name (str): Name of the tool
- args (Dict): Tool arguments
- result (object): Tool execution result
"""
class MCPResourceCall(BaseModel):
"""
Represents an MCP resource call.
Parameters:
- uri (AnyUrl): URI of the resource (pydantic AnyUrl type)
- result (object): Resource retrieval result
"""
class MCPPromptCall(BaseModel):
"""
Represents an MCP prompt call.
Parameters:
- name (str): Name of the prompt
- result (object): Prompt execution result
"""Usage example:
from deepeval.test_case import LLMTestCase, MCPServer, MCPToolCall
# Test case with MCP server usage
mcp_test_case = LLMTestCase(
input="Search for Python tutorials",
actual_output="Here are the top Python tutorials I found...",
mcp_servers=[
MCPServer(
server_name="search-server",
transport="stdio",
available_tools=["web_search", "database_query"]
)
],
mcp_tools_called=[
MCPToolCall(
name="web_search",
args={"query": "Python tutorials", "limit": 10},
result={"count": 10, "results": [...]}
)
]
)