Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Metrics designed for evaluating multi-turn conversations, measuring relevancy, completeness, role adherence, and conversational quality. These metrics work with ConversationalTestCase objects.
from deepeval.metrics import (
ConversationalGEval,
TurnRelevancyMetric,
ConversationCompletenessMetric,
RoleAdherenceMetric,
MultiTurnMCPUseMetric,
ConversationalDAGMetric
)G-Eval for conversational test cases, allowing custom evaluation criteria for multi-turn conversations.
class ConversationalGEval:
"""
G-Eval for conversational test cases.
Parameters:
- name (str): Name of the metric
- criteria (str): Evaluation criteria
- evaluation_params (List[TurnParams]): Parameters to evaluate
- evaluation_steps (List[str], optional): Steps for evaluation
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- async_mode (bool): Async mode (default: True)
- strict_mode (bool): Strict mode (default: False)
- verbose_mode (bool): Verbose mode (default: False)
Attributes:
- score (float): Evaluation score (0-1)
- reason (str): Explanation of the score
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import ConversationalGEval
from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
# Create custom conversational metric
metric = ConversationalGEval(
name="Customer Satisfaction",
criteria="Evaluate if the conversation leads to customer satisfaction",
evaluation_params=[
TurnParams.CONTENT,
TurnParams.SCENARIO,
TurnParams.EXPECTED_OUTCOME
],
evaluation_steps=[
"Analyze if agent addressed customer concerns",
"Check if agent was polite and professional",
"Evaluate if the expected outcome was achieved"
],
threshold=0.7
)
# Create conversational test case
conversation = ConversationalTestCase(
scenario="Customer wants to return a defective product",
expected_outcome="Customer receives return label and is satisfied",
turns=[
Turn(role="user", content="My product arrived broken"),
Turn(role="assistant", content="I'm sorry to hear that. Can you provide your order number?"),
Turn(role="user", content="Order #12345"),
Turn(role="assistant", content="I've initiated a return. You'll receive a prepaid label via email.")
]
)
# Evaluate
metric.measure(conversation)
print(f"Customer satisfaction score: {metric.score:.2f}")
print(f"Reason: {metric.reason}")Measures relevancy of conversation turns to the overall scenario and context.
class TurnRelevancyMetric:
"""
Measures relevancy of conversation turns.
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
- async_mode (bool): Async mode (default: True)
Required Test Case Parameters:
- TURNS
- SCENARIO
Attributes:
- score (float): Turn relevancy score (0-1)
- reason (str): Explanation identifying irrelevant turns
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import TurnRelevancyMetric
from deepeval.test_case import ConversationalTestCase, Turn
metric = TurnRelevancyMetric(threshold=0.8)
conversation = ConversationalTestCase(
scenario="Customer inquiring about shipping status",
turns=[
Turn(role="user", content="Where is my order?"),
Turn(role="assistant", content="Let me check. What's your order number?"),
Turn(role="user", content="#12345"),
Turn(role="assistant", content="By the way, did you know we have a new product line?"), # Irrelevant
Turn(role="assistant", content="Your order is out for delivery today")
]
)
metric.measure(conversation)
if not metric.success:
print(f"Irrelevant turns detected: {metric.reason}")Evaluates completeness of conversations based on expected outcomes and scenario requirements.
class ConversationCompletenessMetric:
"""
Evaluates completeness of conversations.
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
- async_mode (bool): Async mode (default: True)
Required Test Case Parameters:
- TURNS
- SCENARIO
- EXPECTED_OUTCOME
Attributes:
- score (float): Completeness score (0-1)
- reason (str): Explanation of what's incomplete
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import ConversationCompletenessMetric
from deepeval.test_case import ConversationalTestCase, Turn
metric = ConversationCompletenessMetric(threshold=0.8)
# Incomplete conversation
incomplete_conversation = ConversationalTestCase(
scenario="Customer wants to change shipping address",
expected_outcome="Shipping address is updated and confirmed",
turns=[
Turn(role="user", content="I need to change my shipping address"),
Turn(role="assistant", content="I can help with that. What's your order number?"),
Turn(role="user", content="#12345")
# Conversation ends without address change
]
)
metric.measure(incomplete_conversation)
if not metric.success:
print(f"Incomplete: {metric.reason}")
# Example: "Expected outcome 'address is updated' was not achieved"
# Complete conversation
complete_conversation = ConversationalTestCase(
scenario="Customer wants to change shipping address",
expected_outcome="Shipping address is updated and confirmed",
turns=[
Turn(role="user", content="I need to change my shipping address"),
Turn(role="assistant", content="I can help with that. What's your order number?"),
Turn(role="user", content="#12345"),
Turn(role="assistant", content="What's the new address?"),
Turn(role="user", content="123 Main St, New York, NY 10001"),
Turn(role="assistant", content="Updated! Your order will ship to 123 Main St, New York, NY 10001")
]
)
metric.measure(complete_conversation)
print(f"Completeness: {metric.score:.2f}")Measures adherence to assigned role in conversations.
class RoleAdherenceMetric:
"""
Measures adherence to assigned role in conversations.
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
Required Test Case Parameters:
- TURNS
- CHATBOT_ROLE or role defined in test case
Attributes:
- score (float): Role adherence score (0-1)
- reason (str): Explanation of role violations
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import RoleAdherenceMetric
from deepeval.test_case import ConversationalTestCase, Turn
metric = RoleAdherenceMetric(threshold=0.8)
conversation = ConversationalTestCase(
scenario="Technical support for printer issue",
chatbot_role="Technical support specialist for printers",
turns=[
Turn(role="user", content="My printer won't print"),
Turn(role="assistant", content="Let me help you troubleshoot. Is the printer powered on?"),
Turn(role="user", content="Yes, it's on"),
Turn(role="assistant", content="Check the paper tray and ink levels"),
Turn(role="user", content="How's the weather today?"),
Turn(role="assistant", content="The weather is sunny, 75°F.") # Role violation
]
)
metric.measure(conversation)
if not metric.success:
print(f"Role violation: {metric.reason}")Evaluates MCP usage across multiple conversation turns.
class MultiTurnMCPUseMetric:
"""
Evaluates MCP usage across multiple turns.
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
Required Test Case Parameters:
- TURNS (with MCP tools/resources/prompts)
- MCP_SERVERS
Attributes:
- score (float): MCP usage score (0-1)
- reason (str): Explanation of MCP usage quality
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import MultiTurnMCPUseMetric
from deepeval.test_case import (
ConversationalTestCase,
Turn,
MCPServer,
MCPToolCall
)
metric = MultiTurnMCPUseMetric(threshold=0.7)
conversation = ConversationalTestCase(
scenario="Research assistant helping with data analysis",
mcp_servers=[
MCPServer(
server_name="data-server",
available_tools=["query_database", "generate_chart"]
)
],
turns=[
Turn(
role="user",
content="Show me sales data for Q1"
),
Turn(
role="assistant",
content="Here's the Q1 sales data...",
mcp_tools_called=[
MCPToolCall(
server_name="data-server",
tool_name="query_database",
arguments={"query": "SELECT * FROM sales WHERE quarter='Q1'"}
)
]
),
Turn(
role="user",
content="Can you create a chart?"
),
Turn(
role="assistant",
content="Here's a chart of the data...",
mcp_tools_called=[
MCPToolCall(
server_name="data-server",
tool_name="generate_chart",
arguments={"data": [...], "type": "bar"}
)
]
)
]
)
metric.measure(conversation)DAG (Deep Acyclic Graph) metric for conversational flows.
class ConversationalDAGMetric:
"""
DAG metric for conversational flows.
Parameters:
- name (str): Name of the metric
- dag (DeepAcyclicGraph): DAG structure for conversation evaluation
- threshold (float): Success threshold (default: 0.5)
Required Test Case Parameters:
- TURNS
Attributes:
- score (float): DAG compliance score (0-1)
- reason (str): Explanation of DAG evaluation
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import ConversationalDAGMetric, DeepAcyclicGraph
from deepeval.test_case import ConversationalTestCase, Turn
# Define conversation flow DAG
conversation_dag = DeepAcyclicGraph()
conversation_dag.add_node("greeting", "Agent greets customer")
conversation_dag.add_node("identify_issue", "Identify customer issue")
conversation_dag.add_node("resolve_issue", "Resolve the issue")
conversation_dag.add_node("confirm_resolution", "Confirm issue is resolved")
conversation_dag.add_edge("greeting", "identify_issue")
conversation_dag.add_edge("identify_issue", "resolve_issue")
conversation_dag.add_edge("resolve_issue", "confirm_resolution")
# Create metric
metric = ConversationalDAGMetric(
name="Support Flow",
dag=conversation_dag,
threshold=0.8
)
# Evaluate conversation against DAG
conversation = ConversationalTestCase(
scenario="Customer support interaction",
turns=[
Turn(role="assistant", content="Hello! How can I help you today?"), # greeting
Turn(role="user", content="My order hasn't arrived"),
Turn(role="assistant", content="Let me look that up for you"), # identify_issue
Turn(role="assistant", content="I've located your order and will expedite it"), # resolve_issue
Turn(role="assistant", content="Is there anything else I can help with?") # confirm_resolution
]
)
metric.measure(conversation)Evaluate all conversational aspects together:
from deepeval import evaluate
from deepeval.metrics import (
ConversationalGEval,
TurnRelevancyMetric,
ConversationCompletenessMetric,
RoleAdherenceMetric
)
from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
# Create comprehensive conversational metrics
conv_metrics = [
ConversationalGEval(
name="Overall Quality",
criteria="Evaluate conversation quality and helpfulness",
evaluation_params=[TurnParams.CONTENT, TurnParams.SCENARIO],
threshold=0.7
),
TurnRelevancyMetric(threshold=0.8),
ConversationCompletenessMetric(threshold=0.8),
RoleAdherenceMetric(threshold=0.8)
]
# Test conversations
conversations = [
ConversationalTestCase(
scenario="Product inquiry",
chatbot_role="Sales assistant",
expected_outcome="Customer receives product information",
turns=[...]
),
# ... more conversations
]
# Evaluate
result = evaluate(conversations, conv_metrics)
# Analyze results
for test_result in result.test_results:
print(f"\nConversation: {test_result.name}")
for metric_name, metric_result in test_result.metrics.items():
status = "✓" if metric_result.success else "✗"
print(f" {status} {metric_name}: {metric_result.score:.2f}")Use ConversationalGEval to evaluate personality traits:
from deepeval.metrics import ConversationalGEval
from deepeval.test_case import ConversationalTestCase, TurnParams
# Evaluate empathy
empathy_metric = ConversationalGEval(
name="Empathy",
criteria="Evaluate if the chatbot shows empathy and understanding of user emotions",
evaluation_params=[TurnParams.CONTENT],
threshold=0.8
)
# Evaluate professionalism
professionalism_metric = ConversationalGEval(
name="Professionalism",
criteria="Evaluate if the chatbot maintains professional tone and language",
evaluation_params=[TurnParams.CONTENT],
threshold=0.8
)
# Evaluate helpfulness
helpfulness_metric = ConversationalGEval(
name="Helpfulness",
criteria="Evaluate if the chatbot provides helpful and actionable information",
evaluation_params=[TurnParams.CONTENT, TurnParams.EXPECTED_OUTCOME],
threshold=0.8
)
personality_metrics = [empathy_metric, professionalism_metric, helpfulness_metric]
# Evaluate chatbot personality
result = evaluate(conversations, personality_metrics)