tessl/pypi-deepeval

Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents

Overview

Eval results

Files

Conversational Metrics

Name: tessl/pypi-deepeval
Author: tessl

Metrics designed for evaluating multi-turn conversations, measuring relevancy, completeness, role adherence, and conversational quality. These metrics work with ConversationalTestCase objects.

Imports

from deepeval.metrics import (
    ConversationalGEval,
    TurnRelevancyMetric,
    ConversationCompletenessMetric,
    RoleAdherenceMetric,
    MultiTurnMCPUseMetric,
    ConversationalDAGMetric
)

Capabilities

Conversational G-Eval

G-Eval for conversational test cases, allowing custom evaluation criteria for multi-turn conversations.

class ConversationalGEval:
    """
    G-Eval for conversational test cases.

    Parameters:
    - name (str): Name of the metric
    - criteria (str): Evaluation criteria
    - evaluation_params (List[TurnParams]): Parameters to evaluate
    - evaluation_steps (List[str], optional): Steps for evaluation
    - threshold (float): Success threshold (default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - async_mode (bool): Async mode (default: True)
    - strict_mode (bool): Strict mode (default: False)
    - verbose_mode (bool): Verbose mode (default: False)

    Attributes:
    - score (float): Evaluation score (0-1)
    - reason (str): Explanation of the score
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import ConversationalGEval
from deepeval.test_case import ConversationalTestCase, Turn, TurnParams

# Create custom conversational metric
metric = ConversationalGEval(
    name="Customer Satisfaction",
    criteria="Evaluate if the conversation leads to customer satisfaction",
    evaluation_params=[
        TurnParams.CONTENT,
        TurnParams.SCENARIO,
        TurnParams.EXPECTED_OUTCOME
    ],
    evaluation_steps=[
        "Analyze if agent addressed customer concerns",
        "Check if agent was polite and professional",
        "Evaluate if the expected outcome was achieved"
    ],
    threshold=0.7
)

# Create conversational test case
conversation = ConversationalTestCase(
    scenario="Customer wants to return a defective product",
    expected_outcome="Customer receives return label and is satisfied",
    turns=[
        Turn(role="user", content="My product arrived broken"),
        Turn(role="assistant", content="I'm sorry to hear that. Can you provide your order number?"),
        Turn(role="user", content="Order #12345"),
        Turn(role="assistant", content="I've initiated a return. You'll receive a prepaid label via email.")
    ]
)

# Evaluate
metric.measure(conversation)
print(f"Customer satisfaction score: {metric.score:.2f}")
print(f"Reason: {metric.reason}")

Turn Relevancy Metric

Measures relevancy of conversation turns to the overall scenario and context.

class TurnRelevancyMetric:
    """
    Measures relevancy of conversation turns.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)
    - async_mode (bool): Async mode (default: True)

    Required Test Case Parameters:
    - TURNS
    - SCENARIO

    Attributes:
    - score (float): Turn relevancy score (0-1)
    - reason (str): Explanation identifying irrelevant turns
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import TurnRelevancyMetric
from deepeval.test_case import ConversationalTestCase, Turn

metric = TurnRelevancyMetric(threshold=0.8)

conversation = ConversationalTestCase(
    scenario="Customer inquiring about shipping status",
    turns=[
        Turn(role="user", content="Where is my order?"),
        Turn(role="assistant", content="Let me check. What's your order number?"),
        Turn(role="user", content="#12345"),
        Turn(role="assistant", content="By the way, did you know we have a new product line?"),  # Irrelevant
        Turn(role="assistant", content="Your order is out for delivery today")
    ]
)

metric.measure(conversation)

if not metric.success:
    print(f"Irrelevant turns detected: {metric.reason}")

Conversation Completeness Metric

Evaluates completeness of conversations based on expected outcomes and scenario requirements.

class ConversationCompletenessMetric:
    """
    Evaluates completeness of conversations.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)
    - async_mode (bool): Async mode (default: True)

    Required Test Case Parameters:
    - TURNS
    - SCENARIO
    - EXPECTED_OUTCOME

    Attributes:
    - score (float): Completeness score (0-1)
    - reason (str): Explanation of what's incomplete
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import ConversationCompletenessMetric
from deepeval.test_case import ConversationalTestCase, Turn

metric = ConversationCompletenessMetric(threshold=0.8)

# Incomplete conversation
incomplete_conversation = ConversationalTestCase(
    scenario="Customer wants to change shipping address",
    expected_outcome="Shipping address is updated and confirmed",
    turns=[
        Turn(role="user", content="I need to change my shipping address"),
        Turn(role="assistant", content="I can help with that. What's your order number?"),
        Turn(role="user", content="#12345")
        # Conversation ends without address change
    ]
)

metric.measure(incomplete_conversation)

if not metric.success:
    print(f"Incomplete: {metric.reason}")
    # Example: "Expected outcome 'address is updated' was not achieved"

# Complete conversation
complete_conversation = ConversationalTestCase(
    scenario="Customer wants to change shipping address",
    expected_outcome="Shipping address is updated and confirmed",
    turns=[
        Turn(role="user", content="I need to change my shipping address"),
        Turn(role="assistant", content="I can help with that. What's your order number?"),
        Turn(role="user", content="#12345"),
        Turn(role="assistant", content="What's the new address?"),
        Turn(role="user", content="123 Main St, New York, NY 10001"),
        Turn(role="assistant", content="Updated! Your order will ship to 123 Main St, New York, NY 10001")
    ]
)

metric.measure(complete_conversation)
print(f"Completeness: {metric.score:.2f}")

Role Adherence Metric

Measures adherence to assigned role in conversations.

class RoleAdherenceMetric:
    """
    Measures adherence to assigned role in conversations.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)

    Required Test Case Parameters:
    - TURNS
    - CHATBOT_ROLE or role defined in test case

    Attributes:
    - score (float): Role adherence score (0-1)
    - reason (str): Explanation of role violations
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import RoleAdherenceMetric
from deepeval.test_case import ConversationalTestCase, Turn

metric = RoleAdherenceMetric(threshold=0.8)

conversation = ConversationalTestCase(
    scenario="Technical support for printer issue",
    chatbot_role="Technical support specialist for printers",
    turns=[
        Turn(role="user", content="My printer won't print"),
        Turn(role="assistant", content="Let me help you troubleshoot. Is the printer powered on?"),
        Turn(role="user", content="Yes, it's on"),
        Turn(role="assistant", content="Check the paper tray and ink levels"),
        Turn(role="user", content="How's the weather today?"),
        Turn(role="assistant", content="The weather is sunny, 75°F.")  # Role violation
    ]
)

metric.measure(conversation)

if not metric.success:
    print(f"Role violation: {metric.reason}")

Multi-Turn MCP Use Metric

Evaluates MCP usage across multiple conversation turns.

class MultiTurnMCPUseMetric:
    """
    Evaluates MCP usage across multiple turns.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model

    Required Test Case Parameters:
    - TURNS (with MCP tools/resources/prompts)
    - MCP_SERVERS

    Attributes:
    - score (float): MCP usage score (0-1)
    - reason (str): Explanation of MCP usage quality
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import MultiTurnMCPUseMetric
from deepeval.test_case import (
    ConversationalTestCase,
    Turn,
    MCPServer,
    MCPToolCall
)

metric = MultiTurnMCPUseMetric(threshold=0.7)

conversation = ConversationalTestCase(
    scenario="Research assistant helping with data analysis",
    mcp_servers=[
        MCPServer(
            server_name="data-server",
            available_tools=["query_database", "generate_chart"]
        )
    ],
    turns=[
        Turn(
            role="user",
            content="Show me sales data for Q1"
        ),
        Turn(
            role="assistant",
            content="Here's the Q1 sales data...",
            mcp_tools_called=[
                MCPToolCall(
                    server_name="data-server",
                    tool_name="query_database",
                    arguments={"query": "SELECT * FROM sales WHERE quarter='Q1'"}
                )
            ]
        ),
        Turn(
            role="user",
            content="Can you create a chart?"
        ),
        Turn(
            role="assistant",
            content="Here's a chart of the data...",
            mcp_tools_called=[
                MCPToolCall(
                    server_name="data-server",
                    tool_name="generate_chart",
                    arguments={"data": [...], "type": "bar"}
                )
            ]
        )
    ]
)

metric.measure(conversation)

Conversational DAG Metric

DAG (Deep Acyclic Graph) metric for conversational flows.

class ConversationalDAGMetric:
    """
    DAG metric for conversational flows.

    Parameters:
    - name (str): Name of the metric
    - dag (DeepAcyclicGraph): DAG structure for conversation evaluation
    - threshold (float): Success threshold (default: 0.5)

    Required Test Case Parameters:
    - TURNS

    Attributes:
    - score (float): DAG compliance score (0-1)
    - reason (str): Explanation of DAG evaluation
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import ConversationalDAGMetric, DeepAcyclicGraph
from deepeval.test_case import ConversationalTestCase, Turn

# Define conversation flow DAG
conversation_dag = DeepAcyclicGraph()
conversation_dag.add_node("greeting", "Agent greets customer")
conversation_dag.add_node("identify_issue", "Identify customer issue")
conversation_dag.add_node("resolve_issue", "Resolve the issue")
conversation_dag.add_node("confirm_resolution", "Confirm issue is resolved")

conversation_dag.add_edge("greeting", "identify_issue")
conversation_dag.add_edge("identify_issue", "resolve_issue")
conversation_dag.add_edge("resolve_issue", "confirm_resolution")

# Create metric
metric = ConversationalDAGMetric(
    name="Support Flow",
    dag=conversation_dag,
    threshold=0.8
)

# Evaluate conversation against DAG
conversation = ConversationalTestCase(
    scenario="Customer support interaction",
    turns=[
        Turn(role="assistant", content="Hello! How can I help you today?"),  # greeting
        Turn(role="user", content="My order hasn't arrived"),
        Turn(role="assistant", content="Let me look that up for you"),  # identify_issue
        Turn(role="assistant", content="I've located your order and will expedite it"),  # resolve_issue
        Turn(role="assistant", content="Is there anything else I can help with?")  # confirm_resolution
    ]
)

metric.measure(conversation)

Comprehensive Conversational Evaluation

Evaluate all conversational aspects together:

from deepeval import evaluate
from deepeval.metrics import (
    ConversationalGEval,
    TurnRelevancyMetric,
    ConversationCompletenessMetric,
    RoleAdherenceMetric
)
from deepeval.test_case import ConversationalTestCase, Turn, TurnParams

# Create comprehensive conversational metrics
conv_metrics = [
    ConversationalGEval(
        name="Overall Quality",
        criteria="Evaluate conversation quality and helpfulness",
        evaluation_params=[TurnParams.CONTENT, TurnParams.SCENARIO],
        threshold=0.7
    ),
    TurnRelevancyMetric(threshold=0.8),
    ConversationCompletenessMetric(threshold=0.8),
    RoleAdherenceMetric(threshold=0.8)
]

# Test conversations
conversations = [
    ConversationalTestCase(
        scenario="Product inquiry",
        chatbot_role="Sales assistant",
        expected_outcome="Customer receives product information",
        turns=[...]
    ),
    # ... more conversations
]

# Evaluate
result = evaluate(conversations, conv_metrics)

# Analyze results
for test_result in result.test_results:
    print(f"\nConversation: {test_result.name}")
    for metric_name, metric_result in test_result.metrics.items():
        status = "✓" if metric_result.success else "✗"
        print(f"  {status} {metric_name}: {metric_result.score:.2f}")

Evaluating Chatbot Personality

Use ConversationalGEval to evaluate personality traits:

from deepeval.metrics import ConversationalGEval
from deepeval.test_case import ConversationalTestCase, TurnParams

# Evaluate empathy
empathy_metric = ConversationalGEval(
    name="Empathy",
    criteria="Evaluate if the chatbot shows empathy and understanding of user emotions",
    evaluation_params=[TurnParams.CONTENT],
    threshold=0.8
)

# Evaluate professionalism
professionalism_metric = ConversationalGEval(
    name="Professionalism",
    criteria="Evaluate if the chatbot maintains professional tone and language",
    evaluation_params=[TurnParams.CONTENT],
    threshold=0.8
)

# Evaluate helpfulness
helpfulness_metric = ConversationalGEval(
    name="Helpfulness",
    criteria="Evaluate if the chatbot provides helpful and actionable information",
    evaluation_params=[TurnParams.CONTENT, TurnParams.EXPECTED_OUTCOME],
    threshold=0.8
)

personality_metrics = [empathy_metric, professionalism_metric, helpfulness_metric]

# Evaluate chatbot personality
result = evaluate(conversations, personality_metrics)

Install with Tessl CLI