tessl/pypi-deepeval

Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents

Overview

Eval results

Files

Agentic Metrics

Name: tessl/pypi-deepeval
Author: tessl

Metrics for evaluating AI agents, including tool usage, task completion, plan quality, and goal achievement. These metrics assess how well agents perform complex, multi-step tasks.

Imports

from deepeval.metrics import (
    ToolCorrectnessMetric,
    TaskCompletionMetric,
    ToolUseMetric,
    PlanQualityMetric,
    PlanAdherenceMetric,
    StepEfficiencyMetric,
    GoalAccuracyMetric,
    MCPTaskCompletionMetric,
    MCPUseMetric
)

Capabilities

Tool Correctness Metric

Evaluates whether the correct tools were called with correct parameters.

class ToolCorrectnessMetric:
    """
    Evaluates whether the correct tools were called with correct parameters.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)
    - async_mode (bool): Async mode (default: True)

    Required Test Case Parameters:
    - TOOLS_CALLED
    - EXPECTED_TOOLS

    Attributes:
    - score (float): Tool correctness score (0-1)
    - reason (str): Explanation of tool usage issues
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import ToolCorrectnessMetric
from deepeval.test_case import LLMTestCase, ToolCall

metric = ToolCorrectnessMetric(threshold=0.8)

test_case = LLMTestCase(
    input="What's the weather in New York and London?",
    actual_output="New York: 72°F, sunny. London: 18°C, cloudy.",
    tools_called=[
        ToolCall(
            name="get_weather",
            input_parameters={"city": "New York", "unit": "fahrenheit"},
            output={"temp": 72, "condition": "sunny"}
        ),
        ToolCall(
            name="get_weather",
            input_parameters={"city": "London", "unit": "celsius"},
            output={"temp": 18, "condition": "cloudy"}
        )
    ],
    expected_tools=[
        ToolCall(name="get_weather", input_parameters={"city": "New York"}),
        ToolCall(name="get_weather", input_parameters={"city": "London"})
    ]
)

metric.measure(test_case)

if metric.success:
    print("Tools used correctly")
else:
    print(f"Tool usage issues: {metric.reason}")

Task Completion Metric

Evaluates whether the task was completed successfully based on the goal.

class TaskCompletionMetric:
    """
    Evaluates whether the task was completed successfully.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)

    Required Test Case Parameters:
    - INPUT (task description)
    - ACTUAL_OUTPUT (task result)

    Attributes:
    - score (float): Task completion score (0-1)
    - reason (str): Explanation of completion status
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import TaskCompletionMetric
from deepeval.test_case import LLMTestCase

metric = TaskCompletionMetric(threshold=0.8)

test_case = LLMTestCase(
    input="Book a flight from NYC to LAX for next Monday, find a hotel near LAX, and create an itinerary.",
    actual_output="I've booked flight UA123 departing Monday 8am, reserved Hilton LAX for 3 nights, and created a 3-day itinerary including beach visits and city tours.",
    tools_called=[
        ToolCall(name="book_flight", input_parameters={"from": "NYC", "to": "LAX"}),
        ToolCall(name="book_hotel", input_parameters={"location": "LAX"}),
        ToolCall(name="create_itinerary", input_parameters={"days": 3})
    ]
)

metric.measure(test_case)
print(f"Task completion: {metric.score:.2f}")

Tool Use Metric

Evaluates appropriate use of available tools.

class ToolUseMetric:
    """
    Evaluates appropriate use of available tools.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)

    Required Test Case Parameters:
    - INPUT
    - ACTUAL_OUTPUT
    - TOOLS_CALLED

    Attributes:
    - score (float): Tool use appropriateness score (0-1)
    - reason (str): Explanation of tool usage
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import ToolUseMetric
from deepeval.test_case import LLMTestCase, ToolCall

metric = ToolUseMetric(threshold=0.7)

test_case = LLMTestCase(
    input="Find restaurants near me and check the weather",
    actual_output="Here are 5 restaurants near you. The weather is sunny, 75°F.",
    tools_called=[
        ToolCall(
            name="search_restaurants",
            input_parameters={"location": "current"}
        ),
        ToolCall(
            name="get_weather",
            input_parameters={"location": "current"}
        )
    ]
)

metric.measure(test_case)

Plan Quality Metric

Evaluates the quality of generated plans.

class PlanQualityMetric:
    """
    Evaluates the quality of generated plans.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)

    Required Test Case Parameters:
    - INPUT (task/goal)
    - ACTUAL_OUTPUT (plan)

    Attributes:
    - score (float): Plan quality score (0-1)
    - reason (str): Explanation of plan quality
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import PlanQualityMetric
from deepeval.test_case import LLMTestCase

metric = PlanQualityMetric(threshold=0.7)

test_case = LLMTestCase(
    input="Create a plan to migrate our database from MySQL to PostgreSQL",
    actual_output="""
    Plan:
    1. Audit current MySQL schema and data
    2. Set up PostgreSQL instance
    3. Create migration scripts for schema
    4. Test migration on staging environment
    5. Execute migration during maintenance window
    6. Validate data integrity post-migration
    7. Monitor performance for 48 hours
    """
)

metric.measure(test_case)
print(f"Plan quality: {metric.score:.2f}")

Plan Adherence Metric

Measures adherence to a predefined plan.

class PlanAdherenceMetric:
    """
    Measures adherence to a predefined plan.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)

    Required Test Case Parameters:
    - EXPECTED_OUTPUT (plan)
    - ACTUAL_OUTPUT (execution)

    Attributes:
    - score (float): Plan adherence score (0-1)
    - reason (str): Explanation of deviations from plan
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import PlanAdherenceMetric
from deepeval.test_case import LLMTestCase

metric = PlanAdherenceMetric(threshold=0.8)

test_case = LLMTestCase(
    input="Execute the database migration plan",
    expected_output="Plan: 1) Backup data, 2) Stop services, 3) Migrate, 4) Validate, 5) Restart",
    actual_output="Executed: Backed up data, stopped services, ran migration, validated results, restarted services",
    tools_called=[
        ToolCall(name="backup_database"),
        ToolCall(name="stop_services"),
        ToolCall(name="migrate_database"),
        ToolCall(name="validate_data"),
        ToolCall(name="start_services")
    ]
)

metric.measure(test_case)

Step Efficiency Metric

Evaluates efficiency of steps taken to complete a task.

class StepEfficiencyMetric:
    """
    Evaluates efficiency of steps taken to complete a task.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)

    Required Test Case Parameters:
    - INPUT (task)
    - TOOLS_CALLED or ACTUAL_OUTPUT

    Attributes:
    - score (float): Efficiency score (0-1)
    - reason (str): Explanation of efficiency issues
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import StepEfficiencyMetric
from deepeval.test_case import LLMTestCase, ToolCall

metric = StepEfficiencyMetric(threshold=0.7)

# Inefficient agent
test_case_inefficient = LLMTestCase(
    input="What's 2 + 2?",
    actual_output="4",
    tools_called=[
        ToolCall(name="search_web", input_parameters={"query": "2+2"}),
        ToolCall(name="calculator", input_parameters={"expression": "2+2"}),
        ToolCall(name="verify_answer", input_parameters={"answer": 4})
    ]
)

metric.measure(test_case_inefficient)
# Low score: Too many steps for simple calculation

# Efficient agent
test_case_efficient = LLMTestCase(
    input="What's 2 + 2?",
    actual_output="4",
    tools_called=[
        ToolCall(name="calculator", input_parameters={"expression": "2+2"})
    ]
)

metric.measure(test_case_efficient)
# High score: Direct and efficient

Goal Accuracy Metric

Measures accuracy in achieving specified goals.

class GoalAccuracyMetric:
    """
    Measures accuracy in achieving specified goals.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - include_reason (bool): Include reason in output (default: True)

    Required Test Case Parameters:
    - INPUT (goal)
    - ACTUAL_OUTPUT (result)
    - EXPECTED_OUTPUT (optional, expected result)

    Attributes:
    - score (float): Goal accuracy score (0-1)
    - reason (str): Explanation of goal achievement
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import GoalAccuracyMetric
from deepeval.test_case import LLMTestCase

metric = GoalAccuracyMetric(threshold=0.8)

test_case = LLMTestCase(
    input="Goal: Increase website traffic by 20% through SEO optimization",
    actual_output="Implemented SEO changes: optimized meta tags, improved page speed, created quality backlinks. Result: 23% increase in traffic.",
    expected_output="20% increase in website traffic"
)

metric.measure(test_case)
print(f"Goal achievement: {metric.score:.2f}")

MCP Task Completion Metric

Evaluates task completion using Model Context Protocol (MCP) tools.

class MCPTaskCompletionMetric:
    """
    Evaluates task completion using MCP.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model

    Required Test Case Parameters:
    - INPUT
    - ACTUAL_OUTPUT
    - MCP_TOOLS_CALLED

    Attributes:
    - score (float): MCP task completion score (0-1)
    - reason (str): Explanation of task completion with MCP
    - success (bool): Whether score meets threshold
    """

Usage example:

from deepeval.metrics import MCPTaskCompletionMetric
from deepeval.test_case import LLMTestCase, MCPToolCall, MCPServer

metric = MCPTaskCompletionMetric(threshold=0.8)

test_case = LLMTestCase(
    input="Search for Python tutorials and save the top 5 results",
    actual_output="Found and saved 5 Python tutorials",
    mcp_servers=[
        MCPServer(
            server_name="search-server",
            available_tools=["web_search", "save_results"]
        )
    ],
    mcp_tools_called=[
        MCPToolCall(
            server_name="search-server",
            tool_name="web_search",
            arguments={"query": "Python tutorials", "limit": 5}
        ),
        MCPToolCall(
            server_name="search-server",
            tool_name="save_results",
            arguments={"results": [...]}
        )
    ]
)

metric.measure(test_case)

MCP Use Metric

Evaluates proper MCP usage.

class MCPUseMetric:
    """
    Evaluates proper MCP usage.

    Parameters:
    - threshold (float): Success threshold (0-1, default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model

    Required Test Case Parameters:
    - INPUT
    - MCP_TOOLS_CALLED
    - MCP_SERVERS

    Attributes:
    - score (float): MCP usage score (0-1)
    - reason (str): Explanation of MCP usage
    - success (bool): Whether score meets threshold
    """

Comprehensive Agentic Evaluation

Evaluate all agentic capabilities:

from deepeval import evaluate
from deepeval.metrics import (
    ToolCorrectnessMetric,
    TaskCompletionMetric,
    ToolUseMetric,
    PlanQualityMetric,
    StepEfficiencyMetric,
    GoalAccuracyMetric
)
from deepeval.test_case import LLMTestCase

# Create agentic metrics suite
agentic_metrics = [
    ToolCorrectnessMetric(threshold=0.8),
    TaskCompletionMetric(threshold=0.8),
    ToolUseMetric(threshold=0.7),
    PlanQualityMetric(threshold=0.7),
    StepEfficiencyMetric(threshold=0.7),
    GoalAccuracyMetric(threshold=0.8)
]

# Test agent on complex task
test_cases = [
    LLMTestCase(
        input="Plan and execute a customer onboarding workflow",
        actual_output=agent_execute("Plan and execute a customer onboarding workflow"),
        tools_called=get_agent_tools_called(),
        expected_tools=[...]
    )
]

# Evaluate agent
result = evaluate(test_cases, agentic_metrics)

# Analyze agent performance
for test_result in result.test_results:
    print(f"Agent Performance Report:")
    for metric_name, metric_result in test_result.metrics.items():
        status = "✓" if metric_result.success else "✗"
        print(f"  {status} {metric_name}: {metric_result.score:.2f}")
        if not metric_result.success:
            print(f"     Issue: {metric_result.reason}")

Multi-Step Agent Evaluation

Evaluate agents on complex multi-step tasks:

from deepeval.test_case import LLMTestCase, ToolCall
from deepeval.metrics import (
    TaskCompletionMetric,
    ToolCorrectnessMetric,
    StepEfficiencyMetric
)

# Complex multi-step task
test_case = LLMTestCase(
    input="""
    Research the top 3 competitors in the AI LLM space,
    create a comparison table of their features,
    and draft an executive summary
    """,
    actual_output="""
    Researched OpenAI, Anthropic, and Google.
    Created comparison table with pricing, capabilities, and APIs.
    Executive summary: [detailed summary]
    """,
    tools_called=[
        ToolCall(name="web_search", input_parameters={"query": "AI LLM providers"}),
        ToolCall(name="web_search", input_parameters={"query": "OpenAI features"}),
        ToolCall(name="web_search", input_parameters={"query": "Anthropic features"}),
        ToolCall(name="web_search", input_parameters={"query": "Google AI features"}),
        ToolCall(name="create_table", input_parameters={"data": [...]}),
        ToolCall(name="generate_summary", input_parameters={"content": [...]})
    ]
)

# Evaluate with multiple metrics
metrics = [
    TaskCompletionMetric(threshold=0.8),
    StepEfficiencyMetric(threshold=0.7),
    ToolCorrectnessMetric(threshold=0.8)
]

for metric in metrics:
    metric.measure(test_case)
    print(f"{metric.__class__.__name__}: {metric.score:.2f} - {metric.reason}")

Install with Tessl CLI