tessl/pypi-deepeval

Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents

Overview

Eval results

Files

Custom Metrics

Name: tessl/pypi-deepeval
Author: tessl

Framework for creating custom evaluation metrics using G-Eval, DAG (Deep Acyclic Graph), or by extending base metric classes. Build metrics tailored to your specific evaluation needs.

Imports

from deepeval.metrics import GEval, DAGMetric, DeepAcyclicGraph
from deepeval.metrics import (
    BaseMetric,
    BaseConversationalMetric,
    BaseMultimodalMetric,
    BaseArenaMetric
)
from deepeval.test_case import LLMTestCaseParams

Capabilities

G-Eval Metric

Customizable metric based on the G-Eval framework for LLM-based evaluation with custom criteria.

class GEval:
    """
    Customizable metric based on the G-Eval framework for LLM evaluation.

    Parameters:
    - name (str): Name of the metric
    - evaluation_params (List[LLMTestCaseParams]): Parameters to evaluate
    - criteria (str, optional): Evaluation criteria description
    - evaluation_steps (List[str], optional): Steps for evaluation
    - rubric (List[Rubric], optional): Scoring rubric
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
    - threshold (float): Success threshold (default: 0.5)
    - top_logprobs (int): Number of log probabilities to consider (default: 20)
    - async_mode (bool): Async mode (default: True)
    - strict_mode (bool): Strict mode (default: False)
    - verbose_mode (bool): Verbose mode (default: False)
    - evaluation_template (Type[GEvalTemplate]): Custom template (default: GEvalTemplate)

    Attributes:
    - score (float): Evaluation score (0-1)
    - reason (str): Explanation of the score
    - success (bool): Whether score meets threshold
    """

Usage example - Simple criteria:

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

# Create custom metric with simple criteria
coherence_metric = GEval(
    name="Coherence",
    criteria="Determine if the response is coherent and logically structured.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    threshold=0.7
)

test_case = LLMTestCase(
    input="Explain quantum computing",
    actual_output="Quantum computing uses quantum bits or qubits..."
)

coherence_metric.measure(test_case)
print(f"Coherence score: {coherence_metric.score:.2f}")

Usage example - With evaluation steps:

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

# Create metric with detailed evaluation steps
completeness_metric = GEval(
    name="Answer Completeness",
    criteria="Evaluate if the answer completely addresses all parts of the question.",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT
    ],
    evaluation_steps=[
        "Identify all parts of the question in the input",
        "Check if each part is addressed in the output",
        "Evaluate the depth and detail of each answer component",
        "Determine overall completeness score"
    ],
    threshold=0.8,
    model="gpt-4"
)

test_case = LLMTestCase(
    input="What is Python and what is it used for?",
    actual_output="Python is a high-level programming language. It's used for web development, data science, automation, and AI/ML applications."
)

completeness_metric.measure(test_case)

Usage example - With scoring rubric:

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

# Create metric with detailed rubric
code_quality_metric = GEval(
    name="Code Quality",
    criteria="Evaluate the quality of the code solution.",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT
    ],
    rubric={
        "Correctness": "Does the code solve the problem correctly?",
        "Efficiency": "Is the algorithm efficient?",
        "Readability": "Is the code well-structured and readable?",
        "Best Practices": "Does it follow Python best practices?"
    },
    threshold=0.8
)

test_case = LLMTestCase(
    input="Write a function to find the nth Fibonacci number",
    actual_output="""
def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)
"""
)

code_quality_metric.measure(test_case)

DAG Metric

Deep Acyclic Graph metric for evaluating structured reasoning and multi-step processes.

class DAGMetric:
    """
    Deep Acyclic Graph metric for evaluating structured reasoning.

    Parameters:
    - name (str): Name of the metric
    - dag (DeepAcyclicGraph): DAG structure for evaluation
    - threshold (float): Success threshold (default: 0.5)
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model

    Attributes:
    - score (float): DAG compliance score (0-1)
    - reason (str): Explanation of DAG evaluation
    - success (bool): Whether score meets threshold
    """

class DeepAcyclicGraph:
    """
    Helper class for DAG construction and validation.

    Methods:
    - add_node(id: str, description: str): Add a node to the DAG
    - add_edge(from_id: str, to_id: str): Add an edge between nodes
    - validate(): Validate DAG structure (no cycles)
    """

Usage example:

from deepeval.metrics import DAGMetric, DeepAcyclicGraph
from deepeval.test_case import LLMTestCase

# Define reasoning DAG
reasoning_dag = DeepAcyclicGraph()

# Add nodes for reasoning steps
reasoning_dag.add_node("understand", "Understand the problem")
reasoning_dag.add_node("analyze", "Analyze requirements")
reasoning_dag.add_node("plan", "Create solution plan")
reasoning_dag.add_node("implement", "Implement solution")
reasoning_dag.add_node("verify", "Verify solution correctness")

# Define dependencies
reasoning_dag.add_edge("understand", "analyze")
reasoning_dag.add_edge("analyze", "plan")
reasoning_dag.add_edge("plan", "implement")
reasoning_dag.add_edge("implement", "verify")

# Create metric
dag_metric = DAGMetric(
    name="Problem Solving Process",
    dag=reasoning_dag,
    threshold=0.8
)

# Evaluate reasoning process
test_case = LLMTestCase(
    input="Solve: Find the maximum sum of a contiguous subarray",
    actual_output="""
First, I understand this is the maximum subarray problem.
Let me analyze: we need to find the subarray with largest sum.
I'll plan to use Kadane's algorithm for O(n) solution.
Here's the implementation: [code]
Verifying: tested with [-2,1,-3,4,-1,2,1,-5,4], got 6 (correct).
"""
)

dag_metric.measure(test_case)
print(f"Reasoning process score: {dag_metric.score:.2f}")

Arena G-Eval

G-Eval for arena-style comparison between multiple outputs.

class ArenaGEval:
    """
    Arena-style comparison using G-Eval methodology.

    Parameters:
    - name (str): Name of the metric
    - criteria (str): Evaluation criteria
    - model (Union[str, DeepEvalBaseLLM], optional): Evaluation model

    Attributes:
    - winner (str): Name of winning contestant
    - reason (str): Explanation of why winner was chosen
    - success (bool): Always True after evaluation
    """

Usage example:

from deepeval.metrics import ArenaGEval
from deepeval.test_case import ArenaTestCase, LLMTestCase

# Create arena metric
arena_metric = ArenaGEval(
    name="Response Quality",
    criteria="Determine which response is more helpful, accurate, and well-written"
)

# Compare multiple model outputs
arena_test = ArenaTestCase(
    contestants={
        "model_a": LLMTestCase(
            input="Explain neural networks",
            actual_output="Neural networks are computational models inspired by biological brains..."
        ),
        "model_b": LLMTestCase(
            input="Explain neural networks",
            actual_output="A neural network is like... umm... it's a type of AI thing..."
        ),
        "model_c": LLMTestCase(
            input="Explain neural networks",
            actual_output="Neural networks are ML models with interconnected layers..."
        )
    }
)

arena_metric.measure(arena_test)
print(f"Winner: {arena_metric.winner}")
print(f"Reason: {arena_metric.reason}")

Base Metric Classes

Extend base classes to create fully custom metrics.

class BaseMetric:
    """
    Base class for all LLM test case metrics.

    Attributes:
    - threshold (float): Threshold for success
    - score (float, optional): Score from evaluation
    - reason (str, optional): Reason for the score
    - success (bool, optional): Whether the metric passed
    - strict_mode (bool): Whether to use strict mode
    - async_mode (bool): Whether to use async mode
    - verbose_mode (bool): Whether to use verbose mode

    Abstract Methods:
    - measure(test_case: LLMTestCase, *args, **kwargs) -> float
    - a_measure(test_case: LLMTestCase, *args, **kwargs) -> float
    - is_successful() -> bool
    """

class BaseConversationalMetric:
    """
    Base class for conversational metrics.

    Abstract Methods:
    - measure(test_case: ConversationalTestCase, *args, **kwargs) -> float
    - a_measure(test_case: ConversationalTestCase, *args, **kwargs) -> float
    - is_successful() -> bool
    """

class BaseMultimodalMetric:
    """
    Base class for multimodal metrics.

    Abstract Methods:
    - measure(test_case: MLLMTestCase, *args, **kwargs) -> float
    - a_measure(test_case: MLLMTestCase, *args, **kwargs) -> float
    - is_successful() -> bool
    """

class BaseArenaMetric:
    """
    Base class for arena-style comparison metrics.

    Abstract Methods:
    - measure(test_case: ArenaTestCase, *args, **kwargs) -> str
    - a_measure(test_case: ArenaTestCase, *args, **kwargs) -> str
    - is_successful() -> bool
    """

Usage example - Custom metric:

from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase
import re

class WordCountMetric(BaseMetric):
    """Custom metric to check if response meets word count requirements."""

    def __init__(self, min_words: int, max_words: int, threshold: float = 1.0):
        self.min_words = min_words
        self.max_words = max_words
        self.threshold = threshold

    def measure(self, test_case: LLMTestCase) -> float:
        """Measure if word count is within range."""
        words = len(test_case.actual_output.split())

        if self.min_words <= words <= self.max_words:
            self.score = 1.0
            self.reason = f"Word count {words} is within range [{self.min_words}, {self.max_words}]"
        else:
            self.score = 0.0
            self.reason = f"Word count {words} is outside range [{self.min_words}, {self.max_words}]"

        self.success = self.score >= self.threshold
        return self.score

    async def a_measure(self, test_case: LLMTestCase) -> float:
        """Async version of measure."""
        return self.measure(test_case)

    def is_successful(self) -> bool:
        """Check if metric passed."""
        return self.success

# Use custom metric
word_count_metric = WordCountMetric(min_words=50, max_words=100)

test_case = LLMTestCase(
    input="Write a brief summary of quantum computing",
    actual_output="Quantum computing uses quantum mechanics..." * 15  # ~75 words
)

word_count_metric.measure(test_case)
print(f"Success: {word_count_metric.success}")

Advanced custom metric with LLM:

from deepeval.metrics import BaseMetric
from deepeval.models import GPTModel
from deepeval.test_case import LLMTestCase

class CustomToneMetric(BaseMetric):
    """Custom metric to evaluate tone of response."""

    def __init__(self, expected_tone: str, threshold: float = 0.7):
        self.expected_tone = expected_tone
        self.threshold = threshold
        self.model = GPTModel(model="gpt-4")

    def measure(self, test_case: LLMTestCase) -> float:
        """Evaluate tone using LLM."""
        prompt = f"""
        Evaluate if the following text has a {self.expected_tone} tone.
        Rate from 0.0 to 1.0 where 1.0 means perfect tone match.

        Text: {test_case.actual_output}

        Provide ONLY a number between 0.0 and 1.0.
        """

        response = self.model.generate(prompt)
        self.score = float(response.strip())
        self.success = self.score >= self.threshold
        self.reason = f"Tone match score: {self.score:.2f} for {self.expected_tone} tone"

        return self.score

    async def a_measure(self, test_case: LLMTestCase) -> float:
        """Async version."""
        return self.measure(test_case)

    def is_successful(self) -> bool:
        return self.success

# Use custom tone metric
friendly_tone = CustomToneMetric(expected_tone="friendly and professional")

test_case = LLMTestCase(
    input="Respond to customer complaint",
    actual_output="I sincerely apologize for the inconvenience. Let me help resolve this right away!"
)

friendly_tone.measure(test_case)

Non-LLM Metrics

Simple pattern-based metrics without LLM evaluation.

class ExactMatchMetric:
    """
    Simple exact string matching metric.

    Parameters:
    - threshold (float): Success threshold (default: 1.0)

    Required Test Case Parameters:
    - ACTUAL_OUTPUT
    - EXPECTED_OUTPUT
    """

class PatternMatchMetric:
    """
    Pattern matching using regular expressions.

    Parameters:
    - pattern (str): Regular expression pattern
    - threshold (float): Success threshold (default: 1.0)

    Required Test Case Parameters:
    - ACTUAL_OUTPUT
    """

Usage example:

from deepeval.metrics import ExactMatchMetric, PatternMatchMetric
from deepeval.test_case import LLMTestCase

# Exact match
exact_metric = ExactMatchMetric()
test_case = LLMTestCase(
    input="What is 2+2?",
    actual_output="4",
    expected_output="4"
)
exact_metric.measure(test_case)

# Pattern match
email_pattern = PatternMatchMetric(pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
test_case = LLMTestCase(
    input="Extract email",
    actual_output="Contact us at support@example.com"
)
email_pattern.measure(test_case)
print(f"Email found: {email_pattern.success}")

Install with Tessl CLI