Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
npx @tessl/cli install tessl/pypi-deepeval@3.7.0A comprehensive Python framework for evaluating and testing large language model (LLM) systems. DeepEval provides 50+ research-backed metrics for evaluating RAG pipelines, chatbots, AI agents, and other LLM applications. It operates like Pytest but specialized for LLM evaluation, supporting both end-to-end and component-level testing.
pip install -U deepevalimport deepevalCommon imports for evaluation:
from deepeval import evaluate, assert_test
from deepeval.test_case import LLMTestCase, ConversationalTestCase
from deepeval.metrics import GEval, AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.dataset import EvaluationDataset, Goldenfrom deepeval import assert_test
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
# Create a test case
test_case = LLMTestCase(
input="What if these shoes don't fit?",
actual_output="You have 30 days to get a full refund at no extra cost.",
expected_output="We offer a 30-day full refund at no extra costs.",
retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
)
# Create and run a metric
metric = AnswerRelevancyMetric(threshold=0.7)
assert_test(test_case, [metric])Evaluating multiple test cases:
from deepeval import evaluate
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import FaithfulnessMetric
# Create a dataset
dataset = EvaluationDataset(
goldens=[
Golden(input="What's the refund policy?", expected_output="30-day full refund"),
Golden(input="How do I return items?", expected_output="Contact support for return label")
]
)
# Generate test cases and evaluate
for golden in dataset.goldens:
test_case = LLMTestCase(
input=golden.input,
actual_output=your_llm_app(golden.input),
expected_output=golden.expected_output
)
dataset.add_test_case(test_case)
# Evaluate entire dataset
evaluate(dataset, [FaithfulnessMetric()])DeepEval's architecture consists of several key layers:
LLMTestCase, ConversationalTestCase, MLLMTestCase)@observe decorator for nested evaluationsTest cases are structured containers representing LLM interactions to be evaluated. DeepEval supports standard LLM tests, multi-turn conversations, multimodal inputs, and arena-style comparisons.
class LLMTestCase:
"""
Represents a test case for evaluating LLM outputs.
Parameters:
- input (str): Input prompt to the LLM
- actual_output (str, optional): Actual output from the LLM
- expected_output (str, optional): Expected output
- context (List[str], optional): Context information
- retrieval_context (List[str], optional): Retrieved context for RAG
- additional_metadata (Dict, optional): Additional metadata
- tools_called (List[ToolCall], optional): Tools called by the LLM
- expected_tools (List[ToolCall], optional): Expected tools to be called
- comments (str, optional): Comments about the test case
- name (str, optional): Name of the test case
- tags (List[str], optional): Tags for organization
"""
class ConversationalTestCase:
"""
Represents a multi-turn conversational test case.
Parameters:
- turns (List[Turn]): List of conversation turns
- scenario (str, optional): Scenario description
- context (List[str], optional): Context information
- expected_outcome (str, optional): Expected outcome
- name (str, optional): Name of the test case
"""
class MLLMTestCase:
"""
Represents a test case for multimodal LLMs (text + images).
Parameters:
- input (List[Union[str, MLLMImage]]): Input with text and images
- actual_output (List[Union[str, MLLMImage]]): Actual output
- expected_output (List[Union[str, MLLMImage]], optional): Expected output
- context (List[Union[str, MLLMImage]], optional): Context
"""Core evaluation functions for running metrics against test cases, either individually or in batch. Supports pytest integration, standalone evaluation, and comparison between models.
def evaluate(
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]],
metrics: Optional[Union[List[BaseMetric], List[BaseConversationalMetric], List[BaseMultimodalMetric]]] = None,
hyperparameters: Optional[Dict[str, Union[str, int, float]]] = None,
identifier: Optional[str] = None,
async_config: Optional[AsyncConfig] = None,
display_config: Optional[DisplayConfig] = None,
cache_config: Optional[CacheConfig] = None,
error_config: Optional[ErrorConfig] = None
) -> EvaluationResult:
"""
Evaluates test cases against specified metrics.
Returns:
- EvaluationResult: Contains test results, Confident AI link, and test run ID
"""
def assert_test(
test_case: Optional[Union[LLMTestCase, ConversationalTestCase, MLLMTestCase]],
metrics: Optional[Union[List[BaseMetric], List[BaseConversationalMetric], List[BaseMultimodalMetric]]] = None,
run_async: bool = True
):
"""
Asserts that a single test case passes specified metrics.
Raises:
- AssertionError: If metrics fail
"""
def compare(
test_cases: List[List[LLMTestCase]],
metrics: List[BaseMetric]
) -> ComparisonResult:
"""
Compares multiple test results to determine which performs better.
"""Metrics specifically designed for evaluating Retrieval-Augmented Generation (RAG) systems, measuring answer quality, faithfulness to context, and retrieval effectiveness.
class AnswerRelevancyMetric:
"""
Measures whether the answer is relevant to the input question.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
"""
class FaithfulnessMetric:
"""
Measures whether the answer is faithful to the context (no hallucinations).
"""
class ContextualRecallMetric:
"""
Measures whether the retrieved context contains all information needed.
"""
class ContextualRelevancyMetric:
"""
Measures whether the retrieved context is relevant to the input.
"""
class ContextualPrecisionMetric:
"""
Measures whether relevant context nodes are ranked higher than irrelevant ones.
"""Metrics for evaluating content safety, quality, and compliance, detecting issues like hallucinations, bias, toxicity, and PII leakage.
class HallucinationMetric:
"""
Detects hallucinations in the output.
Parameters:
- threshold (float): Success threshold
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
"""
class BiasMetric:
"""
Detects bias in the output.
"""
class ToxicityMetric:
"""
Detects toxic content in the output.
"""
class SummarizationMetric:
"""
Evaluates the quality of summaries.
"""
class PIILeakageMetric:
"""
Detects personally identifiable information (PII) leakage.
"""Metrics for evaluating AI agents, including tool usage, task completion, plan quality, and goal achievement.
class ToolCorrectnessMetric:
"""
Evaluates whether the correct tools were called with correct parameters.
Parameters:
- threshold (float): Success threshold
"""
class TaskCompletionMetric:
"""
Evaluates whether the task was completed successfully.
"""
class ToolUseMetric:
"""
Evaluates appropriate use of available tools.
"""
class PlanQualityMetric:
"""
Evaluates the quality of generated plans.
"""
class GoalAccuracyMetric:
"""
Measures accuracy in achieving specified goals.
"""Metrics designed for evaluating multi-turn conversations, measuring relevancy, completeness, and role adherence.
class ConversationalGEval:
"""
G-Eval for conversational test cases.
Parameters:
- name (str): Name of the metric
- criteria (str): Evaluation criteria
- evaluation_params (List[TurnParams]): Parameters to evaluate
- threshold (float): Success threshold
"""
class TurnRelevancyMetric:
"""
Measures relevancy of conversation turns.
"""
class ConversationCompletenessMetric:
"""
Evaluates completeness of conversations.
"""
class RoleAdherenceMetric:
"""
Measures adherence to assigned role in conversations.
"""Metrics for evaluating multimodal LLM outputs involving text and images, including generation quality and contextual understanding.
class MultimodalGEval:
"""
G-Eval for multimodal test cases.
Parameters:
- name (str): Name of the metric
- criteria (str): Evaluation criteria
- evaluation_params (List[MLLMTestCaseParams]): Parameters to evaluate
"""
class TextToImageMetric:
"""
Evaluates text-to-image generation quality.
"""
class ImageCoherenceMetric:
"""
Evaluates coherence of images in context.
"""
class MultimodalAnswerRelevancyMetric:
"""
Answer relevancy for multimodal inputs.
"""
class MultimodalFaithfulnessMetric:
"""
Faithfulness for multimodal outputs.
"""Framework for creating custom evaluation metrics using G-Eval, DAG (Deep Acyclic Graph), or by extending base metric classes.
class GEval:
"""
Customizable metric based on the G-Eval framework for LLM evaluation.
Parameters:
- name (str): Name of the metric
- evaluation_params (List[LLMTestCaseParams]): Parameters to evaluate
- criteria (str, optional): Evaluation criteria
- evaluation_steps (List[str], optional): Steps for evaluation
- rubric (List[Rubric], optional): Scoring rubric
- threshold (float): Success threshold (default: 0.5)
"""
class DAGMetric:
"""
Deep Acyclic Graph metric for evaluating structured reasoning.
Parameters:
- name (str): Name of the metric
- dag (DeepAcyclicGraph): DAG structure for evaluation
- threshold (float): Success threshold
"""
class BaseMetric:
"""
Base class for all LLM test case metrics.
Abstract Methods:
- measure(test_case: LLMTestCase) -> float
- a_measure(test_case: LLMTestCase) -> float
- is_successful() -> bool
"""Tools for managing collections of test cases and "golden" examples, supporting batch evaluation, synthetic data generation, and dataset persistence.
class EvaluationDataset:
"""
Manages collections of test cases and goldens for evaluation.
Parameters:
- goldens (Union[List[Golden], List[ConversationalGolden]]): Initial goldens
Methods:
- add_test_case(test_case): Add a test case
- add_golden(golden): Add a golden
- generate_goldens_from_docs(document_paths, ...): Generate goldens from documents
- evaluate(metrics): Evaluate with metrics
- push(alias): Push to Confident AI
- pull(alias): Pull from Confident AI
"""
class Golden:
"""
Represents a "golden" test case - expected input/output pairs.
Parameters:
- input (str): Input prompt
- expected_output (str, optional): Expected output
- context (List[str], optional): Context
- retrieval_context (List[str], optional): Retrieved context
"""
class ConversationalGolden:
"""
Represents a "golden" conversational test case.
Parameters:
- scenario (str): Scenario description
- expected_outcome (str, optional): Expected outcome
- turns (List[Turn], optional): Conversation turns
"""Model abstraction layer supporting 15+ LLM providers, multimodal models, and embedding models with a unified interface.
class DeepEvalBaseLLM:
"""
Base class for LLM integrations.
Abstract Methods:
- generate(prompt: str) -> str
- a_generate(prompt: str) -> str
- get_model_name() -> str
"""
class GPTModel:
"""
OpenAI GPT model integration.
Parameters:
- model (str): Model name (e.g., "gpt-4", "gpt-3.5-turbo")
- api_key (str, optional): OpenAI API key
"""
class AnthropicModel:
"""
Anthropic Claude integration.
"""
class GeminiModel:
"""
Google Gemini integration.
"""
class OllamaModel:
"""
Ollama model integration for local models.
"""
class DeepEvalBaseMLLM:
"""
Base class for multimodal LLM integrations.
"""Synthetic test data generation using various evolution strategies (reasoning, multi-context, concretizing, etc.) to create diverse and challenging test cases.
class Synthesizer:
"""
Generates synthetic test data and goldens.
Parameters:
- model (Union[str, DeepEvalBaseLLM], optional): Model for generation
- async_mode (bool): Async mode (default: True)
- filtration_config (FiltrationConfig, optional): Filtration configuration
- evolution_config (EvolutionConfig, optional): Evolution configuration
- styling_config (StylingConfig, optional): Styling configuration
Methods:
- generate_goldens_from_docs(document_paths, ...) -> List[Golden]
- generate_goldens_from_contexts(contexts, ...) -> List[Golden]
- generate_goldens_from_scratch(num_goldens, ...) -> List[Golden]
- save_as(file_type, directory, ...)
"""
class Evolution:
"""
Enum of input evolution strategies.
Values:
- REASONING: Add reasoning complexity
- MULTICONTEXT: Require multiple contexts
- CONCRETIZING: Make more concrete
- CONSTRAINED: Add constraints
- COMPARATIVE: Add comparisons
- HYPOTHETICAL: Make hypothetical
"""Pre-built benchmarks for evaluating LLMs on standard datasets like MMLU, HellaSwag, GSM8K, HumanEval, and more.
class MMLU:
"""
Massive Multitask Language Understanding benchmark.
Parameters:
- tasks (List[MMLUTask], optional): Specific tasks to evaluate
- n_shots (int): Number of few-shot examples
"""
class HellaSwag:
"""
HellaSwag benchmark for commonsense reasoning.
"""
class GSM8K:
"""
Grade School Math 8K benchmark.
"""
class HumanEval:
"""
HumanEval benchmark for code generation.
"""
class BigBenchHard:
"""
Big Bench Hard benchmark.
"""Component-level observability for evaluating nested LLM components using the @observe decorator and trace management.
def observe(
metrics: Optional[List[BaseMetric]] = None,
name: Optional[str] = None
):
"""
Decorator for observing function execution and applying metrics.
Parameters:
- metrics (List[BaseMetric], optional): Metrics to apply
- name (str, optional): Name for the span
"""
def update_current_span(
test_case: Optional[LLMTestCase] = None,
**kwargs
):
"""
Updates the current span with additional data.
Parameters:
- test_case (LLMTestCase, optional): Test case data
- **kwargs: Additional span attributes
"""
def evaluate_trace(
trace_id: str,
metrics: List[BaseMetric]
):
"""
Evaluates a specific trace with metrics.
"""Native integrations with popular LLM frameworks for automatic tracing and evaluation.
# LangChain Integration
class CallbackHandler:
"""
LangChain callback handler for DeepEval tracing.
"""
def tool(func):
"""
Decorator for marking LangChain tools for tracing.
"""
# LlamaIndex Integration
def instrument_llama_index():
"""
Instruments LlamaIndex for automatic tracing.
"""
# CrewAI Integration
def instrument_crewai():
"""
Instruments CrewAI for automatic tracing.
"""
# PydanticAI Integration
def instrument_pydantic_ai():
"""
Instruments PydanticAI for automatic tracing.
"""def login(api_key: str = None):
"""
Logs into Confident AI with an API key.
Parameters:
- api_key (str, optional): Confident AI API key
"""
def log_hyperparameters(hyperparameters: Dict):
"""
Logs hyperparameters for the current test run.
Parameters:
- hyperparameters (Dict): Dictionary of hyperparameters to log
"""
def on_test_run_end(callback: Callable):
"""
Registers a callback to be executed when a test run ends.
Parameters:
- callback (Callable): Function to execute at test run end
"""