Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Metrics specifically designed for evaluating Retrieval-Augmented Generation (RAG) systems. These metrics measure answer quality, faithfulness to context, and retrieval effectiveness using LLM-based evaluation.
from deepeval.metrics import (
AnswerRelevancyMetric,
FaithfulnessMetric,
ContextualRecallMetric,
ContextualRelevancyMetric,
ContextualPrecisionMetric
)Measures whether the answer is relevant to the input question. Evaluates if the LLM's response addresses what was asked.
class AnswerRelevancyMetric:
"""
Measures whether the answer is relevant to the input question.
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
- async_mode (bool): Async mode (default: True)
- strict_mode (bool): Strict mode (default: False)
- verbose_mode (bool): Verbose mode (default: False)
- evaluation_template (Type[AnswerRelevancyTemplate], optional): Custom evaluation template
Required Test Case Parameters:
- INPUT
- ACTUAL_OUTPUT
Attributes:
- score (float): Relevancy score (0-1)
- reason (str): Explanation of the score
- success (bool): Whether score meets threshold
- statements (List[str]): Generated statements from actual output
- verdicts (List[AnswerRelevancyVerdict]): Verdicts for each statement
"""Usage example:
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
# Create metric
metric = AnswerRelevancyMetric(
threshold=0.7,
model="gpt-4",
include_reason=True
)
# Create test case
test_case = LLMTestCase(
input="What is the capital of France?",
actual_output="The capital of France is Paris. It's known as the City of Light."
)
# Evaluate
metric.measure(test_case)
print(f"Score: {metric.score}") # e.g., 0.95
print(f"Reason: {metric.reason}") # Explanation
print(f"Success: {metric.success}") # True if score >= 0.7Measures whether the answer is faithful to the context, detecting hallucinations by checking if all claims in the output are supported by the provided context.
class FaithfulnessMetric:
"""
Measures whether the answer is faithful to the context (no hallucinations).
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
- async_mode (bool): Async mode (default: True)
- strict_mode (bool): Strict mode (default: False)
- verbose_mode (bool): Verbose mode (default: False)
- truths_extraction_limit (int, optional): Limit number of truths extracted from context
- penalize_ambiguous_claims (bool): Penalize ambiguous claims (default: False)
- evaluation_template (Type[FaithfulnessTemplate], optional): Custom evaluation template
Required Test Case Parameters:
- ACTUAL_OUTPUT
- RETRIEVAL_CONTEXT or CONTEXT
Attributes:
- score (float): Faithfulness score (0-1)
- reason (str): Explanation with unfaithful claims if any
- success (bool): Whether score meets threshold
- truths (List[str]): Extracted truths from context
- claims (List[str]): Extracted claims from output
- verdicts (List[FaithfulnessVerdict]): Verdicts for each claim
"""Usage example:
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase
# Create metric
metric = FaithfulnessMetric(threshold=0.8)
# Test case with retrieval context
test_case = LLMTestCase(
input="What is the refund policy?",
actual_output="We offer a 30-day full refund at no extra cost.",
retrieval_context=[
"All customers are eligible for a 30 day full refund at no extra costs.",
"Refunds are processed within 5-7 business days."
]
)
# Evaluate faithfulness
metric.measure(test_case)
if metric.success:
print("Output is faithful to context")
else:
print(f"Hallucination detected: {metric.reason}")Measures whether the retrieved context contains all information needed to answer the question. Evaluates the completeness of the retrieval system.
class ContextualRecallMetric:
"""
Measures whether the retrieved context contains all information needed to answer.
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
- async_mode (bool): Async mode (default: True)
- strict_mode (bool): Strict mode (default: False)
- verbose_mode (bool): Verbose mode (default: False)
Required Test Case Parameters:
- INPUT
- EXPECTED_OUTPUT
- RETRIEVAL_CONTEXT
Attributes:
- score (float): Recall score (0-1)
- reason (str): Explanation of what's missing if any
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase
# Create metric
metric = ContextualRecallMetric(threshold=0.7)
# Test case with expected output
test_case = LLMTestCase(
input="How do I reset my password?",
expected_output="Click 'Forgot Password' on the login page and check your email for reset link.",
retrieval_context=[
"Password reset: Click 'Forgot Password' on login page",
"Reset link sent to registered email address"
]
)
# Evaluate recall
metric.measure(test_case)
if not metric.success:
print(f"Missing information: {metric.reason}")Measures whether the retrieved context is relevant to the input question. Evaluates the precision of the retrieval system by identifying irrelevant context.
class ContextualRelevancyMetric:
"""
Measures whether the retrieved context is relevant to the input.
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
- async_mode (bool): Async mode (default: True)
- strict_mode (bool): Strict mode (default: False)
- verbose_mode (bool): Verbose mode (default: False)
Required Test Case Parameters:
- INPUT
- RETRIEVAL_CONTEXT
Attributes:
- score (float): Relevancy score (0-1)
- reason (str): Explanation identifying irrelevant context
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
# Create metric
metric = ContextualRelevancyMetric(threshold=0.7)
# Test case
test_case = LLMTestCase(
input="What are the shipping costs to California?",
retrieval_context=[
"Shipping to California: $5.99 for standard, $12.99 for express",
"California has over 39 million residents", # Irrelevant
"Free shipping on orders over $50"
]
)
# Evaluate relevancy
metric.measure(test_case)
if not metric.success:
print(f"Irrelevant context detected: {metric.reason}")Measures whether relevant context nodes are ranked higher than irrelevant ones in the retrieval context. Evaluates the ranking quality of the retrieval system.
class ContextualPrecisionMetric:
"""
Measures whether relevant context nodes are ranked higher than irrelevant ones.
Parameters:
- threshold (float): Success threshold (0-1, default: 0.5)
- model (Union[str, DeepEvalBaseLLM], optional): Evaluation model
- include_reason (bool): Include reason in output (default: True)
- async_mode (bool): Async mode (default: True)
- strict_mode (bool): Strict mode (default: False)
- verbose_mode (bool): Verbose mode (default: False)
Required Test Case Parameters:
- INPUT
- EXPECTED_OUTPUT
- RETRIEVAL_CONTEXT (order matters)
Attributes:
- score (float): Precision score (0-1)
- reason (str): Explanation of ranking issues
- success (bool): Whether score meets threshold
"""Usage example:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase
# Create metric
metric = ContextualPrecisionMetric(threshold=0.7)
# Test case with ordered retrieval context
test_case = LLMTestCase(
input="What is the return policy?",
expected_output="30-day return policy with full refund",
retrieval_context=[
"California sales tax rate is 7.25%", # Irrelevant (ranked too high)
"All products have a 30-day return policy", # Relevant (should be first)
"Returns are processed within 5 business days" # Relevant
]
)
# Evaluate precision
metric.measure(test_case)
if not metric.success:
print(f"Ranking issue: {metric.reason}")Evaluate all RAG aspects together:
from deepeval import evaluate
from deepeval.metrics import (
AnswerRelevancyMetric,
FaithfulnessMetric,
ContextualRecallMetric,
ContextualRelevancyMetric,
ContextualPrecisionMetric
)
from deepeval.test_case import LLMTestCase
# Create comprehensive RAG metrics
rag_metrics = [
AnswerRelevancyMetric(threshold=0.7),
FaithfulnessMetric(threshold=0.8),
ContextualRecallMetric(threshold=0.7),
ContextualRelevancyMetric(threshold=0.7),
ContextualPrecisionMetric(threshold=0.7)
]
# Test cases for RAG pipeline
test_cases = [
LLMTestCase(
input="What's the shipping policy?",
actual_output=rag_pipeline("What's the shipping policy?"),
expected_output="Free shipping on orders over $50, 3-5 business days",
retrieval_context=get_retrieval_context("What's the shipping policy?")
),
# ... more test cases
]
# Evaluate entire RAG pipeline
result = evaluate(test_cases, rag_metrics)
# Analyze results by metric type
for metric_name in ["Answer Relevancy", "Faithfulness", "Contextual Recall",
"Contextual Relevancy", "Contextual Precision"]:
scores = [tr.metrics[metric_name].score for tr in result.test_results]
avg_score = sum(scores) / len(scores)
print(f"{metric_name}: {avg_score:.2f}")Customize metrics with specific models and configurations:
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.models import GPTModel
# Use specific model for evaluation
custom_model = GPTModel(model="gpt-4-turbo")
answer_relevancy = AnswerRelevancyMetric(
threshold=0.75,
model=custom_model,
include_reason=True,
strict_mode=True, # More stringent evaluation
verbose_mode=True # Print detailed logs
)
faithfulness = FaithfulnessMetric(
threshold=0.85,
model=custom_model
)
# Use in evaluation
test_case = LLMTestCase(...)
answer_relevancy.measure(test_case)
faithfulness.measure(test_case)While DeepEval provides individual RAG metrics, you can compute a RAGAS-style composite score:
from deepeval import evaluate
from deepeval.metrics import (
AnswerRelevancyMetric,
FaithfulnessMetric,
ContextualRecallMetric,
ContextualPrecisionMetric
)
# Evaluate with RAGAS component metrics
result = evaluate(test_cases, [
AnswerRelevancyMetric(),
FaithfulnessMetric(),
ContextualRecallMetric(),
ContextualPrecisionMetric()
])
# Compute RAGAS score (harmonic mean of component scores)
for test_result in result.test_results:
scores = [m.score for m in test_result.metrics.values()]
ragas_score = len(scores) / sum(1/s for s in scores if s > 0)
print(f"RAGAS Score: {ragas_score:.3f}")