Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Metrics for evaluating multimodal LLM outputs involving text and images. These metrics assess image generation quality, visual question answering, image coherence, and multimodal RAG systems.
from deepeval.metrics import (
MultimodalGEval,
TextToImageMetric,
ImageEditingMetric,
ImageCoherenceMetric,
ImageHelpfulnessMetric,
ImageReferenceMetric,
MultimodalContextualRecallMetric,
MultimodalContextualRelevancyMetric,
MultimodalContextualPrecisionMetric,
MultimodalAnswerRelevancyMetric,
MultimodalFaithfulnessMetric,
MultimodalToolCorrectnessMetric
)G-Eval for multimodal test cases with custom evaluation criteria.
class MultimodalGEval:
"""
G-Eval for multimodal test cases.
Parameters:
- name (str): Name of the metric
- criteria (str): Evaluation criteria
- evaluation_params (List[MLLMTestCaseParams]): Parameters to evaluate
- evaluation_steps (List[str], optional): Steps for evaluation
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Multimodal evaluation model
- async_mode (bool): Async mode (default: True)
Attributes:
- score (float): Evaluation score (0-1)
- reason (str): Explanation
- success (bool): Whether score meets threshold
"""Evaluates text-to-image generation quality.
class TextToImageMetric:
"""
Evaluates text-to-image generation quality.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
- include_reason (bool): Include reason (default: True)
Required Test Case Parameters:
- INPUT (text prompt)
- ACTUAL_OUTPUT (generated image)
Attributes:
- score (float): Image quality score (0-1)
- reason (str): Explanation
- success (bool): Whether score meets threshold
"""Evaluates coherence of images in context.
class ImageCoherenceMetric:
"""
Evaluates coherence of images in context.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
Required Test Case Parameters:
- INPUT
- ACTUAL_OUTPUT (images)
- CONTEXT
Attributes:
- score (float): Coherence score (0-1)
- reason (str): Explanation
- success (bool): Whether score meets threshold
"""Evaluates helpfulness of images in responses.
class ImageHelpfulnessMetric:
"""
Evaluates helpfulness of images.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
Required Test Case Parameters:
- INPUT
- ACTUAL_OUTPUT (response with images)
Attributes:
- score (float): Helpfulness score (0-1)
- reason (str): Explanation
- success (bool): Whether score meets threshold
"""RAG metrics adapted for multimodal inputs and outputs.
class MultimodalAnswerRelevancyMetric:
"""
Answer relevancy for multimodal inputs.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
"""
class MultimodalFaithfulnessMetric:
"""
Faithfulness for multimodal outputs.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
"""
class MultimodalContextualRecallMetric:
"""
Contextual recall for multimodal inputs.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
"""
class MultimodalContextualRelevancyMetric:
"""
Contextual relevancy for multimodal inputs.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
"""
class MultimodalContextualPrecisionMetric:
"""
Contextual precision for multimodal inputs.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
"""Usage example:
from deepeval.metrics import (
MultimodalAnswerRelevancyMetric,
MultimodalFaithfulnessMetric
)
from deepeval.test_case import MLLMTestCase, MLLMImage
# Visual QA with retrieval
test_case = MLLMTestCase(
input=[
"What safety equipment is visible in this image?",
MLLMImage(url="construction_site.jpg", local=True)
],
actual_output=["Hard hats, safety vests, and steel-toed boots are visible."],
retrieval_context=[
"Safety requirements: hard hats, safety vests, steel-toed boots",
MLLMImage(url="safety_guide.jpg")
]
)
metrics = [
MultimodalAnswerRelevancyMetric(threshold=0.7),
MultimodalFaithfulnessMetric(threshold=0.8)
]
for metric in metrics:
metric.measure(test_case)
print(f"{metric.__class__.__name__}: {metric.score:.2f}")Tool correctness for multimodal contexts.
class MultimodalToolCorrectnessMetric:
"""
Tool correctness for multimodal contexts.
Parameters:
- threshold (float): Success threshold (default: 0.5)
- model (Union[str, DeepEvalBaseMLLM], optional): Evaluation model
Required Test Case Parameters:
- TOOLS_CALLED
- EXPECTED_TOOLS
"""