Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Model abstraction layer supporting 15+ LLM providers, multimodal models, and embedding models with a unified interface. Use custom models for metric evaluation or integrate with existing LLM applications.
from deepeval.models import (
# Base classes
DeepEvalBaseLLM,
DeepEvalBaseMLLM,
DeepEvalBaseEmbeddingModel,
# LLM implementations
GPTModel,
AnthropicModel,
GeminiModel,
OllamaModel,
LocalModel,
AzureOpenAIModel,
LiteLLMModel,
AmazonBedrockModel,
KimiModel,
GrokModel,
DeepSeekModel,
# Multimodal models
MultimodalOpenAIModel,
MultimodalGeminiModel,
MultimodalOllamaModel,
# Embedding models
OpenAIEmbeddingModel,
AzureOpenAIEmbeddingModel,
LocalEmbeddingModel,
OllamaEmbeddingModel
)Abstract base class for LLM integrations.
class DeepEvalBaseLLM:
"""
Base class for LLM integrations.
Attributes:
- model_name (str, optional): Name of the model
- model (Any): The underlying model instance
Abstract Methods:
- load_model(*args, **kwargs): Load the model
- generate(prompt: str, **kwargs) -> str: Generate text
- a_generate(prompt: str, **kwargs) -> str: Async generate
- get_model_name() -> str: Get model name
Optional Methods:
- batch_generate(prompts: List[str], **kwargs) -> List[str]: Batch generation
"""class GPTModel:
"""
OpenAI GPT model integration.
Parameters:
- model (str, optional): Model name (default: "gpt-4o")
- api_key (str, optional): OpenAI API key
- *args, **kwargs: Additional arguments for OpenAI client
Methods:
- generate(prompt: str) -> str
- a_generate(prompt: str) -> str
- get_model_name() -> str
"""Usage:
from deepeval.models import GPTModel
from deepeval.metrics import AnswerRelevancyMetric
# Use GPT-4 for evaluation
model = GPTModel(model="gpt-4")
metric = AnswerRelevancyMetric(
threshold=0.7,
model=model
)class AnthropicModel:
"""
Anthropic Claude integration.
Parameters:
- model (str, optional): Model name (default: "claude-3-5-sonnet-20241022")
- api_key (str, optional): Anthropic API key
"""class GeminiModel:
"""
Google Gemini integration.
Parameters:
- model (str, optional): Model name (default: "gemini-2.0-flash-exp")
- api_key (str, optional): Google API key
"""class OllamaModel:
"""
Ollama model integration for local models.
Parameters:
- model (str, optional): Model name (default: "llama3.2")
- base_url (str, optional): Ollama server URL
"""
class LocalModel:
"""
Local model integration (e.g., HuggingFace).
Parameters:
- model (Any): HuggingFace model or pipeline
- tokenizer (Any, optional): Tokenizer
"""class AzureOpenAIModel:
"""
Azure OpenAI integration.
Parameters:
- deployment_name (str): Azure deployment name
- api_key (str, optional): Azure API key
- azure_endpoint (str, optional): Azure endpoint URL
- api_version (str, optional): API version
"""class LiteLLMModel:
"""
LiteLLM integration for unified API across providers.
Parameters:
- model (str): Model name (e.g., "anthropic/claude-3-opus")
"""
class AmazonBedrockModel:
"""Amazon Bedrock integration."""
class KimiModel:
"""Kimi model integration."""
class GrokModel:
"""Grok model integration."""
class DeepSeekModel:
"""DeepSeek model integration."""class DeepEvalBaseMLLM:
"""
Base class for multimodal LLM integrations.
Abstract Methods:
- generate(messages: List, **kwargs) -> str: Generate from multimodal input
- a_generate(messages: List, **kwargs) -> str: Async generate
- get_model_name() -> str: Get model name
"""
class MultimodalOpenAIModel:
"""
OpenAI multimodal integration (GPT-4V, etc.).
Parameters:
- model (str, optional): Model name (default: "gpt-4o")
"""
class MultimodalGeminiModel:
"""Gemini multimodal integration."""
class MultimodalOllamaModel:
"""Ollama multimodal integration."""class DeepEvalBaseEmbeddingModel:
"""
Base class for embedding model integrations.
Abstract Methods:
- embed_text(text: str) -> List[float]: Embed single text
- a_embed_text(text: str) -> List[float]: Async embed single text
- embed_texts(texts: List[str]) -> List[List[float]]: Embed multiple texts
- a_embed_texts(texts: List[str]) -> List[List[float]]: Async embed multiple
- get_model_name() -> str: Get model name
"""
class OpenAIEmbeddingModel:
"""
OpenAI embeddings integration.
Parameters:
- model (str, optional): Model name (default: "text-embedding-3-small")
"""
class AzureOpenAIEmbeddingModel:
"""Azure OpenAI embeddings integration."""
class LocalEmbeddingModel:
"""Local embedding model integration."""
class OllamaEmbeddingModel:
"""Ollama embeddings integration."""from deepeval.models import GPTModel, AnthropicModel
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
# Use GPT-4 for one metric
gpt4_metric = AnswerRelevancyMetric(
model=GPTModel(model="gpt-4"),
threshold=0.7
)
# Use Claude for another
claude_metric = FaithfulnessMetric(
model=AnthropicModel(model="claude-3-5-sonnet-20241022"),
threshold=0.8
)from deepeval.models import OllamaModel
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
# Use local Llama model for evaluation
local_model = OllamaModel(
model="llama3.2",
base_url="http://localhost:11434"
)
metric = GEval(
name="Quality",
criteria="Evaluate response quality",
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
model=local_model
)from deepeval.models import DeepEvalBaseLLM
class CustomModel(DeepEvalBaseLLM):
def __init__(self, api_endpoint: str):
self.api_endpoint = api_endpoint
self.model_name = "custom-model-v1"
def load_model(self):
# Initialize your model
pass
def generate(self, prompt: str) -> str:
# Call your model API
response = requests.post(
self.api_endpoint,
json={"prompt": prompt}
)
return response.json()["output"]
async def a_generate(self, prompt: str) -> str:
# Async version
return self.generate(prompt)
def get_model_name(self) -> str:
return self.model_name
# Use custom model
custom_model = CustomModel(api_endpoint="https://api.example.com/generate")
metric = AnswerRelevancyMetric(model=custom_model)from deepeval.models import MultimodalOpenAIModel
from deepeval.metrics import MultimodalGEval
from deepeval.test_case import MLLMTestCase, MLLMImage, MLLMTestCaseParams
# Use GPT-4V for multimodal evaluation
mllm = MultimodalOpenAIModel(model="gpt-4o")
metric = MultimodalGEval(
name="Image Description Quality",
criteria="Evaluate if the description accurately represents the image",
evaluation_params=[MLLMTestCaseParams.INPUT, MLLMTestCaseParams.ACTUAL_OUTPUT],
model=mllm
)
test_case = MLLMTestCase(
input=["Describe this image:", MLLMImage(url="photo.jpg", local=True)],
actual_output=["A golden retriever playing in a park"]
)
metric.measure(test_case)