CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-deepeval

Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents

Overview
Eval results
Files

models.mddocs/

Models

Model abstraction layer supporting 15+ LLM providers, multimodal models, and embedding models with a unified interface. Use custom models for metric evaluation or integrate with existing LLM applications.

Imports

from deepeval.models import (
    # Base classes
    DeepEvalBaseLLM,
    DeepEvalBaseMLLM,
    DeepEvalBaseEmbeddingModel,
    # LLM implementations
    GPTModel,
    AnthropicModel,
    GeminiModel,
    OllamaModel,
    LocalModel,
    AzureOpenAIModel,
    LiteLLMModel,
    AmazonBedrockModel,
    KimiModel,
    GrokModel,
    DeepSeekModel,
    # Multimodal models
    MultimodalOpenAIModel,
    MultimodalGeminiModel,
    MultimodalOllamaModel,
    # Embedding models
    OpenAIEmbeddingModel,
    AzureOpenAIEmbeddingModel,
    LocalEmbeddingModel,
    OllamaEmbeddingModel
)

Capabilities

Base LLM Class

Abstract base class for LLM integrations.

class DeepEvalBaseLLM:
    """
    Base class for LLM integrations.

    Attributes:
    - model_name (str, optional): Name of the model
    - model (Any): The underlying model instance

    Abstract Methods:
    - load_model(*args, **kwargs): Load the model
    - generate(prompt: str, **kwargs) -> str: Generate text
    - a_generate(prompt: str, **kwargs) -> str: Async generate
    - get_model_name() -> str: Get model name

    Optional Methods:
    - batch_generate(prompts: List[str], **kwargs) -> List[str]: Batch generation
    """

LLM Implementations

OpenAI GPT Models

class GPTModel:
    """
    OpenAI GPT model integration.

    Parameters:
    - model (str, optional): Model name (default: "gpt-4o")
    - api_key (str, optional): OpenAI API key
    - *args, **kwargs: Additional arguments for OpenAI client

    Methods:
    - generate(prompt: str) -> str
    - a_generate(prompt: str) -> str
    - get_model_name() -> str
    """

Usage:

from deepeval.models import GPTModel
from deepeval.metrics import AnswerRelevancyMetric

# Use GPT-4 for evaluation
model = GPTModel(model="gpt-4")

metric = AnswerRelevancyMetric(
    threshold=0.7,
    model=model
)

Anthropic Claude

class AnthropicModel:
    """
    Anthropic Claude integration.

    Parameters:
    - model (str, optional): Model name (default: "claude-3-5-sonnet-20241022")
    - api_key (str, optional): Anthropic API key
    """

Google Gemini

class GeminiModel:
    """
    Google Gemini integration.

    Parameters:
    - model (str, optional): Model name (default: "gemini-2.0-flash-exp")
    - api_key (str, optional): Google API key
    """

Local/Ollama Models

class OllamaModel:
    """
    Ollama model integration for local models.

    Parameters:
    - model (str, optional): Model name (default: "llama3.2")
    - base_url (str, optional): Ollama server URL
    """

class LocalModel:
    """
    Local model integration (e.g., HuggingFace).

    Parameters:
    - model (Any): HuggingFace model or pipeline
    - tokenizer (Any, optional): Tokenizer
    """

Azure OpenAI

class AzureOpenAIModel:
    """
    Azure OpenAI integration.

    Parameters:
    - deployment_name (str): Azure deployment name
    - api_key (str, optional): Azure API key
    - azure_endpoint (str, optional): Azure endpoint URL
    - api_version (str, optional): API version
    """

Other Providers

class LiteLLMModel:
    """
    LiteLLM integration for unified API across providers.

    Parameters:
    - model (str): Model name (e.g., "anthropic/claude-3-opus")
    """

class AmazonBedrockModel:
    """Amazon Bedrock integration."""

class KimiModel:
    """Kimi model integration."""

class GrokModel:
    """Grok model integration."""

class DeepSeekModel:
    """DeepSeek model integration."""

Multimodal LLM Class

class DeepEvalBaseMLLM:
    """
    Base class for multimodal LLM integrations.

    Abstract Methods:
    - generate(messages: List, **kwargs) -> str: Generate from multimodal input
    - a_generate(messages: List, **kwargs) -> str: Async generate
    - get_model_name() -> str: Get model name
    """

class MultimodalOpenAIModel:
    """
    OpenAI multimodal integration (GPT-4V, etc.).

    Parameters:
    - model (str, optional): Model name (default: "gpt-4o")
    """

class MultimodalGeminiModel:
    """Gemini multimodal integration."""

class MultimodalOllamaModel:
    """Ollama multimodal integration."""

Embedding Models

class DeepEvalBaseEmbeddingModel:
    """
    Base class for embedding model integrations.

    Abstract Methods:
    - embed_text(text: str) -> List[float]: Embed single text
    - a_embed_text(text: str) -> List[float]: Async embed single text
    - embed_texts(texts: List[str]) -> List[List[float]]: Embed multiple texts
    - a_embed_texts(texts: List[str]) -> List[List[float]]: Async embed multiple
    - get_model_name() -> str: Get model name
    """

class OpenAIEmbeddingModel:
    """
    OpenAI embeddings integration.

    Parameters:
    - model (str, optional): Model name (default: "text-embedding-3-small")
    """

class AzureOpenAIEmbeddingModel:
    """Azure OpenAI embeddings integration."""

class LocalEmbeddingModel:
    """Local embedding model integration."""

class OllamaEmbeddingModel:
    """Ollama embeddings integration."""

Usage Examples

Using Custom Models for Metrics

from deepeval.models import GPTModel, AnthropicModel
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric

# Use GPT-4 for one metric
gpt4_metric = AnswerRelevancyMetric(
    model=GPTModel(model="gpt-4"),
    threshold=0.7
)

# Use Claude for another
claude_metric = FaithfulnessMetric(
    model=AnthropicModel(model="claude-3-5-sonnet-20241022"),
    threshold=0.8
)

Using Local Models

from deepeval.models import OllamaModel
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

# Use local Llama model for evaluation
local_model = OllamaModel(
    model="llama3.2",
    base_url="http://localhost:11434"
)

metric = GEval(
    name="Quality",
    criteria="Evaluate response quality",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    model=local_model
)

Creating Custom Model Integration

from deepeval.models import DeepEvalBaseLLM

class CustomModel(DeepEvalBaseLLM):
    def __init__(self, api_endpoint: str):
        self.api_endpoint = api_endpoint
        self.model_name = "custom-model-v1"

    def load_model(self):
        # Initialize your model
        pass

    def generate(self, prompt: str) -> str:
        # Call your model API
        response = requests.post(
            self.api_endpoint,
            json={"prompt": prompt}
        )
        return response.json()["output"]

    async def a_generate(self, prompt: str) -> str:
        # Async version
        return self.generate(prompt)

    def get_model_name(self) -> str:
        return self.model_name

# Use custom model
custom_model = CustomModel(api_endpoint="https://api.example.com/generate")
metric = AnswerRelevancyMetric(model=custom_model)

Multimodal Models

from deepeval.models import MultimodalOpenAIModel
from deepeval.metrics import MultimodalGEval
from deepeval.test_case import MLLMTestCase, MLLMImage, MLLMTestCaseParams

# Use GPT-4V for multimodal evaluation
mllm = MultimodalOpenAIModel(model="gpt-4o")

metric = MultimodalGEval(
    name="Image Description Quality",
    criteria="Evaluate if the description accurately represents the image",
    evaluation_params=[MLLMTestCaseParams.INPUT, MLLMTestCaseParams.ACTUAL_OUTPUT],
    model=mllm
)

test_case = MLLMTestCase(
    input=["Describe this image:", MLLMImage(url="photo.jpg", local=True)],
    actual_output=["A golden retriever playing in a park"]
)

metric.measure(test_case)

Install with Tessl CLI

npx tessl i tessl/pypi-deepeval

docs

agentic-metrics.md

benchmarks.md

content-quality-metrics.md

conversational-metrics.md

core-evaluation.md

custom-metrics.md

dataset.md

index.md

integrations.md

models.md

multimodal-metrics.md

rag-metrics.md

synthesizer.md

test-cases.md

tracing.md

tile.json