tessl/pypi-deepeval

Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents

Overview

Eval results

Files

Integrations

Name: tessl/pypi-deepeval
Author: tessl

Native integrations with popular LLM frameworks for automatic tracing and evaluation. DeepEval integrates with LangChain, LlamaIndex, CrewAI, and PydanticAI to provide seamless evaluation capabilities.

Imports

# LangChain
from deepeval.integrations.langchain import CallbackHandler, tool

# LlamaIndex
from deepeval.integrations.llama_index import instrument_llama_index

# CrewAI
from deepeval.integrations.crewai import (
    instrument_crewai,
    Crew,
    Agent,
    LLM,
    tool
)

# PydanticAI
from deepeval.integrations.pydantic_ai import (
    instrument_pydantic_ai,
    ConfidentInstrumentationSettings,
    Agent
)

Capabilities

LangChain Integration

Integrate DeepEval with LangChain applications using callback handlers.

class CallbackHandler:
    """
    LangChain callback handler for DeepEval tracing.

    Usage:
    - Add to LangChain chain/agent callbacks
    - Automatically traces LangChain executions
    - Syncs with Confident AI
    """

def tool(func):
    """
    Decorator for marking LangChain tools for tracing.

    Parameters:
    - func: Tool function to decorate

    Returns:
    - Decorated tool with tracing
    """

Usage example:

from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from deepeval.integrations.langchain import CallbackHandler

# Create callback handler
callback = CallbackHandler()

# Use with LangChain
llm = ChatOpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[callback]
)

# Executions are automatically traced
result = qa_chain.run("What is quantum computing?")

LangChain tool tracing:

from langchain.tools import tool as langchain_tool
from deepeval.integrations.langchain import tool as deepeval_tool

@deepeval_tool
@langchain_tool
def search_database(query: str) -> str:
    """Search the product database."""
    results = db.search(query)
    return results

# Tool calls are automatically traced

LlamaIndex Integration

Instrument LlamaIndex applications for automatic tracing.

def instrument_llama_index():
    """
    Instruments LlamaIndex for automatic tracing.

    Usage:
    - Call once at the start of your application
    - All LlamaIndex operations are automatically traced
    - Traces include retrieval, LLM calls, and synthesis
    """

Usage example:

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from deepeval.integrations.llama_index import instrument_llama_index

# Instrument LlamaIndex
instrument_llama_index()

# All operations are now traced
documents = SimpleDirectoryReader('./data').load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

# Queries are automatically traced
response = query_engine.query("What is the main topic?")

Evaluate LlamaIndex with metrics:

from deepeval.integrations.llama_index import instrument_llama_index
from deepeval.tracing import observe
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

instrument_llama_index()

answer_relevancy = AnswerRelevancyMetric(threshold=0.7)

@observe(metrics=[answer_relevancy])
def query_llama_index(question: str):
    """Query LlamaIndex with tracing."""
    response = query_engine.query(question)

    from deepeval.tracing import update_current_span
    update_current_span(
        test_case=LLMTestCase(
            input=question,
            actual_output=str(response)
        )
    )

    return response

# Evaluate
from deepeval.dataset import Golden
goldens = [Golden(input="What is AI?")]
result = evaluate(observed_callback=query_llama_index, goldens=goldens)

CrewAI Integration

Instrument CrewAI agents and crews for tracing.

def instrument_crewai():
    """
    Instruments CrewAI for automatic tracing.

    Usage:
    - Call once at the start of your application
    - All CrewAI agent operations are traced
    - Includes task execution, tool usage, and collaboration
    """

class Crew:
    """
    DeepEval-wrapped Crew class.

    Usage:
    - Use instead of crewai.Crew for automatic tracing
    - Same API as CrewAI Crew
    """

class Agent:
    """
    DeepEval-wrapped Agent class.

    Usage:
    - Use instead of crewai.Agent for automatic tracing
    - Same API as CrewAI Agent
    """

class LLM:
    """
    DeepEval-wrapped LLM class for CrewAI.
    """

def tool(func):
    """
    Decorator for marking CrewAI tools.
    """

Usage example:

from deepeval.integrations.crewai import instrument_crewai, Crew, Agent, tool

# Instrument CrewAI
instrument_crewai()

# Define tools with tracing
@tool
def search_web(query: str) -> str:
    """Search the web for information."""
    return perform_search(query)

# Create agents (automatically traced)
from crewai import Task

researcher = Agent(
    role="Researcher",
    goal="Research information",
    backstory="Expert researcher",
    tools=[search_web]
)

writer = Agent(
    role="Writer",
    goal="Write content",
    backstory="Expert writer"
)

# Create tasks
research_task = Task(
    description="Research quantum computing",
    agent=researcher
)

write_task = Task(
    description="Write an article about quantum computing",
    agent=writer
)

# Create and run crew (automatically traced)
crew = Crew(
    agents=[researcher, writer],
    tasks=[research_task, write_task]
)

result = crew.kickoff()

Alternative using wrapped classes:

from deepeval.integrations.crewai import Crew, Agent, LLM, tool

@tool
def calculator(expression: str) -> float:
    """Calculate mathematical expressions."""
    return eval(expression)

# Use wrapped classes
agent = Agent(
    role="Math Expert",
    goal="Solve math problems",
    tools=[calculator],
    llm=LLM(model="gpt-4")
)

crew = Crew(agents=[agent], tasks=[...])
result = crew.kickoff()
# All operations are traced

PydanticAI Integration

Instrument PydanticAI agents for tracing.

def instrument_pydantic_ai(
    settings: Optional[ConfidentInstrumentationSettings] = None
):
    """
    Instruments PydanticAI for automatic tracing.

    Parameters:
    - settings (ConfidentInstrumentationSettings, optional): Instrumentation configuration
    """

class ConfidentInstrumentationSettings:
    """
    Configuration for PydanticAI instrumentation.

    Parameters:
    - trace_runs (bool): Trace agent runs (default: True)
    - trace_tools (bool): Trace tool calls (default: True)
    - trace_prompts (bool): Trace prompts (default: True)
    """

class Agent:
    """
    DeepEval-wrapped PydanticAI Agent class.

    Usage:
    - Use instead of pydantic_ai.Agent for automatic tracing
    - Same API as PydanticAI Agent
    """

Usage example:

from deepeval.integrations.pydantic_ai import (
    instrument_pydantic_ai,
    ConfidentInstrumentationSettings
)

# Instrument with settings
instrument_pydantic_ai(
    settings=ConfidentInstrumentationSettings(
        trace_runs=True,
        trace_tools=True,
        trace_prompts=True
    )
)

# Use PydanticAI as normal
from pydantic_ai import Agent

agent = Agent(
    "openai:gpt-4",
    system_prompt="You are a helpful assistant"
)

# Agent runs are automatically traced
result = agent.run_sync("What is quantum computing?")

Using wrapped Agent:

from deepeval.integrations.pydantic_ai import Agent

agent = Agent(
    "openai:gpt-4",
    system_prompt="You are a helpful assistant"
)

# Automatically traced
result = agent.run_sync("Explain machine learning")

Evaluation with Integrations

LangChain + Metrics

from langchain.chains import RetrievalQA
from deepeval.integrations.langchain import CallbackHandler
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.dataset import Golden

callback = CallbackHandler()

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[callback]
)

# Wrap for evaluation
from deepeval.tracing import observe, update_current_span

@observe(metrics=[
    FaithfulnessMetric(threshold=0.8),
    AnswerRelevancyMetric(threshold=0.7)
])
def query_with_langchain(question: str):
    result = qa_chain.run(question)

    update_current_span(
        test_case=LLMTestCase(
            input=question,
            actual_output=result
        )
    )

    return result

# Evaluate
goldens = [Golden(input="What is AI?")]
result = evaluate(observed_callback=query_with_langchain, goldens=goldens)

CrewAI + Metrics

from deepeval.integrations.crewai import instrument_crewai, Crew
from deepeval.metrics import TaskCompletionMetric, ToolCorrectnessMetric
from deepeval.tracing import observe, update_current_span
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

instrument_crewai()

task_completion = TaskCompletionMetric(threshold=0.8)

@observe(metrics=[task_completion])
def run_crew_task(task_description: str):
    """Run CrewAI task with evaluation."""
    crew = Crew(agents=[...], tasks=[...])
    result = crew.kickoff()

    update_current_span(
        test_case=LLMTestCase(
            input=task_description,
            actual_output=str(result)
        )
    )

    return result

# Evaluate crew performance
from deepeval.dataset import Golden
goldens = [Golden(input="Research and write about AI")]
result = evaluate(observed_callback=run_crew_task, goldens=goldens)

Integration Best Practices

Instrument Early: Call instrumentation functions at the start of your application
Use Callbacks: For LangChain, always use CallbackHandler
Combine with Metrics: Use @observe decorator with metrics for component evaluation
Monitor Traces: Log in to Confident AI to view detailed traces
Evaluate Regularly: Run evaluations during development and in CI/CD