Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Native integrations with popular LLM frameworks for automatic tracing and evaluation. DeepEval integrates with LangChain, LlamaIndex, CrewAI, and PydanticAI to provide seamless evaluation capabilities.
# LangChain
from deepeval.integrations.langchain import CallbackHandler, tool
# LlamaIndex
from deepeval.integrations.llama_index import instrument_llama_index
# CrewAI
from deepeval.integrations.crewai import (
instrument_crewai,
Crew,
Agent,
LLM,
tool
)
# PydanticAI
from deepeval.integrations.pydantic_ai import (
instrument_pydantic_ai,
ConfidentInstrumentationSettings,
Agent
)Integrate DeepEval with LangChain applications using callback handlers.
class CallbackHandler:
"""
LangChain callback handler for DeepEval tracing.
Usage:
- Add to LangChain chain/agent callbacks
- Automatically traces LangChain executions
- Syncs with Confident AI
"""
def tool(func):
"""
Decorator for marking LangChain tools for tracing.
Parameters:
- func: Tool function to decorate
Returns:
- Decorated tool with tracing
"""Usage example:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from deepeval.integrations.langchain import CallbackHandler
# Create callback handler
callback = CallbackHandler()
# Use with LangChain
llm = ChatOpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
callbacks=[callback]
)
# Executions are automatically traced
result = qa_chain.run("What is quantum computing?")LangChain tool tracing:
from langchain.tools import tool as langchain_tool
from deepeval.integrations.langchain import tool as deepeval_tool
@deepeval_tool
@langchain_tool
def search_database(query: str) -> str:
"""Search the product database."""
results = db.search(query)
return results
# Tool calls are automatically tracedInstrument LlamaIndex applications for automatic tracing.
def instrument_llama_index():
"""
Instruments LlamaIndex for automatic tracing.
Usage:
- Call once at the start of your application
- All LlamaIndex operations are automatically traced
- Traces include retrieval, LLM calls, and synthesis
"""Usage example:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from deepeval.integrations.llama_index import instrument_llama_index
# Instrument LlamaIndex
instrument_llama_index()
# All operations are now traced
documents = SimpleDirectoryReader('./data').load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
# Queries are automatically traced
response = query_engine.query("What is the main topic?")Evaluate LlamaIndex with metrics:
from deepeval.integrations.llama_index import instrument_llama_index
from deepeval.tracing import observe
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
instrument_llama_index()
answer_relevancy = AnswerRelevancyMetric(threshold=0.7)
@observe(metrics=[answer_relevancy])
def query_llama_index(question: str):
"""Query LlamaIndex with tracing."""
response = query_engine.query(question)
from deepeval.tracing import update_current_span
update_current_span(
test_case=LLMTestCase(
input=question,
actual_output=str(response)
)
)
return response
# Evaluate
from deepeval.dataset import Golden
goldens = [Golden(input="What is AI?")]
result = evaluate(observed_callback=query_llama_index, goldens=goldens)Instrument CrewAI agents and crews for tracing.
def instrument_crewai():
"""
Instruments CrewAI for automatic tracing.
Usage:
- Call once at the start of your application
- All CrewAI agent operations are traced
- Includes task execution, tool usage, and collaboration
"""
class Crew:
"""
DeepEval-wrapped Crew class.
Usage:
- Use instead of crewai.Crew for automatic tracing
- Same API as CrewAI Crew
"""
class Agent:
"""
DeepEval-wrapped Agent class.
Usage:
- Use instead of crewai.Agent for automatic tracing
- Same API as CrewAI Agent
"""
class LLM:
"""
DeepEval-wrapped LLM class for CrewAI.
"""
def tool(func):
"""
Decorator for marking CrewAI tools.
"""Usage example:
from deepeval.integrations.crewai import instrument_crewai, Crew, Agent, tool
# Instrument CrewAI
instrument_crewai()
# Define tools with tracing
@tool
def search_web(query: str) -> str:
"""Search the web for information."""
return perform_search(query)
# Create agents (automatically traced)
from crewai import Task
researcher = Agent(
role="Researcher",
goal="Research information",
backstory="Expert researcher",
tools=[search_web]
)
writer = Agent(
role="Writer",
goal="Write content",
backstory="Expert writer"
)
# Create tasks
research_task = Task(
description="Research quantum computing",
agent=researcher
)
write_task = Task(
description="Write an article about quantum computing",
agent=writer
)
# Create and run crew (automatically traced)
crew = Crew(
agents=[researcher, writer],
tasks=[research_task, write_task]
)
result = crew.kickoff()Alternative using wrapped classes:
from deepeval.integrations.crewai import Crew, Agent, LLM, tool
@tool
def calculator(expression: str) -> float:
"""Calculate mathematical expressions."""
return eval(expression)
# Use wrapped classes
agent = Agent(
role="Math Expert",
goal="Solve math problems",
tools=[calculator],
llm=LLM(model="gpt-4")
)
crew = Crew(agents=[agent], tasks=[...])
result = crew.kickoff()
# All operations are tracedInstrument PydanticAI agents for tracing.
def instrument_pydantic_ai(
settings: Optional[ConfidentInstrumentationSettings] = None
):
"""
Instruments PydanticAI for automatic tracing.
Parameters:
- settings (ConfidentInstrumentationSettings, optional): Instrumentation configuration
"""
class ConfidentInstrumentationSettings:
"""
Configuration for PydanticAI instrumentation.
Parameters:
- trace_runs (bool): Trace agent runs (default: True)
- trace_tools (bool): Trace tool calls (default: True)
- trace_prompts (bool): Trace prompts (default: True)
"""
class Agent:
"""
DeepEval-wrapped PydanticAI Agent class.
Usage:
- Use instead of pydantic_ai.Agent for automatic tracing
- Same API as PydanticAI Agent
"""Usage example:
from deepeval.integrations.pydantic_ai import (
instrument_pydantic_ai,
ConfidentInstrumentationSettings
)
# Instrument with settings
instrument_pydantic_ai(
settings=ConfidentInstrumentationSettings(
trace_runs=True,
trace_tools=True,
trace_prompts=True
)
)
# Use PydanticAI as normal
from pydantic_ai import Agent
agent = Agent(
"openai:gpt-4",
system_prompt="You are a helpful assistant"
)
# Agent runs are automatically traced
result = agent.run_sync("What is quantum computing?")Using wrapped Agent:
from deepeval.integrations.pydantic_ai import Agent
agent = Agent(
"openai:gpt-4",
system_prompt="You are a helpful assistant"
)
# Automatically traced
result = agent.run_sync("Explain machine learning")from langchain.chains import RetrievalQA
from deepeval.integrations.langchain import CallbackHandler
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.dataset import Golden
callback = CallbackHandler()
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
callbacks=[callback]
)
# Wrap for evaluation
from deepeval.tracing import observe, update_current_span
@observe(metrics=[
FaithfulnessMetric(threshold=0.8),
AnswerRelevancyMetric(threshold=0.7)
])
def query_with_langchain(question: str):
result = qa_chain.run(question)
update_current_span(
test_case=LLMTestCase(
input=question,
actual_output=result
)
)
return result
# Evaluate
goldens = [Golden(input="What is AI?")]
result = evaluate(observed_callback=query_with_langchain, goldens=goldens)from deepeval.integrations.crewai import instrument_crewai, Crew
from deepeval.metrics import TaskCompletionMetric, ToolCorrectnessMetric
from deepeval.tracing import observe, update_current_span
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
instrument_crewai()
task_completion = TaskCompletionMetric(threshold=0.8)
@observe(metrics=[task_completion])
def run_crew_task(task_description: str):
"""Run CrewAI task with evaluation."""
crew = Crew(agents=[...], tasks=[...])
result = crew.kickoff()
update_current_span(
test_case=LLMTestCase(
input=task_description,
actual_output=str(result)
)
)
return result
# Evaluate crew performance
from deepeval.dataset import Golden
goldens = [Golden(input="Research and write about AI")]
result = evaluate(observed_callback=run_crew_task, goldens=goldens)@observe decorator with metrics for component evaluation