Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Component-level observability for evaluating nested LLM components using the @observe decorator and trace management. Enable tracing to evaluate individual components within your LLM application.
from deepeval.tracing import (
observe,
trace,
trace_manager,
update_current_span,
update_current_trace,
update_retriever_span,
update_llm_span,
evaluate_trace,
evaluate_span,
evaluate_thread
)Decorator for observing function execution and applying metrics to components.
def observe(
metrics: Optional[List[BaseMetric]] = None,
name: Optional[str] = None,
type: Optional[str] = None
):
"""
Decorator for observing function execution.
Parameters:
- metrics (List[BaseMetric], optional): Metrics to apply to this component
- name (str, optional): Name for the span
- type (str, optional): Type of component (e.g., "llm", "retriever", "tool")
Usage:
- Decorate any function to create a traced span
- Use update_current_span() within function to add test case data
- Metrics are evaluated automatically on the component
"""Usage example:
from deepeval.tracing import observe, update_current_span
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase
# Define metrics for components
answer_relevancy = AnswerRelevancyMetric(threshold=0.7)
faithfulness = FaithfulnessMetric(threshold=0.8)
@observe(metrics=[answer_relevancy, faithfulness])
def llm_component(query: str, context: list):
"""LLM component that generates answer from context."""
# Your LLM call
answer = call_llm(query, context)
# Update span with test case data
update_current_span(
test_case=LLMTestCase(
input=query,
actual_output=answer,
retrieval_context=context
)
)
return answer
@observe(name="retrieval", type="retriever")
def retriever_component(query: str):
"""Retrieval component."""
results = vector_search(query)
update_retriever_span(
embedder="text-embedding-ada-002",
top_k=10,
chunk_size=512
)
return results
@observe(name="rag_pipeline")
def rag_pipeline(user_query: str):
"""Full RAG pipeline with traced components."""
# Each component is traced
context = retriever_component(user_query)
answer = llm_component(user_query, context)
return answer
# Execute and automatically evaluate components
result = rag_pipeline("What is quantum computing?")Functions to update span data during execution.
def update_current_span(
test_case: Optional[LLMTestCase] = None,
**kwargs
):
"""
Updates the current span with additional data.
Parameters:
- test_case (LLMTestCase, optional): Test case data for the span
- **kwargs: Additional span attributes (metadata, tags, etc.)
"""
def update_current_trace(
**kwargs
):
"""
Updates the current trace with additional data.
Parameters:
- **kwargs: Trace-level attributes
"""
def update_retriever_span(
embedder: Optional[str] = None,
top_k: Optional[int] = None,
chunk_size: Optional[int] = None
):
"""
Updates retriever-specific span data.
Parameters:
- embedder (str, optional): Name of the embedding model used
- top_k (int, optional): Number of top results retrieved
- chunk_size (int, optional): Size of chunks used in retrieval
"""
def update_llm_span(
model: Optional[str] = None,
input_token_count: Optional[float] = None,
output_token_count: Optional[float] = None,
cost_per_input_token: Optional[float] = None,
cost_per_output_token: Optional[float] = None,
token_intervals: Optional[Dict[float, str]] = None,
prompt: Optional[Prompt] = None
):
"""
Updates LLM-specific span data.
Parameters:
- model (str, optional): Model name
- input_token_count (float, optional): Number of input tokens
- output_token_count (float, optional): Number of output tokens
- cost_per_input_token (float, optional): Cost per input token
- cost_per_output_token (float, optional): Cost per output token
- token_intervals (Dict[float, str], optional): Token timing intervals
- prompt (Prompt, optional): Prompt object used
"""Context manager for creating trace scopes.
def trace(name: Optional[str] = None):
"""
Context manager for tracing execution.
Parameters:
- name (str, optional): Name for the trace
"""Usage:
from deepeval.tracing import trace, observe
@observe
def process_document(doc):
# Processing logic
return result
def main():
with trace(name="document_processing"):
for doc in documents:
process_document(doc)Evaluate traces after execution.
def evaluate_trace(
trace_uuid: str,
metric_collection: str
):
"""
Evaluates a specific trace using a Confident AI metric collection.
Parameters:
- trace_uuid (str): UUID of the trace to evaluate
- metric_collection (str): Name of the metric collection on Confident AI
"""
def evaluate_span(
span_uuid: str,
metric_collection: str
):
"""
Evaluates a specific span using a Confident AI metric collection.
Parameters:
- span_uuid (str): UUID of the span to evaluate
- metric_collection (str): Name of the metric collection on Confident AI
"""
def evaluate_thread(
thread_id: str,
metric_collection: str,
overwrite_metrics: bool = False
):
"""
Evaluates a traced thread using a Confident AI metric collection.
Parameters:
- thread_id (str): ID of the thread to evaluate
- metric_collection (str): Name of the metric collection on Confident AI
- overwrite_metrics (bool): Whether to overwrite existing metrics (default: False)
"""from deepeval import evaluate
from deepeval.tracing import observe, update_current_span
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.dataset import Golden
# Define component metrics
faithfulness = FaithfulnessMetric(threshold=0.8)
relevancy = AnswerRelevancyMetric(threshold=0.7)
@observe(metrics=[faithfulness, relevancy])
def answer_generator(question: str, context: list):
"""Generate answer from context."""
answer = llm_generate(question, context)
# Provide test case data for evaluation
update_current_span(
test_case=LLMTestCase(
input=question,
actual_output=answer,
retrieval_context=context
)
)
return answer
@observe(name="rag_app")
def rag_application(question: str):
"""Main RAG application."""
context = retrieve_context(question)
answer = answer_generator(question, context)
return answer
# Evaluate using observed callback
goldens = [
Golden(input="What is Python?"),
Golden(input="What is JavaScript?")
]
result = evaluate(
observed_callback=rag_application,
goldens=goldens
)from deepeval.tracing import observe, update_current_span
from deepeval.metrics import ToolCorrectnessMetric
from deepeval.test_case import LLMTestCase, ToolCall
tool_metric = ToolCorrectnessMetric(threshold=0.8)
@observe(name="tool_selector")
def select_tools(query: str):
"""Select appropriate tools."""
tools = analyze_and_select_tools(query)
return tools
@observe(metrics=[tool_metric])
def tool_executor(query: str, tools: list):
"""Execute tools."""
results = []
tool_calls = []
for tool in tools:
result = execute_tool(tool, query)
results.append(result)
tool_calls.append(ToolCall(
name=tool.name,
input_parameters=tool.params,
output=result
))
update_current_span(
test_case=LLMTestCase(
input=query,
actual_output=str(results),
tools_called=tool_calls
)
)
return results
@observe(name="agent")
def agent_pipeline(query: str):
"""Full agent pipeline."""
tools = select_tools(query)
results = tool_executor(query, tools)
final_answer = synthesize_answer(results)
return final_answer
# Execute with tracing
answer = agent_pipeline("Book a flight to NYC")from deepeval.tracing import observe, update_current_span
from deepeval.dataset import Golden, get_current_golden
from deepeval.test_case import LLMTestCase
@observe
def my_component(input_text: str):
"""Component that accesses current golden."""
# Get current golden from context
golden = get_current_golden()
# Process input
output = process(input_text)
# Use golden data in test case
update_current_span(
test_case=LLMTestCase(
input=input_text,
actual_output=output,
expected_output=golden.expected_output if golden else None,
retrieval_context=golden.retrieval_context if golden else None
)
)
return output
# Evaluate with goldens
from deepeval import evaluate
goldens = [Golden(input="test", expected_output="result")]
result = evaluate(observed_callback=my_component, goldens=goldens)from deepeval.tracing import trace_manager
# Get all traces
traces = trace_manager.get_traces()
# Get specific trace
trace = trace_manager.get_trace(trace_id="abc123")
# Get spans for a trace
spans = trace_manager.get_spans(trace_id="abc123")
# Clear traces
trace_manager.clear()Traces are automatically synced to Confident AI when logged in:
deepeval loginfrom deepeval.tracing import observe
@observe
def my_function(input):
# This trace will be synced to Confident AI
return process(input)
my_function("test")
# View traces at app.confident-ai.com