tessl/pypi-deepeval

Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents

Overview

Eval results

Files

Tracing

Name: tessl/pypi-deepeval
Author: tessl

Component-level observability for evaluating nested LLM components using the @observe decorator and trace management. Enable tracing to evaluate individual components within your LLM application.

Imports

from deepeval.tracing import (
    observe,
    trace,
    trace_manager,
    update_current_span,
    update_current_trace,
    update_retriever_span,
    update_llm_span,
    evaluate_trace,
    evaluate_span,
    evaluate_thread
)

Capabilities

Observe Decorator

Decorator for observing function execution and applying metrics to components.

def observe(
    metrics: Optional[List[BaseMetric]] = None,
    name: Optional[str] = None,
    type: Optional[str] = None
):
    """
    Decorator for observing function execution.

    Parameters:
    - metrics (List[BaseMetric], optional): Metrics to apply to this component
    - name (str, optional): Name for the span
    - type (str, optional): Type of component (e.g., "llm", "retriever", "tool")

    Usage:
    - Decorate any function to create a traced span
    - Use update_current_span() within function to add test case data
    - Metrics are evaluated automatically on the component
    """

Usage example:

from deepeval.tracing import observe, update_current_span
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase

# Define metrics for components
answer_relevancy = AnswerRelevancyMetric(threshold=0.7)
faithfulness = FaithfulnessMetric(threshold=0.8)

@observe(metrics=[answer_relevancy, faithfulness])
def llm_component(query: str, context: list):
    """LLM component that generates answer from context."""
    # Your LLM call
    answer = call_llm(query, context)

    # Update span with test case data
    update_current_span(
        test_case=LLMTestCase(
            input=query,
            actual_output=answer,
            retrieval_context=context
        )
    )

    return answer

@observe(name="retrieval", type="retriever")
def retriever_component(query: str):
    """Retrieval component."""
    results = vector_search(query)

    update_retriever_span(
        embedder="text-embedding-ada-002",
        top_k=10,
        chunk_size=512
    )

    return results

@observe(name="rag_pipeline")
def rag_pipeline(user_query: str):
    """Full RAG pipeline with traced components."""
    # Each component is traced
    context = retriever_component(user_query)
    answer = llm_component(user_query, context)

    return answer

# Execute and automatically evaluate components
result = rag_pipeline("What is quantum computing?")

Update Span Functions

Functions to update span data during execution.

def update_current_span(
    test_case: Optional[LLMTestCase] = None,
    **kwargs
):
    """
    Updates the current span with additional data.

    Parameters:
    - test_case (LLMTestCase, optional): Test case data for the span
    - **kwargs: Additional span attributes (metadata, tags, etc.)
    """

def update_current_trace(
    **kwargs
):
    """
    Updates the current trace with additional data.

    Parameters:
    - **kwargs: Trace-level attributes
    """

def update_retriever_span(
    embedder: Optional[str] = None,
    top_k: Optional[int] = None,
    chunk_size: Optional[int] = None
):
    """
    Updates retriever-specific span data.

    Parameters:
    - embedder (str, optional): Name of the embedding model used
    - top_k (int, optional): Number of top results retrieved
    - chunk_size (int, optional): Size of chunks used in retrieval
    """

def update_llm_span(
    model: Optional[str] = None,
    input_token_count: Optional[float] = None,
    output_token_count: Optional[float] = None,
    cost_per_input_token: Optional[float] = None,
    cost_per_output_token: Optional[float] = None,
    token_intervals: Optional[Dict[float, str]] = None,
    prompt: Optional[Prompt] = None
):
    """
    Updates LLM-specific span data.

    Parameters:
    - model (str, optional): Model name
    - input_token_count (float, optional): Number of input tokens
    - output_token_count (float, optional): Number of output tokens
    - cost_per_input_token (float, optional): Cost per input token
    - cost_per_output_token (float, optional): Cost per output token
    - token_intervals (Dict[float, str], optional): Token timing intervals
    - prompt (Prompt, optional): Prompt object used
    """

Trace Context Manager

Context manager for creating trace scopes.

def trace(name: Optional[str] = None):
    """
    Context manager for tracing execution.

    Parameters:
    - name (str, optional): Name for the trace
    """

Usage:

from deepeval.tracing import trace, observe

@observe
def process_document(doc):
    # Processing logic
    return result

def main():
    with trace(name="document_processing"):
        for doc in documents:
            process_document(doc)

Offline Evaluation

Evaluate traces after execution.

def evaluate_trace(
    trace_uuid: str,
    metric_collection: str
):
    """
    Evaluates a specific trace using a Confident AI metric collection.

    Parameters:
    - trace_uuid (str): UUID of the trace to evaluate
    - metric_collection (str): Name of the metric collection on Confident AI
    """

def evaluate_span(
    span_uuid: str,
    metric_collection: str
):
    """
    Evaluates a specific span using a Confident AI metric collection.

    Parameters:
    - span_uuid (str): UUID of the span to evaluate
    - metric_collection (str): Name of the metric collection on Confident AI
    """

def evaluate_thread(
    thread_id: str,
    metric_collection: str,
    overwrite_metrics: bool = False
):
    """
    Evaluates a traced thread using a Confident AI metric collection.

    Parameters:
    - thread_id (str): ID of the thread to evaluate
    - metric_collection (str): Name of the metric collection on Confident AI
    - overwrite_metrics (bool): Whether to overwrite existing metrics (default: False)
    """

Usage Examples

Component-Level Evaluation

from deepeval import evaluate
from deepeval.tracing import observe, update_current_span
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.dataset import Golden

# Define component metrics
faithfulness = FaithfulnessMetric(threshold=0.8)
relevancy = AnswerRelevancyMetric(threshold=0.7)

@observe(metrics=[faithfulness, relevancy])
def answer_generator(question: str, context: list):
    """Generate answer from context."""
    answer = llm_generate(question, context)

    # Provide test case data for evaluation
    update_current_span(
        test_case=LLMTestCase(
            input=question,
            actual_output=answer,
            retrieval_context=context
        )
    )

    return answer

@observe(name="rag_app")
def rag_application(question: str):
    """Main RAG application."""
    context = retrieve_context(question)
    answer = answer_generator(question, context)
    return answer

# Evaluate using observed callback
goldens = [
    Golden(input="What is Python?"),
    Golden(input="What is JavaScript?")
]

result = evaluate(
    observed_callback=rag_application,
    goldens=goldens
)

Multi-Component Pipeline

from deepeval.tracing import observe, update_current_span
from deepeval.metrics import ToolCorrectnessMetric
from deepeval.test_case import LLMTestCase, ToolCall

tool_metric = ToolCorrectnessMetric(threshold=0.8)

@observe(name="tool_selector")
def select_tools(query: str):
    """Select appropriate tools."""
    tools = analyze_and_select_tools(query)
    return tools

@observe(metrics=[tool_metric])
def tool_executor(query: str, tools: list):
    """Execute tools."""
    results = []
    tool_calls = []

    for tool in tools:
        result = execute_tool(tool, query)
        results.append(result)
        tool_calls.append(ToolCall(
            name=tool.name,
            input_parameters=tool.params,
            output=result
        ))

    update_current_span(
        test_case=LLMTestCase(
            input=query,
            actual_output=str(results),
            tools_called=tool_calls
        )
    )

    return results

@observe(name="agent")
def agent_pipeline(query: str):
    """Full agent pipeline."""
    tools = select_tools(query)
    results = tool_executor(query, tools)
    final_answer = synthesize_answer(results)
    return final_answer

# Execute with tracing
answer = agent_pipeline("Book a flight to NYC")

Accessing Current Golden

from deepeval.tracing import observe, update_current_span
from deepeval.dataset import Golden, get_current_golden
from deepeval.test_case import LLMTestCase

@observe
def my_component(input_text: str):
    """Component that accesses current golden."""
    # Get current golden from context
    golden = get_current_golden()

    # Process input
    output = process(input_text)

    # Use golden data in test case
    update_current_span(
        test_case=LLMTestCase(
            input=input_text,
            actual_output=output,
            expected_output=golden.expected_output if golden else None,
            retrieval_context=golden.retrieval_context if golden else None
        )
    )

    return output

# Evaluate with goldens
from deepeval import evaluate

goldens = [Golden(input="test", expected_output="result")]
result = evaluate(observed_callback=my_component, goldens=goldens)

Trace Management

from deepeval.tracing import trace_manager

# Get all traces
traces = trace_manager.get_traces()

# Get specific trace
trace = trace_manager.get_trace(trace_id="abc123")

# Get spans for a trace
spans = trace_manager.get_spans(trace_id="abc123")

# Clear traces
trace_manager.clear()

Integration with Confident AI

Traces are automatically synced to Confident AI when logged in:

deepeval login

from deepeval.tracing import observe

@observe
def my_function(input):
    # This trace will be synced to Confident AI
    return process(input)

my_function("test")
# View traces at app.confident-ai.com

Install with Tessl CLI