0
# Tracing
1
2
Component-level observability for evaluating nested LLM components using the `@observe` decorator and trace management. Enable tracing to evaluate individual components within your LLM application.
3
4
## Imports
5
6
```python
7
from deepeval.tracing import (
8
observe,
9
trace,
10
trace_manager,
11
update_current_span,
12
update_current_trace,
13
update_retriever_span,
14
update_llm_span,
15
evaluate_trace,
16
evaluate_span,
17
evaluate_thread
18
)
19
```
20
21
## Capabilities
22
23
### Observe Decorator
24
25
Decorator for observing function execution and applying metrics to components.
26
27
```python { .api }
28
def observe(
29
metrics: Optional[List[BaseMetric]] = None,
30
name: Optional[str] = None,
31
type: Optional[str] = None
32
):
33
"""
34
Decorator for observing function execution.
35
36
Parameters:
37
- metrics (List[BaseMetric], optional): Metrics to apply to this component
38
- name (str, optional): Name for the span
39
- type (str, optional): Type of component (e.g., "llm", "retriever", "tool")
40
41
Usage:
42
- Decorate any function to create a traced span
43
- Use update_current_span() within function to add test case data
44
- Metrics are evaluated automatically on the component
45
"""
46
```
47
48
Usage example:
49
50
```python
51
from deepeval.tracing import observe, update_current_span
52
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
53
from deepeval.test_case import LLMTestCase
54
55
# Define metrics for components
56
answer_relevancy = AnswerRelevancyMetric(threshold=0.7)
57
faithfulness = FaithfulnessMetric(threshold=0.8)
58
59
@observe(metrics=[answer_relevancy, faithfulness])
60
def llm_component(query: str, context: list):
61
"""LLM component that generates answer from context."""
62
# Your LLM call
63
answer = call_llm(query, context)
64
65
# Update span with test case data
66
update_current_span(
67
test_case=LLMTestCase(
68
input=query,
69
actual_output=answer,
70
retrieval_context=context
71
)
72
)
73
74
return answer
75
76
@observe(name="retrieval", type="retriever")
77
def retriever_component(query: str):
78
"""Retrieval component."""
79
results = vector_search(query)
80
81
update_retriever_span(
82
embedder="text-embedding-ada-002",
83
top_k=10,
84
chunk_size=512
85
)
86
87
return results
88
89
@observe(name="rag_pipeline")
90
def rag_pipeline(user_query: str):
91
"""Full RAG pipeline with traced components."""
92
# Each component is traced
93
context = retriever_component(user_query)
94
answer = llm_component(user_query, context)
95
96
return answer
97
98
# Execute and automatically evaluate components
99
result = rag_pipeline("What is quantum computing?")
100
```
101
102
### Update Span Functions
103
104
Functions to update span data during execution.
105
106
```python { .api }
107
def update_current_span(
108
test_case: Optional[LLMTestCase] = None,
109
**kwargs
110
):
111
"""
112
Updates the current span with additional data.
113
114
Parameters:
115
- test_case (LLMTestCase, optional): Test case data for the span
116
- **kwargs: Additional span attributes (metadata, tags, etc.)
117
"""
118
119
def update_current_trace(
120
**kwargs
121
):
122
"""
123
Updates the current trace with additional data.
124
125
Parameters:
126
- **kwargs: Trace-level attributes
127
"""
128
129
def update_retriever_span(
130
embedder: Optional[str] = None,
131
top_k: Optional[int] = None,
132
chunk_size: Optional[int] = None
133
):
134
"""
135
Updates retriever-specific span data.
136
137
Parameters:
138
- embedder (str, optional): Name of the embedding model used
139
- top_k (int, optional): Number of top results retrieved
140
- chunk_size (int, optional): Size of chunks used in retrieval
141
"""
142
143
def update_llm_span(
144
model: Optional[str] = None,
145
input_token_count: Optional[float] = None,
146
output_token_count: Optional[float] = None,
147
cost_per_input_token: Optional[float] = None,
148
cost_per_output_token: Optional[float] = None,
149
token_intervals: Optional[Dict[float, str]] = None,
150
prompt: Optional[Prompt] = None
151
):
152
"""
153
Updates LLM-specific span data.
154
155
Parameters:
156
- model (str, optional): Model name
157
- input_token_count (float, optional): Number of input tokens
158
- output_token_count (float, optional): Number of output tokens
159
- cost_per_input_token (float, optional): Cost per input token
160
- cost_per_output_token (float, optional): Cost per output token
161
- token_intervals (Dict[float, str], optional): Token timing intervals
162
- prompt (Prompt, optional): Prompt object used
163
"""
164
```
165
166
### Trace Context Manager
167
168
Context manager for creating trace scopes.
169
170
```python { .api }
171
def trace(name: Optional[str] = None):
172
"""
173
Context manager for tracing execution.
174
175
Parameters:
176
- name (str, optional): Name for the trace
177
"""
178
```
179
180
Usage:
181
182
```python
183
from deepeval.tracing import trace, observe
184
185
@observe
186
def process_document(doc):
187
# Processing logic
188
return result
189
190
def main():
191
with trace(name="document_processing"):
192
for doc in documents:
193
process_document(doc)
194
```
195
196
### Offline Evaluation
197
198
Evaluate traces after execution.
199
200
```python { .api }
201
def evaluate_trace(
202
trace_uuid: str,
203
metric_collection: str
204
):
205
"""
206
Evaluates a specific trace using a Confident AI metric collection.
207
208
Parameters:
209
- trace_uuid (str): UUID of the trace to evaluate
210
- metric_collection (str): Name of the metric collection on Confident AI
211
"""
212
213
def evaluate_span(
214
span_uuid: str,
215
metric_collection: str
216
):
217
"""
218
Evaluates a specific span using a Confident AI metric collection.
219
220
Parameters:
221
- span_uuid (str): UUID of the span to evaluate
222
- metric_collection (str): Name of the metric collection on Confident AI
223
"""
224
225
def evaluate_thread(
226
thread_id: str,
227
metric_collection: str,
228
overwrite_metrics: bool = False
229
):
230
"""
231
Evaluates a traced thread using a Confident AI metric collection.
232
233
Parameters:
234
- thread_id (str): ID of the thread to evaluate
235
- metric_collection (str): Name of the metric collection on Confident AI
236
- overwrite_metrics (bool): Whether to overwrite existing metrics (default: False)
237
"""
238
```
239
240
## Usage Examples
241
242
### Component-Level Evaluation
243
244
```python
245
from deepeval import evaluate
246
from deepeval.tracing import observe, update_current_span
247
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric
248
from deepeval.test_case import LLMTestCase
249
from deepeval.dataset import Golden
250
251
# Define component metrics
252
faithfulness = FaithfulnessMetric(threshold=0.8)
253
relevancy = AnswerRelevancyMetric(threshold=0.7)
254
255
@observe(metrics=[faithfulness, relevancy])
256
def answer_generator(question: str, context: list):
257
"""Generate answer from context."""
258
answer = llm_generate(question, context)
259
260
# Provide test case data for evaluation
261
update_current_span(
262
test_case=LLMTestCase(
263
input=question,
264
actual_output=answer,
265
retrieval_context=context
266
)
267
)
268
269
return answer
270
271
@observe(name="rag_app")
272
def rag_application(question: str):
273
"""Main RAG application."""
274
context = retrieve_context(question)
275
answer = answer_generator(question, context)
276
return answer
277
278
# Evaluate using observed callback
279
goldens = [
280
Golden(input="What is Python?"),
281
Golden(input="What is JavaScript?")
282
]
283
284
result = evaluate(
285
observed_callback=rag_application,
286
goldens=goldens
287
)
288
```
289
290
### Multi-Component Pipeline
291
292
```python
293
from deepeval.tracing import observe, update_current_span
294
from deepeval.metrics import ToolCorrectnessMetric
295
from deepeval.test_case import LLMTestCase, ToolCall
296
297
tool_metric = ToolCorrectnessMetric(threshold=0.8)
298
299
@observe(name="tool_selector")
300
def select_tools(query: str):
301
"""Select appropriate tools."""
302
tools = analyze_and_select_tools(query)
303
return tools
304
305
@observe(metrics=[tool_metric])
306
def tool_executor(query: str, tools: list):
307
"""Execute tools."""
308
results = []
309
tool_calls = []
310
311
for tool in tools:
312
result = execute_tool(tool, query)
313
results.append(result)
314
tool_calls.append(ToolCall(
315
name=tool.name,
316
input_parameters=tool.params,
317
output=result
318
))
319
320
update_current_span(
321
test_case=LLMTestCase(
322
input=query,
323
actual_output=str(results),
324
tools_called=tool_calls
325
)
326
)
327
328
return results
329
330
@observe(name="agent")
331
def agent_pipeline(query: str):
332
"""Full agent pipeline."""
333
tools = select_tools(query)
334
results = tool_executor(query, tools)
335
final_answer = synthesize_answer(results)
336
return final_answer
337
338
# Execute with tracing
339
answer = agent_pipeline("Book a flight to NYC")
340
```
341
342
### Accessing Current Golden
343
344
```python
345
from deepeval.tracing import observe, update_current_span
346
from deepeval.dataset import Golden, get_current_golden
347
from deepeval.test_case import LLMTestCase
348
349
@observe
350
def my_component(input_text: str):
351
"""Component that accesses current golden."""
352
# Get current golden from context
353
golden = get_current_golden()
354
355
# Process input
356
output = process(input_text)
357
358
# Use golden data in test case
359
update_current_span(
360
test_case=LLMTestCase(
361
input=input_text,
362
actual_output=output,
363
expected_output=golden.expected_output if golden else None,
364
retrieval_context=golden.retrieval_context if golden else None
365
)
366
)
367
368
return output
369
370
# Evaluate with goldens
371
from deepeval import evaluate
372
373
goldens = [Golden(input="test", expected_output="result")]
374
result = evaluate(observed_callback=my_component, goldens=goldens)
375
```
376
377
### Trace Management
378
379
```python
380
from deepeval.tracing import trace_manager
381
382
# Get all traces
383
traces = trace_manager.get_traces()
384
385
# Get specific trace
386
trace = trace_manager.get_trace(trace_id="abc123")
387
388
# Get spans for a trace
389
spans = trace_manager.get_spans(trace_id="abc123")
390
391
# Clear traces
392
trace_manager.clear()
393
```
394
395
### Integration with Confident AI
396
397
Traces are automatically synced to Confident AI when logged in:
398
399
```bash
400
deepeval login
401
```
402
403
```python
404
from deepeval.tracing import observe
405
406
@observe
407
def my_function(input):
408
# This trace will be synced to Confident AI
409
return process(input)
410
411
my_function("test")
412
# View traces at app.confident-ai.com
413
```
414