or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agentic-metrics.mdbenchmarks.mdcontent-quality-metrics.mdconversational-metrics.mdcore-evaluation.mdcustom-metrics.mddataset.mdindex.mdintegrations.mdmodels.mdmultimodal-metrics.mdrag-metrics.mdsynthesizer.mdtest-cases.mdtracing.md

tracing.mddocs/

0

# Tracing

1

2

Component-level observability for evaluating nested LLM components using the `@observe` decorator and trace management. Enable tracing to evaluate individual components within your LLM application.

3

4

## Imports

5

6

```python

7

from deepeval.tracing import (

8

observe,

9

trace,

10

trace_manager,

11

update_current_span,

12

update_current_trace,

13

update_retriever_span,

14

update_llm_span,

15

evaluate_trace,

16

evaluate_span,

17

evaluate_thread

18

)

19

```

20

21

## Capabilities

22

23

### Observe Decorator

24

25

Decorator for observing function execution and applying metrics to components.

26

27

```python { .api }

28

def observe(

29

metrics: Optional[List[BaseMetric]] = None,

30

name: Optional[str] = None,

31

type: Optional[str] = None

32

):

33

"""

34

Decorator for observing function execution.

35

36

Parameters:

37

- metrics (List[BaseMetric], optional): Metrics to apply to this component

38

- name (str, optional): Name for the span

39

- type (str, optional): Type of component (e.g., "llm", "retriever", "tool")

40

41

Usage:

42

- Decorate any function to create a traced span

43

- Use update_current_span() within function to add test case data

44

- Metrics are evaluated automatically on the component

45

"""

46

```

47

48

Usage example:

49

50

```python

51

from deepeval.tracing import observe, update_current_span

52

from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric

53

from deepeval.test_case import LLMTestCase

54

55

# Define metrics for components

56

answer_relevancy = AnswerRelevancyMetric(threshold=0.7)

57

faithfulness = FaithfulnessMetric(threshold=0.8)

58

59

@observe(metrics=[answer_relevancy, faithfulness])

60

def llm_component(query: str, context: list):

61

"""LLM component that generates answer from context."""

62

# Your LLM call

63

answer = call_llm(query, context)

64

65

# Update span with test case data

66

update_current_span(

67

test_case=LLMTestCase(

68

input=query,

69

actual_output=answer,

70

retrieval_context=context

71

)

72

)

73

74

return answer

75

76

@observe(name="retrieval", type="retriever")

77

def retriever_component(query: str):

78

"""Retrieval component."""

79

results = vector_search(query)

80

81

update_retriever_span(

82

embedder="text-embedding-ada-002",

83

top_k=10,

84

chunk_size=512

85

)

86

87

return results

88

89

@observe(name="rag_pipeline")

90

def rag_pipeline(user_query: str):

91

"""Full RAG pipeline with traced components."""

92

# Each component is traced

93

context = retriever_component(user_query)

94

answer = llm_component(user_query, context)

95

96

return answer

97

98

# Execute and automatically evaluate components

99

result = rag_pipeline("What is quantum computing?")

100

```

101

102

### Update Span Functions

103

104

Functions to update span data during execution.

105

106

```python { .api }

107

def update_current_span(

108

test_case: Optional[LLMTestCase] = None,

109

**kwargs

110

):

111

"""

112

Updates the current span with additional data.

113

114

Parameters:

115

- test_case (LLMTestCase, optional): Test case data for the span

116

- **kwargs: Additional span attributes (metadata, tags, etc.)

117

"""

118

119

def update_current_trace(

120

**kwargs

121

):

122

"""

123

Updates the current trace with additional data.

124

125

Parameters:

126

- **kwargs: Trace-level attributes

127

"""

128

129

def update_retriever_span(

130

embedder: Optional[str] = None,

131

top_k: Optional[int] = None,

132

chunk_size: Optional[int] = None

133

):

134

"""

135

Updates retriever-specific span data.

136

137

Parameters:

138

- embedder (str, optional): Name of the embedding model used

139

- top_k (int, optional): Number of top results retrieved

140

- chunk_size (int, optional): Size of chunks used in retrieval

141

"""

142

143

def update_llm_span(

144

model: Optional[str] = None,

145

input_token_count: Optional[float] = None,

146

output_token_count: Optional[float] = None,

147

cost_per_input_token: Optional[float] = None,

148

cost_per_output_token: Optional[float] = None,

149

token_intervals: Optional[Dict[float, str]] = None,

150

prompt: Optional[Prompt] = None

151

):

152

"""

153

Updates LLM-specific span data.

154

155

Parameters:

156

- model (str, optional): Model name

157

- input_token_count (float, optional): Number of input tokens

158

- output_token_count (float, optional): Number of output tokens

159

- cost_per_input_token (float, optional): Cost per input token

160

- cost_per_output_token (float, optional): Cost per output token

161

- token_intervals (Dict[float, str], optional): Token timing intervals

162

- prompt (Prompt, optional): Prompt object used

163

"""

164

```

165

166

### Trace Context Manager

167

168

Context manager for creating trace scopes.

169

170

```python { .api }

171

def trace(name: Optional[str] = None):

172

"""

173

Context manager for tracing execution.

174

175

Parameters:

176

- name (str, optional): Name for the trace

177

"""

178

```

179

180

Usage:

181

182

```python

183

from deepeval.tracing import trace, observe

184

185

@observe

186

def process_document(doc):

187

# Processing logic

188

return result

189

190

def main():

191

with trace(name="document_processing"):

192

for doc in documents:

193

process_document(doc)

194

```

195

196

### Offline Evaluation

197

198

Evaluate traces after execution.

199

200

```python { .api }

201

def evaluate_trace(

202

trace_uuid: str,

203

metric_collection: str

204

):

205

"""

206

Evaluates a specific trace using a Confident AI metric collection.

207

208

Parameters:

209

- trace_uuid (str): UUID of the trace to evaluate

210

- metric_collection (str): Name of the metric collection on Confident AI

211

"""

212

213

def evaluate_span(

214

span_uuid: str,

215

metric_collection: str

216

):

217

"""

218

Evaluates a specific span using a Confident AI metric collection.

219

220

Parameters:

221

- span_uuid (str): UUID of the span to evaluate

222

- metric_collection (str): Name of the metric collection on Confident AI

223

"""

224

225

def evaluate_thread(

226

thread_id: str,

227

metric_collection: str,

228

overwrite_metrics: bool = False

229

):

230

"""

231

Evaluates a traced thread using a Confident AI metric collection.

232

233

Parameters:

234

- thread_id (str): ID of the thread to evaluate

235

- metric_collection (str): Name of the metric collection on Confident AI

236

- overwrite_metrics (bool): Whether to overwrite existing metrics (default: False)

237

"""

238

```

239

240

## Usage Examples

241

242

### Component-Level Evaluation

243

244

```python

245

from deepeval import evaluate

246

from deepeval.tracing import observe, update_current_span

247

from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric

248

from deepeval.test_case import LLMTestCase

249

from deepeval.dataset import Golden

250

251

# Define component metrics

252

faithfulness = FaithfulnessMetric(threshold=0.8)

253

relevancy = AnswerRelevancyMetric(threshold=0.7)

254

255

@observe(metrics=[faithfulness, relevancy])

256

def answer_generator(question: str, context: list):

257

"""Generate answer from context."""

258

answer = llm_generate(question, context)

259

260

# Provide test case data for evaluation

261

update_current_span(

262

test_case=LLMTestCase(

263

input=question,

264

actual_output=answer,

265

retrieval_context=context

266

)

267

)

268

269

return answer

270

271

@observe(name="rag_app")

272

def rag_application(question: str):

273

"""Main RAG application."""

274

context = retrieve_context(question)

275

answer = answer_generator(question, context)

276

return answer

277

278

# Evaluate using observed callback

279

goldens = [

280

Golden(input="What is Python?"),

281

Golden(input="What is JavaScript?")

282

]

283

284

result = evaluate(

285

observed_callback=rag_application,

286

goldens=goldens

287

)

288

```

289

290

### Multi-Component Pipeline

291

292

```python

293

from deepeval.tracing import observe, update_current_span

294

from deepeval.metrics import ToolCorrectnessMetric

295

from deepeval.test_case import LLMTestCase, ToolCall

296

297

tool_metric = ToolCorrectnessMetric(threshold=0.8)

298

299

@observe(name="tool_selector")

300

def select_tools(query: str):

301

"""Select appropriate tools."""

302

tools = analyze_and_select_tools(query)

303

return tools

304

305

@observe(metrics=[tool_metric])

306

def tool_executor(query: str, tools: list):

307

"""Execute tools."""

308

results = []

309

tool_calls = []

310

311

for tool in tools:

312

result = execute_tool(tool, query)

313

results.append(result)

314

tool_calls.append(ToolCall(

315

name=tool.name,

316

input_parameters=tool.params,

317

output=result

318

))

319

320

update_current_span(

321

test_case=LLMTestCase(

322

input=query,

323

actual_output=str(results),

324

tools_called=tool_calls

325

)

326

)

327

328

return results

329

330

@observe(name="agent")

331

def agent_pipeline(query: str):

332

"""Full agent pipeline."""

333

tools = select_tools(query)

334

results = tool_executor(query, tools)

335

final_answer = synthesize_answer(results)

336

return final_answer

337

338

# Execute with tracing

339

answer = agent_pipeline("Book a flight to NYC")

340

```

341

342

### Accessing Current Golden

343

344

```python

345

from deepeval.tracing import observe, update_current_span

346

from deepeval.dataset import Golden, get_current_golden

347

from deepeval.test_case import LLMTestCase

348

349

@observe

350

def my_component(input_text: str):

351

"""Component that accesses current golden."""

352

# Get current golden from context

353

golden = get_current_golden()

354

355

# Process input

356

output = process(input_text)

357

358

# Use golden data in test case

359

update_current_span(

360

test_case=LLMTestCase(

361

input=input_text,

362

actual_output=output,

363

expected_output=golden.expected_output if golden else None,

364

retrieval_context=golden.retrieval_context if golden else None

365

)

366

)

367

368

return output

369

370

# Evaluate with goldens

371

from deepeval import evaluate

372

373

goldens = [Golden(input="test", expected_output="result")]

374

result = evaluate(observed_callback=my_component, goldens=goldens)

375

```

376

377

### Trace Management

378

379

```python

380

from deepeval.tracing import trace_manager

381

382

# Get all traces

383

traces = trace_manager.get_traces()

384

385

# Get specific trace

386

trace = trace_manager.get_trace(trace_id="abc123")

387

388

# Get spans for a trace

389

spans = trace_manager.get_spans(trace_id="abc123")

390

391

# Clear traces

392

trace_manager.clear()

393

```

394

395

### Integration with Confident AI

396

397

Traces are automatically synced to Confident AI when logged in:

398

399

```bash

400

deepeval login

401

```

402

403

```python

404

from deepeval.tracing import observe

405

406

@observe

407

def my_function(input):

408

# This trace will be synced to Confident AI

409

return process(input)

410

411

my_function("test")

412

# View traces at app.confident-ai.com

413

```

414