or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agentic-metrics.mdbenchmarks.mdcontent-quality-metrics.mdconversational-metrics.mdcore-evaluation.mdcustom-metrics.mddataset.mdindex.mdintegrations.mdmodels.mdmultimodal-metrics.mdrag-metrics.mdsynthesizer.mdtest-cases.mdtracing.md

dataset.mddocs/

0

# Datasets

1

2

Tools for managing collections of test cases and "golden" examples. Supports batch evaluation, synthetic data generation, dataset persistence, and integration with Confident AI platform.

3

4

## Imports

5

6

```python

7

from deepeval.dataset import EvaluationDataset, Golden, ConversationalGolden

8

from deepeval.contextvars import get_current_golden

9

```

10

11

## Capabilities

12

13

### Evaluation Dataset

14

15

Manages collections of test cases and goldens for batch evaluation.

16

17

```python { .api }

18

class EvaluationDataset:

19

"""

20

Manages collections of test cases and goldens for evaluation.

21

22

Parameters:

23

- goldens (Union[List[Golden], List[ConversationalGolden]], optional): Initial goldens

24

25

Properties:

26

- goldens: Getter/setter for goldens list

27

- test_cases: Getter/setter for test cases list

28

29

Methods:

30

- add_test_case(test_case): Add a test case

31

- add_golden(golden): Add a golden

32

- add_test_cases_from_csv_file(file_path, **kwargs): Load test cases from CSV

33

- add_test_cases_from_json_file(file_path, **kwargs): Load test cases from JSON

34

- add_goldens_from_csv_file(file_path, **kwargs): Load goldens from CSV

35

- add_goldens_from_json_file(file_path, **kwargs): Load goldens from JSON

36

- push(alias, finalized=True): Push to Confident AI

37

- pull(alias, finalized=True, auto_convert_goldens_to_test_cases=False): Pull from Confident AI

38

- queue(alias, goldens): Queue goldens to Confident AI

39

- delete(alias): Delete dataset from Confident AI

40

- generate_goldens_from_docs(document_paths, **kwargs) -> List[Golden]

41

- generate_goldens_from_contexts(contexts, **kwargs) -> List[Golden]

42

- generate_goldens_from_scratch(num_goldens, **kwargs) -> List[Golden]

43

- save_as(file_type, directory, file_name=None, include_test_cases=False): Save dataset to file

44

- evals_iterator(metrics, **kwargs): Iterator for agentic evaluations

45

- evaluate(metrics, **kwargs) -> EvaluationResult: Evaluate dataset

46

"""

47

```

48

49

Usage example:

50

51

```python

52

from deepeval.dataset import EvaluationDataset, Golden

53

from deepeval.metrics import AnswerRelevancyMetric

54

from deepeval.test_case import LLMTestCase

55

56

# Create dataset

57

dataset = EvaluationDataset(

58

goldens=[

59

Golden(

60

input="What is Python?",

61

expected_output="Python is a high-level programming language"

62

),

63

Golden(

64

input="What is JavaScript?",

65

expected_output="JavaScript is a scripting language for web development"

66

)

67

]

68

)

69

70

# Add more goldens

71

dataset.add_golden(Golden(input="What is Java?", expected_output="..."))

72

73

# Generate test cases from goldens

74

for golden in dataset.goldens:

75

test_case = LLMTestCase(

76

input=golden.input,

77

actual_output=your_llm_app(golden.input),

78

expected_output=golden.expected_output

79

)

80

dataset.add_test_case(test_case)

81

82

# Evaluate

83

result = dataset.evaluate([AnswerRelevancyMetric(threshold=0.7)])

84

print(f"Results: {result.confident_link}")

85

86

# Save dataset

87

dataset.save_as(

88

file_type="json",

89

directory="./datasets",

90

file_name="my_dataset",

91

include_test_cases=True

92

)

93

```

94

95

Loading from files:

96

97

```python

98

from deepeval.dataset import EvaluationDataset

99

100

# Load from CSV

101

dataset = EvaluationDataset()

102

dataset.add_goldens_from_csv_file(

103

file_path="./data/goldens.csv",

104

input_col="question",

105

expected_output_col="answer"

106

)

107

108

# Load from JSON

109

dataset.add_goldens_from_json_file(

110

file_path="./data/goldens.json"

111

)

112

113

# Load test cases

114

dataset.add_test_cases_from_json_file(

115

file_path="./data/test_cases.json"

116

)

117

```

118

119

### Golden

120

121

Represents a "golden" test case with expected input/output pairs.

122

123

```python { .api }

124

class Golden:

125

"""

126

Represents a "golden" test case - expected input/output pairs.

127

128

Parameters:

129

- input (str): Input prompt

130

- actual_output (str, optional): Actual output

131

- expected_output (str, optional): Expected output

132

- context (List[str], optional): Context information

133

- retrieval_context (List[str], optional): Retrieved context

134

- additional_metadata (Dict, optional): Additional metadata

135

- comments (str, optional): Comments

136

- tools_called (List[ToolCall], optional): Tools called

137

- expected_tools (List[ToolCall], optional): Expected tools

138

- source_file (str, optional): Source file path

139

- name (str, optional): Name

140

- custom_column_key_values (Dict[str, str], optional): Custom columns

141

"""

142

```

143

144

Usage example:

145

146

```python

147

from deepeval.dataset import Golden

148

149

# Simple golden

150

golden = Golden(

151

input="What is the return policy?",

152

expected_output="30-day full refund"

153

)

154

155

# Golden with context

156

golden_with_context = Golden(

157

input="How long does shipping take?",

158

expected_output="3-5 business days",

159

context=["Standard shipping timeline"],

160

retrieval_context=["Shipping takes 3-5 business days for US orders"]

161

)

162

163

# Golden with metadata

164

golden_with_metadata = Golden(

165

input="Product inquiry",

166

expected_output="Product details",

167

additional_metadata={

168

"category": "support",

169

"priority": "high"

170

},

171

comments="Test case for product inquiry flow",

172

name="product_inquiry_test"

173

)

174

```

175

176

### Conversational Golden

177

178

Golden test case for conversational interactions.

179

180

```python { .api }

181

class ConversationalGolden:

182

"""

183

Represents a "golden" conversational test case.

184

185

Parameters:

186

- scenario (str): Scenario description

187

- expected_outcome (str, optional): Expected outcome

188

- user_description (str, optional): User description

189

- context (List[str], optional): Context information

190

- additional_metadata (Dict, optional): Additional metadata

191

- comments (str, optional): Comments

192

- name (str, optional): Name

193

- custom_column_key_values (Dict[str, str], optional): Custom columns

194

- turns (List[Turn], optional): Conversation turns

195

"""

196

```

197

198

Usage example:

199

200

```python

201

from deepeval.dataset import ConversationalGolden

202

from deepeval.test_case import Turn

203

204

conversational_golden = ConversationalGolden(

205

scenario="Customer wants to track order",

206

expected_outcome="Customer receives tracking information",

207

user_description="Existing customer with pending order",

208

context=["Order placed 2 days ago"],

209

turns=[

210

Turn(role="user", content="Where is my order?"),

211

Turn(role="assistant", content="Let me check your order status...")

212

]

213

)

214

```

215

216

### Confident AI Integration

217

218

Sync datasets with Confident AI platform for team collaboration.

219

220

```python

221

from deepeval.dataset import EvaluationDataset

222

223

dataset = EvaluationDataset(goldens=[...])

224

225

# Push to Confident AI

226

dataset.push(

227

alias="customer-support-v1",

228

finalized=True # Mark as finalized/ready for use

229

)

230

231

# Pull from Confident AI

232

dataset_from_cloud = EvaluationDataset()

233

dataset_from_cloud.pull(

234

alias="customer-support-v1",

235

auto_convert_goldens_to_test_cases=False

236

)

237

238

# Delete from Confident AI

239

dataset.delete(alias="customer-support-v1")

240

```

241

242

### Synthetic Golden Generation

243

244

Generate goldens from documents or contexts.

245

246

```python

247

from deepeval.dataset import EvaluationDataset

248

249

dataset = EvaluationDataset()

250

251

# Generate from documents

252

goldens = dataset.generate_goldens_from_docs(

253

document_paths=[

254

"./docs/product_guide.pdf",

255

"./docs/faq.txt"

256

],

257

max_goldens_per_document=10,

258

include_expected_output=True

259

)

260

261

# Generate from contexts

262

goldens = dataset.generate_goldens_from_contexts(

263

contexts=[

264

["Context about returns and refunds"],

265

["Context about shipping policies"]

266

],

267

max_goldens_per_context=5,

268

include_expected_output=True

269

)

270

271

# Generate from scratch (using styling config)

272

goldens = dataset.generate_goldens_from_scratch(

273

num_goldens=20

274

)

275

276

print(f"Generated {len(goldens)} goldens")

277

```

278

279

### Agentic Evaluation with Goldens

280

281

Use goldens for agentic evaluation workflows.

282

283

```python

284

from deepeval.dataset import EvaluationDataset, Golden, get_current_golden

285

from deepeval.tracing import observe

286

from deepeval.metrics import GEval

287

from deepeval.test_case import LLMTestCase, LLMTestCaseParams

288

289

# Define evaluation metric

290

metric = GEval(

291

name="Correctness",

292

criteria="Evaluate correctness of output",

293

evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]

294

)

295

296

# Create agent function

297

@observe(metrics=[metric])

298

def my_agent(input_text: str):

299

# Get current golden from context

300

golden = get_current_golden()

301

302

# Process with agent

303

output = process_with_agent(input_text)

304

305

# Update span with test case

306

from deepeval.tracing import update_current_span

307

update_current_span(

308

test_case=LLMTestCase(

309

input=input_text,

310

actual_output=output,

311

expected_output=golden.expected_output if golden else None

312

)

313

)

314

return output

315

316

# Create dataset

317

dataset = EvaluationDataset(goldens=[

318

Golden(input="Question 1", expected_output="Answer 1"),

319

Golden(input="Question 2", expected_output="Answer 2")

320

])

321

322

# Evaluate using iterator

323

from deepeval import evaluate

324

325

result = evaluate(

326

observed_callback=my_agent,

327

goldens=dataset.goldens

328

)

329

```

330

331

### Dataset Iteration

332

333

Iterate over dataset for batch processing.

334

335

```python

336

from deepeval.dataset import EvaluationDataset

337

338

dataset = EvaluationDataset()

339

dataset.pull(alias="my-dataset")

340

341

# Iterate over goldens

342

for golden in dataset.goldens:

343

print(f"Input: {golden.input}")

344

print(f"Expected: {golden.expected_output}")

345

346

# Iterate over test cases

347

for test_case in dataset.test_cases:

348

print(f"Input: {test_case.input}")

349

print(f"Output: {test_case.actual_output}")

350

351

# Use with pytest parametrize

352

import pytest

353

from deepeval import assert_test

354

355

@pytest.mark.parametrize("test_case", dataset.test_cases)

356

def test_dataset(test_case):

357

assert_test(test_case, metrics)

358

```

359