or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdcore-tracing.mddatasets.mdexperiments.mdindex.mdintegrations.mdobservation-types.mdprompts.mdscoring.md

datasets.mddocs/

0

# Dataset Management

1

2

Tools for creating, managing, and running experiments on datasets with support for both local data and Langfuse-hosted datasets. Enables systematic data management and experiment tracking.

3

4

## Capabilities

5

6

### Dataset Client

7

8

Manages a complete dataset with experiment running capabilities.

9

10

```python { .api }

11

class DatasetClient:

12

def __init__(self, id: str, name: str, description: str = None,

13

metadata: Any = None, project_id: str = None,

14

created_at: datetime = None, updated_at: datetime = None,

15

items: List[DatasetItemClient] = None):

16

"""Initialize dataset client."""

17

18

def run_experiment(self, *, name: str, task: TaskFunction,

19

evaluators: List[EvaluatorFunction] = None,

20

run_evaluators: List[RunEvaluatorFunction] = None,

21

run_name: str = None, run_description: str = None,

22

experiment_config: Dict[str, Any] = None) -> ExperimentResult:

23

"""Run experiment on this dataset.

24

25

Args:

26

name: Experiment name

27

task: Function to execute on each dataset item

28

evaluators: List of item-level evaluator functions

29

run_evaluators: List of run-level evaluator functions

30

run_name: Name for this specific run

31

run_description: Description of experiment run

32

experiment_config: Configuration metadata

33

34

Returns:

35

ExperimentResult with complete results and evaluations

36

"""

37

38

# Attributes

39

id: str

40

name: str

41

description: Optional[str]

42

metadata: Optional[Any]

43

project_id: str

44

created_at: datetime

45

updated_at: datetime

46

items: List[DatasetItemClient]

47

```

48

49

### Dataset Item Client

50

51

Represents individual items within a dataset with run context capabilities.

52

53

```python { .api }

54

class DatasetItemClient:

55

def __init__(self, id: str, status: DatasetStatus, input: Any = None,

56

expected_output: Any = None, metadata: Any = None,

57

source_trace_id: str = None, source_observation_id: str = None,

58

dataset_id: str = None, dataset_name: str = None,

59

created_at: datetime = None, updated_at: datetime = None):

60

"""Initialize dataset item client."""

61

62

def run(self, **kwargs) -> ContextManager["DatasetItemClient"]:

63

"""Create context manager for dataset item runs.

64

65

Returns:

66

Context manager for tracking item execution

67

"""

68

69

# Attributes

70

id: str

71

status: DatasetStatus

72

input: Any

73

expected_output: Optional[Any]

74

metadata: Optional[Any]

75

source_trace_id: Optional[str]

76

source_observation_id: Optional[str]

77

dataset_id: str

78

dataset_name: str

79

created_at: datetime

80

updated_at: datetime

81

```

82

83

### Dataset Management

84

85

Core methods for managing datasets through the Langfuse client.

86

87

```python { .api }

88

class Langfuse:

89

def get_dataset(self, name: str) -> DatasetClient:

90

"""Retrieve dataset by name.

91

92

Args:

93

name: Dataset name

94

95

Returns:

96

DatasetClient for the named dataset

97

98

Raises:

99

Exception: If dataset not found

100

"""

101

102

def create_dataset(self, *, name: str, description: str = None,

103

metadata: Any = None) -> DatasetClient:

104

"""Create a new dataset.

105

106

Args:

107

name: Dataset name (must be unique)

108

description: Optional dataset description

109

metadata: Additional metadata for the dataset

110

111

Returns:

112

DatasetClient for the created dataset

113

"""

114

115

def create_dataset_item(self, *, dataset_name: str, input: Any,

116

expected_output: Any = None, metadata: Any = None,

117

source_trace_id: str = None,

118

source_observation_id: str = None) -> DatasetItemClient:

119

"""Add item to a dataset.

120

121

Args:

122

dataset_name: Name of target dataset

123

input: Input data for the item

124

expected_output: Expected output for evaluation

125

metadata: Additional item metadata

126

source_trace_id: Source trace ID if created from existing trace

127

source_observation_id: Source observation ID if from existing observation

128

129

Returns:

130

DatasetItemClient for the created item

131

"""

132

```

133

134

### Status and Model Types

135

136

Supporting types for dataset operations.

137

138

```python { .api }

139

# Dataset status enumeration

140

class DatasetStatus(str, Enum):

141

ACTIVE = "ACTIVE"

142

ARCHIVED = "ARCHIVED"

143

144

# Core model types (re-exported from API)

145

class Dataset:

146

"""Dataset model class with full API attributes."""

147

148

class DatasetItem:

149

"""Dataset item model class with full API attributes."""

150

151

class DatasetRun:

152

"""Dataset run model class with execution tracking."""

153

154

# Request types for API operations

155

class CreateDatasetRequest:

156

"""Request structure for creating datasets."""

157

158

class CreateDatasetItemRequest:

159

"""Request structure for creating dataset items."""

160

161

class CreateDatasetRunItemRequest:

162

"""Request structure for creating dataset run items."""

163

```

164

165

## Usage Examples

166

167

### Creating and Managing Datasets

168

169

```python

170

from langfuse import Langfuse

171

172

langfuse = Langfuse()

173

174

# Create a new dataset

175

dataset = langfuse.create_dataset(

176

name="qa-evaluation-set",

177

description="Question-answering dataset for model evaluation",

178

metadata={"domain": "general", "language": "en"}

179

)

180

181

# Add items to the dataset

182

items = [

183

{"input": "What is the capital of France?", "expected_output": "Paris"},

184

{"input": "What is the capital of Germany?", "expected_output": "Berlin"},

185

{"input": "What is the capital of Italy?", "expected_output": "Rome"}

186

]

187

188

for item_data in items:

189

langfuse.create_dataset_item(

190

dataset_name="qa-evaluation-set",

191

input=item_data["input"],

192

expected_output=item_data["expected_output"],

193

metadata={"category": "geography", "difficulty": "easy"}

194

)

195

```

196

197

### Working with Existing Datasets

198

199

```python

200

# Retrieve existing dataset

201

dataset = langfuse.get_dataset("qa-evaluation-set")

202

203

print(f"Dataset: {dataset.name}")

204

print(f"Description: {dataset.description}")

205

print(f"Items: {len(dataset.items)}")

206

207

# Inspect dataset items

208

for item in dataset.items:

209

print(f"Input: {item.input}")

210

print(f"Expected: {item.expected_output}")

211

print(f"Metadata: {item.metadata}")

212

print("---")

213

```

214

215

### Running Experiments on Datasets

216

217

```python

218

# Define task function

219

def qa_task(*, item, **kwargs):

220

# Access item attributes directly

221

question = item.input

222

# Your AI model call

223

answer = my_llm.generate(question)

224

return answer

225

226

# Define evaluator

227

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):

228

from langfuse import Evaluation

229

230

if not expected_output:

231

return Evaluation(name="accuracy", value=None)

232

233

is_correct = output.strip().lower() == expected_output.strip().lower()

234

return Evaluation(

235

name="accuracy",

236

value=1.0 if is_correct else 0.0,

237

comment="Exact match" if is_correct else "Different answer"

238

)

239

240

# Run experiment on dataset

241

dataset = langfuse.get_dataset("qa-evaluation-set")

242

result = dataset.run_experiment(

243

name="GPT-4 QA Evaluation",

244

task=qa_task,

245

evaluators=[accuracy_evaluator],

246

run_description="Testing GPT-4 performance on geography questions"

247

)

248

249

# View results

250

print(result.format())

251

if result.dataset_run_url:

252

print(f"View detailed results: {result.dataset_run_url}")

253

```

254

255

### Creating Datasets from Traces

256

257

```python

258

# Create dataset items from existing traces

259

def extract_qa_pairs_from_traces():

260

# Assuming you have traces with Q&A interactions

261

traces = get_qa_traces() # Your method to get traces

262

263

for trace in traces:

264

# Extract question and answer from trace

265

question = trace.input

266

answer = trace.output

267

268

langfuse.create_dataset_item(

269

dataset_name="production-qa-samples",

270

input=question,

271

expected_output=answer,

272

source_trace_id=trace.id,

273

metadata={

274

"source": "production",

275

"timestamp": trace.created_at.isoformat()

276

}

277

)

278

279

extract_qa_pairs_from_traces()

280

```

281

282

### Dataset Item Run Context

283

284

```python

285

def process_item_with_context(dataset_item):

286

"""Process item with run context for tracking."""

287

288

with dataset_item.run() as item_run:

289

# Your processing logic here

290

result = qa_task(item=dataset_item)

291

292

# Context automatically tracks the execution

293

return result

294

295

# Use with individual items

296

dataset = langfuse.get_dataset("qa-evaluation-set")

297

for item in dataset.items[:5]: # Process first 5 items

298

result = process_item_with_context(item)

299

print(f"Processed: {item.input} -> {result}")

300

```

301

302

### Complex Dataset Structures

303

304

```python

305

# Create dataset with rich metadata and complex inputs

306

complex_items = [

307

{

308

"input": {

309

"context": "France is a country in Western Europe...",

310

"question": "What is the capital of France?"

311

},

312

"expected_output": "Paris",

313

"metadata": {

314

"context_length": 150,

315

"difficulty": "easy",

316

"topics": ["geography", "europe"],

317

"source": "wikipedia"

318

}

319

},

320

{

321

"input": {

322

"context": "Advanced quantum mechanics principles...",

323

"question": "Explain quantum entanglement"

324

},

325

"expected_output": "Quantum entanglement is a phenomenon...",

326

"metadata": {

327

"context_length": 500,

328

"difficulty": "hard",

329

"topics": ["physics", "quantum"],

330

"source": "academic_papers"

331

}

332

}

333

]

334

335

# Create complex dataset

336

dataset = langfuse.create_dataset(

337

name="contextual-qa-dataset",

338

description="Q&A with contextual information",

339

metadata={

340

"format": "context_question",

341

"domains": ["geography", "science"],

342

"creation_date": "2024-01-15"

343

}

344

)

345

346

for item_data in complex_items:

347

langfuse.create_dataset_item(

348

dataset_name="contextual-qa-dataset",

349

**item_data

350

)

351

```

352

353

### Dataset Versioning and Updates

354

355

```python

356

# Add new items to existing dataset

357

def add_items_to_dataset(dataset_name, new_items):

358

for item in new_items:

359

langfuse.create_dataset_item(

360

dataset_name=dataset_name,

361

input=item["input"],

362

expected_output=item.get("expected_output"),

363

metadata=item.get("metadata", {})

364

)

365

366

# Refresh dataset to get latest items

367

def refresh_dataset(dataset_name):

368

return langfuse.get_dataset(dataset_name)

369

370

# Track dataset changes

371

original_dataset = langfuse.get_dataset("qa-evaluation-set")

372

original_count = len(original_dataset.items)

373

374

# Add new items

375

new_items = [

376

{"input": "What is the capital of Spain?", "expected_output": "Madrid"},

377

{"input": "What is the capital of Portugal?", "expected_output": "Lisbon"}

378

]

379

add_items_to_dataset("qa-evaluation-set", new_items)

380

381

# Check updated dataset

382

updated_dataset = refresh_dataset("qa-evaluation-set")

383

print(f"Items added: {len(updated_dataset.items) - original_count}")

384

```

385

386

### Comparing Dataset Performance

387

388

```python

389

def compare_models_on_dataset(dataset_name, models):

390

"""Compare multiple models on the same dataset."""

391

dataset = langfuse.get_dataset(dataset_name)

392

results = {}

393

394

for model_name, model_task in models.items():

395

print(f"Testing {model_name}...")

396

397

result = dataset.run_experiment(

398

name=f"Model Comparison - {model_name}",

399

task=model_task,

400

evaluators=[accuracy_evaluator],

401

run_description=f"Performance evaluation of {model_name}"

402

)

403

404

results[model_name] = result

405

406

# Calculate accuracy

407

accuracy_scores = [

408

eval.value for item_result in result.item_results

409

for eval in item_result.evaluations

410

if eval.name == "accuracy" and eval.value is not None

411

]

412

413

avg_accuracy = sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0

414

print(f"{model_name} accuracy: {avg_accuracy:.2%}")

415

416

return results

417

418

# Compare different models

419

models = {

420

"gpt-4": lambda *, item, **kwargs: gpt4_generate(item.input),

421

"gpt-3.5": lambda *, item, **kwargs: gpt35_generate(item.input),

422

"claude": lambda *, item, **kwargs: claude_generate(item.input)

423

}

424

425

comparison_results = compare_models_on_dataset("qa-evaluation-set", models)

426

```