0
# Dataset Management
1
2
Tools for creating, managing, and running experiments on datasets with support for both local data and Langfuse-hosted datasets. Enables systematic data management and experiment tracking.
3
4
## Capabilities
5
6
### Dataset Client
7
8
Manages a complete dataset with experiment running capabilities.
9
10
```python { .api }
11
class DatasetClient:
12
def __init__(self, id: str, name: str, description: str = None,
13
metadata: Any = None, project_id: str = None,
14
created_at: datetime = None, updated_at: datetime = None,
15
items: List[DatasetItemClient] = None):
16
"""Initialize dataset client."""
17
18
def run_experiment(self, *, name: str, task: TaskFunction,
19
evaluators: List[EvaluatorFunction] = None,
20
run_evaluators: List[RunEvaluatorFunction] = None,
21
run_name: str = None, run_description: str = None,
22
experiment_config: Dict[str, Any] = None) -> ExperimentResult:
23
"""Run experiment on this dataset.
24
25
Args:
26
name: Experiment name
27
task: Function to execute on each dataset item
28
evaluators: List of item-level evaluator functions
29
run_evaluators: List of run-level evaluator functions
30
run_name: Name for this specific run
31
run_description: Description of experiment run
32
experiment_config: Configuration metadata
33
34
Returns:
35
ExperimentResult with complete results and evaluations
36
"""
37
38
# Attributes
39
id: str
40
name: str
41
description: Optional[str]
42
metadata: Optional[Any]
43
project_id: str
44
created_at: datetime
45
updated_at: datetime
46
items: List[DatasetItemClient]
47
```
48
49
### Dataset Item Client
50
51
Represents individual items within a dataset with run context capabilities.
52
53
```python { .api }
54
class DatasetItemClient:
55
def __init__(self, id: str, status: DatasetStatus, input: Any = None,
56
expected_output: Any = None, metadata: Any = None,
57
source_trace_id: str = None, source_observation_id: str = None,
58
dataset_id: str = None, dataset_name: str = None,
59
created_at: datetime = None, updated_at: datetime = None):
60
"""Initialize dataset item client."""
61
62
def run(self, **kwargs) -> ContextManager["DatasetItemClient"]:
63
"""Create context manager for dataset item runs.
64
65
Returns:
66
Context manager for tracking item execution
67
"""
68
69
# Attributes
70
id: str
71
status: DatasetStatus
72
input: Any
73
expected_output: Optional[Any]
74
metadata: Optional[Any]
75
source_trace_id: Optional[str]
76
source_observation_id: Optional[str]
77
dataset_id: str
78
dataset_name: str
79
created_at: datetime
80
updated_at: datetime
81
```
82
83
### Dataset Management
84
85
Core methods for managing datasets through the Langfuse client.
86
87
```python { .api }
88
class Langfuse:
89
def get_dataset(self, name: str) -> DatasetClient:
90
"""Retrieve dataset by name.
91
92
Args:
93
name: Dataset name
94
95
Returns:
96
DatasetClient for the named dataset
97
98
Raises:
99
Exception: If dataset not found
100
"""
101
102
def create_dataset(self, *, name: str, description: str = None,
103
metadata: Any = None) -> DatasetClient:
104
"""Create a new dataset.
105
106
Args:
107
name: Dataset name (must be unique)
108
description: Optional dataset description
109
metadata: Additional metadata for the dataset
110
111
Returns:
112
DatasetClient for the created dataset
113
"""
114
115
def create_dataset_item(self, *, dataset_name: str, input: Any,
116
expected_output: Any = None, metadata: Any = None,
117
source_trace_id: str = None,
118
source_observation_id: str = None) -> DatasetItemClient:
119
"""Add item to a dataset.
120
121
Args:
122
dataset_name: Name of target dataset
123
input: Input data for the item
124
expected_output: Expected output for evaluation
125
metadata: Additional item metadata
126
source_trace_id: Source trace ID if created from existing trace
127
source_observation_id: Source observation ID if from existing observation
128
129
Returns:
130
DatasetItemClient for the created item
131
"""
132
```
133
134
### Status and Model Types
135
136
Supporting types for dataset operations.
137
138
```python { .api }
139
# Dataset status enumeration
140
class DatasetStatus(str, Enum):
141
ACTIVE = "ACTIVE"
142
ARCHIVED = "ARCHIVED"
143
144
# Core model types (re-exported from API)
145
class Dataset:
146
"""Dataset model class with full API attributes."""
147
148
class DatasetItem:
149
"""Dataset item model class with full API attributes."""
150
151
class DatasetRun:
152
"""Dataset run model class with execution tracking."""
153
154
# Request types for API operations
155
class CreateDatasetRequest:
156
"""Request structure for creating datasets."""
157
158
class CreateDatasetItemRequest:
159
"""Request structure for creating dataset items."""
160
161
class CreateDatasetRunItemRequest:
162
"""Request structure for creating dataset run items."""
163
```
164
165
## Usage Examples
166
167
### Creating and Managing Datasets
168
169
```python
170
from langfuse import Langfuse
171
172
langfuse = Langfuse()
173
174
# Create a new dataset
175
dataset = langfuse.create_dataset(
176
name="qa-evaluation-set",
177
description="Question-answering dataset for model evaluation",
178
metadata={"domain": "general", "language": "en"}
179
)
180
181
# Add items to the dataset
182
items = [
183
{"input": "What is the capital of France?", "expected_output": "Paris"},
184
{"input": "What is the capital of Germany?", "expected_output": "Berlin"},
185
{"input": "What is the capital of Italy?", "expected_output": "Rome"}
186
]
187
188
for item_data in items:
189
langfuse.create_dataset_item(
190
dataset_name="qa-evaluation-set",
191
input=item_data["input"],
192
expected_output=item_data["expected_output"],
193
metadata={"category": "geography", "difficulty": "easy"}
194
)
195
```
196
197
### Working with Existing Datasets
198
199
```python
200
# Retrieve existing dataset
201
dataset = langfuse.get_dataset("qa-evaluation-set")
202
203
print(f"Dataset: {dataset.name}")
204
print(f"Description: {dataset.description}")
205
print(f"Items: {len(dataset.items)}")
206
207
# Inspect dataset items
208
for item in dataset.items:
209
print(f"Input: {item.input}")
210
print(f"Expected: {item.expected_output}")
211
print(f"Metadata: {item.metadata}")
212
print("---")
213
```
214
215
### Running Experiments on Datasets
216
217
```python
218
# Define task function
219
def qa_task(*, item, **kwargs):
220
# Access item attributes directly
221
question = item.input
222
# Your AI model call
223
answer = my_llm.generate(question)
224
return answer
225
226
# Define evaluator
227
def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
228
from langfuse import Evaluation
229
230
if not expected_output:
231
return Evaluation(name="accuracy", value=None)
232
233
is_correct = output.strip().lower() == expected_output.strip().lower()
234
return Evaluation(
235
name="accuracy",
236
value=1.0 if is_correct else 0.0,
237
comment="Exact match" if is_correct else "Different answer"
238
)
239
240
# Run experiment on dataset
241
dataset = langfuse.get_dataset("qa-evaluation-set")
242
result = dataset.run_experiment(
243
name="GPT-4 QA Evaluation",
244
task=qa_task,
245
evaluators=[accuracy_evaluator],
246
run_description="Testing GPT-4 performance on geography questions"
247
)
248
249
# View results
250
print(result.format())
251
if result.dataset_run_url:
252
print(f"View detailed results: {result.dataset_run_url}")
253
```
254
255
### Creating Datasets from Traces
256
257
```python
258
# Create dataset items from existing traces
259
def extract_qa_pairs_from_traces():
260
# Assuming you have traces with Q&A interactions
261
traces = get_qa_traces() # Your method to get traces
262
263
for trace in traces:
264
# Extract question and answer from trace
265
question = trace.input
266
answer = trace.output
267
268
langfuse.create_dataset_item(
269
dataset_name="production-qa-samples",
270
input=question,
271
expected_output=answer,
272
source_trace_id=trace.id,
273
metadata={
274
"source": "production",
275
"timestamp": trace.created_at.isoformat()
276
}
277
)
278
279
extract_qa_pairs_from_traces()
280
```
281
282
### Dataset Item Run Context
283
284
```python
285
def process_item_with_context(dataset_item):
286
"""Process item with run context for tracking."""
287
288
with dataset_item.run() as item_run:
289
# Your processing logic here
290
result = qa_task(item=dataset_item)
291
292
# Context automatically tracks the execution
293
return result
294
295
# Use with individual items
296
dataset = langfuse.get_dataset("qa-evaluation-set")
297
for item in dataset.items[:5]: # Process first 5 items
298
result = process_item_with_context(item)
299
print(f"Processed: {item.input} -> {result}")
300
```
301
302
### Complex Dataset Structures
303
304
```python
305
# Create dataset with rich metadata and complex inputs
306
complex_items = [
307
{
308
"input": {
309
"context": "France is a country in Western Europe...",
310
"question": "What is the capital of France?"
311
},
312
"expected_output": "Paris",
313
"metadata": {
314
"context_length": 150,
315
"difficulty": "easy",
316
"topics": ["geography", "europe"],
317
"source": "wikipedia"
318
}
319
},
320
{
321
"input": {
322
"context": "Advanced quantum mechanics principles...",
323
"question": "Explain quantum entanglement"
324
},
325
"expected_output": "Quantum entanglement is a phenomenon...",
326
"metadata": {
327
"context_length": 500,
328
"difficulty": "hard",
329
"topics": ["physics", "quantum"],
330
"source": "academic_papers"
331
}
332
}
333
]
334
335
# Create complex dataset
336
dataset = langfuse.create_dataset(
337
name="contextual-qa-dataset",
338
description="Q&A with contextual information",
339
metadata={
340
"format": "context_question",
341
"domains": ["geography", "science"],
342
"creation_date": "2024-01-15"
343
}
344
)
345
346
for item_data in complex_items:
347
langfuse.create_dataset_item(
348
dataset_name="contextual-qa-dataset",
349
**item_data
350
)
351
```
352
353
### Dataset Versioning and Updates
354
355
```python
356
# Add new items to existing dataset
357
def add_items_to_dataset(dataset_name, new_items):
358
for item in new_items:
359
langfuse.create_dataset_item(
360
dataset_name=dataset_name,
361
input=item["input"],
362
expected_output=item.get("expected_output"),
363
metadata=item.get("metadata", {})
364
)
365
366
# Refresh dataset to get latest items
367
def refresh_dataset(dataset_name):
368
return langfuse.get_dataset(dataset_name)
369
370
# Track dataset changes
371
original_dataset = langfuse.get_dataset("qa-evaluation-set")
372
original_count = len(original_dataset.items)
373
374
# Add new items
375
new_items = [
376
{"input": "What is the capital of Spain?", "expected_output": "Madrid"},
377
{"input": "What is the capital of Portugal?", "expected_output": "Lisbon"}
378
]
379
add_items_to_dataset("qa-evaluation-set", new_items)
380
381
# Check updated dataset
382
updated_dataset = refresh_dataset("qa-evaluation-set")
383
print(f"Items added: {len(updated_dataset.items) - original_count}")
384
```
385
386
### Comparing Dataset Performance
387
388
```python
389
def compare_models_on_dataset(dataset_name, models):
390
"""Compare multiple models on the same dataset."""
391
dataset = langfuse.get_dataset(dataset_name)
392
results = {}
393
394
for model_name, model_task in models.items():
395
print(f"Testing {model_name}...")
396
397
result = dataset.run_experiment(
398
name=f"Model Comparison - {model_name}",
399
task=model_task,
400
evaluators=[accuracy_evaluator],
401
run_description=f"Performance evaluation of {model_name}"
402
)
403
404
results[model_name] = result
405
406
# Calculate accuracy
407
accuracy_scores = [
408
eval.value for item_result in result.item_results
409
for eval in item_result.evaluations
410
if eval.name == "accuracy" and eval.value is not None
411
]
412
413
avg_accuracy = sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
414
print(f"{model_name} accuracy: {avg_accuracy:.2%}")
415
416
return results
417
418
# Compare different models
419
models = {
420
"gpt-4": lambda *, item, **kwargs: gpt4_generate(item.input),
421
"gpt-3.5": lambda *, item, **kwargs: gpt35_generate(item.input),
422
"claude": lambda *, item, **kwargs: claude_generate(item.input)
423
}
424
425
comparison_results = compare_models_on_dataset("qa-evaluation-set", models)
426
```