Tessl Tile for pypi/langfuse@3.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced.md core-tracing.md datasets.md experiments.md index.md integrations.md observation-types.md prompts.md scoring.md

datasets.mddocs/

0
# Dataset Management
1

2
Tools for creating, managing, and running experiments on datasets with support for both local data and Langfuse-hosted datasets. Enables systematic data management and experiment tracking.
3

4
## Capabilities
5

6
### Dataset Client
7

8
Manages a complete dataset with experiment running capabilities.
9

10
```python { .api }
11
class DatasetClient:
12
    def __init__(self, id: str, name: str, description: str = None,
13
                 metadata: Any = None, project_id: str = None,
14
                 created_at: datetime = None, updated_at: datetime = None,
15
                 items: List[DatasetItemClient] = None):
16
        """Initialize dataset client."""
17

18
    def run_experiment(self, *, name: str, task: TaskFunction,
19
                       evaluators: List[EvaluatorFunction] = None,
20
                       run_evaluators: List[RunEvaluatorFunction] = None,
21
                       run_name: str = None, run_description: str = None,
22
                       experiment_config: Dict[str, Any] = None) -> ExperimentResult:
23
        """Run experiment on this dataset.
24

25
        Args:
26
            name: Experiment name
27
            task: Function to execute on each dataset item
28
            evaluators: List of item-level evaluator functions
29
            run_evaluators: List of run-level evaluator functions
30
            run_name: Name for this specific run
31
            run_description: Description of experiment run
32
            experiment_config: Configuration metadata
33

34
        Returns:
35
            ExperimentResult with complete results and evaluations
36
        """
37

38
    # Attributes
39
    id: str
40
    name: str
41
    description: Optional[str]
42
    metadata: Optional[Any]
43
    project_id: str
44
    created_at: datetime
45
    updated_at: datetime
46
    items: List[DatasetItemClient]
47
```
48

49
### Dataset Item Client
50

51
Represents individual items within a dataset with run context capabilities.
52

53
```python { .api }
54
class DatasetItemClient:
55
    def __init__(self, id: str, status: DatasetStatus, input: Any = None,
56
                 expected_output: Any = None, metadata: Any = None,
57
                 source_trace_id: str = None, source_observation_id: str = None,
58
                 dataset_id: str = None, dataset_name: str = None,
59
                 created_at: datetime = None, updated_at: datetime = None):
60
        """Initialize dataset item client."""
61

62
    def run(self, **kwargs) -> ContextManager["DatasetItemClient"]:
63
        """Create context manager for dataset item runs.
64

65
        Returns:
66
            Context manager for tracking item execution
67
        """
68

69
    # Attributes
70
    id: str
71
    status: DatasetStatus
72
    input: Any
73
    expected_output: Optional[Any]
74
    metadata: Optional[Any]
75
    source_trace_id: Optional[str]
76
    source_observation_id: Optional[str]
77
    dataset_id: str
78
    dataset_name: str
79
    created_at: datetime
80
    updated_at: datetime
81
```
82

83
### Dataset Management
84

85
Core methods for managing datasets through the Langfuse client.
86

87
```python { .api }
88
class Langfuse:
89
    def get_dataset(self, name: str) -> DatasetClient:
90
        """Retrieve dataset by name.
91

92
        Args:
93
            name: Dataset name
94

95
        Returns:
96
            DatasetClient for the named dataset
97

98
        Raises:
99
            Exception: If dataset not found
100
        """
101

102
    def create_dataset(self, *, name: str, description: str = None,
103
                       metadata: Any = None) -> DatasetClient:
104
        """Create a new dataset.
105

106
        Args:
107
            name: Dataset name (must be unique)
108
            description: Optional dataset description
109
            metadata: Additional metadata for the dataset
110

111
        Returns:
112
            DatasetClient for the created dataset
113
        """
114

115
    def create_dataset_item(self, *, dataset_name: str, input: Any,
116
                            expected_output: Any = None, metadata: Any = None,
117
                            source_trace_id: str = None,
118
                            source_observation_id: str = None) -> DatasetItemClient:
119
        """Add item to a dataset.
120

121
        Args:
122
            dataset_name: Name of target dataset
123
            input: Input data for the item
124
            expected_output: Expected output for evaluation
125
            metadata: Additional item metadata
126
            source_trace_id: Source trace ID if created from existing trace
127
            source_observation_id: Source observation ID if from existing observation
128

129
        Returns:
130
            DatasetItemClient for the created item
131
        """
132
```
133

134
### Status and Model Types
135

136
Supporting types for dataset operations.
137

138
```python { .api }
139
# Dataset status enumeration
140
class DatasetStatus(str, Enum):
141
    ACTIVE = "ACTIVE"
142
    ARCHIVED = "ARCHIVED"
143

144
# Core model types (re-exported from API)
145
class Dataset:
146
    """Dataset model class with full API attributes."""
147

148
class DatasetItem:
149
    """Dataset item model class with full API attributes."""
150

151
class DatasetRun:
152
    """Dataset run model class with execution tracking."""
153

154
# Request types for API operations
155
class CreateDatasetRequest:
156
    """Request structure for creating datasets."""
157

158
class CreateDatasetItemRequest:
159
    """Request structure for creating dataset items."""
160

161
class CreateDatasetRunItemRequest:
162
    """Request structure for creating dataset run items."""
163
```
164

165
## Usage Examples
166

167
### Creating and Managing Datasets
168

169
```python
170
from langfuse import Langfuse
171

172
langfuse = Langfuse()
173

174
# Create a new dataset
175
dataset = langfuse.create_dataset(
176
    name="qa-evaluation-set",
177
    description="Question-answering dataset for model evaluation",
178
    metadata={"domain": "general", "language": "en"}
179
)
180

181
# Add items to the dataset
182
items = [
183
    {"input": "What is the capital of France?", "expected_output": "Paris"},
184
    {"input": "What is the capital of Germany?", "expected_output": "Berlin"},
185
    {"input": "What is the capital of Italy?", "expected_output": "Rome"}
186
]
187

188
for item_data in items:
189
    langfuse.create_dataset_item(
190
        dataset_name="qa-evaluation-set",
191
        input=item_data["input"],
192
        expected_output=item_data["expected_output"],
193
        metadata={"category": "geography", "difficulty": "easy"}
194
    )
195
```
196

197
### Working with Existing Datasets
198

199
```python
200
# Retrieve existing dataset
201
dataset = langfuse.get_dataset("qa-evaluation-set")
202

203
print(f"Dataset: {dataset.name}")
204
print(f"Description: {dataset.description}")
205
print(f"Items: {len(dataset.items)}")
206

207
# Inspect dataset items
208
for item in dataset.items:
209
    print(f"Input: {item.input}")
210
    print(f"Expected: {item.expected_output}")
211
    print(f"Metadata: {item.metadata}")
212
    print("---")
213
```
214

215
### Running Experiments on Datasets
216

217
```python
218
# Define task function
219
def qa_task(*, item, **kwargs):
220
    # Access item attributes directly
221
    question = item.input
222
    # Your AI model call
223
    answer = my_llm.generate(question)
224
    return answer
225

226
# Define evaluator
227
def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
228
    from langfuse import Evaluation
229

230
    if not expected_output:
231
        return Evaluation(name="accuracy", value=None)
232

233
    is_correct = output.strip().lower() == expected_output.strip().lower()
234
    return Evaluation(
235
        name="accuracy",
236
        value=1.0 if is_correct else 0.0,
237
        comment="Exact match" if is_correct else "Different answer"
238
    )
239

240
# Run experiment on dataset
241
dataset = langfuse.get_dataset("qa-evaluation-set")
242
result = dataset.run_experiment(
243
    name="GPT-4 QA Evaluation",
244
    task=qa_task,
245
    evaluators=[accuracy_evaluator],
246
    run_description="Testing GPT-4 performance on geography questions"
247
)
248

249
# View results
250
print(result.format())
251
if result.dataset_run_url:
252
    print(f"View detailed results: {result.dataset_run_url}")
253
```
254

255
### Creating Datasets from Traces
256

257
```python
258
# Create dataset items from existing traces
259
def extract_qa_pairs_from_traces():
260
    # Assuming you have traces with Q&A interactions
261
    traces = get_qa_traces()  # Your method to get traces
262

263
    for trace in traces:
264
        # Extract question and answer from trace
265
        question = trace.input
266
        answer = trace.output
267

268
        langfuse.create_dataset_item(
269
            dataset_name="production-qa-samples",
270
            input=question,
271
            expected_output=answer,
272
            source_trace_id=trace.id,
273
            metadata={
274
                "source": "production",
275
                "timestamp": trace.created_at.isoformat()
276
            }
277
        )
278

279
extract_qa_pairs_from_traces()
280
```
281

282
### Dataset Item Run Context
283

284
```python
285
def process_item_with_context(dataset_item):
286
    """Process item with run context for tracking."""
287

288
    with dataset_item.run() as item_run:
289
        # Your processing logic here
290
        result = qa_task(item=dataset_item)
291

292
        # Context automatically tracks the execution
293
        return result
294

295
# Use with individual items
296
dataset = langfuse.get_dataset("qa-evaluation-set")
297
for item in dataset.items[:5]:  # Process first 5 items
298
    result = process_item_with_context(item)
299
    print(f"Processed: {item.input} -> {result}")
300
```
301

302
### Complex Dataset Structures
303

304
```python
305
# Create dataset with rich metadata and complex inputs
306
complex_items = [
307
    {
308
        "input": {
309
            "context": "France is a country in Western Europe...",
310
            "question": "What is the capital of France?"
311
        },
312
        "expected_output": "Paris",
313
        "metadata": {
314
            "context_length": 150,
315
            "difficulty": "easy",
316
            "topics": ["geography", "europe"],
317
            "source": "wikipedia"
318
        }
319
    },
320
    {
321
        "input": {
322
            "context": "Advanced quantum mechanics principles...",
323
            "question": "Explain quantum entanglement"
324
        },
325
        "expected_output": "Quantum entanglement is a phenomenon...",
326
        "metadata": {
327
            "context_length": 500,
328
            "difficulty": "hard",
329
            "topics": ["physics", "quantum"],
330
            "source": "academic_papers"
331
        }
332
    }
333
]
334

335
# Create complex dataset
336
dataset = langfuse.create_dataset(
337
    name="contextual-qa-dataset",
338
    description="Q&A with contextual information",
339
    metadata={
340
        "format": "context_question",
341
        "domains": ["geography", "science"],
342
        "creation_date": "2024-01-15"
343
    }
344
)
345

346
for item_data in complex_items:
347
    langfuse.create_dataset_item(
348
        dataset_name="contextual-qa-dataset",
349
        **item_data
350
    )
351
```
352

353
### Dataset Versioning and Updates
354

355
```python
356
# Add new items to existing dataset
357
def add_items_to_dataset(dataset_name, new_items):
358
    for item in new_items:
359
        langfuse.create_dataset_item(
360
            dataset_name=dataset_name,
361
            input=item["input"],
362
            expected_output=item.get("expected_output"),
363
            metadata=item.get("metadata", {})
364
        )
365

366
# Refresh dataset to get latest items
367
def refresh_dataset(dataset_name):
368
    return langfuse.get_dataset(dataset_name)
369

370
# Track dataset changes
371
original_dataset = langfuse.get_dataset("qa-evaluation-set")
372
original_count = len(original_dataset.items)
373

374
# Add new items
375
new_items = [
376
    {"input": "What is the capital of Spain?", "expected_output": "Madrid"},
377
    {"input": "What is the capital of Portugal?", "expected_output": "Lisbon"}
378
]
379
add_items_to_dataset("qa-evaluation-set", new_items)
380

381
# Check updated dataset
382
updated_dataset = refresh_dataset("qa-evaluation-set")
383
print(f"Items added: {len(updated_dataset.items) - original_count}")
384
```
385

386
### Comparing Dataset Performance
387

388
```python
389
def compare_models_on_dataset(dataset_name, models):
390
    """Compare multiple models on the same dataset."""
391
    dataset = langfuse.get_dataset(dataset_name)
392
    results = {}
393

394
    for model_name, model_task in models.items():
395
        print(f"Testing {model_name}...")
396

397
        result = dataset.run_experiment(
398
            name=f"Model Comparison - {model_name}",
399
            task=model_task,
400
            evaluators=[accuracy_evaluator],
401
            run_description=f"Performance evaluation of {model_name}"
402
        )
403

404
        results[model_name] = result
405

406
        # Calculate accuracy
407
        accuracy_scores = [
408
            eval.value for item_result in result.item_results
409
            for eval in item_result.evaluations
410
            if eval.name == "accuracy" and eval.value is not None
411
        ]
412

413
        avg_accuracy = sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
414
        print(f"{model_name} accuracy: {avg_accuracy:.2%}")
415

416
    return results
417

418
# Compare different models
419
models = {
420
    "gpt-4": lambda *, item, **kwargs: gpt4_generate(item.input),
421
    "gpt-3.5": lambda *, item, **kwargs: gpt35_generate(item.input),
422
    "claude": lambda *, item, **kwargs: claude_generate(item.input)
423
}
424

425
comparison_results = compare_models_on_dataset("qa-evaluation-set", models)
426
```

Version

Tile

Files

datasets.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

datasets.mddocs/