Tessl Tile for npm/langfuse@3.38.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

configuration.md datasets.md index.md media.md openai-integration.md prompts.md public-api.md tracing.md

datasets.mddocs/

0
# Dataset Operations
1

2
Comprehensive dataset management for evaluations, experiments, and testing workflows. Datasets contain items with input/output pairs that can be linked to observations for run tracking and analysis.
3

4
## Capabilities
5

6
### Fetching Datasets
7

8
Retrieve datasets with all their items.
9

10
```typescript { .api }
11
/**
12
 * Fetches a dataset with all its items
13
 * @param name - Dataset name
14
 * @param options - Optional pagination settings
15
 * @returns Dataset with items
16
 */
17
getDataset(
18
  name: string,
19
  options?: { fetchItemsPageSize: number }
20
): Promise<Dataset>;
21

22
interface Dataset {
23
  /** Dataset ID */
24
  id: string;
25
  /** Dataset name */
26
  name: string;
27
  /** Optional description */
28
  description?: string;
29
  /** Custom metadata */
30
  metadata?: any;
31
  /** Project ID */
32
  projectId: string;
33
  /** Dataset items */
34
  items: DatasetItem[];
35
}
36

37
interface DatasetItem {
38
  /** Item ID */
39
  id: string;
40
  /** Status: ACTIVE or ARCHIVED */
41
  status: ApiDatasetStatus;
42
  /** Input data for the item */
43
  input: any;
44
  /** Expected output (ground truth) */
45
  expectedOutput?: any;
46
  /** Custom metadata */
47
  metadata?: any;
48
  /** Source trace ID if created from a trace */
49
  sourceTraceId?: string;
50
  /** Source observation ID if created from an observation */
51
  sourceObservationId?: string;
52
  /** Method to link this item to a run */
53
  link: LinkDatasetItem;
54
}
55

56
type ApiDatasetStatus = "ACTIVE" | "ARCHIVED";
57

58
type LinkDatasetItem = (
59
  obj: LangfuseObjectClient,
60
  runName: string,
61
  runArgs?: {
62
    description?: string;
63
    metadata?: any;
64
  }
65
) => Promise<CreateLangfuseDatasetRunItemResponse>;
66

67
type LangfuseObjectClient =
68
  | LangfuseTraceClient
69
  | LangfuseSpanClient
70
  | LangfuseGenerationClient
71
  | LangfuseEventClient;
72
```
73

74
**Usage Example:**
75

76
```typescript
77
import { Langfuse } from 'langfuse';
78

79
const langfuse = new Langfuse();
80

81
// Fetch dataset with default page size
82
const dataset = await langfuse.getDataset('eval-dataset');
83

84
console.log(dataset.name); // "eval-dataset"
85
console.log(dataset.items.length); // Number of items
86

87
// Fetch with custom page size
88
const largeDataset = await langfuse.getDataset('large-dataset', {
89
  fetchItemsPageSize: 100
90
});
91

92
// Access dataset items
93
for (const item of dataset.items) {
94
  console.log(item.input);
95
  console.log(item.expectedOutput);
96
}
97
```
98

99
### Creating Datasets
100

101
Create new datasets for organizing test cases and evaluations.
102

103
```typescript { .api }
104
/**
105
 * Creates a new dataset
106
 * @param dataset - Dataset name as string or configuration object
107
 * @returns Dataset creation response
108
 */
109
createDataset(dataset: string): Promise<CreateLangfuseDatasetResponse>;
110
createDataset(dataset: CreateLangfuseDatasetBody): Promise<CreateLangfuseDatasetResponse>;
111

112
interface CreateLangfuseDatasetBody {
113
  /** Dataset name (must be unique) */
114
  name: string;
115
  /** Optional description */
116
  description?: string;
117
  /** Custom metadata */
118
  metadata?: any;
119
}
120

121
interface CreateLangfuseDatasetResponse {
122
  /** Dataset ID */
123
  id: string;
124
  /** Dataset name */
125
  name: string;
126
  /** Optional description */
127
  description?: string;
128
  /** Custom metadata */
129
  metadata?: any;
130
  /** Creation timestamp */
131
  createdAt: string;
132
  /** Last update timestamp */
133
  updatedAt: string;
134
}
135
```
136

137
**Usage Example:**
138

139
```typescript
140
// Create a dataset with just a name
141
const simpleDataset = await langfuse.createDataset('qa-evaluation');
142

143
// Or create with full configuration
144
const dataset = await langfuse.createDataset({
145
  name: 'qa-evaluation',
146
  description: 'Question-answering evaluation dataset',
147
  metadata: {
148
    version: '1.0',
149
    created_by: 'eval-team'
150
  }
151
});
152

153
console.log(dataset.id); // Dataset ID
154
console.log(dataset.name); // "qa-evaluation"
155
```
156

157
### Creating Dataset Items
158

159
Add items to datasets with input data and expected outputs.
160

161
```typescript { .api }
162
/**
163
 * Creates a dataset item
164
 * @param body - Dataset item configuration
165
 * @returns Dataset item response
166
 */
167
createDatasetItem(body: CreateLangfuseDatasetItemBody): Promise<CreateLangfuseDatasetItemResponse>;
168

169
interface CreateLangfuseDatasetItemBody {
170
  /** Dataset name to add item to */
171
  datasetName: string;
172
  /** Input data for the item */
173
  input: any;
174
  /** Expected output (ground truth) */
175
  expectedOutput?: any;
176
  /** Custom metadata */
177
  metadata?: any;
178
  /** Source trace ID if creating from a trace */
179
  sourceTraceId?: string;
180
  /** Source observation ID if creating from an observation */
181
  sourceObservationId?: string;
182
}
183

184
interface CreateLangfuseDatasetItemResponse {
185
  /** Item ID */
186
  id: string;
187
  /** Status */
188
  status: ApiDatasetStatus;
189
  /** Input data */
190
  input: any;
191
  /** Expected output */
192
  expectedOutput?: any;
193
  /** Custom metadata */
194
  metadata?: any;
195
  /** Source trace ID */
196
  sourceTraceId?: string;
197
  /** Source observation ID */
198
  sourceObservationId?: string;
199
  /** Dataset ID */
200
  datasetId: string;
201
  /** Dataset name */
202
  datasetName: string;
203
  /** Creation timestamp */
204
  createdAt: string;
205
  /** Last update timestamp */
206
  updatedAt: string;
207
}
208
```
209

210
**Usage Example:**
211

212
```typescript
213
// Create a dataset item
214
const item = await langfuse.createDatasetItem({
215
  datasetName: 'qa-evaluation',
216
  input: {
217
    question: 'What is the capital of France?'
218
  },
219
  expectedOutput: {
220
    answer: 'Paris'
221
  },
222
  metadata: {
223
    difficulty: 'easy',
224
    category: 'geography'
225
  }
226
});
227

228
// Create item from existing trace
229
const traceItem = await langfuse.createDatasetItem({
230
  datasetName: 'production-samples',
231
  input: { query: 'user question' },
232
  expectedOutput: { response: 'correct answer' },
233
  sourceTraceId: 'trace-123',
234
  sourceObservationId: 'obs-456'
235
});
236
```
237

238
### Fetching Dataset Items
239

240
Retrieve a specific dataset item by ID.
241

242
```typescript { .api }
243
/**
244
 * Fetches a specific dataset item
245
 * @param id - Dataset item ID
246
 * @returns Dataset item response
247
 */
248
getDatasetItem(id: string): Promise<CreateLangfuseDatasetItemResponse>;
249
```
250

251
**Usage Example:**
252

253
```typescript
254
const item = await langfuse.getDatasetItem('item-123');
255

256
console.log(item.input);
257
console.log(item.expectedOutput);
258
console.log(item.metadata);
259
```
260

261
### Dataset Runs
262

263
Dataset runs track executions of your system against dataset items, enabling evaluation and comparison.
264

265
```typescript { .api }
266
/**
267
 * Fetches a dataset run
268
 * @param params - Run identifier parameters
269
 * @returns Dataset run response
270
 */
271
getDatasetRun(params: GetLangfuseDatasetRunParams): Promise<GetLangfuseDatasetRunResponse>;
272

273
/**
274
 * Fetches dataset runs for a dataset
275
 * @param datasetName - Dataset name
276
 * @param query - Optional filtering and pagination
277
 * @returns Dataset runs response
278
 */
279
getDatasetRuns(
280
  datasetName: string,
281
  query?: GetLangfuseDatasetRunsQuery
282
): Promise<GetLangfuseDatasetRunsResponse>;
283

284
interface GetLangfuseDatasetRunParams {
285
  /** Dataset name */
286
  datasetName: string;
287
  /** Run name */
288
  runName: string;
289
}
290

291
interface GetLangfuseDatasetRunResponse {
292
  /** Run ID */
293
  id: string;
294
  /** Run name */
295
  name: string;
296
  /** Optional description */
297
  description?: string;
298
  /** Custom metadata */
299
  metadata?: any;
300
  /** Dataset ID */
301
  datasetId: string;
302
  /** Dataset name */
303
  datasetName: string;
304
  /** Creation timestamp */
305
  createdAt: string;
306
  /** Last update timestamp */
307
  updatedAt: string;
308
}
309

310
interface GetLangfuseDatasetRunsQuery {
311
  /** Page number */
312
  page?: number;
313
  /** Page size */
314
  limit?: number;
315
}
316

317
interface GetLangfuseDatasetRunsResponse {
318
  /** Array of runs */
319
  data: ApiDatasetRun[];
320
  /** Pagination metadata */
321
  meta: {
322
    page: number;
323
    limit: number;
324
    totalItems: number;
325
    totalPages: number;
326
  };
327
}
328

329
interface ApiDatasetRun {
330
  id: string;
331
  name: string;
332
  description?: string;
333
  metadata?: any;
334
  datasetId: string;
335
  datasetName: string;
336
  createdAt: string;
337
  updatedAt: string;
338
}
339
```
340

341
**Usage Example:**
342

343
```typescript
344
// Fetch a specific run
345
const run = await langfuse.getDatasetRun({
346
  datasetName: 'qa-evaluation',
347
  runName: 'gpt4-run-1'
348
});
349

350
// Fetch all runs for a dataset
351
const runs = await langfuse.getDatasetRuns('qa-evaluation', {
352
  page: 1,
353
  limit: 50
354
});
355

356
for (const run of runs.data) {
357
  console.log(run.name);
358
  console.log(run.metadata);
359
}
360
```
361

362
### Creating Dataset Run Items
363

364
Link observations to dataset items to track execution runs.
365

366
```typescript { .api }
367
/**
368
 * Creates a dataset run item linking an observation to a dataset item
369
 * @param body - Run item configuration
370
 * @returns Run item response
371
 */
372
createDatasetRunItem(body: CreateLangfuseDatasetRunItemBody): Promise<CreateLangfuseDatasetRunItemResponse>;
373

374
interface CreateLangfuseDatasetRunItemBody {
375
  /** Run name */
376
  runName: string;
377
  /** Dataset item ID */
378
  datasetItemId: string;
379
  /** Trace ID to link */
380
  traceId?: string;
381
  /** Observation ID to link */
382
  observationId?: string;
383
  /** Optional run description */
384
  runDescription?: string;
385
  /** Custom metadata */
386
  metadata?: any;
387
}
388

389
interface CreateLangfuseDatasetRunItemResponse {
390
  /** Run item ID */
391
  id: string;
392
}
393
```
394

395
**Usage Example:**
396

397
```typescript
398
// Create run item manually
399
const runItem = await langfuse.createDatasetRunItem({
400
  runName: 'experiment-1',
401
  datasetItemId: 'item-123',
402
  traceId: 'trace-456',
403
  observationId: 'obs-789',
404
  runDescription: 'GPT-4 evaluation run',
405
  metadata: {
406
    model: 'gpt-4',
407
    temperature: 0.7
408
  }
409
});
410
```
411

412
### Linking Dataset Items to Runs
413

414
Use the `link` method on dataset items for convenient run tracking.
415

416
```typescript { .api }
417
interface DatasetItem {
418
  /**
419
   * Links this dataset item to an observation for run tracking
420
   * @param obj - Trace, span, generation, or event client
421
   * @param runName - Name of the run
422
   * @param runArgs - Optional run configuration
423
   * @returns Run item response
424
   */
425
  link: (
426
    obj: LangfuseObjectClient,
427
    runName: string,
428
    runArgs?: {
429
      description?: string;
430
      metadata?: any;
431
    }
432
  ) => Promise<CreateLangfuseDatasetRunItemResponse>;
433
}
434
```
435

436
**Usage Example:**
437

438
```typescript
439
const dataset = await langfuse.getDataset('qa-evaluation');
440

441
for (const item of dataset.items) {
442
  // Create a trace for this item
443
  const trace = langfuse.trace({
444
    name: 'eval-trace',
445
    input: item.input
446
  });
447

448
  // Execute your system with the input
449
  const generation = trace.generation({
450
    name: 'qa-generation',
451
    model: 'gpt-4',
452
    input: item.input
453
  });
454

455
  // Simulate processing
456
  const output = await processQuestion(item.input.question);
457

458
  generation.end({
459
    output: { answer: output }
460
  });
461

462
  // Link this execution to the dataset item
463
  await item.link(trace, 'gpt4-evaluation', {
464
    description: 'GPT-4 evaluation run',
465
    metadata: {
466
      temperature: 0.7,
467
      model: 'gpt-4'
468
    }
469
  });
470
}
471

472
await langfuse.flushAsync();
473
```
474

475
## Complete Dataset Evaluation Example
476

477
```typescript
478
import { Langfuse } from 'langfuse';
479

480
const langfuse = new Langfuse();
481

482
// Step 1: Create a dataset
483
const dataset = await langfuse.createDataset({
484
  name: 'customer-support-qa',
485
  description: 'Customer support Q&A evaluation dataset',
486
  metadata: { version: '1.0' }
487
});
488

489
// Step 2: Add items to the dataset
490
const items = [
491
  {
492
    question: 'How do I reset my password?',
493
    expectedAnswer: 'You can reset your password by clicking the "Forgot Password" link on the login page.'
494
  },
495
  {
496
    question: 'What are your business hours?',
497
    expectedAnswer: 'We are open Monday-Friday, 9 AM to 5 PM EST.'
498
  },
499
  {
500
    question: 'How do I cancel my subscription?',
501
    expectedAnswer: 'You can cancel your subscription in the billing section of your account settings.'
502
  }
503
];
504

505
for (const item of items) {
506
  await langfuse.createDatasetItem({
507
    datasetName: 'customer-support-qa',
508
    input: { question: item.question },
509
    expectedOutput: { answer: item.expectedAnswer },
510
    metadata: { category: 'support' }
511
  });
512
}
513

514
// Step 3: Run evaluation
515
const fetchedDataset = await langfuse.getDataset('customer-support-qa');
516
const runName = `eval-run-${Date.now()}`;
517

518
for (const item of fetchedDataset.items) {
519
  // Create trace for this evaluation
520
  const trace = langfuse.trace({
521
    name: 'qa-evaluation',
522
    input: item.input,
523
    metadata: { runName }
524
  });
525

526
  // Get prompt
527
  const prompt = await langfuse.getPrompt('support-qa-prompt', undefined, {
528
    type: 'chat'
529
  });
530

531
  // Create generation
532
  const messages = prompt.compile(
533
    { question: item.input.question },
534
    { history: [] }
535
  );
536

537
  const generation = trace.generation({
538
    name: 'answer-generation',
539
    prompt: prompt,
540
    model: 'gpt-4',
541
    input: messages
542
  });
543

544
  // Simulate LLM call
545
  const response = await callLLM(messages);
546

547
  generation.end({
548
    output: { answer: response },
549
    usage: { input: 50, output: 100, total: 150 }
550
  });
551

552
  // Update trace with output
553
  trace.update({
554
    output: { answer: response }
555
  });
556

557
  // Link to dataset run
558
  await item.link(trace, runName, {
559
    description: 'GPT-4 evaluation with support prompt',
560
    metadata: {
561
      model: 'gpt-4',
562
      promptVersion: prompt.version
563
    }
564
  });
565

566
  // Score the generation
567
  const score = calculateSimilarity(response, item.expectedOutput.answer);
568
  trace.score({
569
    name: 'similarity',
570
    value: score,
571
    dataType: 'NUMERIC',
572
    comment: 'Semantic similarity to expected output'
573
  });
574
}
575

576
// Flush all events
577
await langfuse.flushAsync();
578

579
// Step 4: Analyze results
580
const runs = await langfuse.getDatasetRuns('customer-support-qa');
581
console.log(`Total runs: ${runs.data.length}`);
582

583
const latestRun = await langfuse.getDatasetRun({
584
  datasetName: 'customer-support-qa',
585
  runName: runName
586
});
587

588
console.log('Latest run:', latestRun);
589
```
590

591
## Best Practices
592

593
### Dataset Organization
594

595
```typescript
596
// Organize datasets by use case
597
await langfuse.createDataset({
598
  name: 'prod-samples-2024-01',
599
  description: 'Production samples from January 2024',
600
  metadata: {
601
    source: 'production',
602
    month: '2024-01',
603
    sample_rate: 0.1
604
  }
605
});
606

607
// Use metadata for categorization
608
await langfuse.createDatasetItem({
609
  datasetName: 'prod-samples-2024-01',
610
  input: { query: 'user question' },
611
  expectedOutput: { response: 'expected response' },
612
  metadata: {
613
    category: 'technical',
614
    difficulty: 'medium',
615
    language: 'en'
616
  }
617
});
618
```
619

620
### Evaluation Workflow
621

622
```typescript
623
// 1. Create dataset from production traces
624
const productionTraces = await langfuse.fetchTraces({
625
  tags: ['production'],
626
  fromTimestamp: '2024-01-01',
627
  limit: 100
628
});
629

630
for (const trace of productionTraces.data) {
631
  await langfuse.createDatasetItem({
632
    datasetName: 'prod-golden-set',
633
    input: trace.input,
634
    expectedOutput: trace.output,
635
    sourceTraceId: trace.id,
636
    metadata: { userId: trace.userId }
637
  });
638
}
639

640
// 2. Run experiments with different models
641
const models = ['gpt-4', 'gpt-3.5-turbo', 'claude-3-opus'];
642
const dataset = await langfuse.getDataset('prod-golden-set');
643

644
for (const model of models) {
645
  for (const item of dataset.items) {
646
    const trace = langfuse.trace({
647
      name: 'model-comparison',
648
      input: item.input,
649
      metadata: { model }
650
    });
651

652
    const generation = trace.generation({
653
      name: 'completion',
654
      model: model,
655
      input: item.input
656
    });
657

658
    const output = await generateWithModel(model, item.input);
659

660
    generation.end({ output });
661
    trace.update({ output });
662

663
    await item.link(trace, `${model}-comparison`, {
664
      metadata: { model }
665
    });
666
  }
667
}
668

669
await langfuse.flushAsync();
670

671
// 3. Compare results in Langfuse UI or via API
672
const gpt4Results = await langfuse.getDatasetRun({
673
  datasetName: 'prod-golden-set',
674
  runName: 'gpt-4-comparison'
675
});
676

677
const gpt35Results = await langfuse.getDatasetRun({
678
  datasetName: 'prod-golden-set',
679
  runName: 'gpt-3.5-turbo-comparison'
680
});
681
```
682

Version

Tile

Files

datasets.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

datasets.mddocs/