or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mddatasets.mdindex.mdmedia.mdopenai-integration.mdprompts.mdpublic-api.mdtracing.md

datasets.mddocs/

0

# Dataset Operations

1

2

Comprehensive dataset management for evaluations, experiments, and testing workflows. Datasets contain items with input/output pairs that can be linked to observations for run tracking and analysis.

3

4

## Capabilities

5

6

### Fetching Datasets

7

8

Retrieve datasets with all their items.

9

10

```typescript { .api }

11

/**

12

* Fetches a dataset with all its items

13

* @param name - Dataset name

14

* @param options - Optional pagination settings

15

* @returns Dataset with items

16

*/

17

getDataset(

18

name: string,

19

options?: { fetchItemsPageSize: number }

20

): Promise<Dataset>;

21

22

interface Dataset {

23

/** Dataset ID */

24

id: string;

25

/** Dataset name */

26

name: string;

27

/** Optional description */

28

description?: string;

29

/** Custom metadata */

30

metadata?: any;

31

/** Project ID */

32

projectId: string;

33

/** Dataset items */

34

items: DatasetItem[];

35

}

36

37

interface DatasetItem {

38

/** Item ID */

39

id: string;

40

/** Status: ACTIVE or ARCHIVED */

41

status: ApiDatasetStatus;

42

/** Input data for the item */

43

input: any;

44

/** Expected output (ground truth) */

45

expectedOutput?: any;

46

/** Custom metadata */

47

metadata?: any;

48

/** Source trace ID if created from a trace */

49

sourceTraceId?: string;

50

/** Source observation ID if created from an observation */

51

sourceObservationId?: string;

52

/** Method to link this item to a run */

53

link: LinkDatasetItem;

54

}

55

56

type ApiDatasetStatus = "ACTIVE" | "ARCHIVED";

57

58

type LinkDatasetItem = (

59

obj: LangfuseObjectClient,

60

runName: string,

61

runArgs?: {

62

description?: string;

63

metadata?: any;

64

}

65

) => Promise<CreateLangfuseDatasetRunItemResponse>;

66

67

type LangfuseObjectClient =

68

| LangfuseTraceClient

69

| LangfuseSpanClient

70

| LangfuseGenerationClient

71

| LangfuseEventClient;

72

```

73

74

**Usage Example:**

75

76

```typescript

77

import { Langfuse } from 'langfuse';

78

79

const langfuse = new Langfuse();

80

81

// Fetch dataset with default page size

82

const dataset = await langfuse.getDataset('eval-dataset');

83

84

console.log(dataset.name); // "eval-dataset"

85

console.log(dataset.items.length); // Number of items

86

87

// Fetch with custom page size

88

const largeDataset = await langfuse.getDataset('large-dataset', {

89

fetchItemsPageSize: 100

90

});

91

92

// Access dataset items

93

for (const item of dataset.items) {

94

console.log(item.input);

95

console.log(item.expectedOutput);

96

}

97

```

98

99

### Creating Datasets

100

101

Create new datasets for organizing test cases and evaluations.

102

103

```typescript { .api }

104

/**

105

* Creates a new dataset

106

* @param dataset - Dataset name as string or configuration object

107

* @returns Dataset creation response

108

*/

109

createDataset(dataset: string): Promise<CreateLangfuseDatasetResponse>;

110

createDataset(dataset: CreateLangfuseDatasetBody): Promise<CreateLangfuseDatasetResponse>;

111

112

interface CreateLangfuseDatasetBody {

113

/** Dataset name (must be unique) */

114

name: string;

115

/** Optional description */

116

description?: string;

117

/** Custom metadata */

118

metadata?: any;

119

}

120

121

interface CreateLangfuseDatasetResponse {

122

/** Dataset ID */

123

id: string;

124

/** Dataset name */

125

name: string;

126

/** Optional description */

127

description?: string;

128

/** Custom metadata */

129

metadata?: any;

130

/** Creation timestamp */

131

createdAt: string;

132

/** Last update timestamp */

133

updatedAt: string;

134

}

135

```

136

137

**Usage Example:**

138

139

```typescript

140

// Create a dataset with just a name

141

const simpleDataset = await langfuse.createDataset('qa-evaluation');

142

143

// Or create with full configuration

144

const dataset = await langfuse.createDataset({

145

name: 'qa-evaluation',

146

description: 'Question-answering evaluation dataset',

147

metadata: {

148

version: '1.0',

149

created_by: 'eval-team'

150

}

151

});

152

153

console.log(dataset.id); // Dataset ID

154

console.log(dataset.name); // "qa-evaluation"

155

```

156

157

### Creating Dataset Items

158

159

Add items to datasets with input data and expected outputs.

160

161

```typescript { .api }

162

/**

163

* Creates a dataset item

164

* @param body - Dataset item configuration

165

* @returns Dataset item response

166

*/

167

createDatasetItem(body: CreateLangfuseDatasetItemBody): Promise<CreateLangfuseDatasetItemResponse>;

168

169

interface CreateLangfuseDatasetItemBody {

170

/** Dataset name to add item to */

171

datasetName: string;

172

/** Input data for the item */

173

input: any;

174

/** Expected output (ground truth) */

175

expectedOutput?: any;

176

/** Custom metadata */

177

metadata?: any;

178

/** Source trace ID if creating from a trace */

179

sourceTraceId?: string;

180

/** Source observation ID if creating from an observation */

181

sourceObservationId?: string;

182

}

183

184

interface CreateLangfuseDatasetItemResponse {

185

/** Item ID */

186

id: string;

187

/** Status */

188

status: ApiDatasetStatus;

189

/** Input data */

190

input: any;

191

/** Expected output */

192

expectedOutput?: any;

193

/** Custom metadata */

194

metadata?: any;

195

/** Source trace ID */

196

sourceTraceId?: string;

197

/** Source observation ID */

198

sourceObservationId?: string;

199

/** Dataset ID */

200

datasetId: string;

201

/** Dataset name */

202

datasetName: string;

203

/** Creation timestamp */

204

createdAt: string;

205

/** Last update timestamp */

206

updatedAt: string;

207

}

208

```

209

210

**Usage Example:**

211

212

```typescript

213

// Create a dataset item

214

const item = await langfuse.createDatasetItem({

215

datasetName: 'qa-evaluation',

216

input: {

217

question: 'What is the capital of France?'

218

},

219

expectedOutput: {

220

answer: 'Paris'

221

},

222

metadata: {

223

difficulty: 'easy',

224

category: 'geography'

225

}

226

});

227

228

// Create item from existing trace

229

const traceItem = await langfuse.createDatasetItem({

230

datasetName: 'production-samples',

231

input: { query: 'user question' },

232

expectedOutput: { response: 'correct answer' },

233

sourceTraceId: 'trace-123',

234

sourceObservationId: 'obs-456'

235

});

236

```

237

238

### Fetching Dataset Items

239

240

Retrieve a specific dataset item by ID.

241

242

```typescript { .api }

243

/**

244

* Fetches a specific dataset item

245

* @param id - Dataset item ID

246

* @returns Dataset item response

247

*/

248

getDatasetItem(id: string): Promise<CreateLangfuseDatasetItemResponse>;

249

```

250

251

**Usage Example:**

252

253

```typescript

254

const item = await langfuse.getDatasetItem('item-123');

255

256

console.log(item.input);

257

console.log(item.expectedOutput);

258

console.log(item.metadata);

259

```

260

261

### Dataset Runs

262

263

Dataset runs track executions of your system against dataset items, enabling evaluation and comparison.

264

265

```typescript { .api }

266

/**

267

* Fetches a dataset run

268

* @param params - Run identifier parameters

269

* @returns Dataset run response

270

*/

271

getDatasetRun(params: GetLangfuseDatasetRunParams): Promise<GetLangfuseDatasetRunResponse>;

272

273

/**

274

* Fetches dataset runs for a dataset

275

* @param datasetName - Dataset name

276

* @param query - Optional filtering and pagination

277

* @returns Dataset runs response

278

*/

279

getDatasetRuns(

280

datasetName: string,

281

query?: GetLangfuseDatasetRunsQuery

282

): Promise<GetLangfuseDatasetRunsResponse>;

283

284

interface GetLangfuseDatasetRunParams {

285

/** Dataset name */

286

datasetName: string;

287

/** Run name */

288

runName: string;

289

}

290

291

interface GetLangfuseDatasetRunResponse {

292

/** Run ID */

293

id: string;

294

/** Run name */

295

name: string;

296

/** Optional description */

297

description?: string;

298

/** Custom metadata */

299

metadata?: any;

300

/** Dataset ID */

301

datasetId: string;

302

/** Dataset name */

303

datasetName: string;

304

/** Creation timestamp */

305

createdAt: string;

306

/** Last update timestamp */

307

updatedAt: string;

308

}

309

310

interface GetLangfuseDatasetRunsQuery {

311

/** Page number */

312

page?: number;

313

/** Page size */

314

limit?: number;

315

}

316

317

interface GetLangfuseDatasetRunsResponse {

318

/** Array of runs */

319

data: ApiDatasetRun[];

320

/** Pagination metadata */

321

meta: {

322

page: number;

323

limit: number;

324

totalItems: number;

325

totalPages: number;

326

};

327

}

328

329

interface ApiDatasetRun {

330

id: string;

331

name: string;

332

description?: string;

333

metadata?: any;

334

datasetId: string;

335

datasetName: string;

336

createdAt: string;

337

updatedAt: string;

338

}

339

```

340

341

**Usage Example:**

342

343

```typescript

344

// Fetch a specific run

345

const run = await langfuse.getDatasetRun({

346

datasetName: 'qa-evaluation',

347

runName: 'gpt4-run-1'

348

});

349

350

// Fetch all runs for a dataset

351

const runs = await langfuse.getDatasetRuns('qa-evaluation', {

352

page: 1,

353

limit: 50

354

});

355

356

for (const run of runs.data) {

357

console.log(run.name);

358

console.log(run.metadata);

359

}

360

```

361

362

### Creating Dataset Run Items

363

364

Link observations to dataset items to track execution runs.

365

366

```typescript { .api }

367

/**

368

* Creates a dataset run item linking an observation to a dataset item

369

* @param body - Run item configuration

370

* @returns Run item response

371

*/

372

createDatasetRunItem(body: CreateLangfuseDatasetRunItemBody): Promise<CreateLangfuseDatasetRunItemResponse>;

373

374

interface CreateLangfuseDatasetRunItemBody {

375

/** Run name */

376

runName: string;

377

/** Dataset item ID */

378

datasetItemId: string;

379

/** Trace ID to link */

380

traceId?: string;

381

/** Observation ID to link */

382

observationId?: string;

383

/** Optional run description */

384

runDescription?: string;

385

/** Custom metadata */

386

metadata?: any;

387

}

388

389

interface CreateLangfuseDatasetRunItemResponse {

390

/** Run item ID */

391

id: string;

392

}

393

```

394

395

**Usage Example:**

396

397

```typescript

398

// Create run item manually

399

const runItem = await langfuse.createDatasetRunItem({

400

runName: 'experiment-1',

401

datasetItemId: 'item-123',

402

traceId: 'trace-456',

403

observationId: 'obs-789',

404

runDescription: 'GPT-4 evaluation run',

405

metadata: {

406

model: 'gpt-4',

407

temperature: 0.7

408

}

409

});

410

```

411

412

### Linking Dataset Items to Runs

413

414

Use the `link` method on dataset items for convenient run tracking.

415

416

```typescript { .api }

417

interface DatasetItem {

418

/**

419

* Links this dataset item to an observation for run tracking

420

* @param obj - Trace, span, generation, or event client

421

* @param runName - Name of the run

422

* @param runArgs - Optional run configuration

423

* @returns Run item response

424

*/

425

link: (

426

obj: LangfuseObjectClient,

427

runName: string,

428

runArgs?: {

429

description?: string;

430

metadata?: any;

431

}

432

) => Promise<CreateLangfuseDatasetRunItemResponse>;

433

}

434

```

435

436

**Usage Example:**

437

438

```typescript

439

const dataset = await langfuse.getDataset('qa-evaluation');

440

441

for (const item of dataset.items) {

442

// Create a trace for this item

443

const trace = langfuse.trace({

444

name: 'eval-trace',

445

input: item.input

446

});

447

448

// Execute your system with the input

449

const generation = trace.generation({

450

name: 'qa-generation',

451

model: 'gpt-4',

452

input: item.input

453

});

454

455

// Simulate processing

456

const output = await processQuestion(item.input.question);

457

458

generation.end({

459

output: { answer: output }

460

});

461

462

// Link this execution to the dataset item

463

await item.link(trace, 'gpt4-evaluation', {

464

description: 'GPT-4 evaluation run',

465

metadata: {

466

temperature: 0.7,

467

model: 'gpt-4'

468

}

469

});

470

}

471

472

await langfuse.flushAsync();

473

```

474

475

## Complete Dataset Evaluation Example

476

477

```typescript

478

import { Langfuse } from 'langfuse';

479

480

const langfuse = new Langfuse();

481

482

// Step 1: Create a dataset

483

const dataset = await langfuse.createDataset({

484

name: 'customer-support-qa',

485

description: 'Customer support Q&A evaluation dataset',

486

metadata: { version: '1.0' }

487

});

488

489

// Step 2: Add items to the dataset

490

const items = [

491

{

492

question: 'How do I reset my password?',

493

expectedAnswer: 'You can reset your password by clicking the "Forgot Password" link on the login page.'

494

},

495

{

496

question: 'What are your business hours?',

497

expectedAnswer: 'We are open Monday-Friday, 9 AM to 5 PM EST.'

498

},

499

{

500

question: 'How do I cancel my subscription?',

501

expectedAnswer: 'You can cancel your subscription in the billing section of your account settings.'

502

}

503

];

504

505

for (const item of items) {

506

await langfuse.createDatasetItem({

507

datasetName: 'customer-support-qa',

508

input: { question: item.question },

509

expectedOutput: { answer: item.expectedAnswer },

510

metadata: { category: 'support' }

511

});

512

}

513

514

// Step 3: Run evaluation

515

const fetchedDataset = await langfuse.getDataset('customer-support-qa');

516

const runName = `eval-run-${Date.now()}`;

517

518

for (const item of fetchedDataset.items) {

519

// Create trace for this evaluation

520

const trace = langfuse.trace({

521

name: 'qa-evaluation',

522

input: item.input,

523

metadata: { runName }

524

});

525

526

// Get prompt

527

const prompt = await langfuse.getPrompt('support-qa-prompt', undefined, {

528

type: 'chat'

529

});

530

531

// Create generation

532

const messages = prompt.compile(

533

{ question: item.input.question },

534

{ history: [] }

535

);

536

537

const generation = trace.generation({

538

name: 'answer-generation',

539

prompt: prompt,

540

model: 'gpt-4',

541

input: messages

542

});

543

544

// Simulate LLM call

545

const response = await callLLM(messages);

546

547

generation.end({

548

output: { answer: response },

549

usage: { input: 50, output: 100, total: 150 }

550

});

551

552

// Update trace with output

553

trace.update({

554

output: { answer: response }

555

});

556

557

// Link to dataset run

558

await item.link(trace, runName, {

559

description: 'GPT-4 evaluation with support prompt',

560

metadata: {

561

model: 'gpt-4',

562

promptVersion: prompt.version

563

}

564

});

565

566

// Score the generation

567

const score = calculateSimilarity(response, item.expectedOutput.answer);

568

trace.score({

569

name: 'similarity',

570

value: score,

571

dataType: 'NUMERIC',

572

comment: 'Semantic similarity to expected output'

573

});

574

}

575

576

// Flush all events

577

await langfuse.flushAsync();

578

579

// Step 4: Analyze results

580

const runs = await langfuse.getDatasetRuns('customer-support-qa');

581

console.log(`Total runs: ${runs.data.length}`);

582

583

const latestRun = await langfuse.getDatasetRun({

584

datasetName: 'customer-support-qa',

585

runName: runName

586

});

587

588

console.log('Latest run:', latestRun);

589

```

590

591

## Best Practices

592

593

### Dataset Organization

594

595

```typescript

596

// Organize datasets by use case

597

await langfuse.createDataset({

598

name: 'prod-samples-2024-01',

599

description: 'Production samples from January 2024',

600

metadata: {

601

source: 'production',

602

month: '2024-01',

603

sample_rate: 0.1

604

}

605

});

606

607

// Use metadata for categorization

608

await langfuse.createDatasetItem({

609

datasetName: 'prod-samples-2024-01',

610

input: { query: 'user question' },

611

expectedOutput: { response: 'expected response' },

612

metadata: {

613

category: 'technical',

614

difficulty: 'medium',

615

language: 'en'

616

}

617

});

618

```

619

620

### Evaluation Workflow

621

622

```typescript

623

// 1. Create dataset from production traces

624

const productionTraces = await langfuse.fetchTraces({

625

tags: ['production'],

626

fromTimestamp: '2024-01-01',

627

limit: 100

628

});

629

630

for (const trace of productionTraces.data) {

631

await langfuse.createDatasetItem({

632

datasetName: 'prod-golden-set',

633

input: trace.input,

634

expectedOutput: trace.output,

635

sourceTraceId: trace.id,

636

metadata: { userId: trace.userId }

637

});

638

}

639

640

// 2. Run experiments with different models

641

const models = ['gpt-4', 'gpt-3.5-turbo', 'claude-3-opus'];

642

const dataset = await langfuse.getDataset('prod-golden-set');

643

644

for (const model of models) {

645

for (const item of dataset.items) {

646

const trace = langfuse.trace({

647

name: 'model-comparison',

648

input: item.input,

649

metadata: { model }

650

});

651

652

const generation = trace.generation({

653

name: 'completion',

654

model: model,

655

input: item.input

656

});

657

658

const output = await generateWithModel(model, item.input);

659

660

generation.end({ output });

661

trace.update({ output });

662

663

await item.link(trace, `${model}-comparison`, {

664

metadata: { model }

665

});

666

}

667

}

668

669

await langfuse.flushAsync();

670

671

// 3. Compare results in Langfuse UI or via API

672

const gpt4Results = await langfuse.getDatasetRun({

673

datasetName: 'prod-golden-set',

674

runName: 'gpt-4-comparison'

675

});

676

677

const gpt35Results = await langfuse.getDatasetRun({

678

datasetName: 'prod-golden-set',

679

runName: 'gpt-3.5-turbo-comparison'

680

});

681

```

682