or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

autoevals-adapter.mdclient.mddatasets.mdexperiments.mdindex.mdmedia.mdprompts.mdscores.md

scores.mddocs/

0

# Score Management

1

2

The Score Management system provides comprehensive capabilities for creating, batching, and managing evaluation scores for traces and observations. It supports automatic batching, multiple data types, OpenTelemetry integration, and flexible scoring strategies with efficient queue management.

3

4

## Capabilities

5

6

### Create Score

7

8

Create a score event and add it to the processing queue for batched submission.

9

10

```typescript { .api }

11

/**

12

* Creates a new score event and adds it to the processing queue

13

*

14

* Scores are queued and sent in batches for efficiency. The score will be

15

* automatically sent when the queue reaches the flush threshold or after

16

* the flush interval expires.

17

*

18

* Batching behavior:

19

* - Automatic flush when queue reaches flushAtCount (default: 10, configurable via LANGFUSE_FLUSH_AT)

20

* - Time-based flush after flushIntervalSeconds (default: 1s, configurable via LANGFUSE_FLUSH_INTERVAL)

21

* - Maximum batch size: 100 scores per API call

22

* - Maximum queue size: 100,000 scores (prevents memory leaks)

23

*

24

* @param data - The score data to create

25

*/

26

create(data: ScoreBody): void;

27

28

interface ScoreBody {

29

/** Optional unique identifier for the score (auto-generated if not provided) */

30

id?: string;

31

32

/** Trace ID to associate the score with */

33

traceId?: string;

34

35

/** Session ID to associate the score with */

36

sessionId?: string;

37

38

/** Observation/span ID to associate the score with */

39

observationId?: string;

40

41

/** Dataset run ID for experiment scoring */

42

datasetRunId?: string;

43

44

/** Name of the score (e.g., "quality", "accuracy", "relevance") */

45

name: string;

46

47

/** Environment tag (defaults to LANGFUSE_TRACING_ENVIRONMENT) */

48

environment?: string;

49

50

/**

51

* The value of the score

52

* - Numeric scores: number (e.g., 0.85, 4.5)

53

* - Boolean scores: 1 or 0 (true or false)

54

* - Categorical scores: string (e.g., "excellent", "good", "poor")

55

*/

56

value: number | string;

57

58

/** Optional comment explaining the score */

59

comment?: string;

60

61

/** Optional metadata object with additional context */

62

metadata?: unknown;

63

64

/**

65

* Data type of the score

66

* When set, must match the score value's type

67

* If not set, will be inferred from the score value or config

68

*/

69

dataType?: "NUMERIC" | "BOOLEAN" | "CATEGORICAL";

70

71

/**

72

* Reference to a score config

73

* When set, the score name must equal the config name

74

* Scores must comply with the config's range and data type

75

* For categorical scores, the value must map to a config category

76

* Numeric scores might be constrained by the config's max and min values

77

*/

78

configId?: string;

79

}

80

```

81

82

**Usage Examples:**

83

84

```typescript

85

import { LangfuseClient } from '@langfuse/client';

86

87

const langfuse = new LangfuseClient();

88

89

// Basic numeric score

90

langfuse.score.create({

91

name: "quality",

92

value: 0.85,

93

traceId: "trace-123",

94

comment: "High quality response"

95

});

96

97

// Numeric score with explicit data type

98

langfuse.score.create({

99

name: "accuracy",

100

value: 0.92,

101

dataType: "NUMERIC",

102

traceId: "trace-456",

103

metadata: {

104

model: "gpt-4",

105

version: "1.0"

106

}

107

});

108

109

// Boolean score (1 = true, 0 = false)

110

langfuse.score.create({

111

name: "hallucination",

112

value: 0,

113

dataType: "BOOLEAN",

114

traceId: "trace-789",

115

comment: "No hallucinations detected"

116

});

117

118

// Categorical score

119

langfuse.score.create({

120

name: "sentiment",

121

value: "positive",

122

dataType: "CATEGORICAL",

123

traceId: "trace-abc",

124

observationId: "span-xyz"

125

});

126

127

// Score with custom ID and environment

128

langfuse.score.create({

129

id: "custom-score-id",

130

name: "user_satisfaction",

131

value: 4,

132

traceId: "trace-def",

133

environment: "production",

134

metadata: {

135

userId: "user-123",

136

timestamp: new Date().toISOString()

137

}

138

});

139

140

// Score with config reference

141

langfuse.score.create({

142

name: "correctness",

143

value: "partially correct",

144

dataType: "CATEGORICAL",

145

configId: "config-123",

146

traceId: "trace-ghi",

147

comment: "Answer was mostly correct but lacked details"

148

});

149

150

// Session-level score

151

langfuse.score.create({

152

name: "session_quality",

153

value: 0.78,

154

sessionId: "session-456"

155

});

156

157

// Dataset run score (for experiments)

158

langfuse.score.create({

159

name: "test_accuracy",

160

value: 0.95,

161

datasetRunId: "run-789"

162

});

163

164

// Complex metadata example

165

langfuse.score.create({

166

name: "performance",

167

value: 0.88,

168

traceId: "trace-jkl",

169

comment: "Strong performance across all metrics",

170

metadata: {

171

model: "gpt-4-turbo",

172

latency_ms: 1250,

173

token_count: 450,

174

cost_usd: 0.025,

175

evaluation_method: "llm-as-judge",

176

criteria: ["accuracy", "completeness", "clarity"]

177

}

178

});

179

```

180

181

### Score Observation

182

183

Create a score for a specific observation using its OpenTelemetry span.

184

185

```typescript { .api }

186

/**

187

* Creates a score for a specific observation using its OpenTelemetry span

188

*

189

* This method automatically extracts the trace ID and observation ID from

190

* the provided span context, eliminating the need to manually track IDs.

191

*

192

* @param observation - Object containing the OpenTelemetry span

193

* @param data - Score data (traceId, observationId, sessionId, and datasetRunId are auto-populated)

194

*/

195

observation(

196

observation: { otelSpan: Span },

197

data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">

198

): void;

199

```

200

201

**Usage Examples:**

202

203

```typescript

204

import { LangfuseClient } from '@langfuse/client';

205

import { startObservation } from '@langfuse/tracing';

206

207

const langfuse = new LangfuseClient();

208

209

// Score an observation with OpenTelemetry integration

210

const span = startObservation({ name: "llm-call" });

211

212

// Perform operation

213

const result = await callLLM();

214

215

// Score the observation

216

langfuse.score.observation(

217

{ otelSpan: span },

218

{

219

name: "response_quality",

220

value: 0.92,

221

comment: "Excellent response quality"

222

}

223

);

224

225

span.end();

226

227

// Score with metadata

228

const analysisSpan = startObservation({ name: "document-analysis" });

229

230

const analysis = await analyzeDocument(document);

231

232

langfuse.score.observation(

233

{ otelSpan: analysisSpan },

234

{

235

name: "accuracy",

236

value: 0.87,

237

dataType: "NUMERIC",

238

metadata: {

239

documentLength: document.length,

240

processingTime: Date.now() - startTime,

241

model: "gpt-4"

242

}

243

}

244

);

245

246

analysisSpan.end();

247

248

// Boolean score for observation

249

const validationSpan = startObservation({ name: "validation" });

250

251

const isValid = await validateOutput(output);

252

253

langfuse.score.observation(

254

{ otelSpan: validationSpan },

255

{

256

name: "validation_passed",

257

value: isValid ? 1 : 0,

258

dataType: "BOOLEAN"

259

}

260

);

261

262

validationSpan.end();

263

264

// Categorical score for observation

265

const classificationSpan = startObservation({ name: "classify-intent" });

266

267

const intent = await classifyIntent(userMessage);

268

269

langfuse.score.observation(

270

{ otelSpan: classificationSpan },

271

{

272

name: "intent_category",

273

value: intent,

274

dataType: "CATEGORICAL",

275

metadata: {

276

confidence: 0.95,

277

alternatives: ["support", "sales"]

278

}

279

}

280

);

281

282

classificationSpan.end();

283

```

284

285

### Score Trace

286

287

Create a score for a trace using an OpenTelemetry span.

288

289

```typescript { .api }

290

/**

291

* Creates a score for a trace using an OpenTelemetry span

292

*

293

* This method automatically extracts the trace ID from the provided

294

* span context and creates a trace-level score (not observation-specific).

295

*

296

* @param observation - Object containing the OpenTelemetry span

297

* @param data - Score data (traceId, sessionId, observationId, and datasetRunId are auto-populated)

298

*/

299

trace(

300

observation: { otelSpan: Span },

301

data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">

302

): void;

303

```

304

305

**Usage Examples:**

306

307

```typescript

308

import { LangfuseClient } from '@langfuse/client';

309

import { startObservation } from '@langfuse/tracing';

310

311

const langfuse = new LangfuseClient();

312

313

// Score a trace (trace-level evaluation)

314

const span = startObservation({ name: "user-query-pipeline" });

315

316

// Execute multi-step pipeline

317

const retrieval = await retrieveContext(query);

318

const generation = await generateResponse(query, retrieval);

319

const validation = await validateResponse(generation);

320

321

// Score the entire trace (not just one observation)

322

langfuse.score.trace(

323

{ otelSpan: span },

324

{

325

name: "overall_quality",

326

value: 0.88,

327

comment: "Good overall quality with minor improvements needed"

328

}

329

);

330

331

span.end();

332

333

// Multiple trace-level scores

334

const workflowSpan = startObservation({ name: "customer-support-workflow" });

335

336

const conversation = await handleConversation(user);

337

338

// Score different aspects of the trace

339

langfuse.score.trace(

340

{ otelSpan: workflowSpan },

341

{

342

name: "user_satisfaction",

343

value: 4,

344

dataType: "NUMERIC",

345

comment: "User rated 4 out of 5 stars"

346

}

347

);

348

349

langfuse.score.trace(

350

{ otelSpan: workflowSpan },

351

{

352

name: "issue_resolved",

353

value: 1,

354

dataType: "BOOLEAN"

355

}

356

);

357

358

langfuse.score.trace(

359

{ otelSpan: workflowSpan },

360

{

361

name: "conversation_tone",

362

value: "professional",

363

dataType: "CATEGORICAL"

364

}

365

);

366

367

workflowSpan.end();

368

369

// Trace score with experiment metadata

370

const experimentSpan = startObservation({ name: "prompt-variant-test" });

371

372

const response = await testPromptVariant(input, variantConfig);

373

374

langfuse.score.trace(

375

{ otelSpan: experimentSpan },

376

{

377

name: "variant_performance",

378

value: 0.91,

379

metadata: {

380

variantId: "v2",

381

temperature: 0.7,

382

model: "gpt-4",

383

comparison_baseline: 0.85

384

}

385

}

386

);

387

388

experimentSpan.end();

389

```

390

391

### Score Active Observation

392

393

Create a score for the currently active observation in the OpenTelemetry context.

394

395

```typescript { .api }

396

/**

397

* Creates a score for the currently active observation

398

*

399

* This method automatically detects the active OpenTelemetry span and

400

* creates an observation-level score. If no active span is found,

401

* a warning is logged and the operation is skipped.

402

*

403

* This is useful when you don't have direct access to the span object

404

* but are within an active span context.

405

*

406

* @param data - Score data (traceId, observationId, sessionId, and datasetRunId are auto-populated)

407

*/

408

activeObservation(

409

data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">

410

): void;

411

```

412

413

**Usage Examples:**

414

415

```typescript

416

import { LangfuseClient } from '@langfuse/client';

417

import { startActiveSpan } from '@langfuse/tracing';

418

419

const langfuse = new LangfuseClient();

420

421

// Score within an active span context

422

startActiveSpan({ name: "process-document" }, async (span) => {

423

const result = await processDocument(document);

424

425

// Score the active observation (no need to pass span)

426

langfuse.score.activeObservation({

427

name: "processing_quality",

428

value: 0.9,

429

comment: "High quality processing"

430

});

431

432

span.end();

433

});

434

435

// Nested spans with active scoring

436

startActiveSpan({ name: "parent-operation" }, async (parentSpan) => {

437

438

startActiveSpan({ name: "child-operation" }, async (childSpan) => {

439

// This scores the child-operation (currently active span)

440

langfuse.score.activeObservation({

441

name: "child_accuracy",

442

value: 0.95

443

});

444

445

childSpan.end();

446

});

447

448

// This scores the parent-operation (now active again)

449

langfuse.score.activeObservation({

450

name: "parent_completeness",

451

value: 0.88

452

});

453

454

parentSpan.end();

455

});

456

457

// Use in middleware or callbacks

458

async function evaluateResponse(response: string) {

459

// Assumes this is called within an active span context

460

const quality = await assessQuality(response);

461

462

langfuse.score.activeObservation({

463

name: "response_quality",

464

value: quality.score,

465

metadata: {

466

metrics: quality.metrics,

467

evaluator: "llm-judge"

468

}

469

});

470

}

471

472

// In an async context manager

473

async function withScoring<T>(

474

operation: () => Promise<T>,

475

scoreName: string

476

): Promise<T> {

477

return startActiveSpan({ name: "scored-operation" }, async (span) => {

478

try {

479

const result = await operation();

480

481

langfuse.score.activeObservation({

482

name: scoreName,

483

value: 1,

484

dataType: "BOOLEAN",

485

comment: "Operation completed successfully"

486

});

487

488

return result;

489

} catch (error) {

490

langfuse.score.activeObservation({

491

name: scoreName,

492

value: 0,

493

dataType: "BOOLEAN",

494

comment: `Operation failed: ${error.message}`

495

});

496

497

throw error;

498

} finally {

499

span.end();

500

}

501

});

502

}

503

504

// Usage with error handling

505

startActiveSpan({ name: "risky-operation" }, async (span) => {

506

try {

507

await performRiskyOperation();

508

509

langfuse.score.activeObservation({

510

name: "success",

511

value: 1,

512

dataType: "BOOLEAN"

513

});

514

} catch (error) {

515

langfuse.score.activeObservation({

516

name: "success",

517

value: 0,

518

dataType: "BOOLEAN",

519

comment: error.message

520

});

521

} finally {

522

span.end();

523

}

524

});

525

```

526

527

### Score Active Trace

528

529

Create a score for the currently active trace in the OpenTelemetry context.

530

531

```typescript { .api }

532

/**

533

* Creates a score for the currently active trace

534

*

535

* This method automatically detects the active OpenTelemetry span and

536

* creates a trace-level score. If no active span is found,

537

* a warning is logged and the operation is skipped.

538

*

539

* This is useful for scoring the entire trace from within any nested

540

* span context without needing to track the root span.

541

*

542

* @param data - Score data (traceId, sessionId, observationId, and datasetRunId are auto-populated)

543

*/

544

activeTrace(

545

data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">

546

): void;

547

```

548

549

**Usage Examples:**

550

551

```typescript

552

import { LangfuseClient } from '@langfuse/client';

553

import { startActiveSpan } from '@langfuse/tracing';

554

555

const langfuse = new LangfuseClient();

556

557

// Score trace from within any span

558

startActiveSpan({ name: "main-workflow" }, async (span) => {

559

await step1();

560

await step2();

561

await step3();

562

563

// Score the entire trace (not just this span)

564

langfuse.score.activeTrace({

565

name: "workflow_success",

566

value: 1,

567

dataType: "BOOLEAN"

568

});

569

570

span.end();

571

});

572

573

// Score trace from nested operations

574

startActiveSpan({ name: "parent" }, async (parentSpan) => {

575

576

startActiveSpan({ name: "child" }, async (childSpan) => {

577

// Score the entire trace from within child span

578

langfuse.score.activeTrace({

579

name: "overall_quality",

580

value: 0.92,

581

comment: "Excellent overall execution"

582

});

583

584

childSpan.end();

585

});

586

587

parentSpan.end();

588

});

589

590

// User feedback collection

591

async function collectUserFeedback(userId: string, rating: number) {

592

// Assumes called within an active trace context

593

langfuse.score.activeTrace({

594

name: "user_satisfaction",

595

value: rating,

596

dataType: "NUMERIC",

597

metadata: {

598

userId,

599

timestamp: new Date().toISOString(),

600

source: "in-app-feedback"

601

}

602

});

603

}

604

605

// Post-execution trace evaluation

606

startActiveSpan({ name: "ai-assistant-conversation" }, async (span) => {

607

const conversation = await handleUserConversation(user);

608

609

// Evaluate entire conversation

610

const evaluation = await evaluateConversation(conversation);

611

612

// Score the trace based on evaluation

613

langfuse.score.activeTrace({

614

name: "conversation_quality",

615

value: evaluation.overallScore,

616

comment: evaluation.feedback,

617

metadata: {

618

metrics: evaluation.metrics,

619

duration: conversation.duration,

620

turns: conversation.turns.length

621

}

622

});

623

624

span.end();

625

});

626

627

// Multi-criteria trace scoring

628

startActiveSpan({ name: "document-processing-pipeline" }, async (span) => {

629

const result = await processPipeline(document);

630

631

// Score multiple aspects of the trace

632

langfuse.score.activeTrace({

633

name: "accuracy",

634

value: result.accuracy,

635

dataType: "NUMERIC"

636

});

637

638

langfuse.score.activeTrace({

639

name: "completeness",

640

value: result.isComplete ? 1 : 0,

641

dataType: "BOOLEAN"

642

});

643

644

langfuse.score.activeTrace({

645

name: "quality_tier",

646

value: result.qualityTier,

647

dataType: "CATEGORICAL"

648

});

649

650

span.end();

651

});

652

```

653

654

### Flush

655

656

Flush all pending score events to the Langfuse API immediately.

657

658

```typescript { .api }

659

/**

660

* Flushes all pending score events to the Langfuse API

661

*

662

* This method ensures all queued scores are sent immediately rather than

663

* waiting for the automatic flush interval or batch size threshold.

664

*

665

* Batching behavior during flush:

666

* - Scores are sent in batches of up to 100

667

* - Multiple batches are sent concurrently

668

* - All batches must complete before flush resolves

669

*

670

* @returns Promise that resolves when all pending scores have been sent

671

*/

672

flush(): Promise<void>;

673

```

674

675

**Usage Examples:**

676

677

```typescript

678

import { LangfuseClient } from '@langfuse/client';

679

680

const langfuse = new LangfuseClient();

681

682

// Manual flush after creating scores

683

langfuse.score.create({

684

name: "quality",

685

value: 0.8,

686

traceId: "trace-123"

687

});

688

689

langfuse.score.create({

690

name: "accuracy",

691

value: 0.9,

692

traceId: "trace-123"

693

});

694

695

// Ensure scores are sent immediately

696

await langfuse.score.flush();

697

698

// Flush before critical operations

699

async function processWithScoring(data: any) {

700

langfuse.score.create({

701

name: "preprocessing",

702

value: 1,

703

traceId: data.traceId

704

});

705

706

// Ensure score is sent before proceeding

707

await langfuse.score.flush();

708

709

return await criticalOperation(data);

710

}

711

712

// Flush in testing

713

describe("scoring tests", () => {

714

afterEach(async () => {

715

// Ensure all scores are sent after each test

716

await langfuse.score.flush();

717

});

718

719

it("should score correctly", async () => {

720

langfuse.score.create({ name: "test", value: 1 });

721

await langfuse.score.flush();

722

// Verify score was sent

723

});

724

});

725

726

// Flush with error handling

727

async function safeFlush() {

728

try {

729

await langfuse.score.flush();

730

console.log("Scores flushed successfully");

731

} catch (error) {

732

console.error("Failed to flush scores:", error);

733

// Scores remain in queue and will retry on next flush

734

}

735

}

736

737

// Periodic flushing in long-running processes

738

setInterval(async () => {

739

await langfuse.score.flush();

740

}, 60000); // Flush every minute

741

742

// Flush before application exit

743

process.on("SIGTERM", async () => {

744

console.log("Flushing scores before shutdown...");

745

await langfuse.score.flush();

746

process.exit(0);

747

});

748

749

// Flush in batch processing

750

async function processBatch(items: any[]) {

751

for (const item of items) {

752

await processItem(item);

753

754

langfuse.score.create({

755

name: "item_processed",

756

value: 1,

757

metadata: { itemId: item.id }

758

});

759

}

760

761

// Flush after batch completion

762

await langfuse.score.flush();

763

}

764

```

765

766

### Shutdown

767

768

Gracefully shutdown the score manager by flushing all pending scores.

769

770

```typescript { .api }

771

/**

772

* Gracefully shuts down the score manager by flushing all pending scores

773

*

774

* This method should be called before your application exits to ensure

775

* all score data is sent to Langfuse. It internally calls flush() and

776

* waits for completion.

777

*

778

* @returns Promise that resolves when shutdown is complete

779

*/

780

shutdown(): Promise<void>;

781

```

782

783

**Usage Examples:**

784

785

```typescript

786

import { LangfuseClient } from '@langfuse/client';

787

788

const langfuse = new LangfuseClient();

789

790

// Graceful shutdown before exit

791

async function gracefulShutdown() {

792

console.log("Shutting down...");

793

794

// Flush all pending scores

795

await langfuse.score.shutdown();

796

797

console.log("Shutdown complete");

798

process.exit(0);

799

}

800

801

// Handle process signals

802

process.on("SIGTERM", gracefulShutdown);

803

process.on("SIGINT", gracefulShutdown);

804

805

// Shutdown in application cleanup

806

async function cleanupApplication() {

807

// Close database connections

808

await db.close();

809

810

// Flush scores before exit

811

await langfuse.score.shutdown();

812

813

// Close other resources

814

await cache.disconnect();

815

}

816

817

// Shutdown with timeout

818

async function shutdownWithTimeout(timeoutMs: number = 5000) {

819

const timeout = new Promise((_, reject) =>

820

setTimeout(() => reject(new Error("Shutdown timeout")), timeoutMs)

821

);

822

823

try {

824

await Promise.race([

825

langfuse.score.shutdown(),

826

timeout

827

]);

828

console.log("Score manager shutdown successfully");

829

} catch (error) {

830

console.error("Shutdown error:", error);

831

// Force exit if timeout

832

}

833

}

834

835

// Shutdown in tests

836

afterAll(async () => {

837

await langfuse.score.shutdown();

838

});

839

840

// Shutdown in serverless functions

841

export async function handler(event: any) {

842

try {

843

// Process request and create scores

844

langfuse.score.create({

845

name: "request_handled",

846

value: 1

847

});

848

849

return { statusCode: 200, body: "Success" };

850

} finally {

851

// Ensure scores are sent before function terminates

852

await langfuse.score.shutdown();

853

}

854

}

855

856

// Shutdown with error handling

857

async function safeShutdown() {

858

try {

859

await langfuse.score.shutdown();

860

console.log("Scores flushed successfully");

861

} catch (error) {

862

console.error("Error during shutdown:", error);

863

// Log error but continue shutdown

864

}

865

}

866

867

// Shutdown in Docker container

868

process.on("SIGTERM", async () => {

869

console.log("SIGTERM received, starting graceful shutdown");

870

871

// Stop accepting new requests

872

server.close();

873

874

// Flush pending scores

875

await langfuse.score.shutdown();

876

877

console.log("Graceful shutdown complete");

878

process.exit(0);

879

});

880

```

881

882

## Type Definitions

883

884

### ScoreDataType

885

886

Enumeration of supported score data types.

887

888

```typescript { .api }

889

/**

890

* Score data types supported by Langfuse

891

*/

892

type ScoreDataType = "NUMERIC" | "BOOLEAN" | "CATEGORICAL";

893

894

// Constants for convenience

895

const ScoreDataType = {

896

Numeric: "NUMERIC",

897

Boolean: "BOOLEAN",

898

Categorical: "CATEGORICAL",

899

} as const;

900

```

901

902

**Data Type Details:**

903

904

- **NUMERIC**: Numerical values (integers or floats)

905

- Examples: 0.85, 4.5, -0.3, 100

906

- Use cases: Quality scores, ratings, metrics, percentages

907

- Can be constrained by config min/max values

908

909

- **BOOLEAN**: Binary values represented as 1 (true) or 0 (false)

910

- Values: 1 or 0 only

911

- Use cases: Pass/fail checks, validation results, binary classifications

912

- Useful for yes/no evaluations

913

914

- **CATEGORICAL**: String labels for classification

915

- Examples: "excellent", "good", "poor", "positive", "neutral", "negative"

916

- Use cases: Quality tiers, sentiment labels, classification results

917

- Must map to config categories when using configId

918

919

**Usage Examples:**

920

921

```typescript

922

import { ScoreDataType } from '@langfuse/core';

923

924

// Numeric score

925

langfuse.score.create({

926

name: "quality_score",

927

value: 0.87,

928

dataType: ScoreDataType.Numeric,

929

traceId: "trace-123"

930

});

931

932

// Boolean score

933

langfuse.score.create({

934

name: "validation_passed",

935

value: 1,

936

dataType: ScoreDataType.Boolean,

937

traceId: "trace-456"

938

});

939

940

// Categorical score

941

langfuse.score.create({

942

name: "sentiment",

943

value: "positive",

944

dataType: ScoreDataType.Categorical,

945

traceId: "trace-789"

946

});

947

948

// Type inference (dataType can be omitted)

949

langfuse.score.create({

950

name: "auto_numeric",

951

value: 0.5, // Inferred as NUMERIC

952

traceId: "trace-abc"

953

});

954

955

langfuse.score.create({

956

name: "auto_categorical",

957

value: "excellent", // Inferred as CATEGORICAL

958

traceId: "trace-def"

959

});

960

961

// Use with constants

962

const SCORE_TYPES = {

963

QUALITY: { name: "quality", dataType: ScoreDataType.Numeric },

964

VALID: { name: "is_valid", dataType: ScoreDataType.Boolean },

965

TIER: { name: "quality_tier", dataType: ScoreDataType.Categorical }

966

};

967

968

langfuse.score.create({

969

...SCORE_TYPES.QUALITY,

970

value: 0.92,

971

traceId: "trace-ghi"

972

});

973

```

974

975

### CreateScoreValue

976

977

Union type for score values supporting both numeric and string types.

978

979

```typescript { .api }

980

/**

981

* The value of the score

982

* - Numeric for NUMERIC and BOOLEAN data types

983

* - String for CATEGORICAL data type

984

*/

985

type CreateScoreValue = number | string;

986

```

987

988

**Usage Examples:**

989

990

```typescript

991

// Numeric values

992

const numericValue: CreateScoreValue = 0.85;

993

const integerValue: CreateScoreValue = 5;

994

const negativeValue: CreateScoreValue = -0.2;

995

996

// String values

997

const categoricalValue: CreateScoreValue = "excellent";

998

const sentimentValue: CreateScoreValue = "positive";

999

1000

// Type-safe score creation

1001

function createTypedScore(

1002

name: string,

1003

value: CreateScoreValue,

1004

traceId: string

1005

) {

1006

langfuse.score.create({ name, value, traceId });

1007

}

1008

1009

createTypedScore("quality", 0.9, "trace-123");

1010

createTypedScore("sentiment", "positive", "trace-456");

1011

```

1012

1013

## Batching and Flush Behavior

1014

1015

The Score Manager implements efficient batching to optimize API usage and performance.

1016

1017

### Batch Configuration

1018

1019

Configure batching behavior via environment variables or use defaults:

1020

1021

```typescript

1022

// Environment variables

1023

LANGFUSE_FLUSH_AT=10 // Flush after this many scores (default: 10)

1024

LANGFUSE_FLUSH_INTERVAL=1 // Flush after this many seconds (default: 1)

1025

```

1026

1027

**Configuration Examples:**

1028

1029

```bash

1030

# Development: Frequent flushing for immediate feedback

1031

LANGFUSE_FLUSH_AT=5

1032

LANGFUSE_FLUSH_INTERVAL=0.5

1033

1034

# Production: Larger batches for efficiency

1035

LANGFUSE_FLUSH_AT=50

1036

LANGFUSE_FLUSH_INTERVAL=5

1037

1038

# Testing: Immediate flushing

1039

LANGFUSE_FLUSH_AT=1

1040

LANGFUSE_FLUSH_INTERVAL=0.1

1041

1042

# High-throughput: Maximum batching

1043

LANGFUSE_FLUSH_AT=100

1044

LANGFUSE_FLUSH_INTERVAL=10

1045

```

1046

1047

### Batch Constants

1048

1049

```typescript

1050

const MAX_BATCH_SIZE = 100; // Maximum scores per API call

1051

const MAX_QUEUE_SIZE = 100_000; // Maximum queue size (prevents memory leaks)

1052

```

1053

1054

### Automatic Flushing

1055

1056

Scores are automatically flushed when:

1057

1058

1. **Count threshold reached**: Queue contains `flushAtCount` scores

1059

2. **Time interval elapsed**: `flushIntervalSeconds` have passed since first queued score

1060

3. **Manual flush**: `flush()` or `shutdown()` is called

1061

1062

**Batching Examples:**

1063

1064

```typescript

1065

// Automatic flush by count (LANGFUSE_FLUSH_AT=10)

1066

for (let i = 0; i < 15; i++) {

1067

langfuse.score.create({

1068

name: `score-${i}`,

1069

value: i * 0.1

1070

});

1071

}

1072

// First 10 scores flushed automatically

1073

// Remaining 5 scores wait for timer or manual flush

1074

1075

// Automatic flush by timer (LANGFUSE_FLUSH_INTERVAL=1)

1076

langfuse.score.create({ name: "score1", value: 0.8 });

1077

// Score queued, timer starts

1078

// After 1 second, score is automatically flushed

1079

1080

// Large batch handling (150 scores)

1081

for (let i = 0; i < 150; i++) {

1082

langfuse.score.create({

1083

name: `batch-score-${i}`,

1084

value: i * 0.01

1085

});

1086

}

1087

await langfuse.score.flush();

1088

// Sent as 2 batches: 100 + 50 (respects MAX_BATCH_SIZE)

1089

1090

// Queue overflow protection

1091

for (let i = 0; i < 100_001; i++) {

1092

langfuse.score.create({

1093

name: `overflow-${i}`,

1094

value: 1

1095

});

1096

}

1097

// Score #100,001 is dropped with error log

1098

// Prevents memory exhaustion

1099

```

1100

1101

### Flush Timer Management

1102

1103

```typescript

1104

// Timer is created when first score is added to empty queue

1105

langfuse.score.create({ name: "first", value: 1 });

1106

// Timer starts

1107

1108

// Subsequent scores don't create new timers

1109

langfuse.score.create({ name: "second", value: 2 });

1110

langfuse.score.create({ name: "third", value: 3 });

1111

// Same timer continues

1112

1113

// Timer is cleared when flush completes

1114

await langfuse.score.flush();

1115

// Timer cleared, queue empty

1116

1117

// New score starts new timer

1118

langfuse.score.create({ name: "fourth", value: 4 });

1119

// New timer starts

1120

```

1121

1122

### Concurrent Flush Handling

1123

1124

```typescript

1125

// Multiple concurrent flush calls are deduplicated

1126

langfuse.score.create({ name: "test", value: 1 });

1127

1128

const flush1 = langfuse.score.flush();

1129

const flush2 = langfuse.score.flush();

1130

const flush3 = langfuse.score.flush();

1131

1132

await Promise.all([flush1, flush2, flush3]);

1133

// Only one actual API call is made

1134

// All promises resolve when flush completes

1135

```

1136

1137

## Advanced Usage

1138

1139

### Experiment Integration

1140

1141

Use scores within experiments for automated evaluation.

1142

1143

```typescript

1144

import { LangfuseClient } from '@langfuse/client';

1145

import { startObservation } from '@langfuse/tracing';

1146

1147

const langfuse = new LangfuseClient();

1148

1149

// Define experiment with scoring

1150

const result = await langfuse.experiment.run({

1151

name: "prompt-optimization",

1152

data: dataset.items,

1153

task: async (item) => {

1154

const span = startObservation({ name: "task" });

1155

1156

const output = await runModel(item.input);

1157

1158

// Score the observation

1159

langfuse.score.observation(

1160

{ otelSpan: span },

1161

{

1162

name: "task_quality",

1163

value: await evaluateQuality(output),

1164

dataType: "NUMERIC"

1165

}

1166

);

1167

1168

span.end();

1169

return output;

1170

},

1171

evaluators: [

1172

async ({ output, expectedOutput }) => {

1173

// Return evaluation scores

1174

return {

1175

name: "accuracy",

1176

value: calculateAccuracy(output, expectedOutput),

1177

dataType: "NUMERIC"

1178

};

1179

}

1180

]

1181

});

1182

1183

// Scores are automatically associated with dataset run

1184

await langfuse.score.flush();

1185

```

1186

1187

### Multi-Criteria Scoring

1188

1189

Score multiple aspects of a single operation.

1190

1191

```typescript

1192

import { startObservation } from '@langfuse/tracing';

1193

1194

const span = startObservation({ name: "llm-generation" });

1195

1196

const response = await generateResponse(prompt);

1197

1198

// Score multiple criteria

1199

langfuse.score.observation(

1200

{ otelSpan: span },

1201

{

1202

name: "accuracy",

1203

value: 0.92,

1204

dataType: "NUMERIC"

1205

}

1206

);

1207

1208

langfuse.score.observation(

1209

{ otelSpan: span },

1210

{

1211

name: "relevance",

1212

value: 0.88,

1213

dataType: "NUMERIC"

1214

}

1215

);

1216

1217

langfuse.score.observation(

1218

{ otelSpan: span },

1219

{

1220

name: "completeness",

1221

value: 1,

1222

dataType: "BOOLEAN"

1223

}

1224

);

1225

1226

langfuse.score.observation(

1227

{ otelSpan: span },

1228

{

1229

name: "tone",

1230

value: "professional",

1231

dataType: "CATEGORICAL"

1232

}

1233

);

1234

1235

span.end();

1236

```

1237

1238

### Conditional Scoring

1239

1240

Apply scores based on runtime conditions.

1241

1242

```typescript

1243

import { startObservation } from '@langfuse/tracing';

1244

1245

const span = startObservation({ name: "conditional-scoring" });

1246

1247

const result = await processRequest(request);

1248

1249

// Conditional scoring based on result

1250

if (result.needsReview) {

1251

langfuse.score.observation(

1252

{ otelSpan: span },

1253

{

1254

name: "requires_human_review",

1255

value: 1,

1256

dataType: "BOOLEAN",

1257

comment: "Flagged for manual review"

1258

}

1259

);

1260

}

1261

1262

if (result.confidence < 0.7) {

1263

langfuse.score.observation(

1264

{ otelSpan: span },

1265

{

1266

name: "low_confidence",

1267

value: result.confidence,

1268

dataType: "NUMERIC",

1269

comment: `Confidence below threshold: ${result.confidence}`

1270

}

1271

);

1272

}

1273

1274

// Quality tier scoring

1275

const tier = result.score > 0.9 ? "excellent" :

1276

result.score > 0.7 ? "good" :

1277

result.score > 0.5 ? "fair" : "poor";

1278

1279

langfuse.score.observation(

1280

{ otelSpan: span },

1281

{

1282

name: "quality_tier",

1283

value: tier,

1284

dataType: "CATEGORICAL",

1285

metadata: { rawScore: result.score }

1286

}

1287

);

1288

1289

span.end();

1290

```

1291

1292

### Score Config Integration

1293

1294

Use score configs to enforce constraints and standards.

1295

1296

```typescript

1297

// Create score with config reference

1298

langfuse.score.create({

1299

name: "quality",

1300

value: 0.85,

1301

dataType: "NUMERIC",

1302

configId: "quality-config-v1",

1303

traceId: "trace-123"

1304

});

1305

// Score must comply with config's min/max values

1306

1307

// Categorical score with config

1308

langfuse.score.create({

1309

name: "sentiment",

1310

value: "positive",

1311

dataType: "CATEGORICAL",

1312

configId: "sentiment-config",

1313

traceId: "trace-456"

1314

});

1315

// Value must match one of the config's categories

1316

1317

// Boolean score with config

1318

langfuse.score.create({

1319

name: "passes_safety_check",

1320

value: 1,

1321

dataType: "BOOLEAN",

1322

configId: "safety-check-config",

1323

traceId: "trace-789"

1324

});

1325

// Ensures consistent naming and interpretation

1326

```

1327

1328

### Async Scoring Patterns

1329

1330

Handle scoring in asynchronous workflows.

1331

1332

```typescript

1333

// Deferred scoring after async evaluation

1334

async function scoreAfterEvaluation(traceId: string, output: string) {

1335

// Trigger async evaluation (doesn't block)

1336

const evaluationPromise = evaluateWithExternalService(output);

1337

1338

// Continue processing

1339

await continueWorkflow();

1340

1341

// Wait for evaluation and score

1342

const evaluation = await evaluationPromise;

1343

1344

langfuse.score.create({

1345

name: "external_evaluation",

1346

value: evaluation.score,

1347

traceId,

1348

metadata: { evaluator: "external-service" }

1349

});

1350

}

1351

1352

// Background scoring worker

1353

const scoringQueue: Array<() => Promise<void>> = [];

1354

1355

function queueScoring(fn: () => Promise<void>) {

1356

scoringQueue.push(fn);

1357

}

1358

1359

async function processScoringQueue() {

1360

while (scoringQueue.length > 0) {

1361

const scoreFn = scoringQueue.shift();

1362

try {

1363

await scoreFn?.();

1364

} catch (error) {

1365

console.error("Scoring error:", error);

1366

}

1367

}

1368

}

1369

1370

// Queue scores for later processing

1371

queueScoring(async () => {

1372

langfuse.score.create({

1373

name: "delayed_score",

1374

value: 0.9,

1375

traceId: "trace-123"

1376

});

1377

});

1378

1379

// Process queue periodically

1380

setInterval(processScoringQueue, 5000);

1381

```

1382

1383

### Error Handling

1384

1385

Handle errors gracefully during scoring operations.

1386

1387

```typescript

1388

// Safe scoring wrapper

1389

function safeScore(scoreData: ScoreBody) {

1390

try {

1391

langfuse.score.create(scoreData);

1392

} catch (error) {

1393

console.error("Failed to create score:", error);

1394

// Log to error tracking service

1395

errorTracker.capture(error, { context: "scoring" });

1396

}

1397

}

1398

1399

// Retry logic for critical scores

1400

async function scoreWithRetry(

1401

scoreData: ScoreBody,

1402

maxRetries: number = 3

1403

) {

1404

for (let attempt = 1; attempt <= maxRetries; attempt++) {

1405

try {

1406

langfuse.score.create(scoreData);

1407

await langfuse.score.flush();

1408

return; // Success

1409

} catch (error) {

1410

console.error(`Score attempt ${attempt} failed:`, error);

1411

1412

if (attempt === maxRetries) {

1413

// Final failure - log and continue

1414

console.error("Score permanently failed after retries");

1415

} else {

1416

// Wait before retry

1417

await new Promise(resolve =>

1418

setTimeout(resolve, 1000 * attempt)

1419

);

1420

}

1421

}

1422

}

1423

}

1424

1425

// Graceful degradation

1426

async function scoreWithFallback(

1427

primary: ScoreBody,

1428

fallback: ScoreBody

1429

) {

1430

try {

1431

langfuse.score.create(primary);

1432

await langfuse.score.flush();

1433

} catch (error) {

1434

console.warn("Primary score failed, using fallback");

1435

langfuse.score.create(fallback);

1436

}

1437

}

1438

```

1439

1440

## Best Practices

1441

1442

### 1. Choose Appropriate Data Types

1443

1444

```typescript

1445

// Use NUMERIC for continuous values

1446

langfuse.score.create({

1447

name: "confidence",

1448

value: 0.87,

1449

dataType: "NUMERIC"

1450

});

1451

1452

// Use BOOLEAN for binary decisions

1453

langfuse.score.create({

1454

name: "approved",

1455

value: 1,

1456

dataType: "BOOLEAN"

1457

});

1458

1459

// Use CATEGORICAL for discrete labels

1460

langfuse.score.create({

1461

name: "quality_tier",

1462

value: "premium",

1463

dataType: "CATEGORICAL"

1464

});

1465

```

1466

1467

### 2. Provide Meaningful Comments

1468

1469

```typescript

1470

// Add context to scores

1471

langfuse.score.create({

1472

name: "quality",

1473

value: 0.65,

1474

comment: "Below target due to missing context in retrieval",

1475

metadata: {

1476

target: 0.8,

1477

reason: "insufficient_context"

1478

}

1479

});

1480

```

1481

1482

### 3. Use Metadata Effectively

1483

1484

```typescript

1485

// Rich metadata for debugging and analysis

1486

langfuse.score.create({

1487

name: "response_quality",

1488

value: 0.9,

1489

metadata: {

1490

model: "gpt-4",

1491

temperature: 0.7,

1492

promptVersion: "v2.1",

1493

tokenCount: 450,

1494

latency: 1250,

1495

evaluator: "llm-as-judge",

1496

criteria: ["accuracy", "completeness", "clarity"]

1497

}

1498

});

1499

```

1500

1501

### 4. Flush Appropriately

1502

1503

```typescript

1504

// Flush before critical operations

1505

await langfuse.score.flush();

1506

1507

// Always flush on shutdown

1508

process.on("SIGTERM", async () => {

1509

await langfuse.score.shutdown();

1510

});

1511

1512

// Don't flush after every score (defeats batching)

1513

// ❌ Bad

1514

langfuse.score.create({ name: "score", value: 1 });

1515

await langfuse.score.flush(); // Too frequent

1516

1517

// ✅ Good

1518

langfuse.score.create({ name: "score1", value: 1 });

1519

langfuse.score.create({ name: "score2", value: 2 });

1520

langfuse.score.create({ name: "score3", value: 3 });

1521

await langfuse.score.flush(); // Batch flush

1522

```

1523

1524

### 5. Use Active Context Methods

1525

1526

```typescript

1527

// Prefer active context methods when possible

1528

startActiveSpan({ name: "operation" }, async (span) => {

1529

// Cleaner than passing span around

1530

langfuse.score.activeObservation({

1531

name: "quality",

1532

value: 0.9

1533

});

1534

1535

span.end();

1536

});

1537

```

1538

1539

### 6. Configure for Your Environment

1540

1541

```bash

1542

# Development

1543

LANGFUSE_FLUSH_AT=5

1544

LANGFUSE_FLUSH_INTERVAL=0.5

1545

1546

# Production

1547

LANGFUSE_FLUSH_AT=50

1548

LANGFUSE_FLUSH_INTERVAL=5

1549

1550

# Testing

1551

LANGFUSE_FLUSH_AT=1

1552

LANGFUSE_FLUSH_INTERVAL=0.1

1553

```

1554

1555

### 7. Handle Missing Context

1556

1557

```typescript

1558

// Check for active span before scoring

1559

import { trace } from "@opentelemetry/api";

1560

1561

const activeSpan = trace.getActiveSpan();

1562

if (activeSpan) {

1563

langfuse.score.activeObservation({

1564

name: "quality",

1565

value: 0.9

1566

});

1567

} else {

1568

console.warn("No active span, skipping score");

1569

}

1570

```

1571

1572

## Performance Considerations

1573

1574

### Batching Efficiency

1575

1576

- Default batch size of 10 balances latency and efficiency

1577

- Increase `LANGFUSE_FLUSH_AT` for high-throughput scenarios

1578

- Decrease for real-time feedback requirements

1579

1580

### Memory Management

1581

1582

- Queue capped at 100,000 scores to prevent memory leaks

1583

- Scores are removed from queue after successful flush

1584

- Consider manual flushing in long-running processes

1585

1586

### Network Optimization

1587

1588

- Batching reduces API calls by up to 100x

1589

- Concurrent batch uploads for large flushes

1590

- Failed batches don't block other batches

1591

1592

### Best Performance Configuration

1593

1594

```bash

1595

# High-throughput production

1596

LANGFUSE_FLUSH_AT=100

1597

LANGFUSE_FLUSH_INTERVAL=10

1598

1599

# Real-time feedback

1600

LANGFUSE_FLUSH_AT=5

1601

LANGFUSE_FLUSH_INTERVAL=1

1602

1603

# Balanced (default)

1604

LANGFUSE_FLUSH_AT=10

1605

LANGFUSE_FLUSH_INTERVAL=1

1606

```

1607

1608

## Migration Examples

1609

1610

### From Manual Score Tracking

1611

1612

**Before:**

1613

1614

```typescript

1615

const scores = [];

1616

1617

function recordScore(name: string, value: number) {

1618

scores.push({ name, value, timestamp: Date.now() });

1619

}

1620

1621

async function sendScores() {

1622

await fetch("/api/scores", {

1623

method: "POST",

1624

body: JSON.stringify(scores)

1625

});

1626

scores.length = 0;

1627

}

1628

```

1629

1630

**After:**

1631

1632

```typescript

1633

langfuse.score.create({

1634

name: "quality",

1635

value: 0.9,

1636

traceId: "trace-123"

1637

});

1638

1639

// Automatic batching and flushing

1640

await langfuse.score.flush();

1641

```

1642

1643

### From Synchronous Scoring

1644

1645

**Before:**

1646

1647

```typescript

1648

function scoreOperation(result: any) {

1649

const score = calculateScore(result);

1650

1651

// Blocking API call

1652

await sendScoreToAPI({

1653

name: "quality",

1654

value: score

1655

});

1656

}

1657

```

1658

1659

**After:**

1660

1661

```typescript

1662

function scoreOperation(result: any) {

1663

const score = calculateScore(result);

1664

1665

// Non-blocking, queued for batch send

1666

langfuse.score.create({

1667

name: "quality",

1668

value: score,

1669

traceId: result.traceId

1670

});

1671

}

1672

```

1673

1674

## TypeScript Support

1675

1676

Full type safety for all scoring operations.

1677

1678

```typescript

1679

import type { ScoreBody, ScoreDataType } from '@langfuse/core';

1680

1681

// Type-safe score creation

1682

const scoreData: ScoreBody = {

1683

name: "quality",

1684

value: 0.85,

1685

dataType: "NUMERIC",

1686

traceId: "trace-123"

1687

};

1688

1689

langfuse.score.create(scoreData);

1690

1691

// Generic scoring function

1692

function createTypedScore<T extends ScoreBody>(data: T): void {

1693

langfuse.score.create(data);

1694

}

1695

1696

// Type guards

1697

function isNumericScore(value: number | string): value is number {

1698

return typeof value === "number";

1699

}

1700

1701

function createScore(name: string, value: number | string) {

1702

langfuse.score.create({

1703

name,

1704

value,

1705

dataType: isNumericScore(value) ? "NUMERIC" : "CATEGORICAL"

1706

});

1707

}

1708

```

1709