Tessl Tile for npm/@langfuse/client@4.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

autoevals-adapter.md client.md datasets.md experiments.md index.md media.md prompts.md scores.md

scores.mddocs/

0
# Score Management
1

2
The Score Management system provides comprehensive capabilities for creating, batching, and managing evaluation scores for traces and observations. It supports automatic batching, multiple data types, OpenTelemetry integration, and flexible scoring strategies with efficient queue management.
3

4
## Capabilities
5

6
### Create Score
7

8
Create a score event and add it to the processing queue for batched submission.
9

10
```typescript { .api }
11
/**
12
 * Creates a new score event and adds it to the processing queue
13
 *
14
 * Scores are queued and sent in batches for efficiency. The score will be
15
 * automatically sent when the queue reaches the flush threshold or after
16
 * the flush interval expires.
17
 *
18
 * Batching behavior:
19
 * - Automatic flush when queue reaches flushAtCount (default: 10, configurable via LANGFUSE_FLUSH_AT)
20
 * - Time-based flush after flushIntervalSeconds (default: 1s, configurable via LANGFUSE_FLUSH_INTERVAL)
21
 * - Maximum batch size: 100 scores per API call
22
 * - Maximum queue size: 100,000 scores (prevents memory leaks)
23
 *
24
 * @param data - The score data to create
25
 */
26
create(data: ScoreBody): void;
27

28
interface ScoreBody {
29
  /** Optional unique identifier for the score (auto-generated if not provided) */
30
  id?: string;
31

32
  /** Trace ID to associate the score with */
33
  traceId?: string;
34

35
  /** Session ID to associate the score with */
36
  sessionId?: string;
37

38
  /** Observation/span ID to associate the score with */
39
  observationId?: string;
40

41
  /** Dataset run ID for experiment scoring */
42
  datasetRunId?: string;
43

44
  /** Name of the score (e.g., "quality", "accuracy", "relevance") */
45
  name: string;
46

47
  /** Environment tag (defaults to LANGFUSE_TRACING_ENVIRONMENT) */
48
  environment?: string;
49

50
  /**
51
   * The value of the score
52
   * - Numeric scores: number (e.g., 0.85, 4.5)
53
   * - Boolean scores: 1 or 0 (true or false)
54
   * - Categorical scores: string (e.g., "excellent", "good", "poor")
55
   */
56
  value: number | string;
57

58
  /** Optional comment explaining the score */
59
  comment?: string;
60

61
  /** Optional metadata object with additional context */
62
  metadata?: unknown;
63

64
  /**
65
   * Data type of the score
66
   * When set, must match the score value's type
67
   * If not set, will be inferred from the score value or config
68
   */
69
  dataType?: "NUMERIC" | "BOOLEAN" | "CATEGORICAL";
70

71
  /**
72
   * Reference to a score config
73
   * When set, the score name must equal the config name
74
   * Scores must comply with the config's range and data type
75
   * For categorical scores, the value must map to a config category
76
   * Numeric scores might be constrained by the config's max and min values
77
   */
78
  configId?: string;
79
}
80
```
81

82
**Usage Examples:**
83

84
```typescript
85
import { LangfuseClient } from '@langfuse/client';
86

87
const langfuse = new LangfuseClient();
88

89
// Basic numeric score
90
langfuse.score.create({
91
  name: "quality",
92
  value: 0.85,
93
  traceId: "trace-123",
94
  comment: "High quality response"
95
});
96

97
// Numeric score with explicit data type
98
langfuse.score.create({
99
  name: "accuracy",
100
  value: 0.92,
101
  dataType: "NUMERIC",
102
  traceId: "trace-456",
103
  metadata: {
104
    model: "gpt-4",
105
    version: "1.0"
106
  }
107
});
108

109
// Boolean score (1 = true, 0 = false)
110
langfuse.score.create({
111
  name: "hallucination",
112
  value: 0,
113
  dataType: "BOOLEAN",
114
  traceId: "trace-789",
115
  comment: "No hallucinations detected"
116
});
117

118
// Categorical score
119
langfuse.score.create({
120
  name: "sentiment",
121
  value: "positive",
122
  dataType: "CATEGORICAL",
123
  traceId: "trace-abc",
124
  observationId: "span-xyz"
125
});
126

127
// Score with custom ID and environment
128
langfuse.score.create({
129
  id: "custom-score-id",
130
  name: "user_satisfaction",
131
  value: 4,
132
  traceId: "trace-def",
133
  environment: "production",
134
  metadata: {
135
    userId: "user-123",
136
    timestamp: new Date().toISOString()
137
  }
138
});
139

140
// Score with config reference
141
langfuse.score.create({
142
  name: "correctness",
143
  value: "partially correct",
144
  dataType: "CATEGORICAL",
145
  configId: "config-123",
146
  traceId: "trace-ghi",
147
  comment: "Answer was mostly correct but lacked details"
148
});
149

150
// Session-level score
151
langfuse.score.create({
152
  name: "session_quality",
153
  value: 0.78,
154
  sessionId: "session-456"
155
});
156

157
// Dataset run score (for experiments)
158
langfuse.score.create({
159
  name: "test_accuracy",
160
  value: 0.95,
161
  datasetRunId: "run-789"
162
});
163

164
// Complex metadata example
165
langfuse.score.create({
166
  name: "performance",
167
  value: 0.88,
168
  traceId: "trace-jkl",
169
  comment: "Strong performance across all metrics",
170
  metadata: {
171
    model: "gpt-4-turbo",
172
    latency_ms: 1250,
173
    token_count: 450,
174
    cost_usd: 0.025,
175
    evaluation_method: "llm-as-judge",
176
    criteria: ["accuracy", "completeness", "clarity"]
177
  }
178
});
179
```
180

181
### Score Observation
182

183
Create a score for a specific observation using its OpenTelemetry span.
184

185
```typescript { .api }
186
/**
187
 * Creates a score for a specific observation using its OpenTelemetry span
188
 *
189
 * This method automatically extracts the trace ID and observation ID from
190
 * the provided span context, eliminating the need to manually track IDs.
191
 *
192
 * @param observation - Object containing the OpenTelemetry span
193
 * @param data - Score data (traceId, observationId, sessionId, and datasetRunId are auto-populated)
194
 */
195
observation(
196
  observation: { otelSpan: Span },
197
  data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
198
): void;
199
```
200

201
**Usage Examples:**
202

203
```typescript
204
import { LangfuseClient } from '@langfuse/client';
205
import { startObservation } from '@langfuse/tracing';
206

207
const langfuse = new LangfuseClient();
208

209
// Score an observation with OpenTelemetry integration
210
const span = startObservation({ name: "llm-call" });
211

212
// Perform operation
213
const result = await callLLM();
214

215
// Score the observation
216
langfuse.score.observation(
217
  { otelSpan: span },
218
  {
219
    name: "response_quality",
220
    value: 0.92,
221
    comment: "Excellent response quality"
222
  }
223
);
224

225
span.end();
226

227
// Score with metadata
228
const analysisSpan = startObservation({ name: "document-analysis" });
229

230
const analysis = await analyzeDocument(document);
231

232
langfuse.score.observation(
233
  { otelSpan: analysisSpan },
234
  {
235
    name: "accuracy",
236
    value: 0.87,
237
    dataType: "NUMERIC",
238
    metadata: {
239
      documentLength: document.length,
240
      processingTime: Date.now() - startTime,
241
      model: "gpt-4"
242
    }
243
  }
244
);
245

246
analysisSpan.end();
247

248
// Boolean score for observation
249
const validationSpan = startObservation({ name: "validation" });
250

251
const isValid = await validateOutput(output);
252

253
langfuse.score.observation(
254
  { otelSpan: validationSpan },
255
  {
256
    name: "validation_passed",
257
    value: isValid ? 1 : 0,
258
    dataType: "BOOLEAN"
259
  }
260
);
261

262
validationSpan.end();
263

264
// Categorical score for observation
265
const classificationSpan = startObservation({ name: "classify-intent" });
266

267
const intent = await classifyIntent(userMessage);
268

269
langfuse.score.observation(
270
  { otelSpan: classificationSpan },
271
  {
272
    name: "intent_category",
273
    value: intent,
274
    dataType: "CATEGORICAL",
275
    metadata: {
276
      confidence: 0.95,
277
      alternatives: ["support", "sales"]
278
    }
279
  }
280
);
281

282
classificationSpan.end();
283
```
284

285
### Score Trace
286

287
Create a score for a trace using an OpenTelemetry span.
288

289
```typescript { .api }
290
/**
291
 * Creates a score for a trace using an OpenTelemetry span
292
 *
293
 * This method automatically extracts the trace ID from the provided
294
 * span context and creates a trace-level score (not observation-specific).
295
 *
296
 * @param observation - Object containing the OpenTelemetry span
297
 * @param data - Score data (traceId, sessionId, observationId, and datasetRunId are auto-populated)
298
 */
299
trace(
300
  observation: { otelSpan: Span },
301
  data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
302
): void;
303
```
304

305
**Usage Examples:**
306

307
```typescript
308
import { LangfuseClient } from '@langfuse/client';
309
import { startObservation } from '@langfuse/tracing';
310

311
const langfuse = new LangfuseClient();
312

313
// Score a trace (trace-level evaluation)
314
const span = startObservation({ name: "user-query-pipeline" });
315

316
// Execute multi-step pipeline
317
const retrieval = await retrieveContext(query);
318
const generation = await generateResponse(query, retrieval);
319
const validation = await validateResponse(generation);
320

321
// Score the entire trace (not just one observation)
322
langfuse.score.trace(
323
  { otelSpan: span },
324
  {
325
    name: "overall_quality",
326
    value: 0.88,
327
    comment: "Good overall quality with minor improvements needed"
328
  }
329
);
330

331
span.end();
332

333
// Multiple trace-level scores
334
const workflowSpan = startObservation({ name: "customer-support-workflow" });
335

336
const conversation = await handleConversation(user);
337

338
// Score different aspects of the trace
339
langfuse.score.trace(
340
  { otelSpan: workflowSpan },
341
  {
342
    name: "user_satisfaction",
343
    value: 4,
344
    dataType: "NUMERIC",
345
    comment: "User rated 4 out of 5 stars"
346
  }
347
);
348

349
langfuse.score.trace(
350
  { otelSpan: workflowSpan },
351
  {
352
    name: "issue_resolved",
353
    value: 1,
354
    dataType: "BOOLEAN"
355
  }
356
);
357

358
langfuse.score.trace(
359
  { otelSpan: workflowSpan },
360
  {
361
    name: "conversation_tone",
362
    value: "professional",
363
    dataType: "CATEGORICAL"
364
  }
365
);
366

367
workflowSpan.end();
368

369
// Trace score with experiment metadata
370
const experimentSpan = startObservation({ name: "prompt-variant-test" });
371

372
const response = await testPromptVariant(input, variantConfig);
373

374
langfuse.score.trace(
375
  { otelSpan: experimentSpan },
376
  {
377
    name: "variant_performance",
378
    value: 0.91,
379
    metadata: {
380
      variantId: "v2",
381
      temperature: 0.7,
382
      model: "gpt-4",
383
      comparison_baseline: 0.85
384
    }
385
  }
386
);
387

388
experimentSpan.end();
389
```
390

391
### Score Active Observation
392

393
Create a score for the currently active observation in the OpenTelemetry context.
394

395
```typescript { .api }
396
/**
397
 * Creates a score for the currently active observation
398
 *
399
 * This method automatically detects the active OpenTelemetry span and
400
 * creates an observation-level score. If no active span is found,
401
 * a warning is logged and the operation is skipped.
402
 *
403
 * This is useful when you don't have direct access to the span object
404
 * but are within an active span context.
405
 *
406
 * @param data - Score data (traceId, observationId, sessionId, and datasetRunId are auto-populated)
407
 */
408
activeObservation(
409
  data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
410
): void;
411
```
412

413
**Usage Examples:**
414

415
```typescript
416
import { LangfuseClient } from '@langfuse/client';
417
import { startActiveSpan } from '@langfuse/tracing';
418

419
const langfuse = new LangfuseClient();
420

421
// Score within an active span context
422
startActiveSpan({ name: "process-document" }, async (span) => {
423
  const result = await processDocument(document);
424

425
  // Score the active observation (no need to pass span)
426
  langfuse.score.activeObservation({
427
    name: "processing_quality",
428
    value: 0.9,
429
    comment: "High quality processing"
430
  });
431

432
  span.end();
433
});
434

435
// Nested spans with active scoring
436
startActiveSpan({ name: "parent-operation" }, async (parentSpan) => {
437

438
  startActiveSpan({ name: "child-operation" }, async (childSpan) => {
439
    // This scores the child-operation (currently active span)
440
    langfuse.score.activeObservation({
441
      name: "child_accuracy",
442
      value: 0.95
443
    });
444

445
    childSpan.end();
446
  });
447

448
  // This scores the parent-operation (now active again)
449
  langfuse.score.activeObservation({
450
    name: "parent_completeness",
451
    value: 0.88
452
  });
453

454
  parentSpan.end();
455
});
456

457
// Use in middleware or callbacks
458
async function evaluateResponse(response: string) {
459
  // Assumes this is called within an active span context
460
  const quality = await assessQuality(response);
461

462
  langfuse.score.activeObservation({
463
    name: "response_quality",
464
    value: quality.score,
465
    metadata: {
466
      metrics: quality.metrics,
467
      evaluator: "llm-judge"
468
    }
469
  });
470
}
471

472
// In an async context manager
473
async function withScoring<T>(
474
  operation: () => Promise<T>,
475
  scoreName: string
476
): Promise<T> {
477
  return startActiveSpan({ name: "scored-operation" }, async (span) => {
478
    try {
479
      const result = await operation();
480

481
      langfuse.score.activeObservation({
482
        name: scoreName,
483
        value: 1,
484
        dataType: "BOOLEAN",
485
        comment: "Operation completed successfully"
486
      });
487

488
      return result;
489
    } catch (error) {
490
      langfuse.score.activeObservation({
491
        name: scoreName,
492
        value: 0,
493
        dataType: "BOOLEAN",
494
        comment: `Operation failed: ${error.message}`
495
      });
496

497
      throw error;
498
    } finally {
499
      span.end();
500
    }
501
  });
502
}
503

504
// Usage with error handling
505
startActiveSpan({ name: "risky-operation" }, async (span) => {
506
  try {
507
    await performRiskyOperation();
508

509
    langfuse.score.activeObservation({
510
      name: "success",
511
      value: 1,
512
      dataType: "BOOLEAN"
513
    });
514
  } catch (error) {
515
    langfuse.score.activeObservation({
516
      name: "success",
517
      value: 0,
518
      dataType: "BOOLEAN",
519
      comment: error.message
520
    });
521
  } finally {
522
    span.end();
523
  }
524
});
525
```
526

527
### Score Active Trace
528

529
Create a score for the currently active trace in the OpenTelemetry context.
530

531
```typescript { .api }
532
/**
533
 * Creates a score for the currently active trace
534
 *
535
 * This method automatically detects the active OpenTelemetry span and
536
 * creates a trace-level score. If no active span is found,
537
 * a warning is logged and the operation is skipped.
538
 *
539
 * This is useful for scoring the entire trace from within any nested
540
 * span context without needing to track the root span.
541
 *
542
 * @param data - Score data (traceId, sessionId, observationId, and datasetRunId are auto-populated)
543
 */
544
activeTrace(
545
  data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
546
): void;
547
```
548

549
**Usage Examples:**
550

551
```typescript
552
import { LangfuseClient } from '@langfuse/client';
553
import { startActiveSpan } from '@langfuse/tracing';
554

555
const langfuse = new LangfuseClient();
556

557
// Score trace from within any span
558
startActiveSpan({ name: "main-workflow" }, async (span) => {
559
  await step1();
560
  await step2();
561
  await step3();
562

563
  // Score the entire trace (not just this span)
564
  langfuse.score.activeTrace({
565
    name: "workflow_success",
566
    value: 1,
567
    dataType: "BOOLEAN"
568
  });
569

570
  span.end();
571
});
572

573
// Score trace from nested operations
574
startActiveSpan({ name: "parent" }, async (parentSpan) => {
575

576
  startActiveSpan({ name: "child" }, async (childSpan) => {
577
    // Score the entire trace from within child span
578
    langfuse.score.activeTrace({
579
      name: "overall_quality",
580
      value: 0.92,
581
      comment: "Excellent overall execution"
582
    });
583

584
    childSpan.end();
585
  });
586

587
  parentSpan.end();
588
});
589

590
// User feedback collection
591
async function collectUserFeedback(userId: string, rating: number) {
592
  // Assumes called within an active trace context
593
  langfuse.score.activeTrace({
594
    name: "user_satisfaction",
595
    value: rating,
596
    dataType: "NUMERIC",
597
    metadata: {
598
      userId,
599
      timestamp: new Date().toISOString(),
600
      source: "in-app-feedback"
601
    }
602
  });
603
}
604

605
// Post-execution trace evaluation
606
startActiveSpan({ name: "ai-assistant-conversation" }, async (span) => {
607
  const conversation = await handleUserConversation(user);
608

609
  // Evaluate entire conversation
610
  const evaluation = await evaluateConversation(conversation);
611

612
  // Score the trace based on evaluation
613
  langfuse.score.activeTrace({
614
    name: "conversation_quality",
615
    value: evaluation.overallScore,
616
    comment: evaluation.feedback,
617
    metadata: {
618
      metrics: evaluation.metrics,
619
      duration: conversation.duration,
620
      turns: conversation.turns.length
621
    }
622
  });
623

624
  span.end();
625
});
626

627
// Multi-criteria trace scoring
628
startActiveSpan({ name: "document-processing-pipeline" }, async (span) => {
629
  const result = await processPipeline(document);
630

631
  // Score multiple aspects of the trace
632
  langfuse.score.activeTrace({
633
    name: "accuracy",
634
    value: result.accuracy,
635
    dataType: "NUMERIC"
636
  });
637

638
  langfuse.score.activeTrace({
639
    name: "completeness",
640
    value: result.isComplete ? 1 : 0,
641
    dataType: "BOOLEAN"
642
  });
643

644
  langfuse.score.activeTrace({
645
    name: "quality_tier",
646
    value: result.qualityTier,
647
    dataType: "CATEGORICAL"
648
  });
649

650
  span.end();
651
});
652
```
653

654
### Flush
655

656
Flush all pending score events to the Langfuse API immediately.
657

658
```typescript { .api }
659
/**
660
 * Flushes all pending score events to the Langfuse API
661
 *
662
 * This method ensures all queued scores are sent immediately rather than
663
 * waiting for the automatic flush interval or batch size threshold.
664
 *
665
 * Batching behavior during flush:
666
 * - Scores are sent in batches of up to 100
667
 * - Multiple batches are sent concurrently
668
 * - All batches must complete before flush resolves
669
 *
670
 * @returns Promise that resolves when all pending scores have been sent
671
 */
672
flush(): Promise<void>;
673
```
674

675
**Usage Examples:**
676

677
```typescript
678
import { LangfuseClient } from '@langfuse/client';
679

680
const langfuse = new LangfuseClient();
681

682
// Manual flush after creating scores
683
langfuse.score.create({
684
  name: "quality",
685
  value: 0.8,
686
  traceId: "trace-123"
687
});
688

689
langfuse.score.create({
690
  name: "accuracy",
691
  value: 0.9,
692
  traceId: "trace-123"
693
});
694

695
// Ensure scores are sent immediately
696
await langfuse.score.flush();
697

698
// Flush before critical operations
699
async function processWithScoring(data: any) {
700
  langfuse.score.create({
701
    name: "preprocessing",
702
    value: 1,
703
    traceId: data.traceId
704
  });
705

706
  // Ensure score is sent before proceeding
707
  await langfuse.score.flush();
708

709
  return await criticalOperation(data);
710
}
711

712
// Flush in testing
713
describe("scoring tests", () => {
714
  afterEach(async () => {
715
    // Ensure all scores are sent after each test
716
    await langfuse.score.flush();
717
  });
718

719
  it("should score correctly", async () => {
720
    langfuse.score.create({ name: "test", value: 1 });
721
    await langfuse.score.flush();
722
    // Verify score was sent
723
  });
724
});
725

726
// Flush with error handling
727
async function safeFlush() {
728
  try {
729
    await langfuse.score.flush();
730
    console.log("Scores flushed successfully");
731
  } catch (error) {
732
    console.error("Failed to flush scores:", error);
733
    // Scores remain in queue and will retry on next flush
734
  }
735
}
736

737
// Periodic flushing in long-running processes
738
setInterval(async () => {
739
  await langfuse.score.flush();
740
}, 60000); // Flush every minute
741

742
// Flush before application exit
743
process.on("SIGTERM", async () => {
744
  console.log("Flushing scores before shutdown...");
745
  await langfuse.score.flush();
746
  process.exit(0);
747
});
748

749
// Flush in batch processing
750
async function processBatch(items: any[]) {
751
  for (const item of items) {
752
    await processItem(item);
753

754
    langfuse.score.create({
755
      name: "item_processed",
756
      value: 1,
757
      metadata: { itemId: item.id }
758
    });
759
  }
760

761
  // Flush after batch completion
762
  await langfuse.score.flush();
763
}
764
```
765

766
### Shutdown
767

768
Gracefully shutdown the score manager by flushing all pending scores.
769

770
```typescript { .api }
771
/**
772
 * Gracefully shuts down the score manager by flushing all pending scores
773
 *
774
 * This method should be called before your application exits to ensure
775
 * all score data is sent to Langfuse. It internally calls flush() and
776
 * waits for completion.
777
 *
778
 * @returns Promise that resolves when shutdown is complete
779
 */
780
shutdown(): Promise<void>;
781
```
782

783
**Usage Examples:**
784

785
```typescript
786
import { LangfuseClient } from '@langfuse/client';
787

788
const langfuse = new LangfuseClient();
789

790
// Graceful shutdown before exit
791
async function gracefulShutdown() {
792
  console.log("Shutting down...");
793

794
  // Flush all pending scores
795
  await langfuse.score.shutdown();
796

797
  console.log("Shutdown complete");
798
  process.exit(0);
799
}
800

801
// Handle process signals
802
process.on("SIGTERM", gracefulShutdown);
803
process.on("SIGINT", gracefulShutdown);
804

805
// Shutdown in application cleanup
806
async function cleanupApplication() {
807
  // Close database connections
808
  await db.close();
809

810
  // Flush scores before exit
811
  await langfuse.score.shutdown();
812

813
  // Close other resources
814
  await cache.disconnect();
815
}
816

817
// Shutdown with timeout
818
async function shutdownWithTimeout(timeoutMs: number = 5000) {
819
  const timeout = new Promise((_, reject) =>
820
    setTimeout(() => reject(new Error("Shutdown timeout")), timeoutMs)
821
  );
822

823
  try {
824
    await Promise.race([
825
      langfuse.score.shutdown(),
826
      timeout
827
    ]);
828
    console.log("Score manager shutdown successfully");
829
  } catch (error) {
830
    console.error("Shutdown error:", error);
831
    // Force exit if timeout
832
  }
833
}
834

835
// Shutdown in tests
836
afterAll(async () => {
837
  await langfuse.score.shutdown();
838
});
839

840
// Shutdown in serverless functions
841
export async function handler(event: any) {
842
  try {
843
    // Process request and create scores
844
    langfuse.score.create({
845
      name: "request_handled",
846
      value: 1
847
    });
848

849
    return { statusCode: 200, body: "Success" };
850
  } finally {
851
    // Ensure scores are sent before function terminates
852
    await langfuse.score.shutdown();
853
  }
854
}
855

856
// Shutdown with error handling
857
async function safeShutdown() {
858
  try {
859
    await langfuse.score.shutdown();
860
    console.log("Scores flushed successfully");
861
  } catch (error) {
862
    console.error("Error during shutdown:", error);
863
    // Log error but continue shutdown
864
  }
865
}
866

867
// Shutdown in Docker container
868
process.on("SIGTERM", async () => {
869
  console.log("SIGTERM received, starting graceful shutdown");
870

871
  // Stop accepting new requests
872
  server.close();
873

874
  // Flush pending scores
875
  await langfuse.score.shutdown();
876

877
  console.log("Graceful shutdown complete");
878
  process.exit(0);
879
});
880
```
881

882
## Type Definitions
883

884
### ScoreDataType
885

886
Enumeration of supported score data types.
887

888
```typescript { .api }
889
/**
890
 * Score data types supported by Langfuse
891
 */
892
type ScoreDataType = "NUMERIC" | "BOOLEAN" | "CATEGORICAL";
893

894
// Constants for convenience
895
const ScoreDataType = {
896
  Numeric: "NUMERIC",
897
  Boolean: "BOOLEAN",
898
  Categorical: "CATEGORICAL",
899
} as const;
900
```
901

902
**Data Type Details:**
903

904
- **NUMERIC**: Numerical values (integers or floats)
905
  - Examples: 0.85, 4.5, -0.3, 100
906
  - Use cases: Quality scores, ratings, metrics, percentages
907
  - Can be constrained by config min/max values
908

909
- **BOOLEAN**: Binary values represented as 1 (true) or 0 (false)
910
  - Values: 1 or 0 only
911
  - Use cases: Pass/fail checks, validation results, binary classifications
912
  - Useful for yes/no evaluations
913

914
- **CATEGORICAL**: String labels for classification
915
  - Examples: "excellent", "good", "poor", "positive", "neutral", "negative"
916
  - Use cases: Quality tiers, sentiment labels, classification results
917
  - Must map to config categories when using configId
918

919
**Usage Examples:**
920

921
```typescript
922
import { ScoreDataType } from '@langfuse/core';
923

924
// Numeric score
925
langfuse.score.create({
926
  name: "quality_score",
927
  value: 0.87,
928
  dataType: ScoreDataType.Numeric,
929
  traceId: "trace-123"
930
});
931

932
// Boolean score
933
langfuse.score.create({
934
  name: "validation_passed",
935
  value: 1,
936
  dataType: ScoreDataType.Boolean,
937
  traceId: "trace-456"
938
});
939

940
// Categorical score
941
langfuse.score.create({
942
  name: "sentiment",
943
  value: "positive",
944
  dataType: ScoreDataType.Categorical,
945
  traceId: "trace-789"
946
});
947

948
// Type inference (dataType can be omitted)
949
langfuse.score.create({
950
  name: "auto_numeric",
951
  value: 0.5, // Inferred as NUMERIC
952
  traceId: "trace-abc"
953
});
954

955
langfuse.score.create({
956
  name: "auto_categorical",
957
  value: "excellent", // Inferred as CATEGORICAL
958
  traceId: "trace-def"
959
});
960

961
// Use with constants
962
const SCORE_TYPES = {
963
  QUALITY: { name: "quality", dataType: ScoreDataType.Numeric },
964
  VALID: { name: "is_valid", dataType: ScoreDataType.Boolean },
965
  TIER: { name: "quality_tier", dataType: ScoreDataType.Categorical }
966
};
967

968
langfuse.score.create({
969
  ...SCORE_TYPES.QUALITY,
970
  value: 0.92,
971
  traceId: "trace-ghi"
972
});
973
```
974

975
### CreateScoreValue
976

977
Union type for score values supporting both numeric and string types.
978

979
```typescript { .api }
980
/**
981
 * The value of the score
982
 * - Numeric for NUMERIC and BOOLEAN data types
983
 * - String for CATEGORICAL data type
984
 */
985
type CreateScoreValue = number | string;
986
```
987

988
**Usage Examples:**
989

990
```typescript
991
// Numeric values
992
const numericValue: CreateScoreValue = 0.85;
993
const integerValue: CreateScoreValue = 5;
994
const negativeValue: CreateScoreValue = -0.2;
995

996
// String values
997
const categoricalValue: CreateScoreValue = "excellent";
998
const sentimentValue: CreateScoreValue = "positive";
999

1000
// Type-safe score creation
1001
function createTypedScore(
1002
  name: string,
1003
  value: CreateScoreValue,
1004
  traceId: string
1005
) {
1006
  langfuse.score.create({ name, value, traceId });
1007
}
1008

1009
createTypedScore("quality", 0.9, "trace-123");
1010
createTypedScore("sentiment", "positive", "trace-456");
1011
```
1012

1013
## Batching and Flush Behavior
1014

1015
The Score Manager implements efficient batching to optimize API usage and performance.
1016

1017
### Batch Configuration
1018

1019
Configure batching behavior via environment variables or use defaults:
1020

1021
```typescript
1022
// Environment variables
1023
LANGFUSE_FLUSH_AT=10          // Flush after this many scores (default: 10)
1024
LANGFUSE_FLUSH_INTERVAL=1     // Flush after this many seconds (default: 1)
1025
```
1026

1027
**Configuration Examples:**
1028

1029
```bash
1030
# Development: Frequent flushing for immediate feedback
1031
LANGFUSE_FLUSH_AT=5
1032
LANGFUSE_FLUSH_INTERVAL=0.5
1033

1034
# Production: Larger batches for efficiency
1035
LANGFUSE_FLUSH_AT=50
1036
LANGFUSE_FLUSH_INTERVAL=5
1037

1038
# Testing: Immediate flushing
1039
LANGFUSE_FLUSH_AT=1
1040
LANGFUSE_FLUSH_INTERVAL=0.1
1041

1042
# High-throughput: Maximum batching
1043
LANGFUSE_FLUSH_AT=100
1044
LANGFUSE_FLUSH_INTERVAL=10
1045
```
1046

1047
### Batch Constants
1048

1049
```typescript
1050
const MAX_BATCH_SIZE = 100;      // Maximum scores per API call
1051
const MAX_QUEUE_SIZE = 100_000;  // Maximum queue size (prevents memory leaks)
1052
```
1053

1054
### Automatic Flushing
1055

1056
Scores are automatically flushed when:
1057

1058
1. **Count threshold reached**: Queue contains `flushAtCount` scores
1059
2. **Time interval elapsed**: `flushIntervalSeconds` have passed since first queued score
1060
3. **Manual flush**: `flush()` or `shutdown()` is called
1061

1062
**Batching Examples:**
1063

1064
```typescript
1065
// Automatic flush by count (LANGFUSE_FLUSH_AT=10)
1066
for (let i = 0; i < 15; i++) {
1067
  langfuse.score.create({
1068
    name: `score-${i}`,
1069
    value: i * 0.1
1070
  });
1071
}
1072
// First 10 scores flushed automatically
1073
// Remaining 5 scores wait for timer or manual flush
1074

1075
// Automatic flush by timer (LANGFUSE_FLUSH_INTERVAL=1)
1076
langfuse.score.create({ name: "score1", value: 0.8 });
1077
// Score queued, timer starts
1078
// After 1 second, score is automatically flushed
1079

1080
// Large batch handling (150 scores)
1081
for (let i = 0; i < 150; i++) {
1082
  langfuse.score.create({
1083
    name: `batch-score-${i}`,
1084
    value: i * 0.01
1085
  });
1086
}
1087
await langfuse.score.flush();
1088
// Sent as 2 batches: 100 + 50 (respects MAX_BATCH_SIZE)
1089

1090
// Queue overflow protection
1091
for (let i = 0; i < 100_001; i++) {
1092
  langfuse.score.create({
1093
    name: `overflow-${i}`,
1094
    value: 1
1095
  });
1096
}
1097
// Score #100,001 is dropped with error log
1098
// Prevents memory exhaustion
1099
```
1100

1101
### Flush Timer Management
1102

1103
```typescript
1104
// Timer is created when first score is added to empty queue
1105
langfuse.score.create({ name: "first", value: 1 });
1106
// Timer starts
1107

1108
// Subsequent scores don't create new timers
1109
langfuse.score.create({ name: "second", value: 2 });
1110
langfuse.score.create({ name: "third", value: 3 });
1111
// Same timer continues
1112

1113
// Timer is cleared when flush completes
1114
await langfuse.score.flush();
1115
// Timer cleared, queue empty
1116

1117
// New score starts new timer
1118
langfuse.score.create({ name: "fourth", value: 4 });
1119
// New timer starts
1120
```
1121

1122
### Concurrent Flush Handling
1123

1124
```typescript
1125
// Multiple concurrent flush calls are deduplicated
1126
langfuse.score.create({ name: "test", value: 1 });
1127

1128
const flush1 = langfuse.score.flush();
1129
const flush2 = langfuse.score.flush();
1130
const flush3 = langfuse.score.flush();
1131

1132
await Promise.all([flush1, flush2, flush3]);
1133
// Only one actual API call is made
1134
// All promises resolve when flush completes
1135
```
1136

1137
## Advanced Usage
1138

1139
### Experiment Integration
1140

1141
Use scores within experiments for automated evaluation.
1142

1143
```typescript
1144
import { LangfuseClient } from '@langfuse/client';
1145
import { startObservation } from '@langfuse/tracing';
1146

1147
const langfuse = new LangfuseClient();
1148

1149
// Define experiment with scoring
1150
const result = await langfuse.experiment.run({
1151
  name: "prompt-optimization",
1152
  data: dataset.items,
1153
  task: async (item) => {
1154
    const span = startObservation({ name: "task" });
1155

1156
    const output = await runModel(item.input);
1157

1158
    // Score the observation
1159
    langfuse.score.observation(
1160
      { otelSpan: span },
1161
      {
1162
        name: "task_quality",
1163
        value: await evaluateQuality(output),
1164
        dataType: "NUMERIC"
1165
      }
1166
    );
1167

1168
    span.end();
1169
    return output;
1170
  },
1171
  evaluators: [
1172
    async ({ output, expectedOutput }) => {
1173
      // Return evaluation scores
1174
      return {
1175
        name: "accuracy",
1176
        value: calculateAccuracy(output, expectedOutput),
1177
        dataType: "NUMERIC"
1178
      };
1179
    }
1180
  ]
1181
});
1182

1183
// Scores are automatically associated with dataset run
1184
await langfuse.score.flush();
1185
```
1186

1187
### Multi-Criteria Scoring
1188

1189
Score multiple aspects of a single operation.
1190

1191
```typescript
1192
import { startObservation } from '@langfuse/tracing';
1193

1194
const span = startObservation({ name: "llm-generation" });
1195

1196
const response = await generateResponse(prompt);
1197

1198
// Score multiple criteria
1199
langfuse.score.observation(
1200
  { otelSpan: span },
1201
  {
1202
    name: "accuracy",
1203
    value: 0.92,
1204
    dataType: "NUMERIC"
1205
  }
1206
);
1207

1208
langfuse.score.observation(
1209
  { otelSpan: span },
1210
  {
1211
    name: "relevance",
1212
    value: 0.88,
1213
    dataType: "NUMERIC"
1214
  }
1215
);
1216

1217
langfuse.score.observation(
1218
  { otelSpan: span },
1219
  {
1220
    name: "completeness",
1221
    value: 1,
1222
    dataType: "BOOLEAN"
1223
  }
1224
);
1225

1226
langfuse.score.observation(
1227
  { otelSpan: span },
1228
  {
1229
    name: "tone",
1230
    value: "professional",
1231
    dataType: "CATEGORICAL"
1232
  }
1233
);
1234

1235
span.end();
1236
```
1237

1238
### Conditional Scoring
1239

1240
Apply scores based on runtime conditions.
1241

1242
```typescript
1243
import { startObservation } from '@langfuse/tracing';
1244

1245
const span = startObservation({ name: "conditional-scoring" });
1246

1247
const result = await processRequest(request);
1248

1249
// Conditional scoring based on result
1250
if (result.needsReview) {
1251
  langfuse.score.observation(
1252
    { otelSpan: span },
1253
    {
1254
      name: "requires_human_review",
1255
      value: 1,
1256
      dataType: "BOOLEAN",
1257
      comment: "Flagged for manual review"
1258
    }
1259
  );
1260
}
1261

1262
if (result.confidence < 0.7) {
1263
  langfuse.score.observation(
1264
    { otelSpan: span },
1265
    {
1266
      name: "low_confidence",
1267
      value: result.confidence,
1268
      dataType: "NUMERIC",
1269
      comment: `Confidence below threshold: ${result.confidence}`
1270
    }
1271
  );
1272
}
1273

1274
// Quality tier scoring
1275
const tier = result.score > 0.9 ? "excellent" :
1276
             result.score > 0.7 ? "good" :
1277
             result.score > 0.5 ? "fair" : "poor";
1278

1279
langfuse.score.observation(
1280
  { otelSpan: span },
1281
  {
1282
    name: "quality_tier",
1283
    value: tier,
1284
    dataType: "CATEGORICAL",
1285
    metadata: { rawScore: result.score }
1286
  }
1287
);
1288

1289
span.end();
1290
```
1291

1292
### Score Config Integration
1293

1294
Use score configs to enforce constraints and standards.
1295

1296
```typescript
1297
// Create score with config reference
1298
langfuse.score.create({
1299
  name: "quality",
1300
  value: 0.85,
1301
  dataType: "NUMERIC",
1302
  configId: "quality-config-v1",
1303
  traceId: "trace-123"
1304
});
1305
// Score must comply with config's min/max values
1306

1307
// Categorical score with config
1308
langfuse.score.create({
1309
  name: "sentiment",
1310
  value: "positive",
1311
  dataType: "CATEGORICAL",
1312
  configId: "sentiment-config",
1313
  traceId: "trace-456"
1314
});
1315
// Value must match one of the config's categories
1316

1317
// Boolean score with config
1318
langfuse.score.create({
1319
  name: "passes_safety_check",
1320
  value: 1,
1321
  dataType: "BOOLEAN",
1322
  configId: "safety-check-config",
1323
  traceId: "trace-789"
1324
});
1325
// Ensures consistent naming and interpretation
1326
```
1327

1328
### Async Scoring Patterns
1329

1330
Handle scoring in asynchronous workflows.
1331

1332
```typescript
1333
// Deferred scoring after async evaluation
1334
async function scoreAfterEvaluation(traceId: string, output: string) {
1335
  // Trigger async evaluation (doesn't block)
1336
  const evaluationPromise = evaluateWithExternalService(output);
1337

1338
  // Continue processing
1339
  await continueWorkflow();
1340

1341
  // Wait for evaluation and score
1342
  const evaluation = await evaluationPromise;
1343

1344
  langfuse.score.create({
1345
    name: "external_evaluation",
1346
    value: evaluation.score,
1347
    traceId,
1348
    metadata: { evaluator: "external-service" }
1349
  });
1350
}
1351

1352
// Background scoring worker
1353
const scoringQueue: Array<() => Promise<void>> = [];
1354

1355
function queueScoring(fn: () => Promise<void>) {
1356
  scoringQueue.push(fn);
1357
}
1358

1359
async function processScoringQueue() {
1360
  while (scoringQueue.length > 0) {
1361
    const scoreFn = scoringQueue.shift();
1362
    try {
1363
      await scoreFn?.();
1364
    } catch (error) {
1365
      console.error("Scoring error:", error);
1366
    }
1367
  }
1368
}
1369

1370
// Queue scores for later processing
1371
queueScoring(async () => {
1372
  langfuse.score.create({
1373
    name: "delayed_score",
1374
    value: 0.9,
1375
    traceId: "trace-123"
1376
  });
1377
});
1378

1379
// Process queue periodically
1380
setInterval(processScoringQueue, 5000);
1381
```
1382

1383
### Error Handling
1384

1385
Handle errors gracefully during scoring operations.
1386

1387
```typescript
1388
// Safe scoring wrapper
1389
function safeScore(scoreData: ScoreBody) {
1390
  try {
1391
    langfuse.score.create(scoreData);
1392
  } catch (error) {
1393
    console.error("Failed to create score:", error);
1394
    // Log to error tracking service
1395
    errorTracker.capture(error, { context: "scoring" });
1396
  }
1397
}
1398

1399
// Retry logic for critical scores
1400
async function scoreWithRetry(
1401
  scoreData: ScoreBody,
1402
  maxRetries: number = 3
1403
) {
1404
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
1405
    try {
1406
      langfuse.score.create(scoreData);
1407
      await langfuse.score.flush();
1408
      return; // Success
1409
    } catch (error) {
1410
      console.error(`Score attempt ${attempt} failed:`, error);
1411

1412
      if (attempt === maxRetries) {
1413
        // Final failure - log and continue
1414
        console.error("Score permanently failed after retries");
1415
      } else {
1416
        // Wait before retry
1417
        await new Promise(resolve =>
1418
          setTimeout(resolve, 1000 * attempt)
1419
        );
1420
      }
1421
    }
1422
  }
1423
}
1424

1425
// Graceful degradation
1426
async function scoreWithFallback(
1427
  primary: ScoreBody,
1428
  fallback: ScoreBody
1429
) {
1430
  try {
1431
    langfuse.score.create(primary);
1432
    await langfuse.score.flush();
1433
  } catch (error) {
1434
    console.warn("Primary score failed, using fallback");
1435
    langfuse.score.create(fallback);
1436
  }
1437
}
1438
```
1439

1440
## Best Practices
1441

1442
### 1. Choose Appropriate Data Types
1443

1444
```typescript
1445
// Use NUMERIC for continuous values
1446
langfuse.score.create({
1447
  name: "confidence",
1448
  value: 0.87,
1449
  dataType: "NUMERIC"
1450
});
1451

1452
// Use BOOLEAN for binary decisions
1453
langfuse.score.create({
1454
  name: "approved",
1455
  value: 1,
1456
  dataType: "BOOLEAN"
1457
});
1458

1459
// Use CATEGORICAL for discrete labels
1460
langfuse.score.create({
1461
  name: "quality_tier",
1462
  value: "premium",
1463
  dataType: "CATEGORICAL"
1464
});
1465
```
1466

1467
### 2. Provide Meaningful Comments
1468

1469
```typescript
1470
// Add context to scores
1471
langfuse.score.create({
1472
  name: "quality",
1473
  value: 0.65,
1474
  comment: "Below target due to missing context in retrieval",
1475
  metadata: {
1476
    target: 0.8,
1477
    reason: "insufficient_context"
1478
  }
1479
});
1480
```
1481

1482
### 3. Use Metadata Effectively
1483

1484
```typescript
1485
// Rich metadata for debugging and analysis
1486
langfuse.score.create({
1487
  name: "response_quality",
1488
  value: 0.9,
1489
  metadata: {
1490
    model: "gpt-4",
1491
    temperature: 0.7,
1492
    promptVersion: "v2.1",
1493
    tokenCount: 450,
1494
    latency: 1250,
1495
    evaluator: "llm-as-judge",
1496
    criteria: ["accuracy", "completeness", "clarity"]
1497
  }
1498
});
1499
```
1500

1501
### 4. Flush Appropriately
1502

1503
```typescript
1504
// Flush before critical operations
1505
await langfuse.score.flush();
1506

1507
// Always flush on shutdown
1508
process.on("SIGTERM", async () => {
1509
  await langfuse.score.shutdown();
1510
});
1511

1512
// Don't flush after every score (defeats batching)
1513
// ❌ Bad
1514
langfuse.score.create({ name: "score", value: 1 });
1515
await langfuse.score.flush(); // Too frequent
1516

1517
// ✅ Good
1518
langfuse.score.create({ name: "score1", value: 1 });
1519
langfuse.score.create({ name: "score2", value: 2 });
1520
langfuse.score.create({ name: "score3", value: 3 });
1521
await langfuse.score.flush(); // Batch flush
1522
```
1523

1524
### 5. Use Active Context Methods
1525

1526
```typescript
1527
// Prefer active context methods when possible
1528
startActiveSpan({ name: "operation" }, async (span) => {
1529
  // Cleaner than passing span around
1530
  langfuse.score.activeObservation({
1531
    name: "quality",
1532
    value: 0.9
1533
  });
1534

1535
  span.end();
1536
});
1537
```
1538

1539
### 6. Configure for Your Environment
1540

1541
```bash
1542
# Development
1543
LANGFUSE_FLUSH_AT=5
1544
LANGFUSE_FLUSH_INTERVAL=0.5
1545

1546
# Production
1547
LANGFUSE_FLUSH_AT=50
1548
LANGFUSE_FLUSH_INTERVAL=5
1549

1550
# Testing
1551
LANGFUSE_FLUSH_AT=1
1552
LANGFUSE_FLUSH_INTERVAL=0.1
1553
```
1554

1555
### 7. Handle Missing Context
1556

1557
```typescript
1558
// Check for active span before scoring
1559
import { trace } from "@opentelemetry/api";
1560

1561
const activeSpan = trace.getActiveSpan();
1562
if (activeSpan) {
1563
  langfuse.score.activeObservation({
1564
    name: "quality",
1565
    value: 0.9
1566
  });
1567
} else {
1568
  console.warn("No active span, skipping score");
1569
}
1570
```
1571

1572
## Performance Considerations
1573

1574
### Batching Efficiency
1575

1576
- Default batch size of 10 balances latency and efficiency
1577
- Increase `LANGFUSE_FLUSH_AT` for high-throughput scenarios
1578
- Decrease for real-time feedback requirements
1579

1580
### Memory Management
1581

1582
- Queue capped at 100,000 scores to prevent memory leaks
1583
- Scores are removed from queue after successful flush
1584
- Consider manual flushing in long-running processes
1585

1586
### Network Optimization
1587

1588
- Batching reduces API calls by up to 100x
1589
- Concurrent batch uploads for large flushes
1590
- Failed batches don't block other batches
1591

1592
### Best Performance Configuration
1593

1594
```bash
1595
# High-throughput production
1596
LANGFUSE_FLUSH_AT=100
1597
LANGFUSE_FLUSH_INTERVAL=10
1598

1599
# Real-time feedback
1600
LANGFUSE_FLUSH_AT=5
1601
LANGFUSE_FLUSH_INTERVAL=1
1602

1603
# Balanced (default)
1604
LANGFUSE_FLUSH_AT=10
1605
LANGFUSE_FLUSH_INTERVAL=1
1606
```
1607

1608
## Migration Examples
1609

1610
### From Manual Score Tracking
1611

1612
**Before:**
1613

1614
```typescript
1615
const scores = [];
1616

1617
function recordScore(name: string, value: number) {
1618
  scores.push({ name, value, timestamp: Date.now() });
1619
}
1620

1621
async function sendScores() {
1622
  await fetch("/api/scores", {
1623
    method: "POST",
1624
    body: JSON.stringify(scores)
1625
  });
1626
  scores.length = 0;
1627
}
1628
```
1629

1630
**After:**
1631

1632
```typescript
1633
langfuse.score.create({
1634
  name: "quality",
1635
  value: 0.9,
1636
  traceId: "trace-123"
1637
});
1638

1639
// Automatic batching and flushing
1640
await langfuse.score.flush();
1641
```
1642

1643
### From Synchronous Scoring
1644

1645
**Before:**
1646

1647
```typescript
1648
function scoreOperation(result: any) {
1649
  const score = calculateScore(result);
1650

1651
  // Blocking API call
1652
  await sendScoreToAPI({
1653
    name: "quality",
1654
    value: score
1655
  });
1656
}
1657
```
1658

1659
**After:**
1660

1661
```typescript
1662
function scoreOperation(result: any) {
1663
  const score = calculateScore(result);
1664

1665
  // Non-blocking, queued for batch send
1666
  langfuse.score.create({
1667
    name: "quality",
1668
    value: score,
1669
    traceId: result.traceId
1670
  });
1671
}
1672
```
1673

1674
## TypeScript Support
1675

1676
Full type safety for all scoring operations.
1677

1678
```typescript
1679
import type { ScoreBody, ScoreDataType } from '@langfuse/core';
1680

1681
// Type-safe score creation
1682
const scoreData: ScoreBody = {
1683
  name: "quality",
1684
  value: 0.85,
1685
  dataType: "NUMERIC",
1686
  traceId: "trace-123"
1687
};
1688

1689
langfuse.score.create(scoreData);
1690

1691
// Generic scoring function
1692
function createTypedScore<T extends ScoreBody>(data: T): void {
1693
  langfuse.score.create(data);
1694
}
1695

1696
// Type guards
1697
function isNumericScore(value: number | string): value is number {
1698
  return typeof value === "number";
1699
}
1700

1701
function createScore(name: string, value: number | string) {
1702
  langfuse.score.create({
1703
    name,
1704
    value,
1705
    dataType: isNumericScore(value) ? "NUMERIC" : "CATEGORICAL"
1706
  });
1707
}
1708
```
1709

Version

Tile

Files

scores.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

scores.mddocs/