0
# Score Management
1
2
The Score Management system provides comprehensive capabilities for creating, batching, and managing evaluation scores for traces and observations. It supports automatic batching, multiple data types, OpenTelemetry integration, and flexible scoring strategies with efficient queue management.
3
4
## Capabilities
5
6
### Create Score
7
8
Create a score event and add it to the processing queue for batched submission.
9
10
```typescript { .api }
11
/**
12
* Creates a new score event and adds it to the processing queue
13
*
14
* Scores are queued and sent in batches for efficiency. The score will be
15
* automatically sent when the queue reaches the flush threshold or after
16
* the flush interval expires.
17
*
18
* Batching behavior:
19
* - Automatic flush when queue reaches flushAtCount (default: 10, configurable via LANGFUSE_FLUSH_AT)
20
* - Time-based flush after flushIntervalSeconds (default: 1s, configurable via LANGFUSE_FLUSH_INTERVAL)
21
* - Maximum batch size: 100 scores per API call
22
* - Maximum queue size: 100,000 scores (prevents memory leaks)
23
*
24
* @param data - The score data to create
25
*/
26
create(data: ScoreBody): void;
27
28
interface ScoreBody {
29
/** Optional unique identifier for the score (auto-generated if not provided) */
30
id?: string;
31
32
/** Trace ID to associate the score with */
33
traceId?: string;
34
35
/** Session ID to associate the score with */
36
sessionId?: string;
37
38
/** Observation/span ID to associate the score with */
39
observationId?: string;
40
41
/** Dataset run ID for experiment scoring */
42
datasetRunId?: string;
43
44
/** Name of the score (e.g., "quality", "accuracy", "relevance") */
45
name: string;
46
47
/** Environment tag (defaults to LANGFUSE_TRACING_ENVIRONMENT) */
48
environment?: string;
49
50
/**
51
* The value of the score
52
* - Numeric scores: number (e.g., 0.85, 4.5)
53
* - Boolean scores: 1 or 0 (true or false)
54
* - Categorical scores: string (e.g., "excellent", "good", "poor")
55
*/
56
value: number | string;
57
58
/** Optional comment explaining the score */
59
comment?: string;
60
61
/** Optional metadata object with additional context */
62
metadata?: unknown;
63
64
/**
65
* Data type of the score
66
* When set, must match the score value's type
67
* If not set, will be inferred from the score value or config
68
*/
69
dataType?: "NUMERIC" | "BOOLEAN" | "CATEGORICAL";
70
71
/**
72
* Reference to a score config
73
* When set, the score name must equal the config name
74
* Scores must comply with the config's range and data type
75
* For categorical scores, the value must map to a config category
76
* Numeric scores might be constrained by the config's max and min values
77
*/
78
configId?: string;
79
}
80
```
81
82
**Usage Examples:**
83
84
```typescript
85
import { LangfuseClient } from '@langfuse/client';
86
87
const langfuse = new LangfuseClient();
88
89
// Basic numeric score
90
langfuse.score.create({
91
name: "quality",
92
value: 0.85,
93
traceId: "trace-123",
94
comment: "High quality response"
95
});
96
97
// Numeric score with explicit data type
98
langfuse.score.create({
99
name: "accuracy",
100
value: 0.92,
101
dataType: "NUMERIC",
102
traceId: "trace-456",
103
metadata: {
104
model: "gpt-4",
105
version: "1.0"
106
}
107
});
108
109
// Boolean score (1 = true, 0 = false)
110
langfuse.score.create({
111
name: "hallucination",
112
value: 0,
113
dataType: "BOOLEAN",
114
traceId: "trace-789",
115
comment: "No hallucinations detected"
116
});
117
118
// Categorical score
119
langfuse.score.create({
120
name: "sentiment",
121
value: "positive",
122
dataType: "CATEGORICAL",
123
traceId: "trace-abc",
124
observationId: "span-xyz"
125
});
126
127
// Score with custom ID and environment
128
langfuse.score.create({
129
id: "custom-score-id",
130
name: "user_satisfaction",
131
value: 4,
132
traceId: "trace-def",
133
environment: "production",
134
metadata: {
135
userId: "user-123",
136
timestamp: new Date().toISOString()
137
}
138
});
139
140
// Score with config reference
141
langfuse.score.create({
142
name: "correctness",
143
value: "partially correct",
144
dataType: "CATEGORICAL",
145
configId: "config-123",
146
traceId: "trace-ghi",
147
comment: "Answer was mostly correct but lacked details"
148
});
149
150
// Session-level score
151
langfuse.score.create({
152
name: "session_quality",
153
value: 0.78,
154
sessionId: "session-456"
155
});
156
157
// Dataset run score (for experiments)
158
langfuse.score.create({
159
name: "test_accuracy",
160
value: 0.95,
161
datasetRunId: "run-789"
162
});
163
164
// Complex metadata example
165
langfuse.score.create({
166
name: "performance",
167
value: 0.88,
168
traceId: "trace-jkl",
169
comment: "Strong performance across all metrics",
170
metadata: {
171
model: "gpt-4-turbo",
172
latency_ms: 1250,
173
token_count: 450,
174
cost_usd: 0.025,
175
evaluation_method: "llm-as-judge",
176
criteria: ["accuracy", "completeness", "clarity"]
177
}
178
});
179
```
180
181
### Score Observation
182
183
Create a score for a specific observation using its OpenTelemetry span.
184
185
```typescript { .api }
186
/**
187
* Creates a score for a specific observation using its OpenTelemetry span
188
*
189
* This method automatically extracts the trace ID and observation ID from
190
* the provided span context, eliminating the need to manually track IDs.
191
*
192
* @param observation - Object containing the OpenTelemetry span
193
* @param data - Score data (traceId, observationId, sessionId, and datasetRunId are auto-populated)
194
*/
195
observation(
196
observation: { otelSpan: Span },
197
data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
198
): void;
199
```
200
201
**Usage Examples:**
202
203
```typescript
204
import { LangfuseClient } from '@langfuse/client';
205
import { startObservation } from '@langfuse/tracing';
206
207
const langfuse = new LangfuseClient();
208
209
// Score an observation with OpenTelemetry integration
210
const span = startObservation({ name: "llm-call" });
211
212
// Perform operation
213
const result = await callLLM();
214
215
// Score the observation
216
langfuse.score.observation(
217
{ otelSpan: span },
218
{
219
name: "response_quality",
220
value: 0.92,
221
comment: "Excellent response quality"
222
}
223
);
224
225
span.end();
226
227
// Score with metadata
228
const analysisSpan = startObservation({ name: "document-analysis" });
229
230
const analysis = await analyzeDocument(document);
231
232
langfuse.score.observation(
233
{ otelSpan: analysisSpan },
234
{
235
name: "accuracy",
236
value: 0.87,
237
dataType: "NUMERIC",
238
metadata: {
239
documentLength: document.length,
240
processingTime: Date.now() - startTime,
241
model: "gpt-4"
242
}
243
}
244
);
245
246
analysisSpan.end();
247
248
// Boolean score for observation
249
const validationSpan = startObservation({ name: "validation" });
250
251
const isValid = await validateOutput(output);
252
253
langfuse.score.observation(
254
{ otelSpan: validationSpan },
255
{
256
name: "validation_passed",
257
value: isValid ? 1 : 0,
258
dataType: "BOOLEAN"
259
}
260
);
261
262
validationSpan.end();
263
264
// Categorical score for observation
265
const classificationSpan = startObservation({ name: "classify-intent" });
266
267
const intent = await classifyIntent(userMessage);
268
269
langfuse.score.observation(
270
{ otelSpan: classificationSpan },
271
{
272
name: "intent_category",
273
value: intent,
274
dataType: "CATEGORICAL",
275
metadata: {
276
confidence: 0.95,
277
alternatives: ["support", "sales"]
278
}
279
}
280
);
281
282
classificationSpan.end();
283
```
284
285
### Score Trace
286
287
Create a score for a trace using an OpenTelemetry span.
288
289
```typescript { .api }
290
/**
291
* Creates a score for a trace using an OpenTelemetry span
292
*
293
* This method automatically extracts the trace ID from the provided
294
* span context and creates a trace-level score (not observation-specific).
295
*
296
* @param observation - Object containing the OpenTelemetry span
297
* @param data - Score data (traceId, sessionId, observationId, and datasetRunId are auto-populated)
298
*/
299
trace(
300
observation: { otelSpan: Span },
301
data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
302
): void;
303
```
304
305
**Usage Examples:**
306
307
```typescript
308
import { LangfuseClient } from '@langfuse/client';
309
import { startObservation } from '@langfuse/tracing';
310
311
const langfuse = new LangfuseClient();
312
313
// Score a trace (trace-level evaluation)
314
const span = startObservation({ name: "user-query-pipeline" });
315
316
// Execute multi-step pipeline
317
const retrieval = await retrieveContext(query);
318
const generation = await generateResponse(query, retrieval);
319
const validation = await validateResponse(generation);
320
321
// Score the entire trace (not just one observation)
322
langfuse.score.trace(
323
{ otelSpan: span },
324
{
325
name: "overall_quality",
326
value: 0.88,
327
comment: "Good overall quality with minor improvements needed"
328
}
329
);
330
331
span.end();
332
333
// Multiple trace-level scores
334
const workflowSpan = startObservation({ name: "customer-support-workflow" });
335
336
const conversation = await handleConversation(user);
337
338
// Score different aspects of the trace
339
langfuse.score.trace(
340
{ otelSpan: workflowSpan },
341
{
342
name: "user_satisfaction",
343
value: 4,
344
dataType: "NUMERIC",
345
comment: "User rated 4 out of 5 stars"
346
}
347
);
348
349
langfuse.score.trace(
350
{ otelSpan: workflowSpan },
351
{
352
name: "issue_resolved",
353
value: 1,
354
dataType: "BOOLEAN"
355
}
356
);
357
358
langfuse.score.trace(
359
{ otelSpan: workflowSpan },
360
{
361
name: "conversation_tone",
362
value: "professional",
363
dataType: "CATEGORICAL"
364
}
365
);
366
367
workflowSpan.end();
368
369
// Trace score with experiment metadata
370
const experimentSpan = startObservation({ name: "prompt-variant-test" });
371
372
const response = await testPromptVariant(input, variantConfig);
373
374
langfuse.score.trace(
375
{ otelSpan: experimentSpan },
376
{
377
name: "variant_performance",
378
value: 0.91,
379
metadata: {
380
variantId: "v2",
381
temperature: 0.7,
382
model: "gpt-4",
383
comparison_baseline: 0.85
384
}
385
}
386
);
387
388
experimentSpan.end();
389
```
390
391
### Score Active Observation
392
393
Create a score for the currently active observation in the OpenTelemetry context.
394
395
```typescript { .api }
396
/**
397
* Creates a score for the currently active observation
398
*
399
* This method automatically detects the active OpenTelemetry span and
400
* creates an observation-level score. If no active span is found,
401
* a warning is logged and the operation is skipped.
402
*
403
* This is useful when you don't have direct access to the span object
404
* but are within an active span context.
405
*
406
* @param data - Score data (traceId, observationId, sessionId, and datasetRunId are auto-populated)
407
*/
408
activeObservation(
409
data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
410
): void;
411
```
412
413
**Usage Examples:**
414
415
```typescript
416
import { LangfuseClient } from '@langfuse/client';
417
import { startActiveSpan } from '@langfuse/tracing';
418
419
const langfuse = new LangfuseClient();
420
421
// Score within an active span context
422
startActiveSpan({ name: "process-document" }, async (span) => {
423
const result = await processDocument(document);
424
425
// Score the active observation (no need to pass span)
426
langfuse.score.activeObservation({
427
name: "processing_quality",
428
value: 0.9,
429
comment: "High quality processing"
430
});
431
432
span.end();
433
});
434
435
// Nested spans with active scoring
436
startActiveSpan({ name: "parent-operation" }, async (parentSpan) => {
437
438
startActiveSpan({ name: "child-operation" }, async (childSpan) => {
439
// This scores the child-operation (currently active span)
440
langfuse.score.activeObservation({
441
name: "child_accuracy",
442
value: 0.95
443
});
444
445
childSpan.end();
446
});
447
448
// This scores the parent-operation (now active again)
449
langfuse.score.activeObservation({
450
name: "parent_completeness",
451
value: 0.88
452
});
453
454
parentSpan.end();
455
});
456
457
// Use in middleware or callbacks
458
async function evaluateResponse(response: string) {
459
// Assumes this is called within an active span context
460
const quality = await assessQuality(response);
461
462
langfuse.score.activeObservation({
463
name: "response_quality",
464
value: quality.score,
465
metadata: {
466
metrics: quality.metrics,
467
evaluator: "llm-judge"
468
}
469
});
470
}
471
472
// In an async context manager
473
async function withScoring<T>(
474
operation: () => Promise<T>,
475
scoreName: string
476
): Promise<T> {
477
return startActiveSpan({ name: "scored-operation" }, async (span) => {
478
try {
479
const result = await operation();
480
481
langfuse.score.activeObservation({
482
name: scoreName,
483
value: 1,
484
dataType: "BOOLEAN",
485
comment: "Operation completed successfully"
486
});
487
488
return result;
489
} catch (error) {
490
langfuse.score.activeObservation({
491
name: scoreName,
492
value: 0,
493
dataType: "BOOLEAN",
494
comment: `Operation failed: ${error.message}`
495
});
496
497
throw error;
498
} finally {
499
span.end();
500
}
501
});
502
}
503
504
// Usage with error handling
505
startActiveSpan({ name: "risky-operation" }, async (span) => {
506
try {
507
await performRiskyOperation();
508
509
langfuse.score.activeObservation({
510
name: "success",
511
value: 1,
512
dataType: "BOOLEAN"
513
});
514
} catch (error) {
515
langfuse.score.activeObservation({
516
name: "success",
517
value: 0,
518
dataType: "BOOLEAN",
519
comment: error.message
520
});
521
} finally {
522
span.end();
523
}
524
});
525
```
526
527
### Score Active Trace
528
529
Create a score for the currently active trace in the OpenTelemetry context.
530
531
```typescript { .api }
532
/**
533
* Creates a score for the currently active trace
534
*
535
* This method automatically detects the active OpenTelemetry span and
536
* creates a trace-level score. If no active span is found,
537
* a warning is logged and the operation is skipped.
538
*
539
* This is useful for scoring the entire trace from within any nested
540
* span context without needing to track the root span.
541
*
542
* @param data - Score data (traceId, sessionId, observationId, and datasetRunId are auto-populated)
543
*/
544
activeTrace(
545
data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
546
): void;
547
```
548
549
**Usage Examples:**
550
551
```typescript
552
import { LangfuseClient } from '@langfuse/client';
553
import { startActiveSpan } from '@langfuse/tracing';
554
555
const langfuse = new LangfuseClient();
556
557
// Score trace from within any span
558
startActiveSpan({ name: "main-workflow" }, async (span) => {
559
await step1();
560
await step2();
561
await step3();
562
563
// Score the entire trace (not just this span)
564
langfuse.score.activeTrace({
565
name: "workflow_success",
566
value: 1,
567
dataType: "BOOLEAN"
568
});
569
570
span.end();
571
});
572
573
// Score trace from nested operations
574
startActiveSpan({ name: "parent" }, async (parentSpan) => {
575
576
startActiveSpan({ name: "child" }, async (childSpan) => {
577
// Score the entire trace from within child span
578
langfuse.score.activeTrace({
579
name: "overall_quality",
580
value: 0.92,
581
comment: "Excellent overall execution"
582
});
583
584
childSpan.end();
585
});
586
587
parentSpan.end();
588
});
589
590
// User feedback collection
591
async function collectUserFeedback(userId: string, rating: number) {
592
// Assumes called within an active trace context
593
langfuse.score.activeTrace({
594
name: "user_satisfaction",
595
value: rating,
596
dataType: "NUMERIC",
597
metadata: {
598
userId,
599
timestamp: new Date().toISOString(),
600
source: "in-app-feedback"
601
}
602
});
603
}
604
605
// Post-execution trace evaluation
606
startActiveSpan({ name: "ai-assistant-conversation" }, async (span) => {
607
const conversation = await handleUserConversation(user);
608
609
// Evaluate entire conversation
610
const evaluation = await evaluateConversation(conversation);
611
612
// Score the trace based on evaluation
613
langfuse.score.activeTrace({
614
name: "conversation_quality",
615
value: evaluation.overallScore,
616
comment: evaluation.feedback,
617
metadata: {
618
metrics: evaluation.metrics,
619
duration: conversation.duration,
620
turns: conversation.turns.length
621
}
622
});
623
624
span.end();
625
});
626
627
// Multi-criteria trace scoring
628
startActiveSpan({ name: "document-processing-pipeline" }, async (span) => {
629
const result = await processPipeline(document);
630
631
// Score multiple aspects of the trace
632
langfuse.score.activeTrace({
633
name: "accuracy",
634
value: result.accuracy,
635
dataType: "NUMERIC"
636
});
637
638
langfuse.score.activeTrace({
639
name: "completeness",
640
value: result.isComplete ? 1 : 0,
641
dataType: "BOOLEAN"
642
});
643
644
langfuse.score.activeTrace({
645
name: "quality_tier",
646
value: result.qualityTier,
647
dataType: "CATEGORICAL"
648
});
649
650
span.end();
651
});
652
```
653
654
### Flush
655
656
Flush all pending score events to the Langfuse API immediately.
657
658
```typescript { .api }
659
/**
660
* Flushes all pending score events to the Langfuse API
661
*
662
* This method ensures all queued scores are sent immediately rather than
663
* waiting for the automatic flush interval or batch size threshold.
664
*
665
* Batching behavior during flush:
666
* - Scores are sent in batches of up to 100
667
* - Multiple batches are sent concurrently
668
* - All batches must complete before flush resolves
669
*
670
* @returns Promise that resolves when all pending scores have been sent
671
*/
672
flush(): Promise<void>;
673
```
674
675
**Usage Examples:**
676
677
```typescript
678
import { LangfuseClient } from '@langfuse/client';
679
680
const langfuse = new LangfuseClient();
681
682
// Manual flush after creating scores
683
langfuse.score.create({
684
name: "quality",
685
value: 0.8,
686
traceId: "trace-123"
687
});
688
689
langfuse.score.create({
690
name: "accuracy",
691
value: 0.9,
692
traceId: "trace-123"
693
});
694
695
// Ensure scores are sent immediately
696
await langfuse.score.flush();
697
698
// Flush before critical operations
699
async function processWithScoring(data: any) {
700
langfuse.score.create({
701
name: "preprocessing",
702
value: 1,
703
traceId: data.traceId
704
});
705
706
// Ensure score is sent before proceeding
707
await langfuse.score.flush();
708
709
return await criticalOperation(data);
710
}
711
712
// Flush in testing
713
describe("scoring tests", () => {
714
afterEach(async () => {
715
// Ensure all scores are sent after each test
716
await langfuse.score.flush();
717
});
718
719
it("should score correctly", async () => {
720
langfuse.score.create({ name: "test", value: 1 });
721
await langfuse.score.flush();
722
// Verify score was sent
723
});
724
});
725
726
// Flush with error handling
727
async function safeFlush() {
728
try {
729
await langfuse.score.flush();
730
console.log("Scores flushed successfully");
731
} catch (error) {
732
console.error("Failed to flush scores:", error);
733
// Scores remain in queue and will retry on next flush
734
}
735
}
736
737
// Periodic flushing in long-running processes
738
setInterval(async () => {
739
await langfuse.score.flush();
740
}, 60000); // Flush every minute
741
742
// Flush before application exit
743
process.on("SIGTERM", async () => {
744
console.log("Flushing scores before shutdown...");
745
await langfuse.score.flush();
746
process.exit(0);
747
});
748
749
// Flush in batch processing
750
async function processBatch(items: any[]) {
751
for (const item of items) {
752
await processItem(item);
753
754
langfuse.score.create({
755
name: "item_processed",
756
value: 1,
757
metadata: { itemId: item.id }
758
});
759
}
760
761
// Flush after batch completion
762
await langfuse.score.flush();
763
}
764
```
765
766
### Shutdown
767
768
Gracefully shutdown the score manager by flushing all pending scores.
769
770
```typescript { .api }
771
/**
772
* Gracefully shuts down the score manager by flushing all pending scores
773
*
774
* This method should be called before your application exits to ensure
775
* all score data is sent to Langfuse. It internally calls flush() and
776
* waits for completion.
777
*
778
* @returns Promise that resolves when shutdown is complete
779
*/
780
shutdown(): Promise<void>;
781
```
782
783
**Usage Examples:**
784
785
```typescript
786
import { LangfuseClient } from '@langfuse/client';
787
788
const langfuse = new LangfuseClient();
789
790
// Graceful shutdown before exit
791
async function gracefulShutdown() {
792
console.log("Shutting down...");
793
794
// Flush all pending scores
795
await langfuse.score.shutdown();
796
797
console.log("Shutdown complete");
798
process.exit(0);
799
}
800
801
// Handle process signals
802
process.on("SIGTERM", gracefulShutdown);
803
process.on("SIGINT", gracefulShutdown);
804
805
// Shutdown in application cleanup
806
async function cleanupApplication() {
807
// Close database connections
808
await db.close();
809
810
// Flush scores before exit
811
await langfuse.score.shutdown();
812
813
// Close other resources
814
await cache.disconnect();
815
}
816
817
// Shutdown with timeout
818
async function shutdownWithTimeout(timeoutMs: number = 5000) {
819
const timeout = new Promise((_, reject) =>
820
setTimeout(() => reject(new Error("Shutdown timeout")), timeoutMs)
821
);
822
823
try {
824
await Promise.race([
825
langfuse.score.shutdown(),
826
timeout
827
]);
828
console.log("Score manager shutdown successfully");
829
} catch (error) {
830
console.error("Shutdown error:", error);
831
// Force exit if timeout
832
}
833
}
834
835
// Shutdown in tests
836
afterAll(async () => {
837
await langfuse.score.shutdown();
838
});
839
840
// Shutdown in serverless functions
841
export async function handler(event: any) {
842
try {
843
// Process request and create scores
844
langfuse.score.create({
845
name: "request_handled",
846
value: 1
847
});
848
849
return { statusCode: 200, body: "Success" };
850
} finally {
851
// Ensure scores are sent before function terminates
852
await langfuse.score.shutdown();
853
}
854
}
855
856
// Shutdown with error handling
857
async function safeShutdown() {
858
try {
859
await langfuse.score.shutdown();
860
console.log("Scores flushed successfully");
861
} catch (error) {
862
console.error("Error during shutdown:", error);
863
// Log error but continue shutdown
864
}
865
}
866
867
// Shutdown in Docker container
868
process.on("SIGTERM", async () => {
869
console.log("SIGTERM received, starting graceful shutdown");
870
871
// Stop accepting new requests
872
server.close();
873
874
// Flush pending scores
875
await langfuse.score.shutdown();
876
877
console.log("Graceful shutdown complete");
878
process.exit(0);
879
});
880
```
881
882
## Type Definitions
883
884
### ScoreDataType
885
886
Enumeration of supported score data types.
887
888
```typescript { .api }
889
/**
890
* Score data types supported by Langfuse
891
*/
892
type ScoreDataType = "NUMERIC" | "BOOLEAN" | "CATEGORICAL";
893
894
// Constants for convenience
895
const ScoreDataType = {
896
Numeric: "NUMERIC",
897
Boolean: "BOOLEAN",
898
Categorical: "CATEGORICAL",
899
} as const;
900
```
901
902
**Data Type Details:**
903
904
- **NUMERIC**: Numerical values (integers or floats)
905
- Examples: 0.85, 4.5, -0.3, 100
906
- Use cases: Quality scores, ratings, metrics, percentages
907
- Can be constrained by config min/max values
908
909
- **BOOLEAN**: Binary values represented as 1 (true) or 0 (false)
910
- Values: 1 or 0 only
911
- Use cases: Pass/fail checks, validation results, binary classifications
912
- Useful for yes/no evaluations
913
914
- **CATEGORICAL**: String labels for classification
915
- Examples: "excellent", "good", "poor", "positive", "neutral", "negative"
916
- Use cases: Quality tiers, sentiment labels, classification results
917
- Must map to config categories when using configId
918
919
**Usage Examples:**
920
921
```typescript
922
import { ScoreDataType } from '@langfuse/core';
923
924
// Numeric score
925
langfuse.score.create({
926
name: "quality_score",
927
value: 0.87,
928
dataType: ScoreDataType.Numeric,
929
traceId: "trace-123"
930
});
931
932
// Boolean score
933
langfuse.score.create({
934
name: "validation_passed",
935
value: 1,
936
dataType: ScoreDataType.Boolean,
937
traceId: "trace-456"
938
});
939
940
// Categorical score
941
langfuse.score.create({
942
name: "sentiment",
943
value: "positive",
944
dataType: ScoreDataType.Categorical,
945
traceId: "trace-789"
946
});
947
948
// Type inference (dataType can be omitted)
949
langfuse.score.create({
950
name: "auto_numeric",
951
value: 0.5, // Inferred as NUMERIC
952
traceId: "trace-abc"
953
});
954
955
langfuse.score.create({
956
name: "auto_categorical",
957
value: "excellent", // Inferred as CATEGORICAL
958
traceId: "trace-def"
959
});
960
961
// Use with constants
962
const SCORE_TYPES = {
963
QUALITY: { name: "quality", dataType: ScoreDataType.Numeric },
964
VALID: { name: "is_valid", dataType: ScoreDataType.Boolean },
965
TIER: { name: "quality_tier", dataType: ScoreDataType.Categorical }
966
};
967
968
langfuse.score.create({
969
...SCORE_TYPES.QUALITY,
970
value: 0.92,
971
traceId: "trace-ghi"
972
});
973
```
974
975
### CreateScoreValue
976
977
Union type for score values supporting both numeric and string types.
978
979
```typescript { .api }
980
/**
981
* The value of the score
982
* - Numeric for NUMERIC and BOOLEAN data types
983
* - String for CATEGORICAL data type
984
*/
985
type CreateScoreValue = number | string;
986
```
987
988
**Usage Examples:**
989
990
```typescript
991
// Numeric values
992
const numericValue: CreateScoreValue = 0.85;
993
const integerValue: CreateScoreValue = 5;
994
const negativeValue: CreateScoreValue = -0.2;
995
996
// String values
997
const categoricalValue: CreateScoreValue = "excellent";
998
const sentimentValue: CreateScoreValue = "positive";
999
1000
// Type-safe score creation
1001
function createTypedScore(
1002
name: string,
1003
value: CreateScoreValue,
1004
traceId: string
1005
) {
1006
langfuse.score.create({ name, value, traceId });
1007
}
1008
1009
createTypedScore("quality", 0.9, "trace-123");
1010
createTypedScore("sentiment", "positive", "trace-456");
1011
```
1012
1013
## Batching and Flush Behavior
1014
1015
The Score Manager implements efficient batching to optimize API usage and performance.
1016
1017
### Batch Configuration
1018
1019
Configure batching behavior via environment variables or use defaults:
1020
1021
```typescript
1022
// Environment variables
1023
LANGFUSE_FLUSH_AT=10 // Flush after this many scores (default: 10)
1024
LANGFUSE_FLUSH_INTERVAL=1 // Flush after this many seconds (default: 1)
1025
```
1026
1027
**Configuration Examples:**
1028
1029
```bash
1030
# Development: Frequent flushing for immediate feedback
1031
LANGFUSE_FLUSH_AT=5
1032
LANGFUSE_FLUSH_INTERVAL=0.5
1033
1034
# Production: Larger batches for efficiency
1035
LANGFUSE_FLUSH_AT=50
1036
LANGFUSE_FLUSH_INTERVAL=5
1037
1038
# Testing: Immediate flushing
1039
LANGFUSE_FLUSH_AT=1
1040
LANGFUSE_FLUSH_INTERVAL=0.1
1041
1042
# High-throughput: Maximum batching
1043
LANGFUSE_FLUSH_AT=100
1044
LANGFUSE_FLUSH_INTERVAL=10
1045
```
1046
1047
### Batch Constants
1048
1049
```typescript
1050
const MAX_BATCH_SIZE = 100; // Maximum scores per API call
1051
const MAX_QUEUE_SIZE = 100_000; // Maximum queue size (prevents memory leaks)
1052
```
1053
1054
### Automatic Flushing
1055
1056
Scores are automatically flushed when:
1057
1058
1. **Count threshold reached**: Queue contains `flushAtCount` scores
1059
2. **Time interval elapsed**: `flushIntervalSeconds` have passed since first queued score
1060
3. **Manual flush**: `flush()` or `shutdown()` is called
1061
1062
**Batching Examples:**
1063
1064
```typescript
1065
// Automatic flush by count (LANGFUSE_FLUSH_AT=10)
1066
for (let i = 0; i < 15; i++) {
1067
langfuse.score.create({
1068
name: `score-${i}`,
1069
value: i * 0.1
1070
});
1071
}
1072
// First 10 scores flushed automatically
1073
// Remaining 5 scores wait for timer or manual flush
1074
1075
// Automatic flush by timer (LANGFUSE_FLUSH_INTERVAL=1)
1076
langfuse.score.create({ name: "score1", value: 0.8 });
1077
// Score queued, timer starts
1078
// After 1 second, score is automatically flushed
1079
1080
// Large batch handling (150 scores)
1081
for (let i = 0; i < 150; i++) {
1082
langfuse.score.create({
1083
name: `batch-score-${i}`,
1084
value: i * 0.01
1085
});
1086
}
1087
await langfuse.score.flush();
1088
// Sent as 2 batches: 100 + 50 (respects MAX_BATCH_SIZE)
1089
1090
// Queue overflow protection
1091
for (let i = 0; i < 100_001; i++) {
1092
langfuse.score.create({
1093
name: `overflow-${i}`,
1094
value: 1
1095
});
1096
}
1097
// Score #100,001 is dropped with error log
1098
// Prevents memory exhaustion
1099
```
1100
1101
### Flush Timer Management
1102
1103
```typescript
1104
// Timer is created when first score is added to empty queue
1105
langfuse.score.create({ name: "first", value: 1 });
1106
// Timer starts
1107
1108
// Subsequent scores don't create new timers
1109
langfuse.score.create({ name: "second", value: 2 });
1110
langfuse.score.create({ name: "third", value: 3 });
1111
// Same timer continues
1112
1113
// Timer is cleared when flush completes
1114
await langfuse.score.flush();
1115
// Timer cleared, queue empty
1116
1117
// New score starts new timer
1118
langfuse.score.create({ name: "fourth", value: 4 });
1119
// New timer starts
1120
```
1121
1122
### Concurrent Flush Handling
1123
1124
```typescript
1125
// Multiple concurrent flush calls are deduplicated
1126
langfuse.score.create({ name: "test", value: 1 });
1127
1128
const flush1 = langfuse.score.flush();
1129
const flush2 = langfuse.score.flush();
1130
const flush3 = langfuse.score.flush();
1131
1132
await Promise.all([flush1, flush2, flush3]);
1133
// Only one actual API call is made
1134
// All promises resolve when flush completes
1135
```
1136
1137
## Advanced Usage
1138
1139
### Experiment Integration
1140
1141
Use scores within experiments for automated evaluation.
1142
1143
```typescript
1144
import { LangfuseClient } from '@langfuse/client';
1145
import { startObservation } from '@langfuse/tracing';
1146
1147
const langfuse = new LangfuseClient();
1148
1149
// Define experiment with scoring
1150
const result = await langfuse.experiment.run({
1151
name: "prompt-optimization",
1152
data: dataset.items,
1153
task: async (item) => {
1154
const span = startObservation({ name: "task" });
1155
1156
const output = await runModel(item.input);
1157
1158
// Score the observation
1159
langfuse.score.observation(
1160
{ otelSpan: span },
1161
{
1162
name: "task_quality",
1163
value: await evaluateQuality(output),
1164
dataType: "NUMERIC"
1165
}
1166
);
1167
1168
span.end();
1169
return output;
1170
},
1171
evaluators: [
1172
async ({ output, expectedOutput }) => {
1173
// Return evaluation scores
1174
return {
1175
name: "accuracy",
1176
value: calculateAccuracy(output, expectedOutput),
1177
dataType: "NUMERIC"
1178
};
1179
}
1180
]
1181
});
1182
1183
// Scores are automatically associated with dataset run
1184
await langfuse.score.flush();
1185
```
1186
1187
### Multi-Criteria Scoring
1188
1189
Score multiple aspects of a single operation.
1190
1191
```typescript
1192
import { startObservation } from '@langfuse/tracing';
1193
1194
const span = startObservation({ name: "llm-generation" });
1195
1196
const response = await generateResponse(prompt);
1197
1198
// Score multiple criteria
1199
langfuse.score.observation(
1200
{ otelSpan: span },
1201
{
1202
name: "accuracy",
1203
value: 0.92,
1204
dataType: "NUMERIC"
1205
}
1206
);
1207
1208
langfuse.score.observation(
1209
{ otelSpan: span },
1210
{
1211
name: "relevance",
1212
value: 0.88,
1213
dataType: "NUMERIC"
1214
}
1215
);
1216
1217
langfuse.score.observation(
1218
{ otelSpan: span },
1219
{
1220
name: "completeness",
1221
value: 1,
1222
dataType: "BOOLEAN"
1223
}
1224
);
1225
1226
langfuse.score.observation(
1227
{ otelSpan: span },
1228
{
1229
name: "tone",
1230
value: "professional",
1231
dataType: "CATEGORICAL"
1232
}
1233
);
1234
1235
span.end();
1236
```
1237
1238
### Conditional Scoring
1239
1240
Apply scores based on runtime conditions.
1241
1242
```typescript
1243
import { startObservation } from '@langfuse/tracing';
1244
1245
const span = startObservation({ name: "conditional-scoring" });
1246
1247
const result = await processRequest(request);
1248
1249
// Conditional scoring based on result
1250
if (result.needsReview) {
1251
langfuse.score.observation(
1252
{ otelSpan: span },
1253
{
1254
name: "requires_human_review",
1255
value: 1,
1256
dataType: "BOOLEAN",
1257
comment: "Flagged for manual review"
1258
}
1259
);
1260
}
1261
1262
if (result.confidence < 0.7) {
1263
langfuse.score.observation(
1264
{ otelSpan: span },
1265
{
1266
name: "low_confidence",
1267
value: result.confidence,
1268
dataType: "NUMERIC",
1269
comment: `Confidence below threshold: ${result.confidence}`
1270
}
1271
);
1272
}
1273
1274
// Quality tier scoring
1275
const tier = result.score > 0.9 ? "excellent" :
1276
result.score > 0.7 ? "good" :
1277
result.score > 0.5 ? "fair" : "poor";
1278
1279
langfuse.score.observation(
1280
{ otelSpan: span },
1281
{
1282
name: "quality_tier",
1283
value: tier,
1284
dataType: "CATEGORICAL",
1285
metadata: { rawScore: result.score }
1286
}
1287
);
1288
1289
span.end();
1290
```
1291
1292
### Score Config Integration
1293
1294
Use score configs to enforce constraints and standards.
1295
1296
```typescript
1297
// Create score with config reference
1298
langfuse.score.create({
1299
name: "quality",
1300
value: 0.85,
1301
dataType: "NUMERIC",
1302
configId: "quality-config-v1",
1303
traceId: "trace-123"
1304
});
1305
// Score must comply with config's min/max values
1306
1307
// Categorical score with config
1308
langfuse.score.create({
1309
name: "sentiment",
1310
value: "positive",
1311
dataType: "CATEGORICAL",
1312
configId: "sentiment-config",
1313
traceId: "trace-456"
1314
});
1315
// Value must match one of the config's categories
1316
1317
// Boolean score with config
1318
langfuse.score.create({
1319
name: "passes_safety_check",
1320
value: 1,
1321
dataType: "BOOLEAN",
1322
configId: "safety-check-config",
1323
traceId: "trace-789"
1324
});
1325
// Ensures consistent naming and interpretation
1326
```
1327
1328
### Async Scoring Patterns
1329
1330
Handle scoring in asynchronous workflows.
1331
1332
```typescript
1333
// Deferred scoring after async evaluation
1334
async function scoreAfterEvaluation(traceId: string, output: string) {
1335
// Trigger async evaluation (doesn't block)
1336
const evaluationPromise = evaluateWithExternalService(output);
1337
1338
// Continue processing
1339
await continueWorkflow();
1340
1341
// Wait for evaluation and score
1342
const evaluation = await evaluationPromise;
1343
1344
langfuse.score.create({
1345
name: "external_evaluation",
1346
value: evaluation.score,
1347
traceId,
1348
metadata: { evaluator: "external-service" }
1349
});
1350
}
1351
1352
// Background scoring worker
1353
const scoringQueue: Array<() => Promise<void>> = [];
1354
1355
function queueScoring(fn: () => Promise<void>) {
1356
scoringQueue.push(fn);
1357
}
1358
1359
async function processScoringQueue() {
1360
while (scoringQueue.length > 0) {
1361
const scoreFn = scoringQueue.shift();
1362
try {
1363
await scoreFn?.();
1364
} catch (error) {
1365
console.error("Scoring error:", error);
1366
}
1367
}
1368
}
1369
1370
// Queue scores for later processing
1371
queueScoring(async () => {
1372
langfuse.score.create({
1373
name: "delayed_score",
1374
value: 0.9,
1375
traceId: "trace-123"
1376
});
1377
});
1378
1379
// Process queue periodically
1380
setInterval(processScoringQueue, 5000);
1381
```
1382
1383
### Error Handling
1384
1385
Handle errors gracefully during scoring operations.
1386
1387
```typescript
1388
// Safe scoring wrapper
1389
function safeScore(scoreData: ScoreBody) {
1390
try {
1391
langfuse.score.create(scoreData);
1392
} catch (error) {
1393
console.error("Failed to create score:", error);
1394
// Log to error tracking service
1395
errorTracker.capture(error, { context: "scoring" });
1396
}
1397
}
1398
1399
// Retry logic for critical scores
1400
async function scoreWithRetry(
1401
scoreData: ScoreBody,
1402
maxRetries: number = 3
1403
) {
1404
for (let attempt = 1; attempt <= maxRetries; attempt++) {
1405
try {
1406
langfuse.score.create(scoreData);
1407
await langfuse.score.flush();
1408
return; // Success
1409
} catch (error) {
1410
console.error(`Score attempt ${attempt} failed:`, error);
1411
1412
if (attempt === maxRetries) {
1413
// Final failure - log and continue
1414
console.error("Score permanently failed after retries");
1415
} else {
1416
// Wait before retry
1417
await new Promise(resolve =>
1418
setTimeout(resolve, 1000 * attempt)
1419
);
1420
}
1421
}
1422
}
1423
}
1424
1425
// Graceful degradation
1426
async function scoreWithFallback(
1427
primary: ScoreBody,
1428
fallback: ScoreBody
1429
) {
1430
try {
1431
langfuse.score.create(primary);
1432
await langfuse.score.flush();
1433
} catch (error) {
1434
console.warn("Primary score failed, using fallback");
1435
langfuse.score.create(fallback);
1436
}
1437
}
1438
```
1439
1440
## Best Practices
1441
1442
### 1. Choose Appropriate Data Types
1443
1444
```typescript
1445
// Use NUMERIC for continuous values
1446
langfuse.score.create({
1447
name: "confidence",
1448
value: 0.87,
1449
dataType: "NUMERIC"
1450
});
1451
1452
// Use BOOLEAN for binary decisions
1453
langfuse.score.create({
1454
name: "approved",
1455
value: 1,
1456
dataType: "BOOLEAN"
1457
});
1458
1459
// Use CATEGORICAL for discrete labels
1460
langfuse.score.create({
1461
name: "quality_tier",
1462
value: "premium",
1463
dataType: "CATEGORICAL"
1464
});
1465
```
1466
1467
### 2. Provide Meaningful Comments
1468
1469
```typescript
1470
// Add context to scores
1471
langfuse.score.create({
1472
name: "quality",
1473
value: 0.65,
1474
comment: "Below target due to missing context in retrieval",
1475
metadata: {
1476
target: 0.8,
1477
reason: "insufficient_context"
1478
}
1479
});
1480
```
1481
1482
### 3. Use Metadata Effectively
1483
1484
```typescript
1485
// Rich metadata for debugging and analysis
1486
langfuse.score.create({
1487
name: "response_quality",
1488
value: 0.9,
1489
metadata: {
1490
model: "gpt-4",
1491
temperature: 0.7,
1492
promptVersion: "v2.1",
1493
tokenCount: 450,
1494
latency: 1250,
1495
evaluator: "llm-as-judge",
1496
criteria: ["accuracy", "completeness", "clarity"]
1497
}
1498
});
1499
```
1500
1501
### 4. Flush Appropriately
1502
1503
```typescript
1504
// Flush before critical operations
1505
await langfuse.score.flush();
1506
1507
// Always flush on shutdown
1508
process.on("SIGTERM", async () => {
1509
await langfuse.score.shutdown();
1510
});
1511
1512
// Don't flush after every score (defeats batching)
1513
// ❌ Bad
1514
langfuse.score.create({ name: "score", value: 1 });
1515
await langfuse.score.flush(); // Too frequent
1516
1517
// ✅ Good
1518
langfuse.score.create({ name: "score1", value: 1 });
1519
langfuse.score.create({ name: "score2", value: 2 });
1520
langfuse.score.create({ name: "score3", value: 3 });
1521
await langfuse.score.flush(); // Batch flush
1522
```
1523
1524
### 5. Use Active Context Methods
1525
1526
```typescript
1527
// Prefer active context methods when possible
1528
startActiveSpan({ name: "operation" }, async (span) => {
1529
// Cleaner than passing span around
1530
langfuse.score.activeObservation({
1531
name: "quality",
1532
value: 0.9
1533
});
1534
1535
span.end();
1536
});
1537
```
1538
1539
### 6. Configure for Your Environment
1540
1541
```bash
1542
# Development
1543
LANGFUSE_FLUSH_AT=5
1544
LANGFUSE_FLUSH_INTERVAL=0.5
1545
1546
# Production
1547
LANGFUSE_FLUSH_AT=50
1548
LANGFUSE_FLUSH_INTERVAL=5
1549
1550
# Testing
1551
LANGFUSE_FLUSH_AT=1
1552
LANGFUSE_FLUSH_INTERVAL=0.1
1553
```
1554
1555
### 7. Handle Missing Context
1556
1557
```typescript
1558
// Check for active span before scoring
1559
import { trace } from "@opentelemetry/api";
1560
1561
const activeSpan = trace.getActiveSpan();
1562
if (activeSpan) {
1563
langfuse.score.activeObservation({
1564
name: "quality",
1565
value: 0.9
1566
});
1567
} else {
1568
console.warn("No active span, skipping score");
1569
}
1570
```
1571
1572
## Performance Considerations
1573
1574
### Batching Efficiency
1575
1576
- Default batch size of 10 balances latency and efficiency
1577
- Increase `LANGFUSE_FLUSH_AT` for high-throughput scenarios
1578
- Decrease for real-time feedback requirements
1579
1580
### Memory Management
1581
1582
- Queue capped at 100,000 scores to prevent memory leaks
1583
- Scores are removed from queue after successful flush
1584
- Consider manual flushing in long-running processes
1585
1586
### Network Optimization
1587
1588
- Batching reduces API calls by up to 100x
1589
- Concurrent batch uploads for large flushes
1590
- Failed batches don't block other batches
1591
1592
### Best Performance Configuration
1593
1594
```bash
1595
# High-throughput production
1596
LANGFUSE_FLUSH_AT=100
1597
LANGFUSE_FLUSH_INTERVAL=10
1598
1599
# Real-time feedback
1600
LANGFUSE_FLUSH_AT=5
1601
LANGFUSE_FLUSH_INTERVAL=1
1602
1603
# Balanced (default)
1604
LANGFUSE_FLUSH_AT=10
1605
LANGFUSE_FLUSH_INTERVAL=1
1606
```
1607
1608
## Migration Examples
1609
1610
### From Manual Score Tracking
1611
1612
**Before:**
1613
1614
```typescript
1615
const scores = [];
1616
1617
function recordScore(name: string, value: number) {
1618
scores.push({ name, value, timestamp: Date.now() });
1619
}
1620
1621
async function sendScores() {
1622
await fetch("/api/scores", {
1623
method: "POST",
1624
body: JSON.stringify(scores)
1625
});
1626
scores.length = 0;
1627
}
1628
```
1629
1630
**After:**
1631
1632
```typescript
1633
langfuse.score.create({
1634
name: "quality",
1635
value: 0.9,
1636
traceId: "trace-123"
1637
});
1638
1639
// Automatic batching and flushing
1640
await langfuse.score.flush();
1641
```
1642
1643
### From Synchronous Scoring
1644
1645
**Before:**
1646
1647
```typescript
1648
function scoreOperation(result: any) {
1649
const score = calculateScore(result);
1650
1651
// Blocking API call
1652
await sendScoreToAPI({
1653
name: "quality",
1654
value: score
1655
});
1656
}
1657
```
1658
1659
**After:**
1660
1661
```typescript
1662
function scoreOperation(result: any) {
1663
const score = calculateScore(result);
1664
1665
// Non-blocking, queued for batch send
1666
langfuse.score.create({
1667
name: "quality",
1668
value: score,
1669
traceId: result.traceId
1670
});
1671
}
1672
```
1673
1674
## TypeScript Support
1675
1676
Full type safety for all scoring operations.
1677
1678
```typescript
1679
import type { ScoreBody, ScoreDataType } from '@langfuse/core';
1680
1681
// Type-safe score creation
1682
const scoreData: ScoreBody = {
1683
name: "quality",
1684
value: 0.85,
1685
dataType: "NUMERIC",
1686
traceId: "trace-123"
1687
};
1688
1689
langfuse.score.create(scoreData);
1690
1691
// Generic scoring function
1692
function createTypedScore<T extends ScoreBody>(data: T): void {
1693
langfuse.score.create(data);
1694
}
1695
1696
// Type guards
1697
function isNumericScore(value: number | string): value is number {
1698
return typeof value === "number";
1699
}
1700
1701
function createScore(name: string, value: number | string) {
1702
langfuse.score.create({
1703
name,
1704
value,
1705
dataType: isNumericScore(value) ? "NUMERIC" : "CATEGORICAL"
1706
});
1707
}
1708
```
1709