Langfuse API client for universal JavaScript environments providing observability, prompt management, datasets, experiments, and scoring capabilities
The Score Management system provides comprehensive capabilities for creating, batching, and managing evaluation scores for traces and observations. It supports automatic batching, multiple data types, OpenTelemetry integration, and flexible scoring strategies with efficient queue management.
Create a score event and add it to the processing queue for batched submission.
/**
* Creates a new score event and adds it to the processing queue
*
* Scores are queued and sent in batches for efficiency. The score will be
* automatically sent when the queue reaches the flush threshold or after
* the flush interval expires.
*
* Batching behavior:
* - Automatic flush when queue reaches flushAtCount (default: 10, configurable via LANGFUSE_FLUSH_AT)
* - Time-based flush after flushIntervalSeconds (default: 1s, configurable via LANGFUSE_FLUSH_INTERVAL)
* - Maximum batch size: 100 scores per API call
* - Maximum queue size: 100,000 scores (prevents memory leaks)
*
* @param data - The score data to create
*/
create(data: ScoreBody): void;
interface ScoreBody {
/** Optional unique identifier for the score (auto-generated if not provided) */
id?: string;
/** Trace ID to associate the score with */
traceId?: string;
/** Session ID to associate the score with */
sessionId?: string;
/** Observation/span ID to associate the score with */
observationId?: string;
/** Dataset run ID for experiment scoring */
datasetRunId?: string;
/** Name of the score (e.g., "quality", "accuracy", "relevance") */
name: string;
/** Environment tag (defaults to LANGFUSE_TRACING_ENVIRONMENT) */
environment?: string;
/**
* The value of the score
* - Numeric scores: number (e.g., 0.85, 4.5)
* - Boolean scores: 1 or 0 (true or false)
* - Categorical scores: string (e.g., "excellent", "good", "poor")
*/
value: number | string;
/** Optional comment explaining the score */
comment?: string;
/** Optional metadata object with additional context */
metadata?: unknown;
/**
* Data type of the score
* When set, must match the score value's type
* If not set, will be inferred from the score value or config
*/
dataType?: "NUMERIC" | "BOOLEAN" | "CATEGORICAL";
/**
* Reference to a score config
* When set, the score name must equal the config name
* Scores must comply with the config's range and data type
* For categorical scores, the value must map to a config category
* Numeric scores might be constrained by the config's max and min values
*/
configId?: string;
}Usage Examples:
import { LangfuseClient } from '@langfuse/client';
const langfuse = new LangfuseClient();
// Basic numeric score
langfuse.score.create({
name: "quality",
value: 0.85,
traceId: "trace-123",
comment: "High quality response"
});
// Numeric score with explicit data type
langfuse.score.create({
name: "accuracy",
value: 0.92,
dataType: "NUMERIC",
traceId: "trace-456",
metadata: {
model: "gpt-4",
version: "1.0"
}
});
// Boolean score (1 = true, 0 = false)
langfuse.score.create({
name: "hallucination",
value: 0,
dataType: "BOOLEAN",
traceId: "trace-789",
comment: "No hallucinations detected"
});
// Categorical score
langfuse.score.create({
name: "sentiment",
value: "positive",
dataType: "CATEGORICAL",
traceId: "trace-abc",
observationId: "span-xyz"
});
// Score with custom ID and environment
langfuse.score.create({
id: "custom-score-id",
name: "user_satisfaction",
value: 4,
traceId: "trace-def",
environment: "production",
metadata: {
userId: "user-123",
timestamp: new Date().toISOString()
}
});
// Score with config reference
langfuse.score.create({
name: "correctness",
value: "partially correct",
dataType: "CATEGORICAL",
configId: "config-123",
traceId: "trace-ghi",
comment: "Answer was mostly correct but lacked details"
});
// Session-level score
langfuse.score.create({
name: "session_quality",
value: 0.78,
sessionId: "session-456"
});
// Dataset run score (for experiments)
langfuse.score.create({
name: "test_accuracy",
value: 0.95,
datasetRunId: "run-789"
});
// Complex metadata example
langfuse.score.create({
name: "performance",
value: 0.88,
traceId: "trace-jkl",
comment: "Strong performance across all metrics",
metadata: {
model: "gpt-4-turbo",
latency_ms: 1250,
token_count: 450,
cost_usd: 0.025,
evaluation_method: "llm-as-judge",
criteria: ["accuracy", "completeness", "clarity"]
}
});Create a score for a specific observation using its OpenTelemetry span.
/**
* Creates a score for a specific observation using its OpenTelemetry span
*
* This method automatically extracts the trace ID and observation ID from
* the provided span context, eliminating the need to manually track IDs.
*
* @param observation - Object containing the OpenTelemetry span
* @param data - Score data (traceId, observationId, sessionId, and datasetRunId are auto-populated)
*/
observation(
observation: { otelSpan: Span },
data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
): void;Usage Examples:
import { LangfuseClient } from '@langfuse/client';
import { startObservation } from '@langfuse/tracing';
const langfuse = new LangfuseClient();
// Score an observation with OpenTelemetry integration
const span = startObservation({ name: "llm-call" });
// Perform operation
const result = await callLLM();
// Score the observation
langfuse.score.observation(
{ otelSpan: span },
{
name: "response_quality",
value: 0.92,
comment: "Excellent response quality"
}
);
span.end();
// Score with metadata
const analysisSpan = startObservation({ name: "document-analysis" });
const analysis = await analyzeDocument(document);
langfuse.score.observation(
{ otelSpan: analysisSpan },
{
name: "accuracy",
value: 0.87,
dataType: "NUMERIC",
metadata: {
documentLength: document.length,
processingTime: Date.now() - startTime,
model: "gpt-4"
}
}
);
analysisSpan.end();
// Boolean score for observation
const validationSpan = startObservation({ name: "validation" });
const isValid = await validateOutput(output);
langfuse.score.observation(
{ otelSpan: validationSpan },
{
name: "validation_passed",
value: isValid ? 1 : 0,
dataType: "BOOLEAN"
}
);
validationSpan.end();
// Categorical score for observation
const classificationSpan = startObservation({ name: "classify-intent" });
const intent = await classifyIntent(userMessage);
langfuse.score.observation(
{ otelSpan: classificationSpan },
{
name: "intent_category",
value: intent,
dataType: "CATEGORICAL",
metadata: {
confidence: 0.95,
alternatives: ["support", "sales"]
}
}
);
classificationSpan.end();Create a score for a trace using an OpenTelemetry span.
/**
* Creates a score for a trace using an OpenTelemetry span
*
* This method automatically extracts the trace ID from the provided
* span context and creates a trace-level score (not observation-specific).
*
* @param observation - Object containing the OpenTelemetry span
* @param data - Score data (traceId, sessionId, observationId, and datasetRunId are auto-populated)
*/
trace(
observation: { otelSpan: Span },
data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
): void;Usage Examples:
import { LangfuseClient } from '@langfuse/client';
import { startObservation } from '@langfuse/tracing';
const langfuse = new LangfuseClient();
// Score a trace (trace-level evaluation)
const span = startObservation({ name: "user-query-pipeline" });
// Execute multi-step pipeline
const retrieval = await retrieveContext(query);
const generation = await generateResponse(query, retrieval);
const validation = await validateResponse(generation);
// Score the entire trace (not just one observation)
langfuse.score.trace(
{ otelSpan: span },
{
name: "overall_quality",
value: 0.88,
comment: "Good overall quality with minor improvements needed"
}
);
span.end();
// Multiple trace-level scores
const workflowSpan = startObservation({ name: "customer-support-workflow" });
const conversation = await handleConversation(user);
// Score different aspects of the trace
langfuse.score.trace(
{ otelSpan: workflowSpan },
{
name: "user_satisfaction",
value: 4,
dataType: "NUMERIC",
comment: "User rated 4 out of 5 stars"
}
);
langfuse.score.trace(
{ otelSpan: workflowSpan },
{
name: "issue_resolved",
value: 1,
dataType: "BOOLEAN"
}
);
langfuse.score.trace(
{ otelSpan: workflowSpan },
{
name: "conversation_tone",
value: "professional",
dataType: "CATEGORICAL"
}
);
workflowSpan.end();
// Trace score with experiment metadata
const experimentSpan = startObservation({ name: "prompt-variant-test" });
const response = await testPromptVariant(input, variantConfig);
langfuse.score.trace(
{ otelSpan: experimentSpan },
{
name: "variant_performance",
value: 0.91,
metadata: {
variantId: "v2",
temperature: 0.7,
model: "gpt-4",
comparison_baseline: 0.85
}
}
);
experimentSpan.end();Create a score for the currently active observation in the OpenTelemetry context.
/**
* Creates a score for the currently active observation
*
* This method automatically detects the active OpenTelemetry span and
* creates an observation-level score. If no active span is found,
* a warning is logged and the operation is skipped.
*
* This is useful when you don't have direct access to the span object
* but are within an active span context.
*
* @param data - Score data (traceId, observationId, sessionId, and datasetRunId are auto-populated)
*/
activeObservation(
data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
): void;Usage Examples:
import { LangfuseClient } from '@langfuse/client';
import { startActiveSpan } from '@langfuse/tracing';
const langfuse = new LangfuseClient();
// Score within an active span context
startActiveSpan({ name: "process-document" }, async (span) => {
const result = await processDocument(document);
// Score the active observation (no need to pass span)
langfuse.score.activeObservation({
name: "processing_quality",
value: 0.9,
comment: "High quality processing"
});
span.end();
});
// Nested spans with active scoring
startActiveSpan({ name: "parent-operation" }, async (parentSpan) => {
startActiveSpan({ name: "child-operation" }, async (childSpan) => {
// This scores the child-operation (currently active span)
langfuse.score.activeObservation({
name: "child_accuracy",
value: 0.95
});
childSpan.end();
});
// This scores the parent-operation (now active again)
langfuse.score.activeObservation({
name: "parent_completeness",
value: 0.88
});
parentSpan.end();
});
// Use in middleware or callbacks
async function evaluateResponse(response: string) {
// Assumes this is called within an active span context
const quality = await assessQuality(response);
langfuse.score.activeObservation({
name: "response_quality",
value: quality.score,
metadata: {
metrics: quality.metrics,
evaluator: "llm-judge"
}
});
}
// In an async context manager
async function withScoring<T>(
operation: () => Promise<T>,
scoreName: string
): Promise<T> {
return startActiveSpan({ name: "scored-operation" }, async (span) => {
try {
const result = await operation();
langfuse.score.activeObservation({
name: scoreName,
value: 1,
dataType: "BOOLEAN",
comment: "Operation completed successfully"
});
return result;
} catch (error) {
langfuse.score.activeObservation({
name: scoreName,
value: 0,
dataType: "BOOLEAN",
comment: `Operation failed: ${error.message}`
});
throw error;
} finally {
span.end();
}
});
}
// Usage with error handling
startActiveSpan({ name: "risky-operation" }, async (span) => {
try {
await performRiskyOperation();
langfuse.score.activeObservation({
name: "success",
value: 1,
dataType: "BOOLEAN"
});
} catch (error) {
langfuse.score.activeObservation({
name: "success",
value: 0,
dataType: "BOOLEAN",
comment: error.message
});
} finally {
span.end();
}
});Create a score for the currently active trace in the OpenTelemetry context.
/**
* Creates a score for the currently active trace
*
* This method automatically detects the active OpenTelemetry span and
* creates a trace-level score. If no active span is found,
* a warning is logged and the operation is skipped.
*
* This is useful for scoring the entire trace from within any nested
* span context without needing to track the root span.
*
* @param data - Score data (traceId, sessionId, observationId, and datasetRunId are auto-populated)
*/
activeTrace(
data: Omit<ScoreBody, "traceId" | "sessionId" | "observationId" | "datasetRunId">
): void;Usage Examples:
import { LangfuseClient } from '@langfuse/client';
import { startActiveSpan } from '@langfuse/tracing';
const langfuse = new LangfuseClient();
// Score trace from within any span
startActiveSpan({ name: "main-workflow" }, async (span) => {
await step1();
await step2();
await step3();
// Score the entire trace (not just this span)
langfuse.score.activeTrace({
name: "workflow_success",
value: 1,
dataType: "BOOLEAN"
});
span.end();
});
// Score trace from nested operations
startActiveSpan({ name: "parent" }, async (parentSpan) => {
startActiveSpan({ name: "child" }, async (childSpan) => {
// Score the entire trace from within child span
langfuse.score.activeTrace({
name: "overall_quality",
value: 0.92,
comment: "Excellent overall execution"
});
childSpan.end();
});
parentSpan.end();
});
// User feedback collection
async function collectUserFeedback(userId: string, rating: number) {
// Assumes called within an active trace context
langfuse.score.activeTrace({
name: "user_satisfaction",
value: rating,
dataType: "NUMERIC",
metadata: {
userId,
timestamp: new Date().toISOString(),
source: "in-app-feedback"
}
});
}
// Post-execution trace evaluation
startActiveSpan({ name: "ai-assistant-conversation" }, async (span) => {
const conversation = await handleUserConversation(user);
// Evaluate entire conversation
const evaluation = await evaluateConversation(conversation);
// Score the trace based on evaluation
langfuse.score.activeTrace({
name: "conversation_quality",
value: evaluation.overallScore,
comment: evaluation.feedback,
metadata: {
metrics: evaluation.metrics,
duration: conversation.duration,
turns: conversation.turns.length
}
});
span.end();
});
// Multi-criteria trace scoring
startActiveSpan({ name: "document-processing-pipeline" }, async (span) => {
const result = await processPipeline(document);
// Score multiple aspects of the trace
langfuse.score.activeTrace({
name: "accuracy",
value: result.accuracy,
dataType: "NUMERIC"
});
langfuse.score.activeTrace({
name: "completeness",
value: result.isComplete ? 1 : 0,
dataType: "BOOLEAN"
});
langfuse.score.activeTrace({
name: "quality_tier",
value: result.qualityTier,
dataType: "CATEGORICAL"
});
span.end();
});Flush all pending score events to the Langfuse API immediately.
/**
* Flushes all pending score events to the Langfuse API
*
* This method ensures all queued scores are sent immediately rather than
* waiting for the automatic flush interval or batch size threshold.
*
* Batching behavior during flush:
* - Scores are sent in batches of up to 100
* - Multiple batches are sent concurrently
* - All batches must complete before flush resolves
*
* @returns Promise that resolves when all pending scores have been sent
*/
flush(): Promise<void>;Usage Examples:
import { LangfuseClient } from '@langfuse/client';
const langfuse = new LangfuseClient();
// Manual flush after creating scores
langfuse.score.create({
name: "quality",
value: 0.8,
traceId: "trace-123"
});
langfuse.score.create({
name: "accuracy",
value: 0.9,
traceId: "trace-123"
});
// Ensure scores are sent immediately
await langfuse.score.flush();
// Flush before critical operations
async function processWithScoring(data: any) {
langfuse.score.create({
name: "preprocessing",
value: 1,
traceId: data.traceId
});
// Ensure score is sent before proceeding
await langfuse.score.flush();
return await criticalOperation(data);
}
// Flush in testing
describe("scoring tests", () => {
afterEach(async () => {
// Ensure all scores are sent after each test
await langfuse.score.flush();
});
it("should score correctly", async () => {
langfuse.score.create({ name: "test", value: 1 });
await langfuse.score.flush();
// Verify score was sent
});
});
// Flush with error handling
async function safeFlush() {
try {
await langfuse.score.flush();
console.log("Scores flushed successfully");
} catch (error) {
console.error("Failed to flush scores:", error);
// Scores remain in queue and will retry on next flush
}
}
// Periodic flushing in long-running processes
setInterval(async () => {
await langfuse.score.flush();
}, 60000); // Flush every minute
// Flush before application exit
process.on("SIGTERM", async () => {
console.log("Flushing scores before shutdown...");
await langfuse.score.flush();
process.exit(0);
});
// Flush in batch processing
async function processBatch(items: any[]) {
for (const item of items) {
await processItem(item);
langfuse.score.create({
name: "item_processed",
value: 1,
metadata: { itemId: item.id }
});
}
// Flush after batch completion
await langfuse.score.flush();
}Gracefully shutdown the score manager by flushing all pending scores.
/**
* Gracefully shuts down the score manager by flushing all pending scores
*
* This method should be called before your application exits to ensure
* all score data is sent to Langfuse. It internally calls flush() and
* waits for completion.
*
* @returns Promise that resolves when shutdown is complete
*/
shutdown(): Promise<void>;Usage Examples:
import { LangfuseClient } from '@langfuse/client';
const langfuse = new LangfuseClient();
// Graceful shutdown before exit
async function gracefulShutdown() {
console.log("Shutting down...");
// Flush all pending scores
await langfuse.score.shutdown();
console.log("Shutdown complete");
process.exit(0);
}
// Handle process signals
process.on("SIGTERM", gracefulShutdown);
process.on("SIGINT", gracefulShutdown);
// Shutdown in application cleanup
async function cleanupApplication() {
// Close database connections
await db.close();
// Flush scores before exit
await langfuse.score.shutdown();
// Close other resources
await cache.disconnect();
}
// Shutdown with timeout
async function shutdownWithTimeout(timeoutMs: number = 5000) {
const timeout = new Promise((_, reject) =>
setTimeout(() => reject(new Error("Shutdown timeout")), timeoutMs)
);
try {
await Promise.race([
langfuse.score.shutdown(),
timeout
]);
console.log("Score manager shutdown successfully");
} catch (error) {
console.error("Shutdown error:", error);
// Force exit if timeout
}
}
// Shutdown in tests
afterAll(async () => {
await langfuse.score.shutdown();
});
// Shutdown in serverless functions
export async function handler(event: any) {
try {
// Process request and create scores
langfuse.score.create({
name: "request_handled",
value: 1
});
return { statusCode: 200, body: "Success" };
} finally {
// Ensure scores are sent before function terminates
await langfuse.score.shutdown();
}
}
// Shutdown with error handling
async function safeShutdown() {
try {
await langfuse.score.shutdown();
console.log("Scores flushed successfully");
} catch (error) {
console.error("Error during shutdown:", error);
// Log error but continue shutdown
}
}
// Shutdown in Docker container
process.on("SIGTERM", async () => {
console.log("SIGTERM received, starting graceful shutdown");
// Stop accepting new requests
server.close();
// Flush pending scores
await langfuse.score.shutdown();
console.log("Graceful shutdown complete");
process.exit(0);
});Enumeration of supported score data types.
/**
* Score data types supported by Langfuse
*/
type ScoreDataType = "NUMERIC" | "BOOLEAN" | "CATEGORICAL";
// Constants for convenience
const ScoreDataType = {
Numeric: "NUMERIC",
Boolean: "BOOLEAN",
Categorical: "CATEGORICAL",
} as const;Data Type Details:
NUMERIC: Numerical values (integers or floats)
BOOLEAN: Binary values represented as 1 (true) or 0 (false)
CATEGORICAL: String labels for classification
Usage Examples:
import { ScoreDataType } from '@langfuse/core';
// Numeric score
langfuse.score.create({
name: "quality_score",
value: 0.87,
dataType: ScoreDataType.Numeric,
traceId: "trace-123"
});
// Boolean score
langfuse.score.create({
name: "validation_passed",
value: 1,
dataType: ScoreDataType.Boolean,
traceId: "trace-456"
});
// Categorical score
langfuse.score.create({
name: "sentiment",
value: "positive",
dataType: ScoreDataType.Categorical,
traceId: "trace-789"
});
// Type inference (dataType can be omitted)
langfuse.score.create({
name: "auto_numeric",
value: 0.5, // Inferred as NUMERIC
traceId: "trace-abc"
});
langfuse.score.create({
name: "auto_categorical",
value: "excellent", // Inferred as CATEGORICAL
traceId: "trace-def"
});
// Use with constants
const SCORE_TYPES = {
QUALITY: { name: "quality", dataType: ScoreDataType.Numeric },
VALID: { name: "is_valid", dataType: ScoreDataType.Boolean },
TIER: { name: "quality_tier", dataType: ScoreDataType.Categorical }
};
langfuse.score.create({
...SCORE_TYPES.QUALITY,
value: 0.92,
traceId: "trace-ghi"
});Union type for score values supporting both numeric and string types.
/**
* The value of the score
* - Numeric for NUMERIC and BOOLEAN data types
* - String for CATEGORICAL data type
*/
type CreateScoreValue = number | string;Usage Examples:
// Numeric values
const numericValue: CreateScoreValue = 0.85;
const integerValue: CreateScoreValue = 5;
const negativeValue: CreateScoreValue = -0.2;
// String values
const categoricalValue: CreateScoreValue = "excellent";
const sentimentValue: CreateScoreValue = "positive";
// Type-safe score creation
function createTypedScore(
name: string,
value: CreateScoreValue,
traceId: string
) {
langfuse.score.create({ name, value, traceId });
}
createTypedScore("quality", 0.9, "trace-123");
createTypedScore("sentiment", "positive", "trace-456");The Score Manager implements efficient batching to optimize API usage and performance.
Configure batching behavior via environment variables or use defaults:
// Environment variables
LANGFUSE_FLUSH_AT=10 // Flush after this many scores (default: 10)
LANGFUSE_FLUSH_INTERVAL=1 // Flush after this many seconds (default: 1)Configuration Examples:
# Development: Frequent flushing for immediate feedback
LANGFUSE_FLUSH_AT=5
LANGFUSE_FLUSH_INTERVAL=0.5
# Production: Larger batches for efficiency
LANGFUSE_FLUSH_AT=50
LANGFUSE_FLUSH_INTERVAL=5
# Testing: Immediate flushing
LANGFUSE_FLUSH_AT=1
LANGFUSE_FLUSH_INTERVAL=0.1
# High-throughput: Maximum batching
LANGFUSE_FLUSH_AT=100
LANGFUSE_FLUSH_INTERVAL=10const MAX_BATCH_SIZE = 100; // Maximum scores per API call
const MAX_QUEUE_SIZE = 100_000; // Maximum queue size (prevents memory leaks)Scores are automatically flushed when:
flushAtCount scoresflushIntervalSeconds have passed since first queued scoreflush() or shutdown() is calledBatching Examples:
// Automatic flush by count (LANGFUSE_FLUSH_AT=10)
for (let i = 0; i < 15; i++) {
langfuse.score.create({
name: `score-${i}`,
value: i * 0.1
});
}
// First 10 scores flushed automatically
// Remaining 5 scores wait for timer or manual flush
// Automatic flush by timer (LANGFUSE_FLUSH_INTERVAL=1)
langfuse.score.create({ name: "score1", value: 0.8 });
// Score queued, timer starts
// After 1 second, score is automatically flushed
// Large batch handling (150 scores)
for (let i = 0; i < 150; i++) {
langfuse.score.create({
name: `batch-score-${i}`,
value: i * 0.01
});
}
await langfuse.score.flush();
// Sent as 2 batches: 100 + 50 (respects MAX_BATCH_SIZE)
// Queue overflow protection
for (let i = 0; i < 100_001; i++) {
langfuse.score.create({
name: `overflow-${i}`,
value: 1
});
}
// Score #100,001 is dropped with error log
// Prevents memory exhaustion// Timer is created when first score is added to empty queue
langfuse.score.create({ name: "first", value: 1 });
// Timer starts
// Subsequent scores don't create new timers
langfuse.score.create({ name: "second", value: 2 });
langfuse.score.create({ name: "third", value: 3 });
// Same timer continues
// Timer is cleared when flush completes
await langfuse.score.flush();
// Timer cleared, queue empty
// New score starts new timer
langfuse.score.create({ name: "fourth", value: 4 });
// New timer starts// Multiple concurrent flush calls are deduplicated
langfuse.score.create({ name: "test", value: 1 });
const flush1 = langfuse.score.flush();
const flush2 = langfuse.score.flush();
const flush3 = langfuse.score.flush();
await Promise.all([flush1, flush2, flush3]);
// Only one actual API call is made
// All promises resolve when flush completesUse scores within experiments for automated evaluation.
import { LangfuseClient } from '@langfuse/client';
import { startObservation } from '@langfuse/tracing';
const langfuse = new LangfuseClient();
// Define experiment with scoring
const result = await langfuse.experiment.run({
name: "prompt-optimization",
data: dataset.items,
task: async (item) => {
const span = startObservation({ name: "task" });
const output = await runModel(item.input);
// Score the observation
langfuse.score.observation(
{ otelSpan: span },
{
name: "task_quality",
value: await evaluateQuality(output),
dataType: "NUMERIC"
}
);
span.end();
return output;
},
evaluators: [
async ({ output, expectedOutput }) => {
// Return evaluation scores
return {
name: "accuracy",
value: calculateAccuracy(output, expectedOutput),
dataType: "NUMERIC"
};
}
]
});
// Scores are automatically associated with dataset run
await langfuse.score.flush();Score multiple aspects of a single operation.
import { startObservation } from '@langfuse/tracing';
const span = startObservation({ name: "llm-generation" });
const response = await generateResponse(prompt);
// Score multiple criteria
langfuse.score.observation(
{ otelSpan: span },
{
name: "accuracy",
value: 0.92,
dataType: "NUMERIC"
}
);
langfuse.score.observation(
{ otelSpan: span },
{
name: "relevance",
value: 0.88,
dataType: "NUMERIC"
}
);
langfuse.score.observation(
{ otelSpan: span },
{
name: "completeness",
value: 1,
dataType: "BOOLEAN"
}
);
langfuse.score.observation(
{ otelSpan: span },
{
name: "tone",
value: "professional",
dataType: "CATEGORICAL"
}
);
span.end();Apply scores based on runtime conditions.
import { startObservation } from '@langfuse/tracing';
const span = startObservation({ name: "conditional-scoring" });
const result = await processRequest(request);
// Conditional scoring based on result
if (result.needsReview) {
langfuse.score.observation(
{ otelSpan: span },
{
name: "requires_human_review",
value: 1,
dataType: "BOOLEAN",
comment: "Flagged for manual review"
}
);
}
if (result.confidence < 0.7) {
langfuse.score.observation(
{ otelSpan: span },
{
name: "low_confidence",
value: result.confidence,
dataType: "NUMERIC",
comment: `Confidence below threshold: ${result.confidence}`
}
);
}
// Quality tier scoring
const tier = result.score > 0.9 ? "excellent" :
result.score > 0.7 ? "good" :
result.score > 0.5 ? "fair" : "poor";
langfuse.score.observation(
{ otelSpan: span },
{
name: "quality_tier",
value: tier,
dataType: "CATEGORICAL",
metadata: { rawScore: result.score }
}
);
span.end();Use score configs to enforce constraints and standards.
// Create score with config reference
langfuse.score.create({
name: "quality",
value: 0.85,
dataType: "NUMERIC",
configId: "quality-config-v1",
traceId: "trace-123"
});
// Score must comply with config's min/max values
// Categorical score with config
langfuse.score.create({
name: "sentiment",
value: "positive",
dataType: "CATEGORICAL",
configId: "sentiment-config",
traceId: "trace-456"
});
// Value must match one of the config's categories
// Boolean score with config
langfuse.score.create({
name: "passes_safety_check",
value: 1,
dataType: "BOOLEAN",
configId: "safety-check-config",
traceId: "trace-789"
});
// Ensures consistent naming and interpretationHandle scoring in asynchronous workflows.
// Deferred scoring after async evaluation
async function scoreAfterEvaluation(traceId: string, output: string) {
// Trigger async evaluation (doesn't block)
const evaluationPromise = evaluateWithExternalService(output);
// Continue processing
await continueWorkflow();
// Wait for evaluation and score
const evaluation = await evaluationPromise;
langfuse.score.create({
name: "external_evaluation",
value: evaluation.score,
traceId,
metadata: { evaluator: "external-service" }
});
}
// Background scoring worker
const scoringQueue: Array<() => Promise<void>> = [];
function queueScoring(fn: () => Promise<void>) {
scoringQueue.push(fn);
}
async function processScoringQueue() {
while (scoringQueue.length > 0) {
const scoreFn = scoringQueue.shift();
try {
await scoreFn?.();
} catch (error) {
console.error("Scoring error:", error);
}
}
}
// Queue scores for later processing
queueScoring(async () => {
langfuse.score.create({
name: "delayed_score",
value: 0.9,
traceId: "trace-123"
});
});
// Process queue periodically
setInterval(processScoringQueue, 5000);Handle errors gracefully during scoring operations.
// Safe scoring wrapper
function safeScore(scoreData: ScoreBody) {
try {
langfuse.score.create(scoreData);
} catch (error) {
console.error("Failed to create score:", error);
// Log to error tracking service
errorTracker.capture(error, { context: "scoring" });
}
}
// Retry logic for critical scores
async function scoreWithRetry(
scoreData: ScoreBody,
maxRetries: number = 3
) {
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
langfuse.score.create(scoreData);
await langfuse.score.flush();
return; // Success
} catch (error) {
console.error(`Score attempt ${attempt} failed:`, error);
if (attempt === maxRetries) {
// Final failure - log and continue
console.error("Score permanently failed after retries");
} else {
// Wait before retry
await new Promise(resolve =>
setTimeout(resolve, 1000 * attempt)
);
}
}
}
}
// Graceful degradation
async function scoreWithFallback(
primary: ScoreBody,
fallback: ScoreBody
) {
try {
langfuse.score.create(primary);
await langfuse.score.flush();
} catch (error) {
console.warn("Primary score failed, using fallback");
langfuse.score.create(fallback);
}
}// Use NUMERIC for continuous values
langfuse.score.create({
name: "confidence",
value: 0.87,
dataType: "NUMERIC"
});
// Use BOOLEAN for binary decisions
langfuse.score.create({
name: "approved",
value: 1,
dataType: "BOOLEAN"
});
// Use CATEGORICAL for discrete labels
langfuse.score.create({
name: "quality_tier",
value: "premium",
dataType: "CATEGORICAL"
});// Add context to scores
langfuse.score.create({
name: "quality",
value: 0.65,
comment: "Below target due to missing context in retrieval",
metadata: {
target: 0.8,
reason: "insufficient_context"
}
});// Rich metadata for debugging and analysis
langfuse.score.create({
name: "response_quality",
value: 0.9,
metadata: {
model: "gpt-4",
temperature: 0.7,
promptVersion: "v2.1",
tokenCount: 450,
latency: 1250,
evaluator: "llm-as-judge",
criteria: ["accuracy", "completeness", "clarity"]
}
});// Flush before critical operations
await langfuse.score.flush();
// Always flush on shutdown
process.on("SIGTERM", async () => {
await langfuse.score.shutdown();
});
// Don't flush after every score (defeats batching)
// ❌ Bad
langfuse.score.create({ name: "score", value: 1 });
await langfuse.score.flush(); // Too frequent
// ✅ Good
langfuse.score.create({ name: "score1", value: 1 });
langfuse.score.create({ name: "score2", value: 2 });
langfuse.score.create({ name: "score3", value: 3 });
await langfuse.score.flush(); // Batch flush// Prefer active context methods when possible
startActiveSpan({ name: "operation" }, async (span) => {
// Cleaner than passing span around
langfuse.score.activeObservation({
name: "quality",
value: 0.9
});
span.end();
});# Development
LANGFUSE_FLUSH_AT=5
LANGFUSE_FLUSH_INTERVAL=0.5
# Production
LANGFUSE_FLUSH_AT=50
LANGFUSE_FLUSH_INTERVAL=5
# Testing
LANGFUSE_FLUSH_AT=1
LANGFUSE_FLUSH_INTERVAL=0.1// Check for active span before scoring
import { trace } from "@opentelemetry/api";
const activeSpan = trace.getActiveSpan();
if (activeSpan) {
langfuse.score.activeObservation({
name: "quality",
value: 0.9
});
} else {
console.warn("No active span, skipping score");
}LANGFUSE_FLUSH_AT for high-throughput scenarios# High-throughput production
LANGFUSE_FLUSH_AT=100
LANGFUSE_FLUSH_INTERVAL=10
# Real-time feedback
LANGFUSE_FLUSH_AT=5
LANGFUSE_FLUSH_INTERVAL=1
# Balanced (default)
LANGFUSE_FLUSH_AT=10
LANGFUSE_FLUSH_INTERVAL=1Before:
const scores = [];
function recordScore(name: string, value: number) {
scores.push({ name, value, timestamp: Date.now() });
}
async function sendScores() {
await fetch("/api/scores", {
method: "POST",
body: JSON.stringify(scores)
});
scores.length = 0;
}After:
langfuse.score.create({
name: "quality",
value: 0.9,
traceId: "trace-123"
});
// Automatic batching and flushing
await langfuse.score.flush();Before:
function scoreOperation(result: any) {
const score = calculateScore(result);
// Blocking API call
await sendScoreToAPI({
name: "quality",
value: score
});
}After:
function scoreOperation(result: any) {
const score = calculateScore(result);
// Non-blocking, queued for batch send
langfuse.score.create({
name: "quality",
value: score,
traceId: result.traceId
});
}Full type safety for all scoring operations.
import type { ScoreBody, ScoreDataType } from '@langfuse/core';
// Type-safe score creation
const scoreData: ScoreBody = {
name: "quality",
value: 0.85,
dataType: "NUMERIC",
traceId: "trace-123"
};
langfuse.score.create(scoreData);
// Generic scoring function
function createTypedScore<T extends ScoreBody>(data: T): void {
langfuse.score.create(data);
}
// Type guards
function isNumericScore(value: number | string): value is number {
return typeof value === "number";
}
function createScore(name: string, value: number | string) {
langfuse.score.create({
name,
value,
dataType: isNumericScore(value) ? "NUMERIC" : "CATEGORICAL"
});
}Install with Tessl CLI
npx tessl i tessl/npm-langfuse--client