0
# Scoring and Evaluation
1
2
System for adding scores and evaluations to traces and observations, supporting numeric, categorical, and boolean score types with flexible data structures and UI integration.
3
4
## Capabilities
5
6
### Observation-Level Scoring
7
8
Add scores to specific observations (spans) for detailed evaluation tracking.
9
10
```python { .api }
11
class LangfuseObservationWrapper:
12
def score(self, *, name: str, value: Union[float, str], score_id: str = None,
13
data_type: ScoreDataType = None, comment: str = None,
14
config_id: str = None) -> None:
15
"""Create score for this specific observation.
16
17
Args:
18
name: Score name/metric identifier (e.g., "accuracy", "relevance")
19
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
20
score_id: Optional custom ID for the score (auto-generated if not provided)
21
data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
22
comment: Optional comment or explanation for the score
23
config_id: Optional ID of score config defined in Langfuse
24
25
Example:
26
span.score(
27
name="relevance",
28
value=0.85,
29
data_type="NUMERIC",
30
comment="High relevance to user query"
31
)
32
"""
33
```
34
35
### Trace-Level Scoring
36
37
Add scores to entire traces for overall evaluation and quality assessment.
38
39
```python { .api }
40
class LangfuseObservationWrapper:
41
def score_trace(self, *, name: str, value: Union[float, str], score_id: str = None,
42
data_type: ScoreDataType = None, comment: str = None,
43
config_id: str = None) -> None:
44
"""Create score for the entire trace this observation belongs to.
45
46
Args:
47
name: Score name for trace-level evaluation
48
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
49
score_id: Optional custom ID for the score
50
data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
51
comment: Optional comment explaining the trace-level score
52
config_id: Optional score config ID from Langfuse
53
54
Example:
55
span.score_trace(
56
name="overall_quality",
57
value=0.9,
58
data_type="NUMERIC",
59
comment="Excellent overall response quality"
60
)
61
"""
62
```
63
64
### Direct Score Creation
65
66
Create scores directly through the client without needing span references.
67
68
```python { .api }
69
class Langfuse:
70
def create_score(self, *, name: str, value: str, trace_id: str = None,
71
observation_id: str = None, score_id: str = None,
72
data_type: Literal["CATEGORICAL"] = None, comment: str = None,
73
config_id: str = None) -> None:
74
"""Create score for trace or observation by ID.
75
76
Args:
77
name: Score name/metric identifier
78
value: Score value (stored as string regardless of type)
79
trace_id: Target trace ID (for trace-level scores)
80
observation_id: Target observation ID (for observation-level scores)
81
score_id: Optional custom score ID
82
data_type: Score data type
83
comment: Optional comment or explanation
84
config_id: Optional score config ID
85
86
Note:
87
Provide either trace_id for trace-level scores or observation_id for observation-level scores
88
"""
89
```
90
91
### Score Data Types
92
93
Supported score types with proper type annotations and validation.
94
95
```python { .api }
96
# Score data type enumeration
97
ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
98
99
# Type-specific overloads for better type safety
100
def score(*, name: str, value: float, data_type: Literal["NUMERIC", "BOOLEAN"] = None) -> None: ...
101
def score(*, name: str, value: str, data_type: Literal["CATEGORICAL"] = "CATEGORICAL") -> None: ...
102
```
103
104
## Usage Examples
105
106
### Basic Scoring
107
108
```python
109
from langfuse import Langfuse
110
111
langfuse = Langfuse()
112
113
# Score during span execution
114
with langfuse.start_as_current_span(name="process-query") as span:
115
result = process_user_query()
116
117
# Add observation-level scores
118
span.score(
119
name="accuracy",
120
value=0.95,
121
data_type="NUMERIC",
122
comment="High accuracy based on ground truth comparison"
123
)
124
125
span.score(
126
name="response_category",
127
value="informative",
128
data_type="CATEGORICAL",
129
comment="Response provides comprehensive information"
130
)
131
132
# Add trace-level score
133
span.score_trace(
134
name="user_satisfaction",
135
value=1.0,
136
data_type="BOOLEAN",
137
comment="User indicated satisfaction with response"
138
)
139
```
140
141
### Automated Scoring with Evaluators
142
143
```python
144
@langfuse.observe(as_type="generation")
145
def generate_response(prompt):
146
response = llm.generate(prompt)
147
148
# Automatic scoring within the observed function
149
current_span = langfuse.get_current_observation()
150
if current_span:
151
# Calculate relevance score
152
relevance = calculate_relevance(prompt, response)
153
current_span.score(
154
name="relevance",
155
value=relevance,
156
comment=f"Relevance score: {relevance:.2f}"
157
)
158
159
# Add categorical quality assessment
160
quality_category = assess_quality(response)
161
current_span.score(
162
name="quality_tier",
163
value=quality_category, # "excellent", "good", "fair", "poor"
164
data_type="CATEGORICAL"
165
)
166
167
return response
168
```
169
170
### Multiple Score Types
171
172
```python
173
def comprehensive_scoring(span, input_text, output_text, expected_output=None):
174
"""Add multiple types of scores to a span."""
175
176
# Numeric scores
177
span.score(
178
name="response_length",
179
value=len(output_text),
180
comment=f"Response contains {len(output_text)} characters"
181
)
182
183
span.score(
184
name="confidence",
185
value=0.87,
186
comment="Model confidence score"
187
)
188
189
# Boolean scores
190
contains_answer = "answer" in output_text.lower()
191
span.score(
192
name="contains_answer",
193
value=contains_answer,
194
data_type="BOOLEAN",
195
comment="Response contains the word 'answer'"
196
)
197
198
# Categorical scores
199
sentiment = analyze_sentiment(output_text)
200
span.score(
201
name="sentiment",
202
value=sentiment, # "positive", "neutral", "negative"
203
data_type="CATEGORICAL",
204
comment=f"Response sentiment: {sentiment}"
205
)
206
207
# Accuracy if expected output available
208
if expected_output:
209
is_accurate = output_text.strip().lower() == expected_output.strip().lower()
210
span.score(
211
name="exact_match",
212
value=is_accurate,
213
data_type="BOOLEAN",
214
comment="Exact match with expected output" if is_accurate else "Does not match expected output"
215
)
216
217
# Usage
218
with langfuse.start_as_current_span(name="qa-task") as span:
219
response = generate_answer(question)
220
comprehensive_scoring(span, question, response, expected_answer)
221
```
222
223
### Direct Score Creation
224
225
```python
226
# Create scores after execution using IDs
227
trace_id = langfuse.create_trace_id()
228
229
with langfuse.start_as_current_span(name="main-process", trace_id=trace_id) as span:
230
observation_id = span.id
231
result = perform_task()
232
233
# Later, add scores using IDs
234
langfuse.create_score(
235
name="post_processing_quality",
236
value="0.92", # All values stored as strings
237
trace_id=trace_id,
238
comment="Quality assessment after post-processing"
239
)
240
241
langfuse.create_score(
242
name="observation_specific_metric",
243
value="high",
244
observation_id=observation_id,
245
data_type="CATEGORICAL",
246
comment="Observation-specific categorical assessment"
247
)
248
```
249
250
### Human Feedback Integration
251
252
```python
253
class FeedbackCollector:
254
"""Collect and apply human feedback as scores."""
255
256
def __init__(self, langfuse_client):
257
self.langfuse = langfuse_client
258
259
def apply_user_feedback(self, trace_id, feedback_data):
260
"""Apply user feedback as scores to a trace."""
261
262
# Thumbs up/down feedback
263
if "rating" in feedback_data:
264
self.langfuse.create_score(
265
name="user_rating",
266
value=str(feedback_data["rating"]), # 1 for thumbs up, 0 for thumbs down
267
trace_id=trace_id,
268
data_type="BOOLEAN",
269
comment="User thumbs up/down rating"
270
)
271
272
# Detailed rating (1-5 scale)
273
if "detailed_rating" in feedback_data:
274
self.langfuse.create_score(
275
name="detailed_rating",
276
value=str(feedback_data["detailed_rating"]),
277
trace_id=trace_id,
278
data_type="NUMERIC",
279
comment=f"User detailed rating: {feedback_data['detailed_rating']}/5"
280
)
281
282
# Categorical feedback
283
if "feedback_category" in feedback_data:
284
self.langfuse.create_score(
285
name="feedback_category",
286
value=feedback_data["feedback_category"], # "helpful", "irrelevant", "incorrect", etc.
287
trace_id=trace_id,
288
data_type="CATEGORICAL",
289
comment="User-provided feedback category"
290
)
291
292
# Free-form comments (stored as comment, not score value)
293
if "comment" in feedback_data:
294
self.langfuse.create_score(
295
name="user_comment",
296
value="provided", # Categorical indicator that comment exists
297
trace_id=trace_id,
298
data_type="CATEGORICAL",
299
comment=feedback_data["comment"]
300
)
301
302
# Usage
303
feedback_collector = FeedbackCollector(langfuse)
304
305
# Simulate user feedback
306
user_feedback = {
307
"rating": 1, # Thumbs up
308
"detailed_rating": 4,
309
"feedback_category": "helpful",
310
"comment": "Great response, very informative!"
311
}
312
313
feedback_collector.apply_user_feedback(trace_id, user_feedback)
314
```
315
316
### A/B Test Scoring
317
318
```python
319
def score_ab_test(span, variant, response, metrics):
320
"""Score responses from A/B tests with variant tracking."""
321
322
# Track which variant was used
323
span.score(
324
name="ab_variant",
325
value=variant, # "A", "B", "control", etc.
326
data_type="CATEGORICAL",
327
comment=f"A/B test variant: {variant}"
328
)
329
330
# Apply variant-specific scoring
331
for metric_name, metric_value in metrics.items():
332
span.score(
333
name=f"{metric_name}_{variant}",
334
value=metric_value,
335
comment=f"{metric_name} for variant {variant}"
336
)
337
338
# Overall performance comparison
339
baseline_score = get_baseline_score(metric_name)
340
improvement = metric_value - baseline_score
341
span.score(
342
name="improvement_over_baseline",
343
value=improvement,
344
comment=f"Improvement over baseline: {improvement:+.3f}"
345
)
346
347
# Usage in A/B test
348
@langfuse.observe(as_type="generation")
349
def ab_test_response(prompt, variant="A"):
350
if variant == "A":
351
response = model_a.generate(prompt)
352
else:
353
response = model_b.generate(prompt)
354
355
# Calculate metrics
356
metrics = {
357
"relevance": calculate_relevance(prompt, response),
358
"coherence": calculate_coherence(response),
359
"engagement": calculate_engagement(response)
360
}
361
362
# Score with variant tracking
363
current_span = langfuse.get_current_observation()
364
if current_span:
365
score_ab_test(current_span, variant, response, metrics)
366
367
return response
368
```
369
370
### Batch Scoring
371
372
```python
373
def batch_score_traces(trace_ids, evaluations):
374
"""Apply scores to multiple traces in batch."""
375
376
for trace_id in trace_ids:
377
# Get trace data for evaluation
378
trace_data = get_trace_data(trace_id) # Your method to get trace data
379
380
for eval_func in evaluations:
381
try:
382
scores = eval_func(trace_data)
383
384
# Handle single score or multiple scores
385
if not isinstance(scores, list):
386
scores = [scores]
387
388
for score_data in scores:
389
langfuse.create_score(
390
name=score_data["name"],
391
value=str(score_data["value"]),
392
trace_id=trace_id,
393
data_type=score_data.get("data_type", "NUMERIC"),
394
comment=score_data.get("comment"),
395
config_id=score_data.get("config_id")
396
)
397
398
except Exception as e:
399
print(f"Failed to evaluate trace {trace_id}: {e}")
400
401
# Example evaluations
402
def relevance_evaluator(trace_data):
403
score = calculate_relevance(trace_data["input"], trace_data["output"])
404
return {
405
"name": "relevance",
406
"value": score,
407
"comment": f"Calculated relevance: {score:.3f}"
408
}
409
410
def quality_evaluator(trace_data):
411
quality_scores = assess_multiple_quality_dimensions(trace_data["output"])
412
return [
413
{"name": "clarity", "value": quality_scores["clarity"]},
414
{"name": "accuracy", "value": quality_scores["accuracy"]},
415
{"name": "completeness", "value": quality_scores["completeness"]}
416
]
417
418
# Batch process traces
419
recent_trace_ids = get_recent_traces() # Your method to get trace IDs
420
batch_score_traces(recent_trace_ids, [relevance_evaluator, quality_evaluator])
421
```
422
423
### Custom Score Configurations
424
425
```python
426
def setup_score_configs():
427
"""Set up reusable score configurations in Langfuse UI, then reference them."""
428
429
# Reference pre-configured scores by config_id
430
# These would be set up in the Langfuse UI with specific ranges, thresholds, etc.
431
432
def score_with_config(span, score_name, value, config_name):
433
# In practice, you'd store config_ids somewhere accessible
434
config_ids = {
435
"quality_1_to_5": "config_123",
436
"relevance_0_to_1": "config_456",
437
"satisfaction_boolean": "config_789"
438
}
439
440
config_id = config_ids.get(config_name)
441
442
span.score(
443
name=score_name,
444
value=value,
445
config_id=config_id,
446
comment=f"Score using {config_name} configuration"
447
)
448
449
return score_with_config
450
451
# Usage
452
score_with_config = setup_score_configs()
453
454
with langfuse.start_as_current_span(name="configured-scoring") as span:
455
result = process_request()
456
457
score_with_config(span, "response_quality", 4, "quality_1_to_5")
458
score_with_config(span, "relevance", 0.85, "relevance_0_to_1")
459
score_with_config(span, "user_satisfied", True, "satisfaction_boolean")
460
```
461
462
### Score Analysis and Reporting
463
464
```python
465
def analyze_scores_from_experiment(experiment_result):
466
"""Analyze scores from experiment results."""
467
468
all_scores = {}
469
470
# Collect all scores from experiment
471
for item_result in experiment_result.item_results:
472
if item_result.trace_id:
473
# In practice, you'd fetch scores via API or have them in the result
474
trace_scores = get_trace_scores(item_result.trace_id) # Your method
475
476
for score in trace_scores:
477
if score["name"] not in all_scores:
478
all_scores[score["name"]] = []
479
all_scores[score["name"]].append(score["value"])
480
481
# Generate summary statistics
482
for score_name, values in all_scores.items():
483
if all(isinstance(v, (int, float)) for v in values):
484
avg_score = sum(values) / len(values)
485
min_score = min(values)
486
max_score = max(values)
487
488
print(f"{score_name}:")
489
print(f" Average: {avg_score:.3f}")
490
print(f" Range: {min_score:.3f} - {max_score:.3f}")
491
print(f" Samples: {len(values)}")
492
else:
493
# Categorical data
494
from collections import Counter
495
distribution = Counter(values)
496
print(f"{score_name} distribution:")
497
for category, count in distribution.items():
498
percentage = count / len(values) * 100
499
print(f" {category}: {count} ({percentage:.1f}%)")
500
501
# Usage
502
experiment_result = langfuse.run_experiment(...)
503
analyze_scores_from_experiment(experiment_result)
504
```