Tessl Tile for pypi/langfuse@3.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced.md core-tracing.md datasets.md experiments.md index.md integrations.md observation-types.md prompts.md scoring.md

scoring.mddocs/

0
# Scoring and Evaluation
1

2
System for adding scores and evaluations to traces and observations, supporting numeric, categorical, and boolean score types with flexible data structures and UI integration.
3

4
## Capabilities
5

6
### Observation-Level Scoring
7

8
Add scores to specific observations (spans) for detailed evaluation tracking.
9

10
```python { .api }
11
class LangfuseObservationWrapper:
12
    def score(self, *, name: str, value: Union[float, str], score_id: str = None,
13
              data_type: ScoreDataType = None, comment: str = None,
14
              config_id: str = None) -> None:
15
        """Create score for this specific observation.
16

17
        Args:
18
            name: Score name/metric identifier (e.g., "accuracy", "relevance")
19
            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
20
            score_id: Optional custom ID for the score (auto-generated if not provided)
21
            data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
22
            comment: Optional comment or explanation for the score
23
            config_id: Optional ID of score config defined in Langfuse
24

25
        Example:
26
            span.score(
27
                name="relevance",
28
                value=0.85,
29
                data_type="NUMERIC",
30
                comment="High relevance to user query"
31
            )
32
        """
33
```
34

35
### Trace-Level Scoring
36

37
Add scores to entire traces for overall evaluation and quality assessment.
38

39
```python { .api }
40
class LangfuseObservationWrapper:
41
    def score_trace(self, *, name: str, value: Union[float, str], score_id: str = None,
42
                    data_type: ScoreDataType = None, comment: str = None,
43
                    config_id: str = None) -> None:
44
        """Create score for the entire trace this observation belongs to.
45

46
        Args:
47
            name: Score name for trace-level evaluation
48
            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
49
            score_id: Optional custom ID for the score
50
            data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
51
            comment: Optional comment explaining the trace-level score
52
            config_id: Optional score config ID from Langfuse
53

54
        Example:
55
            span.score_trace(
56
                name="overall_quality",
57
                value=0.9,
58
                data_type="NUMERIC",
59
                comment="Excellent overall response quality"
60
            )
61
        """
62
```
63

64
### Direct Score Creation
65

66
Create scores directly through the client without needing span references.
67

68
```python { .api }
69
class Langfuse:
70
    def create_score(self, *, name: str, value: str, trace_id: str = None,
71
                     observation_id: str = None, score_id: str = None,
72
                     data_type: Literal["CATEGORICAL"] = None, comment: str = None,
73
                     config_id: str = None) -> None:
74
        """Create score for trace or observation by ID.
75

76
        Args:
77
            name: Score name/metric identifier
78
            value: Score value (stored as string regardless of type)
79
            trace_id: Target trace ID (for trace-level scores)
80
            observation_id: Target observation ID (for observation-level scores)
81
            score_id: Optional custom score ID
82
            data_type: Score data type
83
            comment: Optional comment or explanation
84
            config_id: Optional score config ID
85

86
        Note:
87
            Provide either trace_id for trace-level scores or observation_id for observation-level scores
88
        """
89
```
90

91
### Score Data Types
92

93
Supported score types with proper type annotations and validation.
94

95
```python { .api }
96
# Score data type enumeration
97
ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
98

99
# Type-specific overloads for better type safety
100
def score(*, name: str, value: float, data_type: Literal["NUMERIC", "BOOLEAN"] = None) -> None: ...
101
def score(*, name: str, value: str, data_type: Literal["CATEGORICAL"] = "CATEGORICAL") -> None: ...
102
```
103

104
## Usage Examples
105

106
### Basic Scoring
107

108
```python
109
from langfuse import Langfuse
110

111
langfuse = Langfuse()
112

113
# Score during span execution
114
with langfuse.start_as_current_span(name="process-query") as span:
115
    result = process_user_query()
116

117
    # Add observation-level scores
118
    span.score(
119
        name="accuracy",
120
        value=0.95,
121
        data_type="NUMERIC",
122
        comment="High accuracy based on ground truth comparison"
123
    )
124

125
    span.score(
126
        name="response_category",
127
        value="informative",
128
        data_type="CATEGORICAL",
129
        comment="Response provides comprehensive information"
130
    )
131

132
    # Add trace-level score
133
    span.score_trace(
134
        name="user_satisfaction",
135
        value=1.0,
136
        data_type="BOOLEAN",
137
        comment="User indicated satisfaction with response"
138
    )
139
```
140

141
### Automated Scoring with Evaluators
142

143
```python
144
@langfuse.observe(as_type="generation")
145
def generate_response(prompt):
146
    response = llm.generate(prompt)
147

148
    # Automatic scoring within the observed function
149
    current_span = langfuse.get_current_observation()
150
    if current_span:
151
        # Calculate relevance score
152
        relevance = calculate_relevance(prompt, response)
153
        current_span.score(
154
            name="relevance",
155
            value=relevance,
156
            comment=f"Relevance score: {relevance:.2f}"
157
        )
158

159
        # Add categorical quality assessment
160
        quality_category = assess_quality(response)
161
        current_span.score(
162
            name="quality_tier",
163
            value=quality_category,  # "excellent", "good", "fair", "poor"
164
            data_type="CATEGORICAL"
165
        )
166

167
    return response
168
```
169

170
### Multiple Score Types
171

172
```python
173
def comprehensive_scoring(span, input_text, output_text, expected_output=None):
174
    """Add multiple types of scores to a span."""
175

176
    # Numeric scores
177
    span.score(
178
        name="response_length",
179
        value=len(output_text),
180
        comment=f"Response contains {len(output_text)} characters"
181
    )
182

183
    span.score(
184
        name="confidence",
185
        value=0.87,
186
        comment="Model confidence score"
187
    )
188

189
    # Boolean scores
190
    contains_answer = "answer" in output_text.lower()
191
    span.score(
192
        name="contains_answer",
193
        value=contains_answer,
194
        data_type="BOOLEAN",
195
        comment="Response contains the word 'answer'"
196
    )
197

198
    # Categorical scores
199
    sentiment = analyze_sentiment(output_text)
200
    span.score(
201
        name="sentiment",
202
        value=sentiment,  # "positive", "neutral", "negative"
203
        data_type="CATEGORICAL",
204
        comment=f"Response sentiment: {sentiment}"
205
    )
206

207
    # Accuracy if expected output available
208
    if expected_output:
209
        is_accurate = output_text.strip().lower() == expected_output.strip().lower()
210
        span.score(
211
            name="exact_match",
212
            value=is_accurate,
213
            data_type="BOOLEAN",
214
            comment="Exact match with expected output" if is_accurate else "Does not match expected output"
215
        )
216

217
# Usage
218
with langfuse.start_as_current_span(name="qa-task") as span:
219
    response = generate_answer(question)
220
    comprehensive_scoring(span, question, response, expected_answer)
221
```
222

223
### Direct Score Creation
224

225
```python
226
# Create scores after execution using IDs
227
trace_id = langfuse.create_trace_id()
228

229
with langfuse.start_as_current_span(name="main-process", trace_id=trace_id) as span:
230
    observation_id = span.id
231
    result = perform_task()
232

233
# Later, add scores using IDs
234
langfuse.create_score(
235
    name="post_processing_quality",
236
    value="0.92",  # All values stored as strings
237
    trace_id=trace_id,
238
    comment="Quality assessment after post-processing"
239
)
240

241
langfuse.create_score(
242
    name="observation_specific_metric",
243
    value="high",
244
    observation_id=observation_id,
245
    data_type="CATEGORICAL",
246
    comment="Observation-specific categorical assessment"
247
)
248
```
249

250
### Human Feedback Integration
251

252
```python
253
class FeedbackCollector:
254
    """Collect and apply human feedback as scores."""
255

256
    def __init__(self, langfuse_client):
257
        self.langfuse = langfuse_client
258

259
    def apply_user_feedback(self, trace_id, feedback_data):
260
        """Apply user feedback as scores to a trace."""
261

262
        # Thumbs up/down feedback
263
        if "rating" in feedback_data:
264
            self.langfuse.create_score(
265
                name="user_rating",
266
                value=str(feedback_data["rating"]),  # 1 for thumbs up, 0 for thumbs down
267
                trace_id=trace_id,
268
                data_type="BOOLEAN",
269
                comment="User thumbs up/down rating"
270
            )
271

272
        # Detailed rating (1-5 scale)
273
        if "detailed_rating" in feedback_data:
274
            self.langfuse.create_score(
275
                name="detailed_rating",
276
                value=str(feedback_data["detailed_rating"]),
277
                trace_id=trace_id,
278
                data_type="NUMERIC",
279
                comment=f"User detailed rating: {feedback_data['detailed_rating']}/5"
280
            )
281

282
        # Categorical feedback
283
        if "feedback_category" in feedback_data:
284
            self.langfuse.create_score(
285
                name="feedback_category",
286
                value=feedback_data["feedback_category"],  # "helpful", "irrelevant", "incorrect", etc.
287
                trace_id=trace_id,
288
                data_type="CATEGORICAL",
289
                comment="User-provided feedback category"
290
            )
291

292
        # Free-form comments (stored as comment, not score value)
293
        if "comment" in feedback_data:
294
            self.langfuse.create_score(
295
                name="user_comment",
296
                value="provided",  # Categorical indicator that comment exists
297
                trace_id=trace_id,
298
                data_type="CATEGORICAL",
299
                comment=feedback_data["comment"]
300
            )
301

302
# Usage
303
feedback_collector = FeedbackCollector(langfuse)
304

305
# Simulate user feedback
306
user_feedback = {
307
    "rating": 1,  # Thumbs up
308
    "detailed_rating": 4,
309
    "feedback_category": "helpful",
310
    "comment": "Great response, very informative!"
311
}
312

313
feedback_collector.apply_user_feedback(trace_id, user_feedback)
314
```
315

316
### A/B Test Scoring
317

318
```python
319
def score_ab_test(span, variant, response, metrics):
320
    """Score responses from A/B tests with variant tracking."""
321

322
    # Track which variant was used
323
    span.score(
324
        name="ab_variant",
325
        value=variant,  # "A", "B", "control", etc.
326
        data_type="CATEGORICAL",
327
        comment=f"A/B test variant: {variant}"
328
    )
329

330
    # Apply variant-specific scoring
331
    for metric_name, metric_value in metrics.items():
332
        span.score(
333
            name=f"{metric_name}_{variant}",
334
            value=metric_value,
335
            comment=f"{metric_name} for variant {variant}"
336
        )
337

338
    # Overall performance comparison
339
    baseline_score = get_baseline_score(metric_name)
340
    improvement = metric_value - baseline_score
341
    span.score(
342
        name="improvement_over_baseline",
343
        value=improvement,
344
        comment=f"Improvement over baseline: {improvement:+.3f}"
345
    )
346

347
# Usage in A/B test
348
@langfuse.observe(as_type="generation")
349
def ab_test_response(prompt, variant="A"):
350
    if variant == "A":
351
        response = model_a.generate(prompt)
352
    else:
353
        response = model_b.generate(prompt)
354

355
    # Calculate metrics
356
    metrics = {
357
        "relevance": calculate_relevance(prompt, response),
358
        "coherence": calculate_coherence(response),
359
        "engagement": calculate_engagement(response)
360
    }
361

362
    # Score with variant tracking
363
    current_span = langfuse.get_current_observation()
364
    if current_span:
365
        score_ab_test(current_span, variant, response, metrics)
366

367
    return response
368
```
369

370
### Batch Scoring
371

372
```python
373
def batch_score_traces(trace_ids, evaluations):
374
    """Apply scores to multiple traces in batch."""
375

376
    for trace_id in trace_ids:
377
        # Get trace data for evaluation
378
        trace_data = get_trace_data(trace_id)  # Your method to get trace data
379

380
        for eval_func in evaluations:
381
            try:
382
                scores = eval_func(trace_data)
383

384
                # Handle single score or multiple scores
385
                if not isinstance(scores, list):
386
                    scores = [scores]
387

388
                for score_data in scores:
389
                    langfuse.create_score(
390
                        name=score_data["name"],
391
                        value=str(score_data["value"]),
392
                        trace_id=trace_id,
393
                        data_type=score_data.get("data_type", "NUMERIC"),
394
                        comment=score_data.get("comment"),
395
                        config_id=score_data.get("config_id")
396
                    )
397

398
            except Exception as e:
399
                print(f"Failed to evaluate trace {trace_id}: {e}")
400

401
# Example evaluations
402
def relevance_evaluator(trace_data):
403
    score = calculate_relevance(trace_data["input"], trace_data["output"])
404
    return {
405
        "name": "relevance",
406
        "value": score,
407
        "comment": f"Calculated relevance: {score:.3f}"
408
    }
409

410
def quality_evaluator(trace_data):
411
    quality_scores = assess_multiple_quality_dimensions(trace_data["output"])
412
    return [
413
        {"name": "clarity", "value": quality_scores["clarity"]},
414
        {"name": "accuracy", "value": quality_scores["accuracy"]},
415
        {"name": "completeness", "value": quality_scores["completeness"]}
416
    ]
417

418
# Batch process traces
419
recent_trace_ids = get_recent_traces()  # Your method to get trace IDs
420
batch_score_traces(recent_trace_ids, [relevance_evaluator, quality_evaluator])
421
```
422

423
### Custom Score Configurations
424

425
```python
426
def setup_score_configs():
427
    """Set up reusable score configurations in Langfuse UI, then reference them."""
428

429
    # Reference pre-configured scores by config_id
430
    # These would be set up in the Langfuse UI with specific ranges, thresholds, etc.
431

432
    def score_with_config(span, score_name, value, config_name):
433
        # In practice, you'd store config_ids somewhere accessible
434
        config_ids = {
435
            "quality_1_to_5": "config_123",
436
            "relevance_0_to_1": "config_456",
437
            "satisfaction_boolean": "config_789"
438
        }
439

440
        config_id = config_ids.get(config_name)
441

442
        span.score(
443
            name=score_name,
444
            value=value,
445
            config_id=config_id,
446
            comment=f"Score using {config_name} configuration"
447
        )
448

449
    return score_with_config
450

451
# Usage
452
score_with_config = setup_score_configs()
453

454
with langfuse.start_as_current_span(name="configured-scoring") as span:
455
    result = process_request()
456

457
    score_with_config(span, "response_quality", 4, "quality_1_to_5")
458
    score_with_config(span, "relevance", 0.85, "relevance_0_to_1")
459
    score_with_config(span, "user_satisfied", True, "satisfaction_boolean")
460
```
461

462
### Score Analysis and Reporting
463

464
```python
465
def analyze_scores_from_experiment(experiment_result):
466
    """Analyze scores from experiment results."""
467

468
    all_scores = {}
469

470
    # Collect all scores from experiment
471
    for item_result in experiment_result.item_results:
472
        if item_result.trace_id:
473
            # In practice, you'd fetch scores via API or have them in the result
474
            trace_scores = get_trace_scores(item_result.trace_id)  # Your method
475

476
            for score in trace_scores:
477
                if score["name"] not in all_scores:
478
                    all_scores[score["name"]] = []
479
                all_scores[score["name"]].append(score["value"])
480

481
    # Generate summary statistics
482
    for score_name, values in all_scores.items():
483
        if all(isinstance(v, (int, float)) for v in values):
484
            avg_score = sum(values) / len(values)
485
            min_score = min(values)
486
            max_score = max(values)
487

488
            print(f"{score_name}:")
489
            print(f"  Average: {avg_score:.3f}")
490
            print(f"  Range: {min_score:.3f} - {max_score:.3f}")
491
            print(f"  Samples: {len(values)}")
492
        else:
493
            # Categorical data
494
            from collections import Counter
495
            distribution = Counter(values)
496
            print(f"{score_name} distribution:")
497
            for category, count in distribution.items():
498
                percentage = count / len(values) * 100
499
                print(f"  {category}: {count} ({percentage:.1f}%)")
500

501
# Usage
502
experiment_result = langfuse.run_experiment(...)
503
analyze_scores_from_experiment(experiment_result)
504
```

Version

Tile

Files

scoring.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

scoring.mddocs/