or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdcore-tracing.mddatasets.mdexperiments.mdindex.mdintegrations.mdobservation-types.mdprompts.mdscoring.md

scoring.mddocs/

0

# Scoring and Evaluation

1

2

System for adding scores and evaluations to traces and observations, supporting numeric, categorical, and boolean score types with flexible data structures and UI integration.

3

4

## Capabilities

5

6

### Observation-Level Scoring

7

8

Add scores to specific observations (spans) for detailed evaluation tracking.

9

10

```python { .api }

11

class LangfuseObservationWrapper:

12

def score(self, *, name: str, value: Union[float, str], score_id: str = None,

13

data_type: ScoreDataType = None, comment: str = None,

14

config_id: str = None) -> None:

15

"""Create score for this specific observation.

16

17

Args:

18

name: Score name/metric identifier (e.g., "accuracy", "relevance")

19

value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)

20

score_id: Optional custom ID for the score (auto-generated if not provided)

21

data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)

22

comment: Optional comment or explanation for the score

23

config_id: Optional ID of score config defined in Langfuse

24

25

Example:

26

span.score(

27

name="relevance",

28

value=0.85,

29

data_type="NUMERIC",

30

comment="High relevance to user query"

31

)

32

"""

33

```

34

35

### Trace-Level Scoring

36

37

Add scores to entire traces for overall evaluation and quality assessment.

38

39

```python { .api }

40

class LangfuseObservationWrapper:

41

def score_trace(self, *, name: str, value: Union[float, str], score_id: str = None,

42

data_type: ScoreDataType = None, comment: str = None,

43

config_id: str = None) -> None:

44

"""Create score for the entire trace this observation belongs to.

45

46

Args:

47

name: Score name for trace-level evaluation

48

value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)

49

score_id: Optional custom ID for the score

50

data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)

51

comment: Optional comment explaining the trace-level score

52

config_id: Optional score config ID from Langfuse

53

54

Example:

55

span.score_trace(

56

name="overall_quality",

57

value=0.9,

58

data_type="NUMERIC",

59

comment="Excellent overall response quality"

60

)

61

"""

62

```

63

64

### Direct Score Creation

65

66

Create scores directly through the client without needing span references.

67

68

```python { .api }

69

class Langfuse:

70

def create_score(self, *, name: str, value: str, trace_id: str = None,

71

observation_id: str = None, score_id: str = None,

72

data_type: Literal["CATEGORICAL"] = None, comment: str = None,

73

config_id: str = None) -> None:

74

"""Create score for trace or observation by ID.

75

76

Args:

77

name: Score name/metric identifier

78

value: Score value (stored as string regardless of type)

79

trace_id: Target trace ID (for trace-level scores)

80

observation_id: Target observation ID (for observation-level scores)

81

score_id: Optional custom score ID

82

data_type: Score data type

83

comment: Optional comment or explanation

84

config_id: Optional score config ID

85

86

Note:

87

Provide either trace_id for trace-level scores or observation_id for observation-level scores

88

"""

89

```

90

91

### Score Data Types

92

93

Supported score types with proper type annotations and validation.

94

95

```python { .api }

96

# Score data type enumeration

97

ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]

98

99

# Type-specific overloads for better type safety

100

def score(*, name: str, value: float, data_type: Literal["NUMERIC", "BOOLEAN"] = None) -> None: ...

101

def score(*, name: str, value: str, data_type: Literal["CATEGORICAL"] = "CATEGORICAL") -> None: ...

102

```

103

104

## Usage Examples

105

106

### Basic Scoring

107

108

```python

109

from langfuse import Langfuse

110

111

langfuse = Langfuse()

112

113

# Score during span execution

114

with langfuse.start_as_current_span(name="process-query") as span:

115

result = process_user_query()

116

117

# Add observation-level scores

118

span.score(

119

name="accuracy",

120

value=0.95,

121

data_type="NUMERIC",

122

comment="High accuracy based on ground truth comparison"

123

)

124

125

span.score(

126

name="response_category",

127

value="informative",

128

data_type="CATEGORICAL",

129

comment="Response provides comprehensive information"

130

)

131

132

# Add trace-level score

133

span.score_trace(

134

name="user_satisfaction",

135

value=1.0,

136

data_type="BOOLEAN",

137

comment="User indicated satisfaction with response"

138

)

139

```

140

141

### Automated Scoring with Evaluators

142

143

```python

144

@langfuse.observe(as_type="generation")

145

def generate_response(prompt):

146

response = llm.generate(prompt)

147

148

# Automatic scoring within the observed function

149

current_span = langfuse.get_current_observation()

150

if current_span:

151

# Calculate relevance score

152

relevance = calculate_relevance(prompt, response)

153

current_span.score(

154

name="relevance",

155

value=relevance,

156

comment=f"Relevance score: {relevance:.2f}"

157

)

158

159

# Add categorical quality assessment

160

quality_category = assess_quality(response)

161

current_span.score(

162

name="quality_tier",

163

value=quality_category, # "excellent", "good", "fair", "poor"

164

data_type="CATEGORICAL"

165

)

166

167

return response

168

```

169

170

### Multiple Score Types

171

172

```python

173

def comprehensive_scoring(span, input_text, output_text, expected_output=None):

174

"""Add multiple types of scores to a span."""

175

176

# Numeric scores

177

span.score(

178

name="response_length",

179

value=len(output_text),

180

comment=f"Response contains {len(output_text)} characters"

181

)

182

183

span.score(

184

name="confidence",

185

value=0.87,

186

comment="Model confidence score"

187

)

188

189

# Boolean scores

190

contains_answer = "answer" in output_text.lower()

191

span.score(

192

name="contains_answer",

193

value=contains_answer,

194

data_type="BOOLEAN",

195

comment="Response contains the word 'answer'"

196

)

197

198

# Categorical scores

199

sentiment = analyze_sentiment(output_text)

200

span.score(

201

name="sentiment",

202

value=sentiment, # "positive", "neutral", "negative"

203

data_type="CATEGORICAL",

204

comment=f"Response sentiment: {sentiment}"

205

)

206

207

# Accuracy if expected output available

208

if expected_output:

209

is_accurate = output_text.strip().lower() == expected_output.strip().lower()

210

span.score(

211

name="exact_match",

212

value=is_accurate,

213

data_type="BOOLEAN",

214

comment="Exact match with expected output" if is_accurate else "Does not match expected output"

215

)

216

217

# Usage

218

with langfuse.start_as_current_span(name="qa-task") as span:

219

response = generate_answer(question)

220

comprehensive_scoring(span, question, response, expected_answer)

221

```

222

223

### Direct Score Creation

224

225

```python

226

# Create scores after execution using IDs

227

trace_id = langfuse.create_trace_id()

228

229

with langfuse.start_as_current_span(name="main-process", trace_id=trace_id) as span:

230

observation_id = span.id

231

result = perform_task()

232

233

# Later, add scores using IDs

234

langfuse.create_score(

235

name="post_processing_quality",

236

value="0.92", # All values stored as strings

237

trace_id=trace_id,

238

comment="Quality assessment after post-processing"

239

)

240

241

langfuse.create_score(

242

name="observation_specific_metric",

243

value="high",

244

observation_id=observation_id,

245

data_type="CATEGORICAL",

246

comment="Observation-specific categorical assessment"

247

)

248

```

249

250

### Human Feedback Integration

251

252

```python

253

class FeedbackCollector:

254

"""Collect and apply human feedback as scores."""

255

256

def __init__(self, langfuse_client):

257

self.langfuse = langfuse_client

258

259

def apply_user_feedback(self, trace_id, feedback_data):

260

"""Apply user feedback as scores to a trace."""

261

262

# Thumbs up/down feedback

263

if "rating" in feedback_data:

264

self.langfuse.create_score(

265

name="user_rating",

266

value=str(feedback_data["rating"]), # 1 for thumbs up, 0 for thumbs down

267

trace_id=trace_id,

268

data_type="BOOLEAN",

269

comment="User thumbs up/down rating"

270

)

271

272

# Detailed rating (1-5 scale)

273

if "detailed_rating" in feedback_data:

274

self.langfuse.create_score(

275

name="detailed_rating",

276

value=str(feedback_data["detailed_rating"]),

277

trace_id=trace_id,

278

data_type="NUMERIC",

279

comment=f"User detailed rating: {feedback_data['detailed_rating']}/5"

280

)

281

282

# Categorical feedback

283

if "feedback_category" in feedback_data:

284

self.langfuse.create_score(

285

name="feedback_category",

286

value=feedback_data["feedback_category"], # "helpful", "irrelevant", "incorrect", etc.

287

trace_id=trace_id,

288

data_type="CATEGORICAL",

289

comment="User-provided feedback category"

290

)

291

292

# Free-form comments (stored as comment, not score value)

293

if "comment" in feedback_data:

294

self.langfuse.create_score(

295

name="user_comment",

296

value="provided", # Categorical indicator that comment exists

297

trace_id=trace_id,

298

data_type="CATEGORICAL",

299

comment=feedback_data["comment"]

300

)

301

302

# Usage

303

feedback_collector = FeedbackCollector(langfuse)

304

305

# Simulate user feedback

306

user_feedback = {

307

"rating": 1, # Thumbs up

308

"detailed_rating": 4,

309

"feedback_category": "helpful",

310

"comment": "Great response, very informative!"

311

}

312

313

feedback_collector.apply_user_feedback(trace_id, user_feedback)

314

```

315

316

### A/B Test Scoring

317

318

```python

319

def score_ab_test(span, variant, response, metrics):

320

"""Score responses from A/B tests with variant tracking."""

321

322

# Track which variant was used

323

span.score(

324

name="ab_variant",

325

value=variant, # "A", "B", "control", etc.

326

data_type="CATEGORICAL",

327

comment=f"A/B test variant: {variant}"

328

)

329

330

# Apply variant-specific scoring

331

for metric_name, metric_value in metrics.items():

332

span.score(

333

name=f"{metric_name}_{variant}",

334

value=metric_value,

335

comment=f"{metric_name} for variant {variant}"

336

)

337

338

# Overall performance comparison

339

baseline_score = get_baseline_score(metric_name)

340

improvement = metric_value - baseline_score

341

span.score(

342

name="improvement_over_baseline",

343

value=improvement,

344

comment=f"Improvement over baseline: {improvement:+.3f}"

345

)

346

347

# Usage in A/B test

348

@langfuse.observe(as_type="generation")

349

def ab_test_response(prompt, variant="A"):

350

if variant == "A":

351

response = model_a.generate(prompt)

352

else:

353

response = model_b.generate(prompt)

354

355

# Calculate metrics

356

metrics = {

357

"relevance": calculate_relevance(prompt, response),

358

"coherence": calculate_coherence(response),

359

"engagement": calculate_engagement(response)

360

}

361

362

# Score with variant tracking

363

current_span = langfuse.get_current_observation()

364

if current_span:

365

score_ab_test(current_span, variant, response, metrics)

366

367

return response

368

```

369

370

### Batch Scoring

371

372

```python

373

def batch_score_traces(trace_ids, evaluations):

374

"""Apply scores to multiple traces in batch."""

375

376

for trace_id in trace_ids:

377

# Get trace data for evaluation

378

trace_data = get_trace_data(trace_id) # Your method to get trace data

379

380

for eval_func in evaluations:

381

try:

382

scores = eval_func(trace_data)

383

384

# Handle single score or multiple scores

385

if not isinstance(scores, list):

386

scores = [scores]

387

388

for score_data in scores:

389

langfuse.create_score(

390

name=score_data["name"],

391

value=str(score_data["value"]),

392

trace_id=trace_id,

393

data_type=score_data.get("data_type", "NUMERIC"),

394

comment=score_data.get("comment"),

395

config_id=score_data.get("config_id")

396

)

397

398

except Exception as e:

399

print(f"Failed to evaluate trace {trace_id}: {e}")

400

401

# Example evaluations

402

def relevance_evaluator(trace_data):

403

score = calculate_relevance(trace_data["input"], trace_data["output"])

404

return {

405

"name": "relevance",

406

"value": score,

407

"comment": f"Calculated relevance: {score:.3f}"

408

}

409

410

def quality_evaluator(trace_data):

411

quality_scores = assess_multiple_quality_dimensions(trace_data["output"])

412

return [

413

{"name": "clarity", "value": quality_scores["clarity"]},

414

{"name": "accuracy", "value": quality_scores["accuracy"]},

415

{"name": "completeness", "value": quality_scores["completeness"]}

416

]

417

418

# Batch process traces

419

recent_trace_ids = get_recent_traces() # Your method to get trace IDs

420

batch_score_traces(recent_trace_ids, [relevance_evaluator, quality_evaluator])

421

```

422

423

### Custom Score Configurations

424

425

```python

426

def setup_score_configs():

427

"""Set up reusable score configurations in Langfuse UI, then reference them."""

428

429

# Reference pre-configured scores by config_id

430

# These would be set up in the Langfuse UI with specific ranges, thresholds, etc.

431

432

def score_with_config(span, score_name, value, config_name):

433

# In practice, you'd store config_ids somewhere accessible

434

config_ids = {

435

"quality_1_to_5": "config_123",

436

"relevance_0_to_1": "config_456",

437

"satisfaction_boolean": "config_789"

438

}

439

440

config_id = config_ids.get(config_name)

441

442

span.score(

443

name=score_name,

444

value=value,

445

config_id=config_id,

446

comment=f"Score using {config_name} configuration"

447

)

448

449

return score_with_config

450

451

# Usage

452

score_with_config = setup_score_configs()

453

454

with langfuse.start_as_current_span(name="configured-scoring") as span:

455

result = process_request()

456

457

score_with_config(span, "response_quality", 4, "quality_1_to_5")

458

score_with_config(span, "relevance", 0.85, "relevance_0_to_1")

459

score_with_config(span, "user_satisfied", True, "satisfaction_boolean")

460

```

461

462

### Score Analysis and Reporting

463

464

```python

465

def analyze_scores_from_experiment(experiment_result):

466

"""Analyze scores from experiment results."""

467

468

all_scores = {}

469

470

# Collect all scores from experiment

471

for item_result in experiment_result.item_results:

472

if item_result.trace_id:

473

# In practice, you'd fetch scores via API or have them in the result

474

trace_scores = get_trace_scores(item_result.trace_id) # Your method

475

476

for score in trace_scores:

477

if score["name"] not in all_scores:

478

all_scores[score["name"]] = []

479

all_scores[score["name"]].append(score["value"])

480

481

# Generate summary statistics

482

for score_name, values in all_scores.items():

483

if all(isinstance(v, (int, float)) for v in values):

484

avg_score = sum(values) / len(values)

485

min_score = min(values)

486

max_score = max(values)

487

488

print(f"{score_name}:")

489

print(f" Average: {avg_score:.3f}")

490

print(f" Range: {min_score:.3f} - {max_score:.3f}")

491

print(f" Samples: {len(values)}")

492

else:

493

# Categorical data

494

from collections import Counter

495

distribution = Counter(values)

496

print(f"{score_name} distribution:")

497

for category, count in distribution.items():

498

percentage = count / len(values) * 100

499

print(f" {category}: {count} ({percentage:.1f}%)")

500

501

# Usage

502

experiment_result = langfuse.run_experiment(...)

503

analyze_scores_from_experiment(experiment_result)

504

```