or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdcore-tracing.mddatasets.mdexperiments.mdindex.mdintegrations.mdobservation-types.mdprompts.mdscoring.md

experiments.mddocs/

0

# Experiment Management

1

2

Comprehensive system for running experiments on datasets with automatic evaluation, result aggregation, and detailed reporting capabilities. Enables systematic testing and evaluation of AI applications.

3

4

## Capabilities

5

6

### Evaluation Results

7

8

Core data structures for representing evaluation outcomes from experiments.

9

10

```python { .api }

11

class Evaluation:

12

def __init__(self, *, name: str, value: Union[int, float, str, bool, None],

13

comment: str = None, metadata: Dict[str, Any] = None,

14

data_type: ScoreDataType = None, config_id: str = None):

15

"""Initialize evaluation result.

16

17

Args:

18

name: Unique identifier for the evaluation metric

19

value: The evaluation score or result

20

comment: Human-readable explanation of the result

21

metadata: Additional structured metadata about evaluation

22

data_type: Score data type (NUMERIC, CATEGORICAL, BOOLEAN)

23

config_id: Langfuse score config ID

24

"""

25

26

# Attributes

27

name: str

28

value: Union[int, float, str, bool, None]

29

comment: Optional[str]

30

metadata: Optional[Dict[str, Any]]

31

data_type: Optional[ScoreDataType]

32

config_id: Optional[str]

33

```

34

35

### Experiment Item Results

36

37

Results from processing individual items in an experiment.

38

39

```python { .api }

40

class ExperimentItemResult:

41

def __init__(self, *, item: ExperimentItem, output: Any,

42

evaluations: List[Evaluation], trace_id: str = None,

43

dataset_run_id: str = None):

44

"""Initialize experiment item result.

45

46

Args:

47

item: Original experiment item processed

48

output: Task function output for this item

49

evaluations: List of evaluation results

50

trace_id: Langfuse trace ID for execution

51

dataset_run_id: Dataset run ID if using Langfuse datasets

52

"""

53

54

# Attributes

55

item: ExperimentItem

56

output: Any

57

evaluations: List[Evaluation]

58

trace_id: Optional[str]

59

dataset_run_id: Optional[str]

60

```

61

62

### Complete Experiment Results

63

64

Results from running an entire experiment with formatting and analysis capabilities.

65

66

```python { .api }

67

class ExperimentResult:

68

def __init__(self, *, name: str, run_name: str, description: str = None,

69

item_results: List[ExperimentItemResult],

70

run_evaluations: List[Evaluation], dataset_run_id: str = None,

71

dataset_run_url: str = None):

72

"""Initialize complete experiment result.

73

74

Args:

75

name: Experiment name

76

run_name: Current experiment run name

77

description: Optional experiment description

78

item_results: Results from individual dataset items

79

run_evaluations: Aggregate evaluation results for entire run

80

dataset_run_id: Dataset run ID (for Langfuse datasets)

81

dataset_run_url: URL to view results in Langfuse UI

82

"""

83

84

def format(self, *, include_item_results: bool = False) -> str:

85

"""Format results for human-readable display.

86

87

Args:

88

include_item_results: Whether to include detailed results for each item

89

90

Returns:

91

Formatted multi-line string with experiment overview and results

92

"""

93

94

# Attributes

95

name: str

96

run_name: str

97

description: Optional[str]

98

item_results: List[ExperimentItemResult]

99

run_evaluations: List[Evaluation]

100

dataset_run_id: Optional[str]

101

dataset_run_url: Optional[str]

102

```

103

104

### Running Experiments

105

106

Main method for executing experiments on data with automatic tracing and evaluation.

107

108

```python { .api }

109

class Langfuse:

110

def run_experiment(self, *, name: str, data: ExperimentData,

111

task: TaskFunction, evaluators: List[EvaluatorFunction] = None,

112

run_evaluators: List[RunEvaluatorFunction] = None,

113

run_name: str = None, run_description: str = None,

114

experiment_config: Dict[str, Any] = None) -> ExperimentResult:

115

"""Run experiment on dataset with automatic evaluation.

116

117

Args:

118

name: Experiment name

119

data: List of experiment items to process

120

task: Function to execute on each item

121

evaluators: List of item-level evaluator functions

122

run_evaluators: List of run-level evaluator functions

123

run_name: Name for this specific run

124

run_description: Description of this experiment run

125

experiment_config: Configuration metadata for experiment

126

127

Returns:

128

ExperimentResult with complete results and evaluations

129

"""

130

```

131

132

### Data Types and Protocols

133

134

Type definitions for experiment data structures and function interfaces.

135

136

```python { .api }

137

# Data Types

138

LocalExperimentItem = TypedDict('LocalExperimentItem', {

139

'input': Any,

140

'expected_output': Any,

141

'metadata': Optional[Dict[str, Any]]

142

}, total=False)

143

144

ExperimentItem = Union[LocalExperimentItem, DatasetItemClient]

145

ExperimentData = Union[List[LocalExperimentItem], List[DatasetItemClient]]

146

147

# Function Protocols

148

class TaskFunction(Protocol):

149

def __call__(self, *, item: ExperimentItem, **kwargs) -> Union[Any, Awaitable[Any]]:

150

"""Execute task on experiment item.

151

152

Args:

153

item: Experiment item to process

154

**kwargs: Additional arguments

155

156

Returns:

157

Task output (can be async)

158

"""

159

160

class EvaluatorFunction(Protocol):

161

def __call__(self, *, input: Any, output: Any, expected_output: Any = None,

162

metadata: Dict[str, Any] = None, **kwargs) -> Union[Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]]:

163

"""Evaluate task output for individual items.

164

165

Args:

166

input: Original input to task function

167

output: Task function output

168

expected_output: Expected output for comparison

169

metadata: Item metadata

170

**kwargs: Additional arguments

171

172

Returns:

173

Single evaluation or list of evaluations (can be async)

174

"""

175

176

class RunEvaluatorFunction(Protocol):

177

def __call__(self, *, item_results: List[ExperimentItemResult],

178

**kwargs) -> Union[Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]]:

179

"""Evaluate entire experiment run with aggregate metrics.

180

181

Args:

182

item_results: Results from all processed items

183

**kwargs: Additional arguments

184

185

Returns:

186

Aggregate evaluation results (can be async)

187

"""

188

```

189

190

### Utility Functions

191

192

Helper functions for working with evaluators and experiment frameworks.

193

194

```python { .api }

195

def create_evaluator_from_autoevals(autoevals_evaluator: Any,

196

**kwargs: Dict[str, Any]) -> EvaluatorFunction:

197

"""Create Langfuse evaluator from autoevals evaluator.

198

199

Args:

200

autoevals_evaluator: An autoevals evaluator instance

201

**kwargs: Additional arguments passed to evaluator

202

203

Returns:

204

Langfuse-compatible evaluator function

205

"""

206

```

207

208

## Usage Examples

209

210

### Basic Experiment

211

212

```python

213

from langfuse import Langfuse, Evaluation

214

215

langfuse = Langfuse()

216

217

# Define task function

218

def generate_answer(*, item, **kwargs):

219

question = item["input"] if isinstance(item, dict) else item.input

220

# Your AI model call

221

answer = my_llm.generate(question)

222

return answer

223

224

# Define evaluator

225

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):

226

if not expected_output:

227

return Evaluation(name="accuracy", value=None, comment="No expected output")

228

229

is_correct = output.strip().lower() == expected_output.strip().lower()

230

return Evaluation(

231

name="accuracy",

232

value=1.0 if is_correct else 0.0,

233

comment="Exact match" if is_correct else "Different answer"

234

)

235

236

# Experiment data

237

experiment_data = [

238

{"input": "What is the capital of France?", "expected_output": "Paris"},

239

{"input": "What is the capital of Germany?", "expected_output": "Berlin"},

240

{"input": "What is the capital of Italy?", "expected_output": "Rome"}

241

]

242

243

# Run experiment

244

result = langfuse.run_experiment(

245

name="Capital Cities Test",

246

data=experiment_data,

247

task=generate_answer,

248

evaluators=[accuracy_evaluator]

249

)

250

251

# View results

252

print(result.format())

253

print(f"Average accuracy: {sum(eval.value for item in result.item_results for eval in item.evaluations if eval.name == 'accuracy') / len(result.item_results)}")

254

```

255

256

### Multi-Metric Evaluation

257

258

```python

259

def comprehensive_evaluator(*, input, output, expected_output=None, metadata=None, **kwargs):

260

"""Multiple evaluation metrics for a single item."""

261

evaluations = []

262

263

# Length check

264

evaluations.append(Evaluation(

265

name="output_length",

266

value=len(output),

267

comment=f"Output contains {len(output)} characters"

268

))

269

270

# Accuracy check

271

if expected_output:

272

is_correct = output.strip().lower() == expected_output.strip().lower()

273

evaluations.append(Evaluation(

274

name="accuracy",

275

value=is_correct,

276

data_type="BOOLEAN",

277

comment="Exact match" if is_correct else "Different answer"

278

))

279

280

# Custom scoring based on metadata

281

if metadata and "difficulty" in metadata:

282

difficulty_bonus = {"easy": 0, "medium": 0.1, "hard": 0.2}[metadata["difficulty"]]

283

evaluations.append(Evaluation(

284

name="difficulty_adjusted_score",

285

value=0.8 + difficulty_bonus,

286

comment=f"Base score with {metadata['difficulty']} difficulty bonus"

287

))

288

289

return evaluations

290

```

291

292

### Async Task and Evaluators

293

294

```python

295

import asyncio

296

297

async def async_task(*, item, **kwargs):

298

"""Async task function."""

299

question = item["input"] if isinstance(item, dict) else item.input

300

response = await async_llm_client.generate(question)

301

return response

302

303

async def async_evaluator(*, input, output, expected_output=None, **kwargs):

304

"""Async evaluator using external API."""

305

try:

306

# Call external evaluation service

307

evaluation_result = await external_eval_api.evaluate(

308

question=input,

309

answer=output,

310

expected=expected_output

311

)

312

313

return Evaluation(

314

name="external_quality",

315

value=evaluation_result.score,

316

comment=evaluation_result.explanation,

317

metadata={"confidence": evaluation_result.confidence}

318

)

319

except Exception as e:

320

return Evaluation(

321

name="external_quality",

322

value=None,

323

comment=f"Evaluation failed: {str(e)}"

324

)

325

326

# Run with async functions

327

result = langfuse.run_experiment(

328

name="Async Experiment",

329

data=experiment_data,

330

task=async_task,

331

evaluators=[async_evaluator]

332

)

333

```

334

335

### Run-Level Evaluators

336

337

```python

338

def statistical_run_evaluator(*, item_results, **kwargs):

339

"""Aggregate statistics across all experiment items."""

340

evaluations = []

341

342

# Calculate average scores for each metric

343

metric_scores = {}

344

for item_result in item_results:

345

for evaluation in item_result.evaluations:

346

if isinstance(evaluation.value, (int, float)):

347

if evaluation.name not in metric_scores:

348

metric_scores[evaluation.name] = []

349

metric_scores[evaluation.name].append(evaluation.value)

350

351

# Generate aggregate evaluations

352

for metric_name, scores in metric_scores.items():

353

if scores:

354

avg_score = sum(scores) / len(scores)

355

evaluations.append(Evaluation(

356

name=f"avg_{metric_name}",

357

value=avg_score,

358

comment=f"Average {metric_name} across {len(scores)} items: {avg_score:.3f}"

359

))

360

361

# Standard deviation

362

if len(scores) > 1:

363

variance = sum((x - avg_score) ** 2 for x in scores) / len(scores)

364

std_dev = variance ** 0.5

365

evaluations.append(Evaluation(

366

name=f"std_{metric_name}",

367

value=std_dev,

368

comment=f"Standard deviation of {metric_name}: {std_dev:.3f}"

369

))

370

371

return evaluations

372

373

# Use run evaluator

374

result = langfuse.run_experiment(

375

name="Statistical Analysis",

376

data=experiment_data,

377

task=generate_answer,

378

evaluators=[accuracy_evaluator],

379

run_evaluators=[statistical_run_evaluator]

380

)

381

```

382

383

### Working with Experiment Results

384

385

```python

386

# Run experiment

387

result = langfuse.run_experiment(

388

name="Quality Assessment",

389

data=experiment_data,

390

task=my_task,

391

evaluators=[accuracy_evaluator, quality_evaluator]

392

)

393

394

# Basic summary

395

print(result.format())

396

397

# Detailed report with individual items

398

detailed_report = result.format(include_item_results=True)

399

with open("experiment_report.txt", "w") as f:

400

f.write(detailed_report)

401

402

# Access individual results programmatically

403

for i, item_result in enumerate(result.item_results):

404

print(f"Item {i+1}:")

405

print(f" Input: {item_result.item}")

406

print(f" Output: {item_result.output}")

407

408

for evaluation in item_result.evaluations:

409

print(f" {evaluation.name}: {evaluation.value}")

410

if evaluation.comment:

411

print(f" Comment: {evaluation.comment}")

412

413

# Calculate custom metrics

414

accuracy_scores = []

415

for item_result in result.item_results:

416

for evaluation in item_result.evaluations:

417

if evaluation.name == "accuracy" and evaluation.value is not None:

418

accuracy_scores.append(evaluation.value)

419

420

if accuracy_scores:

421

avg_accuracy = sum(accuracy_scores) / len(accuracy_scores)

422

print(f"Overall accuracy: {avg_accuracy:.2%}")

423

```

424

425

### Integration with autoevals

426

427

```python

428

from langfuse.experiment import create_evaluator_from_autoevals

429

# Assuming you have autoevals installed

430

431

# Convert autoevals evaluator to Langfuse format

432

autoevals_evaluator = some_autoevals.Evaluator()

433

langfuse_evaluator = create_evaluator_from_autoevals(

434

autoevals_evaluator,

435

model="gpt-4" # Additional parameters for the evaluator

436

)

437

438

# Use in experiment

439

result = langfuse.run_experiment(

440

name="Autoevals Integration",

441

data=experiment_data,

442

task=my_task,

443

evaluators=[langfuse_evaluator]

444

)

445

```