or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mddatamodel.mdevaluation.mdfine-tuning.mdindex.mdmodels.mdprompts.mdrag-embeddings.mdtask-execution.mdtools.md

evaluation.mddocs/

0

# Evaluation

1

2

Evaluation systems for assessing task output quality, including G-Eval and custom evaluators. Evaluations can be run on individual task runs or in batches to measure performance systematically.

3

4

## Capabilities

5

6

### Evaluation Runner

7

8

Execute evaluations on task runs.

9

10

```python { .api }

11

from kiln_ai.adapters.eval import EvalRunner, EvalJob

12

13

class EvalRunner:

14

"""

15

Execute evaluations on task runs.

16

17

Methods:

18

- run(): Execute single evaluation

19

- run_batch(): Execute batch evaluations

20

"""

21

22

def __init__(self, eval_config):

23

"""

24

Initialize evaluation runner.

25

26

Parameters:

27

- eval_config (Eval): Evaluation configuration

28

"""

29

30

async def run(self, task_run) -> 'EvalRun':

31

"""

32

Execute evaluation on single task run.

33

34

Parameters:

35

- task_run (TaskRun): Task run to evaluate

36

37

Returns:

38

EvalRun: Evaluation results

39

"""

40

41

async def run_batch(self, task_runs: list) -> list:

42

"""

43

Execute batch evaluations.

44

45

Parameters:

46

- task_runs (list[TaskRun]): Task runs to evaluate

47

48

Returns:

49

list[EvalRun]: Batch evaluation results

50

"""

51

52

class EvalJob:

53

"""

54

Evaluation job configuration.

55

56

Properties:

57

- eval_id (str): Evaluation identifier

58

- task_runs (list): Task runs to evaluate

59

- config (dict): Job-specific configuration

60

"""

61

```

62

63

### Base Evaluator

64

65

Abstract base class for all evaluators.

66

67

```python { .api }

68

from kiln_ai.adapters.eval import BaseEval

69

70

class BaseEval:

71

"""

72

Abstract evaluation interface.

73

74

Methods:

75

- evaluate(): Evaluate single output

76

- batch_evaluate(): Evaluate multiple outputs

77

"""

78

79

async def evaluate(self, task_run) -> dict:

80

"""

81

Evaluate single task run.

82

83

Parameters:

84

- task_run: TaskRun instance to evaluate

85

86

Returns:

87

dict: Evaluation score and metadata

88

"""

89

90

async def batch_evaluate(self, task_runs: list) -> list:

91

"""

92

Evaluate multiple task runs.

93

94

Parameters:

95

- task_runs (list): TaskRun instances to evaluate

96

97

Returns:

98

list[dict]: Evaluation results

99

"""

100

```

101

102

### G-Eval

103

104

G-Eval implementation for LLM-based evaluation.

105

106

```python { .api }

107

from kiln_ai.adapters.eval import GEval, GEvalTask

108

109

class GEval(BaseEval):

110

"""

111

G-Eval implementation for LLM-based evaluation.

112

113

Uses language models to evaluate outputs based on criteria.

114

Effective for assessing quality, coherence, and task-specific metrics.

115

"""

116

117

def __init__(self, config: 'GEvalTask'):

118

"""

119

Initialize G-Eval evaluator.

120

121

Parameters:

122

- config (GEvalTask): G-Eval configuration

123

"""

124

125

async def evaluate(self, task_run) -> dict:

126

"""

127

Evaluate task run with G-Eval.

128

129

Parameters:

130

- task_run: TaskRun to evaluate

131

132

Returns:

133

dict: Score, reasoning, and metadata

134

"""

135

136

class GEvalTask:

137

"""

138

G-Eval task configuration.

139

140

Properties:

141

- criteria (str): Evaluation criteria description

142

- scoring_rubric (dict): Scoring guidelines and thresholds

143

"""

144

```

145

146

### Evaluation Registry

147

148

Get evaluator adapters by type.

149

150

```python { .api }

151

from kiln_ai.adapters.eval.registry import eval_adapter_from_type

152

153

def eval_adapter_from_type(eval_type: str, config: dict):

154

"""

155

Get evaluation adapter from type.

156

157

Parameters:

158

- eval_type (str): Type of evaluator (e.g., "g_eval", "custom")

159

- config (dict): Evaluator configuration

160

161

Returns:

162

BaseEval: Evaluator instance

163

"""

164

```

165

166

### Evaluation Data Models

167

168

Core data models for evaluations (from datamodel module).

169

170

```python { .api }

171

from kiln_ai.datamodel import Eval, EvalRun, EvalOutputScore, EvalConfig

172

173

class Eval:

174

"""

175

Evaluation configuration.

176

177

Properties:

178

- id (str): Unique identifier

179

- name (str): Evaluation name

180

- eval_type (str): Type of evaluation

181

- config (EvalConfig): Evaluation configuration

182

- parent (Task): Parent task

183

"""

184

185

@staticmethod

186

def load_from_file(path: str) -> 'Eval':

187

"""

188

Load evaluation from .kiln file.

189

190

Parameters:

191

- path (str): Path to eval.kiln file

192

193

Returns:

194

Eval instance

195

"""

196

197

def save_to_file(self) -> None:

198

"""Save evaluation to .kiln file."""

199

200

class EvalConfig:

201

"""

202

Configuration for specific evaluation type.

203

204

Properties:

205

- type (EvalConfigType): Type of evaluation configuration

206

- parameters (dict): Evaluation-specific parameters

207

"""

208

209

class EvalRun:

210

"""

211

Single evaluation run result.

212

213

Properties:

214

- eval_id (str): Evaluation identifier

215

- task_run_id (str): Task run being evaluated

216

- score (EvalOutputScore): Evaluation score

217

- id (str): Unique run identifier

218

- created_at (str): Timestamp

219

"""

220

221

@staticmethod

222

def load_from_file(path: str) -> 'EvalRun':

223

"""

224

Load evaluation run from .kiln file.

225

226

Parameters:

227

- path (str): Path to eval_run.kiln file

228

229

Returns:

230

EvalRun instance

231

"""

232

233

def save_to_file(self) -> None:

234

"""Save evaluation run to .kiln file."""

235

236

class EvalOutputScore:

237

"""

238

Score from evaluation.

239

240

Properties:

241

- value (float | int | bool): Score value

242

- reasoning (str | None): Explanation for the score

243

"""

244

245

class EvalTemplateId:

246

"""

247

Built-in evaluation templates.

248

249

Values:

250

- g_eval: G-Eval assessment

251

- llm_as_judge: LLM-based evaluation

252

"""

253

g_eval = "g_eval"

254

llm_as_judge = "llm_as_judge"

255

256

class EvalConfigType:

257

"""

258

Types of evaluation configs.

259

260

Values:

261

- g_eval: G-Eval configuration

262

- custom: Custom evaluation configuration

263

"""

264

g_eval = "g_eval"

265

custom = "custom"

266

```

267

268

## Usage Examples

269

270

### Basic Evaluation

271

272

```python

273

from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalConfigType

274

from kiln_ai.adapters.eval import EvalRunner

275

276

# Load task

277

task = Task.load_from_file("path/to/task.kiln")

278

279

# Create evaluation configuration

280

eval_config = Eval(

281

parent=task,

282

name="quality_assessment",

283

eval_type=EvalConfigType.g_eval,

284

config=EvalConfig(

285

type=EvalConfigType.g_eval,

286

parameters={

287

"criteria": "Assess the quality and accuracy of the output",

288

"scoring_rubric": {

289

"1": "Poor quality, inaccurate",

290

"2": "Below average",

291

"3": "Average quality",

292

"4": "Good quality",

293

"5": "Excellent, highly accurate"

294

}

295

}

296

)

297

)

298

eval_config.save_to_file()

299

300

# Run evaluation on task runs

301

runner = EvalRunner(eval_config)

302

303

for task_run in task.runs():

304

eval_result = await runner.run(task_run)

305

print(f"Run {task_run.id}: Score {eval_result.score.value}")

306

if eval_result.score.reasoning:

307

print(f"Reasoning: {eval_result.score.reasoning}")

308

```

309

310

### G-Eval Assessment

311

312

```python

313

from kiln_ai.datamodel import Task, TaskRun

314

from kiln_ai.adapters.eval import GEval, GEvalTask

315

316

# Create G-Eval configuration

317

g_eval_config = GEvalTask(

318

criteria="""Evaluate the summary on three dimensions:

319

1. Accuracy: Does it capture key points?

320

2. Conciseness: Is it appropriately brief?

321

3. Coherence: Is it well-structured?""",

322

scoring_rubric={

323

"1": "Fails on multiple dimensions",

324

"2": "Poor on most dimensions",

325

"3": "Adequate on most dimensions",

326

"4": "Good on all dimensions",

327

"5": "Excellent on all dimensions"

328

}

329

)

330

331

# Create evaluator

332

evaluator = GEval(g_eval_config)

333

334

# Evaluate task run

335

task = Task.load_from_file("path/to/task.kiln")

336

task_run = task.runs()[0]

337

338

result = await evaluator.evaluate(task_run)

339

print(f"Score: {result['score']}")

340

print(f"Reasoning: {result['reasoning']}")

341

```

342

343

### Batch Evaluation

344

345

```python

346

from kiln_ai.datamodel import Task, Eval

347

from kiln_ai.adapters.eval import EvalRunner

348

349

# Load task and evaluation

350

task = Task.load_from_file("path/to/task.kiln")

351

eval_config = Eval.load_from_file("path/to/eval.kiln")

352

353

# Get all task runs

354

task_runs = task.runs()

355

print(f"Evaluating {len(task_runs)} task runs...")

356

357

# Run batch evaluation

358

runner = EvalRunner(eval_config)

359

results = await runner.run_batch(task_runs)

360

361

# Analyze results

362

scores = [r.score.value for r in results]

363

avg_score = sum(scores) / len(scores)

364

print(f"Average score: {avg_score:.2f}")

365

print(f"Min score: {min(scores)}")

366

print(f"Max score: {max(scores)}")

367

368

# Find low-scoring runs

369

low_scores = [r for r in results if r.score.value < 3]

370

print(f"\nLow-scoring runs: {len(low_scores)}")

371

for eval_run in low_scores:

372

print(f" Run {eval_run.task_run_id}: {eval_run.score.value}")

373

print(f" Reason: {eval_run.score.reasoning}")

374

```

375

376

### Custom Evaluation Criteria

377

378

```python

379

from kiln_ai.datamodel import Eval, EvalConfig, EvalConfigType

380

from kiln_ai.adapters.eval import EvalRunner

381

382

# Create evaluation with custom criteria

383

task = Task.load_from_file("path/to/task.kiln")

384

385

eval_config = Eval(

386

parent=task,

387

name="code_quality",

388

eval_type=EvalConfigType.g_eval,

389

config=EvalConfig(

390

type=EvalConfigType.g_eval,

391

parameters={

392

"criteria": """Evaluate code quality:

393

- Correctness: Does it solve the problem?

394

- Efficiency: Is it optimized?

395

- Readability: Is it clear and well-structured?

396

- Best practices: Does it follow conventions?""",

397

"scoring_rubric": {

398

"1": "Major issues in multiple areas",

399

"2": "Significant problems in some areas",

400

"3": "Acceptable but room for improvement",

401

"4": "Good quality with minor issues",

402

"5": "Excellent quality across all criteria"

403

}

404

}

405

)

406

)

407

eval_config.save_to_file()

408

409

# Run evaluation

410

runner = EvalRunner(eval_config)

411

results = await runner.run_batch(task.runs())

412

```

413

414

### Filtering by Evaluation Score

415

416

```python

417

from kiln_ai.datamodel import Task, Eval

418

419

# Load task and evaluation

420

task = Task.load_from_file("path/to/task.kiln")

421

eval_config = Eval.load_from_file("path/to/eval.kiln")

422

423

# Run evaluation

424

runner = EvalRunner(eval_config)

425

eval_results = await runner.run_batch(task.runs())

426

427

# Create mapping of task_run_id to score

428

score_map = {er.task_run_id: er.score.value for er in eval_results}

429

430

# Filter high-quality runs (score >= 4)

431

high_quality_runs = [

432

run for run in task.runs()

433

if score_map.get(run.id, 0) >= 4

434

]

435

436

print(f"High quality runs: {len(high_quality_runs)}")

437

438

# Use for few-shot examples

439

from kiln_ai.adapters.prompt_builders import FewShotPromptBuilder

440

441

# Temporarily filter task runs to high quality

442

original_runs = task.runs()

443

# Use high_quality_runs for few-shot examples

444

```

445

446

### Multiple Evaluation Metrics

447

448

```python

449

from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalConfigType

450

from kiln_ai.adapters.eval import EvalRunner

451

452

task = Task.load_from_file("path/to/task.kiln")

453

454

# Create multiple evaluations for different aspects

455

evaluations = [

456

{

457

"name": "accuracy",

458

"criteria": "Evaluate factual accuracy and correctness"

459

},

460

{

461

"name": "fluency",

462

"criteria": "Evaluate language fluency and naturalness"

463

},

464

{

465

"name": "completeness",

466

"criteria": "Evaluate whether all required information is present"

467

}

468

]

469

470

results_by_metric = {}

471

472

for eval_def in evaluations:

473

# Create evaluation

474

eval_config = Eval(

475

parent=task,

476

name=eval_def["name"],

477

eval_type=EvalConfigType.g_eval,

478

config=EvalConfig(

479

type=EvalConfigType.g_eval,

480

parameters={

481

"criteria": eval_def["criteria"],

482

"scoring_rubric": {str(i): f"Score {i}" for i in range(1, 6)}

483

}

484

)

485

)

486

eval_config.save_to_file()

487

488

# Run evaluation

489

runner = EvalRunner(eval_config)

490

results = await runner.run_batch(task.runs())

491

results_by_metric[eval_def["name"]] = results

492

493

# Analyze across metrics

494

for task_run in task.runs():

495

print(f"\nTask Run {task_run.id}:")

496

for metric_name, results in results_by_metric.items():

497

result = next(r for r in results if r.task_run_id == task_run.id)

498

print(f" {metric_name}: {result.score.value}")

499

```

500

501

### Comparing Models with Evaluation

502

503

```python

504

from kiln_ai.datamodel import Task, TaskRun, Eval

505

from kiln_ai.adapters import adapter_for_task

506

from kiln_ai.adapters.eval import EvalRunner

507

508

task = Task.load_from_file("path/to/task.kiln")

509

eval_config = Eval.load_from_file("path/to/eval.kiln")

510

511

# Test multiple models

512

models = [

513

("gpt_4o", "openai"),

514

("claude_3_5_sonnet", "anthropic"),

515

("llama_3_1_8b", "groq")

516

]

517

518

test_inputs = ["input1", "input2", "input3"]

519

model_scores = {}

520

521

for model_name, provider in models:

522

# Create adapter

523

adapter = adapter_for_task(task, model_name=model_name, provider=provider)

524

525

# Run on test inputs

526

runs = []

527

for input_data in test_inputs:

528

result = await adapter.invoke(input_data)

529

# result.output contains the task run

530

runs.append(result.output)

531

532

# Evaluate results

533

runner = EvalRunner(eval_config)

534

eval_results = await runner.run_batch(runs)

535

536

# Calculate average score

537

avg_score = sum(r.score.value for r in eval_results) / len(eval_results)

538

model_scores[model_name] = avg_score

539

540

# Report

541

print("Model Comparison:")

542

for model_name, score in sorted(model_scores.items(), key=lambda x: -x[1]):

543

print(f" {model_name}: {score:.2f}")

544

```

545

546

### LLM-as-Judge Pattern

547

548

```python

549

from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalTemplateId

550

from kiln_ai.adapters.eval import EvalRunner

551

552

task = Task.load_from_file("path/to/task.kiln")

553

554

# Create LLM-as-judge evaluation

555

eval_config = Eval(

556

parent=task,

557

name="llm_judge",

558

eval_type=EvalTemplateId.llm_as_judge,

559

config=EvalConfig(

560

type=EvalTemplateId.llm_as_judge,

561

parameters={

562

"judge_instruction": """Compare the output against the task requirements.

563

Provide a pass/fail decision with detailed reasoning.""",

564

"judge_model": "gpt_4o",

565

"judge_provider": "openai"

566

}

567

)

568

)

569

eval_config.save_to_file()

570

571

# Run evaluation

572

runner = EvalRunner(eval_config)

573

results = await runner.run_batch(task.runs())

574

575

# Analyze pass/fail

576

passed = sum(1 for r in results if r.score.value)

577

total = len(results)

578

print(f"Pass rate: {passed}/{total} ({100*passed/total:.1f}%)")

579

```

580