0
# Evaluation
1
2
Evaluation systems for assessing task output quality, including G-Eval and custom evaluators. Evaluations can be run on individual task runs or in batches to measure performance systematically.
3
4
## Capabilities
5
6
### Evaluation Runner
7
8
Execute evaluations on task runs.
9
10
```python { .api }
11
from kiln_ai.adapters.eval import EvalRunner, EvalJob
12
13
class EvalRunner:
14
"""
15
Execute evaluations on task runs.
16
17
Methods:
18
- run(): Execute single evaluation
19
- run_batch(): Execute batch evaluations
20
"""
21
22
def __init__(self, eval_config):
23
"""
24
Initialize evaluation runner.
25
26
Parameters:
27
- eval_config (Eval): Evaluation configuration
28
"""
29
30
async def run(self, task_run) -> 'EvalRun':
31
"""
32
Execute evaluation on single task run.
33
34
Parameters:
35
- task_run (TaskRun): Task run to evaluate
36
37
Returns:
38
EvalRun: Evaluation results
39
"""
40
41
async def run_batch(self, task_runs: list) -> list:
42
"""
43
Execute batch evaluations.
44
45
Parameters:
46
- task_runs (list[TaskRun]): Task runs to evaluate
47
48
Returns:
49
list[EvalRun]: Batch evaluation results
50
"""
51
52
class EvalJob:
53
"""
54
Evaluation job configuration.
55
56
Properties:
57
- eval_id (str): Evaluation identifier
58
- task_runs (list): Task runs to evaluate
59
- config (dict): Job-specific configuration
60
"""
61
```
62
63
### Base Evaluator
64
65
Abstract base class for all evaluators.
66
67
```python { .api }
68
from kiln_ai.adapters.eval import BaseEval
69
70
class BaseEval:
71
"""
72
Abstract evaluation interface.
73
74
Methods:
75
- evaluate(): Evaluate single output
76
- batch_evaluate(): Evaluate multiple outputs
77
"""
78
79
async def evaluate(self, task_run) -> dict:
80
"""
81
Evaluate single task run.
82
83
Parameters:
84
- task_run: TaskRun instance to evaluate
85
86
Returns:
87
dict: Evaluation score and metadata
88
"""
89
90
async def batch_evaluate(self, task_runs: list) -> list:
91
"""
92
Evaluate multiple task runs.
93
94
Parameters:
95
- task_runs (list): TaskRun instances to evaluate
96
97
Returns:
98
list[dict]: Evaluation results
99
"""
100
```
101
102
### G-Eval
103
104
G-Eval implementation for LLM-based evaluation.
105
106
```python { .api }
107
from kiln_ai.adapters.eval import GEval, GEvalTask
108
109
class GEval(BaseEval):
110
"""
111
G-Eval implementation for LLM-based evaluation.
112
113
Uses language models to evaluate outputs based on criteria.
114
Effective for assessing quality, coherence, and task-specific metrics.
115
"""
116
117
def __init__(self, config: 'GEvalTask'):
118
"""
119
Initialize G-Eval evaluator.
120
121
Parameters:
122
- config (GEvalTask): G-Eval configuration
123
"""
124
125
async def evaluate(self, task_run) -> dict:
126
"""
127
Evaluate task run with G-Eval.
128
129
Parameters:
130
- task_run: TaskRun to evaluate
131
132
Returns:
133
dict: Score, reasoning, and metadata
134
"""
135
136
class GEvalTask:
137
"""
138
G-Eval task configuration.
139
140
Properties:
141
- criteria (str): Evaluation criteria description
142
- scoring_rubric (dict): Scoring guidelines and thresholds
143
"""
144
```
145
146
### Evaluation Registry
147
148
Get evaluator adapters by type.
149
150
```python { .api }
151
from kiln_ai.adapters.eval.registry import eval_adapter_from_type
152
153
def eval_adapter_from_type(eval_type: str, config: dict):
154
"""
155
Get evaluation adapter from type.
156
157
Parameters:
158
- eval_type (str): Type of evaluator (e.g., "g_eval", "custom")
159
- config (dict): Evaluator configuration
160
161
Returns:
162
BaseEval: Evaluator instance
163
"""
164
```
165
166
### Evaluation Data Models
167
168
Core data models for evaluations (from datamodel module).
169
170
```python { .api }
171
from kiln_ai.datamodel import Eval, EvalRun, EvalOutputScore, EvalConfig
172
173
class Eval:
174
"""
175
Evaluation configuration.
176
177
Properties:
178
- id (str): Unique identifier
179
- name (str): Evaluation name
180
- eval_type (str): Type of evaluation
181
- config (EvalConfig): Evaluation configuration
182
- parent (Task): Parent task
183
"""
184
185
@staticmethod
186
def load_from_file(path: str) -> 'Eval':
187
"""
188
Load evaluation from .kiln file.
189
190
Parameters:
191
- path (str): Path to eval.kiln file
192
193
Returns:
194
Eval instance
195
"""
196
197
def save_to_file(self) -> None:
198
"""Save evaluation to .kiln file."""
199
200
class EvalConfig:
201
"""
202
Configuration for specific evaluation type.
203
204
Properties:
205
- type (EvalConfigType): Type of evaluation configuration
206
- parameters (dict): Evaluation-specific parameters
207
"""
208
209
class EvalRun:
210
"""
211
Single evaluation run result.
212
213
Properties:
214
- eval_id (str): Evaluation identifier
215
- task_run_id (str): Task run being evaluated
216
- score (EvalOutputScore): Evaluation score
217
- id (str): Unique run identifier
218
- created_at (str): Timestamp
219
"""
220
221
@staticmethod
222
def load_from_file(path: str) -> 'EvalRun':
223
"""
224
Load evaluation run from .kiln file.
225
226
Parameters:
227
- path (str): Path to eval_run.kiln file
228
229
Returns:
230
EvalRun instance
231
"""
232
233
def save_to_file(self) -> None:
234
"""Save evaluation run to .kiln file."""
235
236
class EvalOutputScore:
237
"""
238
Score from evaluation.
239
240
Properties:
241
- value (float | int | bool): Score value
242
- reasoning (str | None): Explanation for the score
243
"""
244
245
class EvalTemplateId:
246
"""
247
Built-in evaluation templates.
248
249
Values:
250
- g_eval: G-Eval assessment
251
- llm_as_judge: LLM-based evaluation
252
"""
253
g_eval = "g_eval"
254
llm_as_judge = "llm_as_judge"
255
256
class EvalConfigType:
257
"""
258
Types of evaluation configs.
259
260
Values:
261
- g_eval: G-Eval configuration
262
- custom: Custom evaluation configuration
263
"""
264
g_eval = "g_eval"
265
custom = "custom"
266
```
267
268
## Usage Examples
269
270
### Basic Evaluation
271
272
```python
273
from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalConfigType
274
from kiln_ai.adapters.eval import EvalRunner
275
276
# Load task
277
task = Task.load_from_file("path/to/task.kiln")
278
279
# Create evaluation configuration
280
eval_config = Eval(
281
parent=task,
282
name="quality_assessment",
283
eval_type=EvalConfigType.g_eval,
284
config=EvalConfig(
285
type=EvalConfigType.g_eval,
286
parameters={
287
"criteria": "Assess the quality and accuracy of the output",
288
"scoring_rubric": {
289
"1": "Poor quality, inaccurate",
290
"2": "Below average",
291
"3": "Average quality",
292
"4": "Good quality",
293
"5": "Excellent, highly accurate"
294
}
295
}
296
)
297
)
298
eval_config.save_to_file()
299
300
# Run evaluation on task runs
301
runner = EvalRunner(eval_config)
302
303
for task_run in task.runs():
304
eval_result = await runner.run(task_run)
305
print(f"Run {task_run.id}: Score {eval_result.score.value}")
306
if eval_result.score.reasoning:
307
print(f"Reasoning: {eval_result.score.reasoning}")
308
```
309
310
### G-Eval Assessment
311
312
```python
313
from kiln_ai.datamodel import Task, TaskRun
314
from kiln_ai.adapters.eval import GEval, GEvalTask
315
316
# Create G-Eval configuration
317
g_eval_config = GEvalTask(
318
criteria="""Evaluate the summary on three dimensions:
319
1. Accuracy: Does it capture key points?
320
2. Conciseness: Is it appropriately brief?
321
3. Coherence: Is it well-structured?""",
322
scoring_rubric={
323
"1": "Fails on multiple dimensions",
324
"2": "Poor on most dimensions",
325
"3": "Adequate on most dimensions",
326
"4": "Good on all dimensions",
327
"5": "Excellent on all dimensions"
328
}
329
)
330
331
# Create evaluator
332
evaluator = GEval(g_eval_config)
333
334
# Evaluate task run
335
task = Task.load_from_file("path/to/task.kiln")
336
task_run = task.runs()[0]
337
338
result = await evaluator.evaluate(task_run)
339
print(f"Score: {result['score']}")
340
print(f"Reasoning: {result['reasoning']}")
341
```
342
343
### Batch Evaluation
344
345
```python
346
from kiln_ai.datamodel import Task, Eval
347
from kiln_ai.adapters.eval import EvalRunner
348
349
# Load task and evaluation
350
task = Task.load_from_file("path/to/task.kiln")
351
eval_config = Eval.load_from_file("path/to/eval.kiln")
352
353
# Get all task runs
354
task_runs = task.runs()
355
print(f"Evaluating {len(task_runs)} task runs...")
356
357
# Run batch evaluation
358
runner = EvalRunner(eval_config)
359
results = await runner.run_batch(task_runs)
360
361
# Analyze results
362
scores = [r.score.value for r in results]
363
avg_score = sum(scores) / len(scores)
364
print(f"Average score: {avg_score:.2f}")
365
print(f"Min score: {min(scores)}")
366
print(f"Max score: {max(scores)}")
367
368
# Find low-scoring runs
369
low_scores = [r for r in results if r.score.value < 3]
370
print(f"\nLow-scoring runs: {len(low_scores)}")
371
for eval_run in low_scores:
372
print(f" Run {eval_run.task_run_id}: {eval_run.score.value}")
373
print(f" Reason: {eval_run.score.reasoning}")
374
```
375
376
### Custom Evaluation Criteria
377
378
```python
379
from kiln_ai.datamodel import Eval, EvalConfig, EvalConfigType
380
from kiln_ai.adapters.eval import EvalRunner
381
382
# Create evaluation with custom criteria
383
task = Task.load_from_file("path/to/task.kiln")
384
385
eval_config = Eval(
386
parent=task,
387
name="code_quality",
388
eval_type=EvalConfigType.g_eval,
389
config=EvalConfig(
390
type=EvalConfigType.g_eval,
391
parameters={
392
"criteria": """Evaluate code quality:
393
- Correctness: Does it solve the problem?
394
- Efficiency: Is it optimized?
395
- Readability: Is it clear and well-structured?
396
- Best practices: Does it follow conventions?""",
397
"scoring_rubric": {
398
"1": "Major issues in multiple areas",
399
"2": "Significant problems in some areas",
400
"3": "Acceptable but room for improvement",
401
"4": "Good quality with minor issues",
402
"5": "Excellent quality across all criteria"
403
}
404
}
405
)
406
)
407
eval_config.save_to_file()
408
409
# Run evaluation
410
runner = EvalRunner(eval_config)
411
results = await runner.run_batch(task.runs())
412
```
413
414
### Filtering by Evaluation Score
415
416
```python
417
from kiln_ai.datamodel import Task, Eval
418
419
# Load task and evaluation
420
task = Task.load_from_file("path/to/task.kiln")
421
eval_config = Eval.load_from_file("path/to/eval.kiln")
422
423
# Run evaluation
424
runner = EvalRunner(eval_config)
425
eval_results = await runner.run_batch(task.runs())
426
427
# Create mapping of task_run_id to score
428
score_map = {er.task_run_id: er.score.value for er in eval_results}
429
430
# Filter high-quality runs (score >= 4)
431
high_quality_runs = [
432
run for run in task.runs()
433
if score_map.get(run.id, 0) >= 4
434
]
435
436
print(f"High quality runs: {len(high_quality_runs)}")
437
438
# Use for few-shot examples
439
from kiln_ai.adapters.prompt_builders import FewShotPromptBuilder
440
441
# Temporarily filter task runs to high quality
442
original_runs = task.runs()
443
# Use high_quality_runs for few-shot examples
444
```
445
446
### Multiple Evaluation Metrics
447
448
```python
449
from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalConfigType
450
from kiln_ai.adapters.eval import EvalRunner
451
452
task = Task.load_from_file("path/to/task.kiln")
453
454
# Create multiple evaluations for different aspects
455
evaluations = [
456
{
457
"name": "accuracy",
458
"criteria": "Evaluate factual accuracy and correctness"
459
},
460
{
461
"name": "fluency",
462
"criteria": "Evaluate language fluency and naturalness"
463
},
464
{
465
"name": "completeness",
466
"criteria": "Evaluate whether all required information is present"
467
}
468
]
469
470
results_by_metric = {}
471
472
for eval_def in evaluations:
473
# Create evaluation
474
eval_config = Eval(
475
parent=task,
476
name=eval_def["name"],
477
eval_type=EvalConfigType.g_eval,
478
config=EvalConfig(
479
type=EvalConfigType.g_eval,
480
parameters={
481
"criteria": eval_def["criteria"],
482
"scoring_rubric": {str(i): f"Score {i}" for i in range(1, 6)}
483
}
484
)
485
)
486
eval_config.save_to_file()
487
488
# Run evaluation
489
runner = EvalRunner(eval_config)
490
results = await runner.run_batch(task.runs())
491
results_by_metric[eval_def["name"]] = results
492
493
# Analyze across metrics
494
for task_run in task.runs():
495
print(f"\nTask Run {task_run.id}:")
496
for metric_name, results in results_by_metric.items():
497
result = next(r for r in results if r.task_run_id == task_run.id)
498
print(f" {metric_name}: {result.score.value}")
499
```
500
501
### Comparing Models with Evaluation
502
503
```python
504
from kiln_ai.datamodel import Task, TaskRun, Eval
505
from kiln_ai.adapters import adapter_for_task
506
from kiln_ai.adapters.eval import EvalRunner
507
508
task = Task.load_from_file("path/to/task.kiln")
509
eval_config = Eval.load_from_file("path/to/eval.kiln")
510
511
# Test multiple models
512
models = [
513
("gpt_4o", "openai"),
514
("claude_3_5_sonnet", "anthropic"),
515
("llama_3_1_8b", "groq")
516
]
517
518
test_inputs = ["input1", "input2", "input3"]
519
model_scores = {}
520
521
for model_name, provider in models:
522
# Create adapter
523
adapter = adapter_for_task(task, model_name=model_name, provider=provider)
524
525
# Run on test inputs
526
runs = []
527
for input_data in test_inputs:
528
result = await adapter.invoke(input_data)
529
# result.output contains the task run
530
runs.append(result.output)
531
532
# Evaluate results
533
runner = EvalRunner(eval_config)
534
eval_results = await runner.run_batch(runs)
535
536
# Calculate average score
537
avg_score = sum(r.score.value for r in eval_results) / len(eval_results)
538
model_scores[model_name] = avg_score
539
540
# Report
541
print("Model Comparison:")
542
for model_name, score in sorted(model_scores.items(), key=lambda x: -x[1]):
543
print(f" {model_name}: {score:.2f}")
544
```
545
546
### LLM-as-Judge Pattern
547
548
```python
549
from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalTemplateId
550
from kiln_ai.adapters.eval import EvalRunner
551
552
task = Task.load_from_file("path/to/task.kiln")
553
554
# Create LLM-as-judge evaluation
555
eval_config = Eval(
556
parent=task,
557
name="llm_judge",
558
eval_type=EvalTemplateId.llm_as_judge,
559
config=EvalConfig(
560
type=EvalTemplateId.llm_as_judge,
561
parameters={
562
"judge_instruction": """Compare the output against the task requirements.
563
Provide a pass/fail decision with detailed reasoning.""",
564
"judge_model": "gpt_4o",
565
"judge_provider": "openai"
566
}
567
)
568
)
569
eval_config.save_to_file()
570
571
# Run evaluation
572
runner = EvalRunner(eval_config)
573
results = await runner.run_batch(task.runs())
574
575
# Analyze pass/fail
576
passed = sum(1 for r in results if r.score.value)
577
total = len(results)
578
print(f"Pass rate: {passed}/{total} ({100*passed/total:.1f}%)")
579
```
580