Tessl Tile for pypi/kiln-ai@0.22.1

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

configuration.md datamodel.md evaluation.md fine-tuning.md index.md models.md prompts.md rag-embeddings.md task-execution.md tools.md

evaluation.mddocs/

0
# Evaluation
1

2
Evaluation systems for assessing task output quality, including G-Eval and custom evaluators. Evaluations can be run on individual task runs or in batches to measure performance systematically.
3

4
## Capabilities
5

6
### Evaluation Runner
7

8
Execute evaluations on task runs.
9

10
```python { .api }
11
from kiln_ai.adapters.eval import EvalRunner, EvalJob
12

13
class EvalRunner:
14
    """
15
    Execute evaluations on task runs.
16

17
    Methods:
18
    - run(): Execute single evaluation
19
    - run_batch(): Execute batch evaluations
20
    """
21

22
    def __init__(self, eval_config):
23
        """
24
        Initialize evaluation runner.
25

26
        Parameters:
27
        - eval_config (Eval): Evaluation configuration
28
        """
29

30
    async def run(self, task_run) -> 'EvalRun':
31
        """
32
        Execute evaluation on single task run.
33

34
        Parameters:
35
        - task_run (TaskRun): Task run to evaluate
36

37
        Returns:
38
        EvalRun: Evaluation results
39
        """
40

41
    async def run_batch(self, task_runs: list) -> list:
42
        """
43
        Execute batch evaluations.
44

45
        Parameters:
46
        - task_runs (list[TaskRun]): Task runs to evaluate
47

48
        Returns:
49
        list[EvalRun]: Batch evaluation results
50
        """
51

52
class EvalJob:
53
    """
54
    Evaluation job configuration.
55

56
    Properties:
57
    - eval_id (str): Evaluation identifier
58
    - task_runs (list): Task runs to evaluate
59
    - config (dict): Job-specific configuration
60
    """
61
```
62

63
### Base Evaluator
64

65
Abstract base class for all evaluators.
66

67
```python { .api }
68
from kiln_ai.adapters.eval import BaseEval
69

70
class BaseEval:
71
    """
72
    Abstract evaluation interface.
73

74
    Methods:
75
    - evaluate(): Evaluate single output
76
    - batch_evaluate(): Evaluate multiple outputs
77
    """
78

79
    async def evaluate(self, task_run) -> dict:
80
        """
81
        Evaluate single task run.
82

83
        Parameters:
84
        - task_run: TaskRun instance to evaluate
85

86
        Returns:
87
        dict: Evaluation score and metadata
88
        """
89

90
    async def batch_evaluate(self, task_runs: list) -> list:
91
        """
92
        Evaluate multiple task runs.
93

94
        Parameters:
95
        - task_runs (list): TaskRun instances to evaluate
96

97
        Returns:
98
        list[dict]: Evaluation results
99
        """
100
```
101

102
### G-Eval
103

104
G-Eval implementation for LLM-based evaluation.
105

106
```python { .api }
107
from kiln_ai.adapters.eval import GEval, GEvalTask
108

109
class GEval(BaseEval):
110
    """
111
    G-Eval implementation for LLM-based evaluation.
112

113
    Uses language models to evaluate outputs based on criteria.
114
    Effective for assessing quality, coherence, and task-specific metrics.
115
    """
116

117
    def __init__(self, config: 'GEvalTask'):
118
        """
119
        Initialize G-Eval evaluator.
120

121
        Parameters:
122
        - config (GEvalTask): G-Eval configuration
123
        """
124

125
    async def evaluate(self, task_run) -> dict:
126
        """
127
        Evaluate task run with G-Eval.
128

129
        Parameters:
130
        - task_run: TaskRun to evaluate
131

132
        Returns:
133
        dict: Score, reasoning, and metadata
134
        """
135

136
class GEvalTask:
137
    """
138
    G-Eval task configuration.
139

140
    Properties:
141
    - criteria (str): Evaluation criteria description
142
    - scoring_rubric (dict): Scoring guidelines and thresholds
143
    """
144
```
145

146
### Evaluation Registry
147

148
Get evaluator adapters by type.
149

150
```python { .api }
151
from kiln_ai.adapters.eval.registry import eval_adapter_from_type
152

153
def eval_adapter_from_type(eval_type: str, config: dict):
154
    """
155
    Get evaluation adapter from type.
156

157
    Parameters:
158
    - eval_type (str): Type of evaluator (e.g., "g_eval", "custom")
159
    - config (dict): Evaluator configuration
160

161
    Returns:
162
    BaseEval: Evaluator instance
163
    """
164
```
165

166
### Evaluation Data Models
167

168
Core data models for evaluations (from datamodel module).
169

170
```python { .api }
171
from kiln_ai.datamodel import Eval, EvalRun, EvalOutputScore, EvalConfig
172

173
class Eval:
174
    """
175
    Evaluation configuration.
176

177
    Properties:
178
    - id (str): Unique identifier
179
    - name (str): Evaluation name
180
    - eval_type (str): Type of evaluation
181
    - config (EvalConfig): Evaluation configuration
182
    - parent (Task): Parent task
183
    """
184

185
    @staticmethod
186
    def load_from_file(path: str) -> 'Eval':
187
        """
188
        Load evaluation from .kiln file.
189

190
        Parameters:
191
        - path (str): Path to eval.kiln file
192

193
        Returns:
194
        Eval instance
195
        """
196

197
    def save_to_file(self) -> None:
198
        """Save evaluation to .kiln file."""
199

200
class EvalConfig:
201
    """
202
    Configuration for specific evaluation type.
203

204
    Properties:
205
    - type (EvalConfigType): Type of evaluation configuration
206
    - parameters (dict): Evaluation-specific parameters
207
    """
208

209
class EvalRun:
210
    """
211
    Single evaluation run result.
212

213
    Properties:
214
    - eval_id (str): Evaluation identifier
215
    - task_run_id (str): Task run being evaluated
216
    - score (EvalOutputScore): Evaluation score
217
    - id (str): Unique run identifier
218
    - created_at (str): Timestamp
219
    """
220

221
    @staticmethod
222
    def load_from_file(path: str) -> 'EvalRun':
223
        """
224
        Load evaluation run from .kiln file.
225

226
        Parameters:
227
        - path (str): Path to eval_run.kiln file
228

229
        Returns:
230
        EvalRun instance
231
        """
232

233
    def save_to_file(self) -> None:
234
        """Save evaluation run to .kiln file."""
235

236
class EvalOutputScore:
237
    """
238
    Score from evaluation.
239

240
    Properties:
241
    - value (float | int | bool): Score value
242
    - reasoning (str | None): Explanation for the score
243
    """
244

245
class EvalTemplateId:
246
    """
247
    Built-in evaluation templates.
248

249
    Values:
250
    - g_eval: G-Eval assessment
251
    - llm_as_judge: LLM-based evaluation
252
    """
253
    g_eval = "g_eval"
254
    llm_as_judge = "llm_as_judge"
255

256
class EvalConfigType:
257
    """
258
    Types of evaluation configs.
259

260
    Values:
261
    - g_eval: G-Eval configuration
262
    - custom: Custom evaluation configuration
263
    """
264
    g_eval = "g_eval"
265
    custom = "custom"
266
```
267

268
## Usage Examples
269

270
### Basic Evaluation
271

272
```python
273
from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalConfigType
274
from kiln_ai.adapters.eval import EvalRunner
275

276
# Load task
277
task = Task.load_from_file("path/to/task.kiln")
278

279
# Create evaluation configuration
280
eval_config = Eval(
281
    parent=task,
282
    name="quality_assessment",
283
    eval_type=EvalConfigType.g_eval,
284
    config=EvalConfig(
285
        type=EvalConfigType.g_eval,
286
        parameters={
287
            "criteria": "Assess the quality and accuracy of the output",
288
            "scoring_rubric": {
289
                "1": "Poor quality, inaccurate",
290
                "2": "Below average",
291
                "3": "Average quality",
292
                "4": "Good quality",
293
                "5": "Excellent, highly accurate"
294
            }
295
        }
296
    )
297
)
298
eval_config.save_to_file()
299

300
# Run evaluation on task runs
301
runner = EvalRunner(eval_config)
302

303
for task_run in task.runs():
304
    eval_result = await runner.run(task_run)
305
    print(f"Run {task_run.id}: Score {eval_result.score.value}")
306
    if eval_result.score.reasoning:
307
        print(f"Reasoning: {eval_result.score.reasoning}")
308
```
309

310
### G-Eval Assessment
311

312
```python
313
from kiln_ai.datamodel import Task, TaskRun
314
from kiln_ai.adapters.eval import GEval, GEvalTask
315

316
# Create G-Eval configuration
317
g_eval_config = GEvalTask(
318
    criteria="""Evaluate the summary on three dimensions:
319
    1. Accuracy: Does it capture key points?
320
    2. Conciseness: Is it appropriately brief?
321
    3. Coherence: Is it well-structured?""",
322
    scoring_rubric={
323
        "1": "Fails on multiple dimensions",
324
        "2": "Poor on most dimensions",
325
        "3": "Adequate on most dimensions",
326
        "4": "Good on all dimensions",
327
        "5": "Excellent on all dimensions"
328
    }
329
)
330

331
# Create evaluator
332
evaluator = GEval(g_eval_config)
333

334
# Evaluate task run
335
task = Task.load_from_file("path/to/task.kiln")
336
task_run = task.runs()[0]
337

338
result = await evaluator.evaluate(task_run)
339
print(f"Score: {result['score']}")
340
print(f"Reasoning: {result['reasoning']}")
341
```
342

343
### Batch Evaluation
344

345
```python
346
from kiln_ai.datamodel import Task, Eval
347
from kiln_ai.adapters.eval import EvalRunner
348

349
# Load task and evaluation
350
task = Task.load_from_file("path/to/task.kiln")
351
eval_config = Eval.load_from_file("path/to/eval.kiln")
352

353
# Get all task runs
354
task_runs = task.runs()
355
print(f"Evaluating {len(task_runs)} task runs...")
356

357
# Run batch evaluation
358
runner = EvalRunner(eval_config)
359
results = await runner.run_batch(task_runs)
360

361
# Analyze results
362
scores = [r.score.value for r in results]
363
avg_score = sum(scores) / len(scores)
364
print(f"Average score: {avg_score:.2f}")
365
print(f"Min score: {min(scores)}")
366
print(f"Max score: {max(scores)}")
367

368
# Find low-scoring runs
369
low_scores = [r for r in results if r.score.value < 3]
370
print(f"\nLow-scoring runs: {len(low_scores)}")
371
for eval_run in low_scores:
372
    print(f"  Run {eval_run.task_run_id}: {eval_run.score.value}")
373
    print(f"  Reason: {eval_run.score.reasoning}")
374
```
375

376
### Custom Evaluation Criteria
377

378
```python
379
from kiln_ai.datamodel import Eval, EvalConfig, EvalConfigType
380
from kiln_ai.adapters.eval import EvalRunner
381

382
# Create evaluation with custom criteria
383
task = Task.load_from_file("path/to/task.kiln")
384

385
eval_config = Eval(
386
    parent=task,
387
    name="code_quality",
388
    eval_type=EvalConfigType.g_eval,
389
    config=EvalConfig(
390
        type=EvalConfigType.g_eval,
391
        parameters={
392
            "criteria": """Evaluate code quality:
393
            - Correctness: Does it solve the problem?
394
            - Efficiency: Is it optimized?
395
            - Readability: Is it clear and well-structured?
396
            - Best practices: Does it follow conventions?""",
397
            "scoring_rubric": {
398
                "1": "Major issues in multiple areas",
399
                "2": "Significant problems in some areas",
400
                "3": "Acceptable but room for improvement",
401
                "4": "Good quality with minor issues",
402
                "5": "Excellent quality across all criteria"
403
            }
404
        }
405
    )
406
)
407
eval_config.save_to_file()
408

409
# Run evaluation
410
runner = EvalRunner(eval_config)
411
results = await runner.run_batch(task.runs())
412
```
413

414
### Filtering by Evaluation Score
415

416
```python
417
from kiln_ai.datamodel import Task, Eval
418

419
# Load task and evaluation
420
task = Task.load_from_file("path/to/task.kiln")
421
eval_config = Eval.load_from_file("path/to/eval.kiln")
422

423
# Run evaluation
424
runner = EvalRunner(eval_config)
425
eval_results = await runner.run_batch(task.runs())
426

427
# Create mapping of task_run_id to score
428
score_map = {er.task_run_id: er.score.value for er in eval_results}
429

430
# Filter high-quality runs (score >= 4)
431
high_quality_runs = [
432
    run for run in task.runs()
433
    if score_map.get(run.id, 0) >= 4
434
]
435

436
print(f"High quality runs: {len(high_quality_runs)}")
437

438
# Use for few-shot examples
439
from kiln_ai.adapters.prompt_builders import FewShotPromptBuilder
440

441
# Temporarily filter task runs to high quality
442
original_runs = task.runs()
443
# Use high_quality_runs for few-shot examples
444
```
445

446
### Multiple Evaluation Metrics
447

448
```python
449
from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalConfigType
450
from kiln_ai.adapters.eval import EvalRunner
451

452
task = Task.load_from_file("path/to/task.kiln")
453

454
# Create multiple evaluations for different aspects
455
evaluations = [
456
    {
457
        "name": "accuracy",
458
        "criteria": "Evaluate factual accuracy and correctness"
459
    },
460
    {
461
        "name": "fluency",
462
        "criteria": "Evaluate language fluency and naturalness"
463
    },
464
    {
465
        "name": "completeness",
466
        "criteria": "Evaluate whether all required information is present"
467
    }
468
]
469

470
results_by_metric = {}
471

472
for eval_def in evaluations:
473
    # Create evaluation
474
    eval_config = Eval(
475
        parent=task,
476
        name=eval_def["name"],
477
        eval_type=EvalConfigType.g_eval,
478
        config=EvalConfig(
479
            type=EvalConfigType.g_eval,
480
            parameters={
481
                "criteria": eval_def["criteria"],
482
                "scoring_rubric": {str(i): f"Score {i}" for i in range(1, 6)}
483
            }
484
        )
485
    )
486
    eval_config.save_to_file()
487

488
    # Run evaluation
489
    runner = EvalRunner(eval_config)
490
    results = await runner.run_batch(task.runs())
491
    results_by_metric[eval_def["name"]] = results
492

493
# Analyze across metrics
494
for task_run in task.runs():
495
    print(f"\nTask Run {task_run.id}:")
496
    for metric_name, results in results_by_metric.items():
497
        result = next(r for r in results if r.task_run_id == task_run.id)
498
        print(f"  {metric_name}: {result.score.value}")
499
```
500

501
### Comparing Models with Evaluation
502

503
```python
504
from kiln_ai.datamodel import Task, TaskRun, Eval
505
from kiln_ai.adapters import adapter_for_task
506
from kiln_ai.adapters.eval import EvalRunner
507

508
task = Task.load_from_file("path/to/task.kiln")
509
eval_config = Eval.load_from_file("path/to/eval.kiln")
510

511
# Test multiple models
512
models = [
513
    ("gpt_4o", "openai"),
514
    ("claude_3_5_sonnet", "anthropic"),
515
    ("llama_3_1_8b", "groq")
516
]
517

518
test_inputs = ["input1", "input2", "input3"]
519
model_scores = {}
520

521
for model_name, provider in models:
522
    # Create adapter
523
    adapter = adapter_for_task(task, model_name=model_name, provider=provider)
524

525
    # Run on test inputs
526
    runs = []
527
    for input_data in test_inputs:
528
        result = await adapter.invoke(input_data)
529
        # result.output contains the task run
530
        runs.append(result.output)
531

532
    # Evaluate results
533
    runner = EvalRunner(eval_config)
534
    eval_results = await runner.run_batch(runs)
535

536
    # Calculate average score
537
    avg_score = sum(r.score.value for r in eval_results) / len(eval_results)
538
    model_scores[model_name] = avg_score
539

540
# Report
541
print("Model Comparison:")
542
for model_name, score in sorted(model_scores.items(), key=lambda x: -x[1]):
543
    print(f"  {model_name}: {score:.2f}")
544
```
545

546
### LLM-as-Judge Pattern
547

548
```python
549
from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalTemplateId
550
from kiln_ai.adapters.eval import EvalRunner
551

552
task = Task.load_from_file("path/to/task.kiln")
553

554
# Create LLM-as-judge evaluation
555
eval_config = Eval(
556
    parent=task,
557
    name="llm_judge",
558
    eval_type=EvalTemplateId.llm_as_judge,
559
    config=EvalConfig(
560
        type=EvalTemplateId.llm_as_judge,
561
        parameters={
562
            "judge_instruction": """Compare the output against the task requirements.
563
            Provide a pass/fail decision with detailed reasoning.""",
564
            "judge_model": "gpt_4o",
565
            "judge_provider": "openai"
566
        }
567
    )
568
)
569
eval_config.save_to_file()
570

571
# Run evaluation
572
runner = EvalRunner(eval_config)
573
results = await runner.run_batch(task.runs())
574

575
# Analyze pass/fail
576
passed = sum(1 for r in results if r.score.value)
577
total = len(results)
578
print(f"Pass rate: {passed}/{total} ({100*passed/total:.1f}%)")
579
```
580

Version

Tile

Files

evaluation.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

evaluation.mddocs/