Kiln AI is a comprehensive platform for building, evaluating, and deploying AI systems with dataset management, model fine-tuning, RAG, and evaluation capabilities.
Evaluation systems for assessing task output quality, including G-Eval and custom evaluators. Evaluations can be run on individual task runs or in batches to measure performance systematically.
Execute evaluations on task runs.
from kiln_ai.adapters.eval import EvalRunner, EvalJob
class EvalRunner:
"""
Execute evaluations on task runs.
Methods:
- run(): Execute single evaluation
- run_batch(): Execute batch evaluations
"""
def __init__(self, eval_config):
"""
Initialize evaluation runner.
Parameters:
- eval_config (Eval): Evaluation configuration
"""
async def run(self, task_run) -> 'EvalRun':
"""
Execute evaluation on single task run.
Parameters:
- task_run (TaskRun): Task run to evaluate
Returns:
EvalRun: Evaluation results
"""
async def run_batch(self, task_runs: list) -> list:
"""
Execute batch evaluations.
Parameters:
- task_runs (list[TaskRun]): Task runs to evaluate
Returns:
list[EvalRun]: Batch evaluation results
"""
class EvalJob:
"""
Evaluation job configuration.
Properties:
- eval_id (str): Evaluation identifier
- task_runs (list): Task runs to evaluate
- config (dict): Job-specific configuration
"""Abstract base class for all evaluators.
from kiln_ai.adapters.eval import BaseEval
class BaseEval:
"""
Abstract evaluation interface.
Methods:
- evaluate(): Evaluate single output
- batch_evaluate(): Evaluate multiple outputs
"""
async def evaluate(self, task_run) -> dict:
"""
Evaluate single task run.
Parameters:
- task_run: TaskRun instance to evaluate
Returns:
dict: Evaluation score and metadata
"""
async def batch_evaluate(self, task_runs: list) -> list:
"""
Evaluate multiple task runs.
Parameters:
- task_runs (list): TaskRun instances to evaluate
Returns:
list[dict]: Evaluation results
"""G-Eval implementation for LLM-based evaluation.
from kiln_ai.adapters.eval import GEval, GEvalTask
class GEval(BaseEval):
"""
G-Eval implementation for LLM-based evaluation.
Uses language models to evaluate outputs based on criteria.
Effective for assessing quality, coherence, and task-specific metrics.
"""
def __init__(self, config: 'GEvalTask'):
"""
Initialize G-Eval evaluator.
Parameters:
- config (GEvalTask): G-Eval configuration
"""
async def evaluate(self, task_run) -> dict:
"""
Evaluate task run with G-Eval.
Parameters:
- task_run: TaskRun to evaluate
Returns:
dict: Score, reasoning, and metadata
"""
class GEvalTask:
"""
G-Eval task configuration.
Properties:
- criteria (str): Evaluation criteria description
- scoring_rubric (dict): Scoring guidelines and thresholds
"""Get evaluator adapters by type.
from kiln_ai.adapters.eval.registry import eval_adapter_from_type
def eval_adapter_from_type(eval_type: str, config: dict):
"""
Get evaluation adapter from type.
Parameters:
- eval_type (str): Type of evaluator (e.g., "g_eval", "custom")
- config (dict): Evaluator configuration
Returns:
BaseEval: Evaluator instance
"""Core data models for evaluations (from datamodel module).
from kiln_ai.datamodel import Eval, EvalRun, EvalOutputScore, EvalConfig
class Eval:
"""
Evaluation configuration.
Properties:
- id (str): Unique identifier
- name (str): Evaluation name
- eval_type (str): Type of evaluation
- config (EvalConfig): Evaluation configuration
- parent (Task): Parent task
"""
@staticmethod
def load_from_file(path: str) -> 'Eval':
"""
Load evaluation from .kiln file.
Parameters:
- path (str): Path to eval.kiln file
Returns:
Eval instance
"""
def save_to_file(self) -> None:
"""Save evaluation to .kiln file."""
class EvalConfig:
"""
Configuration for specific evaluation type.
Properties:
- type (EvalConfigType): Type of evaluation configuration
- parameters (dict): Evaluation-specific parameters
"""
class EvalRun:
"""
Single evaluation run result.
Properties:
- eval_id (str): Evaluation identifier
- task_run_id (str): Task run being evaluated
- score (EvalOutputScore): Evaluation score
- id (str): Unique run identifier
- created_at (str): Timestamp
"""
@staticmethod
def load_from_file(path: str) -> 'EvalRun':
"""
Load evaluation run from .kiln file.
Parameters:
- path (str): Path to eval_run.kiln file
Returns:
EvalRun instance
"""
def save_to_file(self) -> None:
"""Save evaluation run to .kiln file."""
class EvalOutputScore:
"""
Score from evaluation.
Properties:
- value (float | int | bool): Score value
- reasoning (str | None): Explanation for the score
"""
class EvalTemplateId:
"""
Built-in evaluation templates.
Values:
- g_eval: G-Eval assessment
- llm_as_judge: LLM-based evaluation
"""
g_eval = "g_eval"
llm_as_judge = "llm_as_judge"
class EvalConfigType:
"""
Types of evaluation configs.
Values:
- g_eval: G-Eval configuration
- custom: Custom evaluation configuration
"""
g_eval = "g_eval"
custom = "custom"from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalConfigType
from kiln_ai.adapters.eval import EvalRunner
# Load task
task = Task.load_from_file("path/to/task.kiln")
# Create evaluation configuration
eval_config = Eval(
parent=task,
name="quality_assessment",
eval_type=EvalConfigType.g_eval,
config=EvalConfig(
type=EvalConfigType.g_eval,
parameters={
"criteria": "Assess the quality and accuracy of the output",
"scoring_rubric": {
"1": "Poor quality, inaccurate",
"2": "Below average",
"3": "Average quality",
"4": "Good quality",
"5": "Excellent, highly accurate"
}
}
)
)
eval_config.save_to_file()
# Run evaluation on task runs
runner = EvalRunner(eval_config)
for task_run in task.runs():
eval_result = await runner.run(task_run)
print(f"Run {task_run.id}: Score {eval_result.score.value}")
if eval_result.score.reasoning:
print(f"Reasoning: {eval_result.score.reasoning}")from kiln_ai.datamodel import Task, TaskRun
from kiln_ai.adapters.eval import GEval, GEvalTask
# Create G-Eval configuration
g_eval_config = GEvalTask(
criteria="""Evaluate the summary on three dimensions:
1. Accuracy: Does it capture key points?
2. Conciseness: Is it appropriately brief?
3. Coherence: Is it well-structured?""",
scoring_rubric={
"1": "Fails on multiple dimensions",
"2": "Poor on most dimensions",
"3": "Adequate on most dimensions",
"4": "Good on all dimensions",
"5": "Excellent on all dimensions"
}
)
# Create evaluator
evaluator = GEval(g_eval_config)
# Evaluate task run
task = Task.load_from_file("path/to/task.kiln")
task_run = task.runs()[0]
result = await evaluator.evaluate(task_run)
print(f"Score: {result['score']}")
print(f"Reasoning: {result['reasoning']}")from kiln_ai.datamodel import Task, Eval
from kiln_ai.adapters.eval import EvalRunner
# Load task and evaluation
task = Task.load_from_file("path/to/task.kiln")
eval_config = Eval.load_from_file("path/to/eval.kiln")
# Get all task runs
task_runs = task.runs()
print(f"Evaluating {len(task_runs)} task runs...")
# Run batch evaluation
runner = EvalRunner(eval_config)
results = await runner.run_batch(task_runs)
# Analyze results
scores = [r.score.value for r in results]
avg_score = sum(scores) / len(scores)
print(f"Average score: {avg_score:.2f}")
print(f"Min score: {min(scores)}")
print(f"Max score: {max(scores)}")
# Find low-scoring runs
low_scores = [r for r in results if r.score.value < 3]
print(f"\nLow-scoring runs: {len(low_scores)}")
for eval_run in low_scores:
print(f" Run {eval_run.task_run_id}: {eval_run.score.value}")
print(f" Reason: {eval_run.score.reasoning}")from kiln_ai.datamodel import Eval, EvalConfig, EvalConfigType
from kiln_ai.adapters.eval import EvalRunner
# Create evaluation with custom criteria
task = Task.load_from_file("path/to/task.kiln")
eval_config = Eval(
parent=task,
name="code_quality",
eval_type=EvalConfigType.g_eval,
config=EvalConfig(
type=EvalConfigType.g_eval,
parameters={
"criteria": """Evaluate code quality:
- Correctness: Does it solve the problem?
- Efficiency: Is it optimized?
- Readability: Is it clear and well-structured?
- Best practices: Does it follow conventions?""",
"scoring_rubric": {
"1": "Major issues in multiple areas",
"2": "Significant problems in some areas",
"3": "Acceptable but room for improvement",
"4": "Good quality with minor issues",
"5": "Excellent quality across all criteria"
}
}
)
)
eval_config.save_to_file()
# Run evaluation
runner = EvalRunner(eval_config)
results = await runner.run_batch(task.runs())from kiln_ai.datamodel import Task, Eval
# Load task and evaluation
task = Task.load_from_file("path/to/task.kiln")
eval_config = Eval.load_from_file("path/to/eval.kiln")
# Run evaluation
runner = EvalRunner(eval_config)
eval_results = await runner.run_batch(task.runs())
# Create mapping of task_run_id to score
score_map = {er.task_run_id: er.score.value for er in eval_results}
# Filter high-quality runs (score >= 4)
high_quality_runs = [
run for run in task.runs()
if score_map.get(run.id, 0) >= 4
]
print(f"High quality runs: {len(high_quality_runs)}")
# Use for few-shot examples
from kiln_ai.adapters.prompt_builders import FewShotPromptBuilder
# Temporarily filter task runs to high quality
original_runs = task.runs()
# Use high_quality_runs for few-shot examplesfrom kiln_ai.datamodel import Task, Eval, EvalConfig, EvalConfigType
from kiln_ai.adapters.eval import EvalRunner
task = Task.load_from_file("path/to/task.kiln")
# Create multiple evaluations for different aspects
evaluations = [
{
"name": "accuracy",
"criteria": "Evaluate factual accuracy and correctness"
},
{
"name": "fluency",
"criteria": "Evaluate language fluency and naturalness"
},
{
"name": "completeness",
"criteria": "Evaluate whether all required information is present"
}
]
results_by_metric = {}
for eval_def in evaluations:
# Create evaluation
eval_config = Eval(
parent=task,
name=eval_def["name"],
eval_type=EvalConfigType.g_eval,
config=EvalConfig(
type=EvalConfigType.g_eval,
parameters={
"criteria": eval_def["criteria"],
"scoring_rubric": {str(i): f"Score {i}" for i in range(1, 6)}
}
)
)
eval_config.save_to_file()
# Run evaluation
runner = EvalRunner(eval_config)
results = await runner.run_batch(task.runs())
results_by_metric[eval_def["name"]] = results
# Analyze across metrics
for task_run in task.runs():
print(f"\nTask Run {task_run.id}:")
for metric_name, results in results_by_metric.items():
result = next(r for r in results if r.task_run_id == task_run.id)
print(f" {metric_name}: {result.score.value}")from kiln_ai.datamodel import Task, TaskRun, Eval
from kiln_ai.adapters import adapter_for_task
from kiln_ai.adapters.eval import EvalRunner
task = Task.load_from_file("path/to/task.kiln")
eval_config = Eval.load_from_file("path/to/eval.kiln")
# Test multiple models
models = [
("gpt_4o", "openai"),
("claude_3_5_sonnet", "anthropic"),
("llama_3_1_8b", "groq")
]
test_inputs = ["input1", "input2", "input3"]
model_scores = {}
for model_name, provider in models:
# Create adapter
adapter = adapter_for_task(task, model_name=model_name, provider=provider)
# Run on test inputs
runs = []
for input_data in test_inputs:
result = await adapter.invoke(input_data)
# result.output contains the task run
runs.append(result.output)
# Evaluate results
runner = EvalRunner(eval_config)
eval_results = await runner.run_batch(runs)
# Calculate average score
avg_score = sum(r.score.value for r in eval_results) / len(eval_results)
model_scores[model_name] = avg_score
# Report
print("Model Comparison:")
for model_name, score in sorted(model_scores.items(), key=lambda x: -x[1]):
print(f" {model_name}: {score:.2f}")from kiln_ai.datamodel import Task, Eval, EvalConfig, EvalTemplateId
from kiln_ai.adapters.eval import EvalRunner
task = Task.load_from_file("path/to/task.kiln")
# Create LLM-as-judge evaluation
eval_config = Eval(
parent=task,
name="llm_judge",
eval_type=EvalTemplateId.llm_as_judge,
config=EvalConfig(
type=EvalTemplateId.llm_as_judge,
parameters={
"judge_instruction": """Compare the output against the task requirements.
Provide a pass/fail decision with detailed reasoning.""",
"judge_model": "gpt_4o",
"judge_provider": "openai"
}
)
)
eval_config.save_to_file()
# Run evaluation
runner = EvalRunner(eval_config)
results = await runner.run_batch(task.runs())
# Analyze pass/fail
passed = sum(1 for r in results if r.score.value)
total = len(results)
print(f"Pass rate: {passed}/{total} ({100*passed/total:.1f}%)")Install with Tessl CLI
npx tessl i tessl/pypi-kiln-ai