0
# Evaluation & Testing
1
2
Evaluate model performance against expected outputs with detailed metrics and analysis. Provides quantitative assessment of model predictions for quality assurance, benchmarking, and optimization.
3
4
## Capabilities
5
6
### Evaluation Requests
7
8
Configure model evaluation by comparing generated outputs against expected results.
9
10
```python { .api }
11
class EvaluationRequest:
12
prompt: Prompt
13
completion_expected: str
14
contextual_control_threshold: Optional[float] = None
15
control_log_additive: Optional[bool] = True
16
"""
17
Request for model evaluation against expected output.
18
19
Attributes:
20
- prompt: Input prompt for model evaluation
21
- completion_expected: Expected output text for comparison
22
- contextual_control_threshold: Threshold for attention controls
23
- control_log_additive: Method for applying attention controls
24
"""
25
26
def to_json(self) -> Mapping[str, Any]:
27
"""Serialize request to JSON format."""
28
```
29
30
### Evaluation Responses
31
32
Structured response containing evaluation metrics and detailed analysis results.
33
34
```python { .api }
35
class EvaluationResponse:
36
model_version: str
37
message: Optional[str]
38
result: Dict[str, Any]
39
num_tokens_prompt_total: int
40
"""
41
Response from model evaluation.
42
43
Attributes:
44
- model_version: Version of model used for evaluation
45
- message: Optional response message or status
46
- result: Detailed evaluation metrics and scores
47
- num_tokens_prompt_total: Total tokens processed in prompt
48
"""
49
50
@staticmethod
51
def from_json(json: Dict[str, Any]) -> EvaluationResponse:
52
"""Create response from JSON data."""
53
```
54
55
### Model Evaluation
56
57
Generate evaluation metrics comparing model output against expected results.
58
59
```python { .api }
60
def evaluate(
61
self,
62
request: EvaluationRequest,
63
model: str
64
) -> EvaluationResponse:
65
"""
66
Evaluate model performance against expected output.
67
68
Parameters:
69
- request: Evaluation configuration with prompt and expected output
70
- model: Model name to evaluate
71
72
Returns:
73
EvaluationResponse with evaluation metrics
74
"""
75
76
async def evaluate(
77
self,
78
request: EvaluationRequest,
79
model: str
80
) -> EvaluationResponse:
81
"""
82
Evaluate model performance against expected output (async).
83
84
Parameters:
85
- request: Evaluation configuration
86
- model: Model name to evaluate
87
88
Returns:
89
EvaluationResponse with evaluation metrics
90
"""
91
```
92
93
### Usage Examples
94
95
Comprehensive evaluation examples for quality assessment and benchmarking:
96
97
```python
98
from aleph_alpha_client import Client, EvaluationRequest, Prompt
99
100
client = Client(token="your-api-token")
101
102
# Basic evaluation - compare model output to expected result
103
prompt = Prompt.from_text("What is the capital of France?")
104
expected_output = "Paris"
105
106
request = EvaluationRequest(
107
prompt=prompt,
108
completion_expected=expected_output
109
)
110
111
response = client.evaluate(request, model="luminous-extended")
112
113
print(f"Model version: {response.model_version}")
114
print(f"Evaluation results: {response.result}")
115
print(f"Tokens processed: {response.num_tokens_prompt_total}")
116
117
if response.message:
118
print(f"Message: {response.message}")
119
120
# Extract specific metrics from results
121
def extract_metrics(eval_response: EvaluationResponse) -> dict:
122
"""Extract key metrics from evaluation response."""
123
results = eval_response.result
124
125
# Common metrics that might be present
126
metrics = {}
127
128
if 'log_probability' in results:
129
metrics['log_probability'] = results['log_probability']
130
131
if 'perplexity' in results:
132
metrics['perplexity'] = results['perplexity']
133
134
if 'likelihood' in results:
135
metrics['likelihood'] = results['likelihood']
136
137
return metrics
138
139
metrics = extract_metrics(response)
140
print(f"Extracted metrics: {metrics}")
141
142
# Batch evaluation for benchmarking
143
evaluation_cases = [
144
{
145
"prompt": "Translate to French: Hello",
146
"expected": "Bonjour",
147
"category": "translation"
148
},
149
{
150
"prompt": "What is 2 + 2?",
151
"expected": "4",
152
"category": "math"
153
},
154
{
155
"prompt": "Name the first president of the USA",
156
"expected": "George Washington",
157
"category": "history"
158
},
159
{
160
"prompt": "What color is the sky?",
161
"expected": "blue",
162
"category": "general"
163
}
164
]
165
166
def run_evaluation_suite(cases: list, model: str) -> dict:
167
"""Run evaluation suite and collect results by category."""
168
results_by_category = {}
169
170
for case in cases:
171
prompt = Prompt.from_text(case["prompt"])
172
request = EvaluationRequest(
173
prompt=prompt,
174
completion_expected=case["expected"]
175
)
176
177
response = client.evaluate(request, model=model)
178
179
category = case["category"]
180
if category not in results_by_category:
181
results_by_category[category] = []
182
183
results_by_category[category].append({
184
"prompt": case["prompt"],
185
"expected": case["expected"],
186
"metrics": extract_metrics(response),
187
"raw_result": response.result
188
})
189
190
return results_by_category
191
192
# Run the evaluation suite
193
suite_results = run_evaluation_suite(evaluation_cases, "luminous-extended")
194
195
# Analyze results by category
196
for category, results in suite_results.items():
197
print(f"\n{category.upper()} Category Results:")
198
for result in results:
199
print(f" Prompt: '{result['prompt']}'")
200
print(f" Expected: '{result['expected']}'")
201
print(f" Metrics: {result['metrics']}")
202
203
# Multimodal evaluation
204
from aleph_alpha_client import Image
205
206
# Evaluate image description task
207
image = Image.from_file("landscape.jpg")
208
multimodal_prompt = Prompt([
209
Text.from_text("Describe this image in one word:"),
210
image
211
])
212
213
multimodal_request = EvaluationRequest(
214
prompt=multimodal_prompt,
215
completion_expected="landscape"
216
)
217
218
multimodal_response = client.evaluate(multimodal_request, model="luminous-extended")
219
print(f"Multimodal evaluation: {multimodal_response.result}")
220
221
# Evaluation with attention controls
222
from aleph_alpha_client import Text, TextControl, ControlTokenOverlap
223
224
controlled_text = Text(
225
text="The most important answer is Paris.",
226
controls=[
227
TextControl(
228
start=27, # Start at "Paris"
229
length=5, # Length of "Paris"
230
factor=2.0,
231
token_overlap=ControlTokenOverlap.Complete
232
)
233
]
234
)
235
236
controlled_prompt = Prompt([controlled_text])
237
controlled_request = EvaluationRequest(
238
prompt=controlled_prompt,
239
completion_expected="Paris",
240
control_log_additive=True
241
)
242
243
controlled_response = client.evaluate(controlled_request, model="luminous-extended")
244
print(f"Controlled evaluation: {controlled_response.result}")
245
246
# Compare performance across models
247
models_to_test = ["luminous-base", "luminous-extended", "luminous-supreme"]
248
249
def compare_models(prompt_text: str, expected: str, models: list) -> dict:
250
"""Compare evaluation results across multiple models."""
251
comparison = {}
252
253
prompt = Prompt.from_text(prompt_text)
254
request = EvaluationRequest(
255
prompt=prompt,
256
completion_expected=expected
257
)
258
259
for model in models:
260
try:
261
response = client.evaluate(request, model=model)
262
comparison[model] = {
263
"metrics": extract_metrics(response),
264
"tokens": response.num_tokens_prompt_total
265
}
266
except Exception as e:
267
comparison[model] = {"error": str(e)}
268
269
return comparison
270
271
# Compare models on a factual question
272
model_comparison = compare_models(
273
"What is the chemical symbol for gold?",
274
"Au",
275
models_to_test
276
)
277
278
print("\nModel Comparison Results:")
279
for model, result in model_comparison.items():
280
print(f"{model}: {result}")
281
282
# Statistical analysis of evaluation results
283
def analyze_evaluation_stats(results: list) -> dict:
284
"""Analyze statistics from multiple evaluation results."""
285
metrics_list = [extract_metrics(r) for r in results]
286
287
# Extract log probabilities if available
288
log_probs = [m.get('log_probability') for m in metrics_list if m.get('log_probability')]
289
290
if log_probs:
291
import statistics
292
return {
293
"count": len(log_probs),
294
"mean_log_prob": statistics.mean(log_probs),
295
"median_log_prob": statistics.median(log_probs),
296
"stdev_log_prob": statistics.stdev(log_probs) if len(log_probs) > 1 else 0
297
}
298
299
return {"count": len(results), "log_probs_available": False}
300
301
# Collect multiple evaluation results for analysis
302
multiple_prompts = [
303
("What is water made of?", "H2O"),
304
("Name the largest planet", "Jupiter"),
305
("What is 10 * 10?", "100"),
306
("Capital of Italy?", "Rome")
307
]
308
309
evaluation_results = []
310
for prompt_text, expected in multiple_prompts:
311
request = EvaluationRequest(
312
prompt=Prompt.from_text(prompt_text),
313
completion_expected=expected
314
)
315
response = client.evaluate(request, model="luminous-extended")
316
evaluation_results.append(response)
317
318
stats = analyze_evaluation_stats(evaluation_results)
319
print(f"\nEvaluation Statistics: {stats}")
320
321
# Async evaluation for large batches
322
import asyncio
323
324
async def async_evaluation_batch(cases: list, model: str):
325
"""Run evaluation batch asynchronously."""
326
async with AsyncClient(token="your-api-token") as async_client:
327
tasks = []
328
329
for case in cases:
330
prompt = Prompt.from_text(case["prompt"])
331
request = EvaluationRequest(
332
prompt=prompt,
333
completion_expected=case["expected"]
334
)
335
task = async_client.evaluate(request, model)
336
tasks.append(task)
337
338
results = await asyncio.gather(*tasks)
339
return results
340
341
# Run async evaluation
342
# async_results = asyncio.run(async_evaluation_batch(evaluation_cases, "luminous-extended"))
343
# print(f"Async evaluation completed: {len(async_results)} results")
344
345
# Custom evaluation pipeline
346
class EvaluationPipeline:
347
"""Custom evaluation pipeline with configurable metrics."""
348
349
def __init__(self, client, model):
350
self.client = client
351
self.model = model
352
self.results = []
353
354
def add_test_case(self, prompt: str, expected: str, category: str = "general"):
355
"""Add test case to pipeline."""
356
self.results.append({
357
"prompt": prompt,
358
"expected": expected,
359
"category": category,
360
"completed": False
361
})
362
363
def run_all(self):
364
"""Execute all test cases."""
365
for test_case in self.results:
366
if not test_case["completed"]:
367
request = EvaluationRequest(
368
prompt=Prompt.from_text(test_case["prompt"]),
369
completion_expected=test_case["expected"]
370
)
371
372
response = self.client.evaluate(request, self.model)
373
test_case["response"] = response
374
test_case["metrics"] = extract_metrics(response)
375
test_case["completed"] = True
376
377
def get_summary(self):
378
"""Get evaluation summary."""
379
completed = [r for r in self.results if r["completed"]]
380
categories = {}
381
382
for result in completed:
383
cat = result["category"]
384
if cat not in categories:
385
categories[cat] = []
386
categories[cat].append(result["metrics"])
387
388
return {
389
"total_tests": len(completed),
390
"categories": list(categories.keys()),
391
"category_counts": {cat: len(results) for cat, results in categories.items()}
392
}
393
394
# Use custom pipeline
395
pipeline = EvaluationPipeline(client, "luminous-extended")
396
pipeline.add_test_case("What is AI?", "Artificial Intelligence", "tech")
397
pipeline.add_test_case("Color of grass?", "green", "nature")
398
pipeline.add_test_case("2 + 3 = ?", "5", "math")
399
400
pipeline.run_all()
401
summary = pipeline.get_summary()
402
print(f"Pipeline summary: {summary}")
403
```