or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

chat-interface.mdclient-management.mddocument-prompt-template.mdembeddings.mdevaluation.mdexplanations.mdindex.mdprompt-construction.mdsteering.mdstructured-output.mdtext-completion.mdtokenization.mdtranslation.mdutilities.md

evaluation.mddocs/

0

# Evaluation & Testing

1

2

Evaluate model performance against expected outputs with detailed metrics and analysis. Provides quantitative assessment of model predictions for quality assurance, benchmarking, and optimization.

3

4

## Capabilities

5

6

### Evaluation Requests

7

8

Configure model evaluation by comparing generated outputs against expected results.

9

10

```python { .api }

11

class EvaluationRequest:

12

prompt: Prompt

13

completion_expected: str

14

contextual_control_threshold: Optional[float] = None

15

control_log_additive: Optional[bool] = True

16

"""

17

Request for model evaluation against expected output.

18

19

Attributes:

20

- prompt: Input prompt for model evaluation

21

- completion_expected: Expected output text for comparison

22

- contextual_control_threshold: Threshold for attention controls

23

- control_log_additive: Method for applying attention controls

24

"""

25

26

def to_json(self) -> Mapping[str, Any]:

27

"""Serialize request to JSON format."""

28

```

29

30

### Evaluation Responses

31

32

Structured response containing evaluation metrics and detailed analysis results.

33

34

```python { .api }

35

class EvaluationResponse:

36

model_version: str

37

message: Optional[str]

38

result: Dict[str, Any]

39

num_tokens_prompt_total: int

40

"""

41

Response from model evaluation.

42

43

Attributes:

44

- model_version: Version of model used for evaluation

45

- message: Optional response message or status

46

- result: Detailed evaluation metrics and scores

47

- num_tokens_prompt_total: Total tokens processed in prompt

48

"""

49

50

@staticmethod

51

def from_json(json: Dict[str, Any]) -> EvaluationResponse:

52

"""Create response from JSON data."""

53

```

54

55

### Model Evaluation

56

57

Generate evaluation metrics comparing model output against expected results.

58

59

```python { .api }

60

def evaluate(

61

self,

62

request: EvaluationRequest,

63

model: str

64

) -> EvaluationResponse:

65

"""

66

Evaluate model performance against expected output.

67

68

Parameters:

69

- request: Evaluation configuration with prompt and expected output

70

- model: Model name to evaluate

71

72

Returns:

73

EvaluationResponse with evaluation metrics

74

"""

75

76

async def evaluate(

77

self,

78

request: EvaluationRequest,

79

model: str

80

) -> EvaluationResponse:

81

"""

82

Evaluate model performance against expected output (async).

83

84

Parameters:

85

- request: Evaluation configuration

86

- model: Model name to evaluate

87

88

Returns:

89

EvaluationResponse with evaluation metrics

90

"""

91

```

92

93

### Usage Examples

94

95

Comprehensive evaluation examples for quality assessment and benchmarking:

96

97

```python

98

from aleph_alpha_client import Client, EvaluationRequest, Prompt

99

100

client = Client(token="your-api-token")

101

102

# Basic evaluation - compare model output to expected result

103

prompt = Prompt.from_text("What is the capital of France?")

104

expected_output = "Paris"

105

106

request = EvaluationRequest(

107

prompt=prompt,

108

completion_expected=expected_output

109

)

110

111

response = client.evaluate(request, model="luminous-extended")

112

113

print(f"Model version: {response.model_version}")

114

print(f"Evaluation results: {response.result}")

115

print(f"Tokens processed: {response.num_tokens_prompt_total}")

116

117

if response.message:

118

print(f"Message: {response.message}")

119

120

# Extract specific metrics from results

121

def extract_metrics(eval_response: EvaluationResponse) -> dict:

122

"""Extract key metrics from evaluation response."""

123

results = eval_response.result

124

125

# Common metrics that might be present

126

metrics = {}

127

128

if 'log_probability' in results:

129

metrics['log_probability'] = results['log_probability']

130

131

if 'perplexity' in results:

132

metrics['perplexity'] = results['perplexity']

133

134

if 'likelihood' in results:

135

metrics['likelihood'] = results['likelihood']

136

137

return metrics

138

139

metrics = extract_metrics(response)

140

print(f"Extracted metrics: {metrics}")

141

142

# Batch evaluation for benchmarking

143

evaluation_cases = [

144

{

145

"prompt": "Translate to French: Hello",

146

"expected": "Bonjour",

147

"category": "translation"

148

},

149

{

150

"prompt": "What is 2 + 2?",

151

"expected": "4",

152

"category": "math"

153

},

154

{

155

"prompt": "Name the first president of the USA",

156

"expected": "George Washington",

157

"category": "history"

158

},

159

{

160

"prompt": "What color is the sky?",

161

"expected": "blue",

162

"category": "general"

163

}

164

]

165

166

def run_evaluation_suite(cases: list, model: str) -> dict:

167

"""Run evaluation suite and collect results by category."""

168

results_by_category = {}

169

170

for case in cases:

171

prompt = Prompt.from_text(case["prompt"])

172

request = EvaluationRequest(

173

prompt=prompt,

174

completion_expected=case["expected"]

175

)

176

177

response = client.evaluate(request, model=model)

178

179

category = case["category"]

180

if category not in results_by_category:

181

results_by_category[category] = []

182

183

results_by_category[category].append({

184

"prompt": case["prompt"],

185

"expected": case["expected"],

186

"metrics": extract_metrics(response),

187

"raw_result": response.result

188

})

189

190

return results_by_category

191

192

# Run the evaluation suite

193

suite_results = run_evaluation_suite(evaluation_cases, "luminous-extended")

194

195

# Analyze results by category

196

for category, results in suite_results.items():

197

print(f"\n{category.upper()} Category Results:")

198

for result in results:

199

print(f" Prompt: '{result['prompt']}'")

200

print(f" Expected: '{result['expected']}'")

201

print(f" Metrics: {result['metrics']}")

202

203

# Multimodal evaluation

204

from aleph_alpha_client import Image

205

206

# Evaluate image description task

207

image = Image.from_file("landscape.jpg")

208

multimodal_prompt = Prompt([

209

Text.from_text("Describe this image in one word:"),

210

image

211

])

212

213

multimodal_request = EvaluationRequest(

214

prompt=multimodal_prompt,

215

completion_expected="landscape"

216

)

217

218

multimodal_response = client.evaluate(multimodal_request, model="luminous-extended")

219

print(f"Multimodal evaluation: {multimodal_response.result}")

220

221

# Evaluation with attention controls

222

from aleph_alpha_client import Text, TextControl, ControlTokenOverlap

223

224

controlled_text = Text(

225

text="The most important answer is Paris.",

226

controls=[

227

TextControl(

228

start=27, # Start at "Paris"

229

length=5, # Length of "Paris"

230

factor=2.0,

231

token_overlap=ControlTokenOverlap.Complete

232

)

233

]

234

)

235

236

controlled_prompt = Prompt([controlled_text])

237

controlled_request = EvaluationRequest(

238

prompt=controlled_prompt,

239

completion_expected="Paris",

240

control_log_additive=True

241

)

242

243

controlled_response = client.evaluate(controlled_request, model="luminous-extended")

244

print(f"Controlled evaluation: {controlled_response.result}")

245

246

# Compare performance across models

247

models_to_test = ["luminous-base", "luminous-extended", "luminous-supreme"]

248

249

def compare_models(prompt_text: str, expected: str, models: list) -> dict:

250

"""Compare evaluation results across multiple models."""

251

comparison = {}

252

253

prompt = Prompt.from_text(prompt_text)

254

request = EvaluationRequest(

255

prompt=prompt,

256

completion_expected=expected

257

)

258

259

for model in models:

260

try:

261

response = client.evaluate(request, model=model)

262

comparison[model] = {

263

"metrics": extract_metrics(response),

264

"tokens": response.num_tokens_prompt_total

265

}

266

except Exception as e:

267

comparison[model] = {"error": str(e)}

268

269

return comparison

270

271

# Compare models on a factual question

272

model_comparison = compare_models(

273

"What is the chemical symbol for gold?",

274

"Au",

275

models_to_test

276

)

277

278

print("\nModel Comparison Results:")

279

for model, result in model_comparison.items():

280

print(f"{model}: {result}")

281

282

# Statistical analysis of evaluation results

283

def analyze_evaluation_stats(results: list) -> dict:

284

"""Analyze statistics from multiple evaluation results."""

285

metrics_list = [extract_metrics(r) for r in results]

286

287

# Extract log probabilities if available

288

log_probs = [m.get('log_probability') for m in metrics_list if m.get('log_probability')]

289

290

if log_probs:

291

import statistics

292

return {

293

"count": len(log_probs),

294

"mean_log_prob": statistics.mean(log_probs),

295

"median_log_prob": statistics.median(log_probs),

296

"stdev_log_prob": statistics.stdev(log_probs) if len(log_probs) > 1 else 0

297

}

298

299

return {"count": len(results), "log_probs_available": False}

300

301

# Collect multiple evaluation results for analysis

302

multiple_prompts = [

303

("What is water made of?", "H2O"),

304

("Name the largest planet", "Jupiter"),

305

("What is 10 * 10?", "100"),

306

("Capital of Italy?", "Rome")

307

]

308

309

evaluation_results = []

310

for prompt_text, expected in multiple_prompts:

311

request = EvaluationRequest(

312

prompt=Prompt.from_text(prompt_text),

313

completion_expected=expected

314

)

315

response = client.evaluate(request, model="luminous-extended")

316

evaluation_results.append(response)

317

318

stats = analyze_evaluation_stats(evaluation_results)

319

print(f"\nEvaluation Statistics: {stats}")

320

321

# Async evaluation for large batches

322

import asyncio

323

324

async def async_evaluation_batch(cases: list, model: str):

325

"""Run evaluation batch asynchronously."""

326

async with AsyncClient(token="your-api-token") as async_client:

327

tasks = []

328

329

for case in cases:

330

prompt = Prompt.from_text(case["prompt"])

331

request = EvaluationRequest(

332

prompt=prompt,

333

completion_expected=case["expected"]

334

)

335

task = async_client.evaluate(request, model)

336

tasks.append(task)

337

338

results = await asyncio.gather(*tasks)

339

return results

340

341

# Run async evaluation

342

# async_results = asyncio.run(async_evaluation_batch(evaluation_cases, "luminous-extended"))

343

# print(f"Async evaluation completed: {len(async_results)} results")

344

345

# Custom evaluation pipeline

346

class EvaluationPipeline:

347

"""Custom evaluation pipeline with configurable metrics."""

348

349

def __init__(self, client, model):

350

self.client = client

351

self.model = model

352

self.results = []

353

354

def add_test_case(self, prompt: str, expected: str, category: str = "general"):

355

"""Add test case to pipeline."""

356

self.results.append({

357

"prompt": prompt,

358

"expected": expected,

359

"category": category,

360

"completed": False

361

})

362

363

def run_all(self):

364

"""Execute all test cases."""

365

for test_case in self.results:

366

if not test_case["completed"]:

367

request = EvaluationRequest(

368

prompt=Prompt.from_text(test_case["prompt"]),

369

completion_expected=test_case["expected"]

370

)

371

372

response = self.client.evaluate(request, self.model)

373

test_case["response"] = response

374

test_case["metrics"] = extract_metrics(response)

375

test_case["completed"] = True

376

377

def get_summary(self):

378

"""Get evaluation summary."""

379

completed = [r for r in self.results if r["completed"]]

380

categories = {}

381

382

for result in completed:

383

cat = result["category"]

384

if cat not in categories:

385

categories[cat] = []

386

categories[cat].append(result["metrics"])

387

388

return {

389

"total_tests": len(completed),

390

"categories": list(categories.keys()),

391

"category_counts": {cat: len(results) for cat, results in categories.items()}

392

}

393

394

# Use custom pipeline

395

pipeline = EvaluationPipeline(client, "luminous-extended")

396

pipeline.add_test_case("What is AI?", "Artificial Intelligence", "tech")

397

pipeline.add_test_case("Color of grass?", "green", "nature")

398

pipeline.add_test_case("2 + 3 = ?", "5", "math")

399

400

pipeline.run_all()

401

summary = pipeline.get_summary()

402

print(f"Pipeline summary: {summary}")

403

```