or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio.mdbatch.mdchat-completions.mdcode-interpreter.mdcompletions.mdembeddings.mdendpoints.mdevaluation.mdfiles.mdfine-tuning.mdimages.mdindex.mdmodels.mdrerank.md

evaluation.mddocs/

0

# Evaluation

1

2

Model performance evaluation with standardized metrics and comparison capabilities for assessing AI model quality, capabilities, and behavioral characteristics across various tasks and benchmarks.

3

4

## Capabilities

5

6

### Create Evaluation Job

7

8

Start an evaluation job to assess model performance on standardized tasks.

9

10

```python { .api }

11

def create(

12

model: str,

13

evaluation_type: str,

14

dataset: str,

15

**kwargs

16

) -> EvaluationCreateResponse:

17

"""

18

Create an evaluation job.

19

20

Args:

21

model: Model identifier to evaluate

22

evaluation_type: Type of evaluation (classify, score, compare)

23

dataset: Dataset identifier for evaluation

24

25

Returns:

26

EvaluationCreateResponse with job information

27

"""

28

```

29

30

### Retrieve Evaluation Results

31

32

Get detailed results and metrics from completed evaluation jobs.

33

34

```python { .api }

35

def retrieve(id: str) -> EvaluationJob:

36

"""

37

Retrieve evaluation job results.

38

39

Args:

40

id: Evaluation job identifier

41

42

Returns:

43

EvaluationJob with results and metrics

44

"""

45

```

46

47

### List Evaluation Jobs

48

49

List all evaluation jobs with their statuses and basic information.

50

51

```python { .api }

52

def list() -> List[EvaluationJob]:

53

"""

54

List all evaluation jobs.

55

56

Returns:

57

List of EvaluationJob objects

58

"""

59

```

60

61

### Async Evaluation Operations

62

63

All evaluation operations support asynchronous execution.

64

65

```python { .api }

66

async def create(model: str, evaluation_type: str, dataset: str, **kwargs) -> EvaluationCreateResponse: ...

67

async def retrieve(id: str) -> EvaluationJob: ...

68

async def list() -> List[EvaluationJob]: ...

69

```

70

71

## Usage Examples

72

73

### Basic Model Evaluation

74

75

```python

76

from together import Together

77

78

client = Together()

79

80

# Start evaluation job

81

evaluation = client.evaluation.create(

82

model="meta-llama/Llama-3.2-3B-Instruct-Turbo",

83

evaluation_type="classify",

84

dataset="standard-benchmark-v1"

85

)

86

87

print(f"Evaluation job created: {evaluation.id}")

88

print(f"Status: {evaluation.status}")

89

```

90

91

### Monitor Evaluation Progress

92

93

```python

94

import time

95

96

def monitor_evaluation(client: Together, eval_id: str):

97

"""Monitor evaluation job until completion."""

98

99

while True:

100

eval_job = client.evaluation.retrieve(eval_id)

101

print(f"Evaluation status: {eval_job.status}")

102

103

if eval_job.status == "completed":

104

print("Evaluation completed!")

105

return eval_job

106

elif eval_job.status == "failed":

107

print("Evaluation failed!")

108

return eval_job

109

110

time.sleep(30) # Check every 30 seconds

111

112

# Monitor the evaluation

113

completed_eval = monitor_evaluation(client, evaluation.id)

114

115

if completed_eval.status == "completed":

116

print(f"Final score: {completed_eval.score}")

117

print(f"Metrics: {completed_eval.metrics}")

118

```

119

120

### Compare Multiple Models

121

122

```python

123

def compare_models(client: Together, models: list, dataset: str):

124

"""Compare multiple models on the same evaluation dataset."""

125

126

evaluation_jobs = []

127

128

# Start evaluations for all models

129

for model in models:

130

eval_job = client.evaluation.create(

131

model=model,

132

evaluation_type="score",

133

dataset=dataset

134

)

135

evaluation_jobs.append({

136

'model': model,

137

'job_id': eval_job.id,

138

'job': eval_job

139

})

140

print(f"Started evaluation for {model}: {eval_job.id}")

141

142

# Wait for all evaluations to complete

143

results = []

144

for eval_info in evaluation_jobs:

145

completed = monitor_evaluation(client, eval_info['job_id'])

146

results.append({

147

'model': eval_info['model'],

148

'score': completed.score if hasattr(completed, 'score') else None,

149

'metrics': completed.metrics if hasattr(completed, 'metrics') else {},

150

'status': completed.status

151

})

152

153

# Sort by score (highest first)

154

successful_results = [r for r in results if r['score'] is not None]

155

successful_results.sort(key=lambda x: x['score'], reverse=True)

156

157

return successful_results

158

159

# Compare models

160

models_to_compare = [

161

"meta-llama/Llama-3.2-3B-Instruct-Turbo",

162

"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",

163

"Qwen/Qwen2.5-VL-72B-Instruct"

164

]

165

166

comparison_results = compare_models(

167

client,

168

models_to_compare,

169

"reasoning-benchmark-v1"

170

)

171

172

print("Model Comparison Results:")

173

for i, result in enumerate(comparison_results):

174

print(f"{i+1}. {result['model']}: {result['score']:.3f}")

175

```

176

177

### Custom Evaluation Dataset

178

179

```python

180

def create_custom_evaluation(client: Together, model: str, questions: list):

181

"""Create custom evaluation with specific questions."""

182

183

# This would typically involve uploading a custom dataset

184

# For demonstration, we'll show the structure

185

186

custom_dataset = {

187

"name": "custom-qa-evaluation",

188

"questions": questions,

189

"evaluation_type": "classify",

190

"metrics": ["accuracy", "f1_score", "precision", "recall"]

191

}

192

193

# Upload custom dataset (hypothetical API)

194

# dataset_id = client.datasets.upload(custom_dataset)

195

196

# For now, use a standard dataset

197

evaluation = client.evaluation.create(

198

model=model,

199

evaluation_type="classify",

200

dataset="qa-benchmark-v1"

201

)

202

203

return evaluation

204

205

# Example custom questions

206

custom_questions = [

207

{

208

"question": "What is the capital of France?",

209

"options": ["London", "Berlin", "Paris", "Madrid"],

210

"correct_answer": "Paris",

211

"category": "geography"

212

},

213

{

214

"question": "What is 2 + 2?",

215

"options": ["3", "4", "5", "6"],

216

"correct_answer": "4",

217

"category": "mathematics"

218

}

219

]

220

221

custom_eval = create_custom_evaluation(

222

client,

223

"meta-llama/Llama-3.2-3B-Instruct-Turbo",

224

custom_questions

225

)

226

```

227

228

### Evaluation Metrics Analysis

229

230

```python

231

def analyze_evaluation_results(client: Together, eval_id: str):

232

"""Analyze detailed evaluation results."""

233

234

eval_job = client.evaluation.retrieve(eval_id)

235

236

if eval_job.status != "completed":

237

print(f"Evaluation not completed yet. Status: {eval_job.status}")

238

return None

239

240

analysis = {

241

'overall_score': eval_job.score,

242

'total_questions': 0,

243

'correct_answers': 0,

244

'category_breakdown': {},

245

'difficulty_breakdown': {}

246

}

247

248

# Analyze metrics if available

249

if hasattr(eval_job, 'metrics') and eval_job.metrics:

250

metrics = eval_job.metrics

251

252

analysis.update({

253

'accuracy': metrics.get('accuracy', 0),

254

'precision': metrics.get('precision', 0),

255

'recall': metrics.get('recall', 0),

256

'f1_score': metrics.get('f1_score', 0)

257

})

258

259

# Category-specific analysis

260

for category, stats in metrics.get('categories', {}).items():

261

analysis['category_breakdown'][category] = {

262

'accuracy': stats.get('accuracy', 0),

263

'question_count': stats.get('count', 0)

264

}

265

266

return analysis

267

268

# Analyze results

269

analysis = analyze_evaluation_results(client, completed_eval.id)

270

271

if analysis:

272

print(f"Overall Score: {analysis['overall_score']:.3f}")

273

print(f"Accuracy: {analysis['accuracy']:.3f}")

274

print(f"F1 Score: {analysis['f1_score']:.3f}")

275

276

if analysis['category_breakdown']:

277

print("\nCategory Breakdown:")

278

for category, stats in analysis['category_breakdown'].items():

279

print(f" {category}: {stats['accuracy']:.3f} ({stats['question_count']} questions)")

280

```

281

282

## Types

283

284

### Request Types

285

286

```python { .api }

287

class EvaluationRequest:

288

model: str

289

evaluation_type: str

290

dataset: str

291

parameters: Optional[Dict[str, Any]] = None

292

293

class ClassifyParameters:

294

threshold: Optional[float] = None

295

categories: Optional[List[str]] = None

296

297

class ScoreParameters:

298

metric: Optional[str] = None

299

scale: Optional[Tuple[float, float]] = None

300

301

class CompareParameters:

302

baseline_model: Optional[str] = None

303

comparison_metric: Optional[str] = None

304

```

305

306

### Response Types

307

308

```python { .api }

309

class EvaluationCreateResponse:

310

id: str

311

object: str

312

model: str

313

evaluation_type: str

314

dataset: str

315

status: str

316

created_at: int

317

318

class EvaluationJob:

319

id: str

320

object: str

321

model: str

322

evaluation_type: str

323

dataset: str

324

status: str

325

score: Optional[float]

326

metrics: Optional[Dict[str, Any]]

327

created_at: int

328

completed_at: Optional[int]

329

error: Optional[str]

330

331

class EvaluationStatusResponse:

332

id: str

333

status: str

334

progress: Optional[float]

335

estimated_completion: Optional[int]

336

```

337

338

### Configuration Types

339

340

```python { .api }

341

class EvaluationType:

342

CLASSIFY = "classify"

343

SCORE = "score"

344

COMPARE = "compare"

345

CUSTOM = "custom"

346

347

class EvaluationStatus:

348

PENDING = "pending"

349

RUNNING = "running"

350

COMPLETED = "completed"

351

FAILED = "failed"

352

CANCELLED = "cancelled"

353

354

class JudgeModelConfig:

355

model: str

356

temperature: Optional[float] = None

357

max_tokens: Optional[int] = None

358

criteria: Optional[List[str]] = None

359

360

class ModelRequest:

361

model: str

362

parameters: Optional[Dict[str, Any]] = None

363

system_prompt: Optional[str] = None

364

```

365

366

## Standard Evaluation Datasets

367

368

- `reasoning-benchmark-v1` - Logical reasoning tasks

369

- `qa-benchmark-v1` - Question answering evaluation

370

- `code-benchmark-v1` - Programming task evaluation

371

- `math-benchmark-v1` - Mathematical problem solving

372

- `reading-comprehension-v1` - Text understanding tasks

373

- `safety-benchmark-v1` - AI safety and alignment evaluation