Tessl Tile for pypi/together@1.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio.md batch.md chat-completions.md code-interpreter.md completions.md embeddings.md endpoints.md evaluation.md files.md fine-tuning.md images.md index.md models.md rerank.md

evaluation.mddocs/

0
# Evaluation
1

2
Model performance evaluation with standardized metrics and comparison capabilities for assessing AI model quality, capabilities, and behavioral characteristics across various tasks and benchmarks.
3

4
## Capabilities
5

6
### Create Evaluation Job
7

8
Start an evaluation job to assess model performance on standardized tasks.
9

10
```python { .api }
11
def create(
12
    model: str,
13
    evaluation_type: str,
14
    dataset: str,
15
    **kwargs
16
) -> EvaluationCreateResponse:
17
    """
18
    Create an evaluation job.
19

20
    Args:
21
        model: Model identifier to evaluate
22
        evaluation_type: Type of evaluation (classify, score, compare)
23
        dataset: Dataset identifier for evaluation
24

25
    Returns:
26
        EvaluationCreateResponse with job information
27
    """
28
```
29

30
### Retrieve Evaluation Results
31

32
Get detailed results and metrics from completed evaluation jobs.
33

34
```python { .api }
35
def retrieve(id: str) -> EvaluationJob:
36
    """
37
    Retrieve evaluation job results.
38

39
    Args:
40
        id: Evaluation job identifier
41

42
    Returns:
43
        EvaluationJob with results and metrics
44
    """
45
```
46

47
### List Evaluation Jobs
48

49
List all evaluation jobs with their statuses and basic information.
50

51
```python { .api }
52
def list() -> List[EvaluationJob]:
53
    """
54
    List all evaluation jobs.
55

56
    Returns:
57
        List of EvaluationJob objects
58
    """
59
```
60

61
### Async Evaluation Operations
62

63
All evaluation operations support asynchronous execution.
64

65
```python { .api }
66
async def create(model: str, evaluation_type: str, dataset: str, **kwargs) -> EvaluationCreateResponse: ...
67
async def retrieve(id: str) -> EvaluationJob: ...
68
async def list() -> List[EvaluationJob]: ...
69
```
70

71
## Usage Examples
72

73
### Basic Model Evaluation
74

75
```python
76
from together import Together
77

78
client = Together()
79

80
# Start evaluation job
81
evaluation = client.evaluation.create(
82
    model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
83
    evaluation_type="classify",
84
    dataset="standard-benchmark-v1"
85
)
86

87
print(f"Evaluation job created: {evaluation.id}")
88
print(f"Status: {evaluation.status}")
89
```
90

91
### Monitor Evaluation Progress
92

93
```python
94
import time
95

96
def monitor_evaluation(client: Together, eval_id: str):
97
    """Monitor evaluation job until completion."""
98
    
99
    while True:
100
        eval_job = client.evaluation.retrieve(eval_id)
101
        print(f"Evaluation status: {eval_job.status}")
102
        
103
        if eval_job.status == "completed":
104
            print("Evaluation completed!")
105
            return eval_job
106
        elif eval_job.status == "failed":
107
            print("Evaluation failed!")
108
            return eval_job
109
        
110
        time.sleep(30)  # Check every 30 seconds
111

112
# Monitor the evaluation
113
completed_eval = monitor_evaluation(client, evaluation.id)
114

115
if completed_eval.status == "completed":
116
    print(f"Final score: {completed_eval.score}")
117
    print(f"Metrics: {completed_eval.metrics}")
118
```
119

120
### Compare Multiple Models
121

122
```python
123
def compare_models(client: Together, models: list, dataset: str):
124
    """Compare multiple models on the same evaluation dataset."""
125
    
126
    evaluation_jobs = []
127
    
128
    # Start evaluations for all models
129
    for model in models:
130
        eval_job = client.evaluation.create(
131
            model=model,
132
            evaluation_type="score",
133
            dataset=dataset
134
        )
135
        evaluation_jobs.append({
136
            'model': model,
137
            'job_id': eval_job.id,
138
            'job': eval_job
139
        })
140
        print(f"Started evaluation for {model}: {eval_job.id}")
141
    
142
    # Wait for all evaluations to complete
143
    results = []
144
    for eval_info in evaluation_jobs:
145
        completed = monitor_evaluation(client, eval_info['job_id'])
146
        results.append({
147
            'model': eval_info['model'],
148
            'score': completed.score if hasattr(completed, 'score') else None,
149
            'metrics': completed.metrics if hasattr(completed, 'metrics') else {},
150
            'status': completed.status
151
        })
152
    
153
    # Sort by score (highest first)
154
    successful_results = [r for r in results if r['score'] is not None]
155
    successful_results.sort(key=lambda x: x['score'], reverse=True)
156
    
157
    return successful_results
158

159
# Compare models
160
models_to_compare = [
161
    "meta-llama/Llama-3.2-3B-Instruct-Turbo",
162
    "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
163
    "Qwen/Qwen2.5-VL-72B-Instruct"
164
]
165

166
comparison_results = compare_models(
167
    client, 
168
    models_to_compare, 
169
    "reasoning-benchmark-v1"
170
)
171

172
print("Model Comparison Results:")
173
for i, result in enumerate(comparison_results):
174
    print(f"{i+1}. {result['model']}: {result['score']:.3f}")
175
```
176

177
### Custom Evaluation Dataset
178

179
```python
180
def create_custom_evaluation(client: Together, model: str, questions: list):
181
    """Create custom evaluation with specific questions."""
182
    
183
    # This would typically involve uploading a custom dataset
184
    # For demonstration, we'll show the structure
185
    
186
    custom_dataset = {
187
        "name": "custom-qa-evaluation",
188
        "questions": questions,
189
        "evaluation_type": "classify",
190
        "metrics": ["accuracy", "f1_score", "precision", "recall"]
191
    }
192
    
193
    # Upload custom dataset (hypothetical API)
194
    # dataset_id = client.datasets.upload(custom_dataset)
195
    
196
    # For now, use a standard dataset
197
    evaluation = client.evaluation.create(
198
        model=model,
199
        evaluation_type="classify",
200
        dataset="qa-benchmark-v1"
201
    )
202
    
203
    return evaluation
204

205
# Example custom questions
206
custom_questions = [
207
    {
208
        "question": "What is the capital of France?",
209
        "options": ["London", "Berlin", "Paris", "Madrid"],
210
        "correct_answer": "Paris",
211
        "category": "geography"
212
    },
213
    {
214
        "question": "What is 2 + 2?",
215
        "options": ["3", "4", "5", "6"],
216
        "correct_answer": "4",
217
        "category": "mathematics"
218
    }
219
]
220

221
custom_eval = create_custom_evaluation(
222
    client,
223
    "meta-llama/Llama-3.2-3B-Instruct-Turbo",
224
    custom_questions
225
)
226
```
227

228
### Evaluation Metrics Analysis
229

230
```python
231
def analyze_evaluation_results(client: Together, eval_id: str):
232
    """Analyze detailed evaluation results."""
233
    
234
    eval_job = client.evaluation.retrieve(eval_id)
235
    
236
    if eval_job.status != "completed":
237
        print(f"Evaluation not completed yet. Status: {eval_job.status}")
238
        return None
239
    
240
    analysis = {
241
        'overall_score': eval_job.score,
242
        'total_questions': 0,
243
        'correct_answers': 0,
244
        'category_breakdown': {},
245
        'difficulty_breakdown': {}
246
    }
247
    
248
    # Analyze metrics if available
249
    if hasattr(eval_job, 'metrics') and eval_job.metrics:
250
        metrics = eval_job.metrics
251
        
252
        analysis.update({
253
            'accuracy': metrics.get('accuracy', 0),
254
            'precision': metrics.get('precision', 0),
255
            'recall': metrics.get('recall', 0),
256
            'f1_score': metrics.get('f1_score', 0)
257
        })
258
        
259
        # Category-specific analysis
260
        for category, stats in metrics.get('categories', {}).items():
261
            analysis['category_breakdown'][category] = {
262
                'accuracy': stats.get('accuracy', 0),
263
                'question_count': stats.get('count', 0)
264
            }
265
    
266
    return analysis
267

268
# Analyze results
269
analysis = analyze_evaluation_results(client, completed_eval.id)
270

271
if analysis:
272
    print(f"Overall Score: {analysis['overall_score']:.3f}")
273
    print(f"Accuracy: {analysis['accuracy']:.3f}")
274
    print(f"F1 Score: {analysis['f1_score']:.3f}")
275
    
276
    if analysis['category_breakdown']:
277
        print("\nCategory Breakdown:")
278
        for category, stats in analysis['category_breakdown'].items():
279
            print(f"  {category}: {stats['accuracy']:.3f} ({stats['question_count']} questions)")
280
```
281

282
## Types
283

284
### Request Types
285

286
```python { .api }
287
class EvaluationRequest:
288
    model: str
289
    evaluation_type: str
290
    dataset: str
291
    parameters: Optional[Dict[str, Any]] = None
292

293
class ClassifyParameters:
294
    threshold: Optional[float] = None
295
    categories: Optional[List[str]] = None
296

297
class ScoreParameters:
298
    metric: Optional[str] = None
299
    scale: Optional[Tuple[float, float]] = None
300

301
class CompareParameters:
302
    baseline_model: Optional[str] = None
303
    comparison_metric: Optional[str] = None
304
```
305

306
### Response Types
307

308
```python { .api }
309
class EvaluationCreateResponse:
310
    id: str
311
    object: str
312
    model: str
313
    evaluation_type: str
314
    dataset: str
315
    status: str
316
    created_at: int
317

318
class EvaluationJob:
319
    id: str
320
    object: str
321
    model: str
322
    evaluation_type: str
323
    dataset: str
324
    status: str
325
    score: Optional[float]
326
    metrics: Optional[Dict[str, Any]]
327
    created_at: int
328
    completed_at: Optional[int]
329
    error: Optional[str]
330

331
class EvaluationStatusResponse:
332
    id: str
333
    status: str
334
    progress: Optional[float]
335
    estimated_completion: Optional[int]
336
```
337

338
### Configuration Types
339

340
```python { .api }
341
class EvaluationType:
342
    CLASSIFY = "classify"
343
    SCORE = "score" 
344
    COMPARE = "compare"
345
    CUSTOM = "custom"
346

347
class EvaluationStatus:
348
    PENDING = "pending"
349
    RUNNING = "running"
350
    COMPLETED = "completed"
351
    FAILED = "failed"
352
    CANCELLED = "cancelled"
353

354
class JudgeModelConfig:
355
    model: str
356
    temperature: Optional[float] = None
357
    max_tokens: Optional[int] = None
358
    criteria: Optional[List[str]] = None
359

360
class ModelRequest:
361
    model: str
362
    parameters: Optional[Dict[str, Any]] = None
363
    system_prompt: Optional[str] = None
364
```
365

366
## Standard Evaluation Datasets
367

368
- `reasoning-benchmark-v1` - Logical reasoning tasks
369
- `qa-benchmark-v1` - Question answering evaluation
370
- `code-benchmark-v1` - Programming task evaluation
371
- `math-benchmark-v1` - Mathematical problem solving
372
- `reading-comprehension-v1` - Text understanding tasks
373
- `safety-benchmark-v1` - AI safety and alignment evaluation

Version

Tile

Files

evaluation.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

evaluation.mddocs/