Python client for Together's Cloud Platform providing comprehensive AI model APIs
Model performance evaluation with standardized metrics and comparison capabilities for assessing AI model quality, capabilities, and behavioral characteristics across various tasks and benchmarks.
Start an evaluation job to assess model performance on standardized tasks.
def create(
model: str,
evaluation_type: str,
dataset: str,
**kwargs
) -> EvaluationCreateResponse:
"""
Create an evaluation job.
Args:
model: Model identifier to evaluate
evaluation_type: Type of evaluation (classify, score, compare)
dataset: Dataset identifier for evaluation
Returns:
EvaluationCreateResponse with job information
"""Get detailed results and metrics from completed evaluation jobs.
def retrieve(id: str) -> EvaluationJob:
"""
Retrieve evaluation job results.
Args:
id: Evaluation job identifier
Returns:
EvaluationJob with results and metrics
"""List all evaluation jobs with their statuses and basic information.
def list() -> List[EvaluationJob]:
"""
List all evaluation jobs.
Returns:
List of EvaluationJob objects
"""All evaluation operations support asynchronous execution.
async def create(model: str, evaluation_type: str, dataset: str, **kwargs) -> EvaluationCreateResponse: ...
async def retrieve(id: str) -> EvaluationJob: ...
async def list() -> List[EvaluationJob]: ...from together import Together
client = Together()
# Start evaluation job
evaluation = client.evaluation.create(
model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
evaluation_type="classify",
dataset="standard-benchmark-v1"
)
print(f"Evaluation job created: {evaluation.id}")
print(f"Status: {evaluation.status}")import time
def monitor_evaluation(client: Together, eval_id: str):
"""Monitor evaluation job until completion."""
while True:
eval_job = client.evaluation.retrieve(eval_id)
print(f"Evaluation status: {eval_job.status}")
if eval_job.status == "completed":
print("Evaluation completed!")
return eval_job
elif eval_job.status == "failed":
print("Evaluation failed!")
return eval_job
time.sleep(30) # Check every 30 seconds
# Monitor the evaluation
completed_eval = monitor_evaluation(client, evaluation.id)
if completed_eval.status == "completed":
print(f"Final score: {completed_eval.score}")
print(f"Metrics: {completed_eval.metrics}")def compare_models(client: Together, models: list, dataset: str):
"""Compare multiple models on the same evaluation dataset."""
evaluation_jobs = []
# Start evaluations for all models
for model in models:
eval_job = client.evaluation.create(
model=model,
evaluation_type="score",
dataset=dataset
)
evaluation_jobs.append({
'model': model,
'job_id': eval_job.id,
'job': eval_job
})
print(f"Started evaluation for {model}: {eval_job.id}")
# Wait for all evaluations to complete
results = []
for eval_info in evaluation_jobs:
completed = monitor_evaluation(client, eval_info['job_id'])
results.append({
'model': eval_info['model'],
'score': completed.score if hasattr(completed, 'score') else None,
'metrics': completed.metrics if hasattr(completed, 'metrics') else {},
'status': completed.status
})
# Sort by score (highest first)
successful_results = [r for r in results if r['score'] is not None]
successful_results.sort(key=lambda x: x['score'], reverse=True)
return successful_results
# Compare models
models_to_compare = [
"meta-llama/Llama-3.2-3B-Instruct-Turbo",
"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
"Qwen/Qwen2.5-VL-72B-Instruct"
]
comparison_results = compare_models(
client,
models_to_compare,
"reasoning-benchmark-v1"
)
print("Model Comparison Results:")
for i, result in enumerate(comparison_results):
print(f"{i+1}. {result['model']}: {result['score']:.3f}")def create_custom_evaluation(client: Together, model: str, questions: list):
"""Create custom evaluation with specific questions."""
# This would typically involve uploading a custom dataset
# For demonstration, we'll show the structure
custom_dataset = {
"name": "custom-qa-evaluation",
"questions": questions,
"evaluation_type": "classify",
"metrics": ["accuracy", "f1_score", "precision", "recall"]
}
# Upload custom dataset (hypothetical API)
# dataset_id = client.datasets.upload(custom_dataset)
# For now, use a standard dataset
evaluation = client.evaluation.create(
model=model,
evaluation_type="classify",
dataset="qa-benchmark-v1"
)
return evaluation
# Example custom questions
custom_questions = [
{
"question": "What is the capital of France?",
"options": ["London", "Berlin", "Paris", "Madrid"],
"correct_answer": "Paris",
"category": "geography"
},
{
"question": "What is 2 + 2?",
"options": ["3", "4", "5", "6"],
"correct_answer": "4",
"category": "mathematics"
}
]
custom_eval = create_custom_evaluation(
client,
"meta-llama/Llama-3.2-3B-Instruct-Turbo",
custom_questions
)def analyze_evaluation_results(client: Together, eval_id: str):
"""Analyze detailed evaluation results."""
eval_job = client.evaluation.retrieve(eval_id)
if eval_job.status != "completed":
print(f"Evaluation not completed yet. Status: {eval_job.status}")
return None
analysis = {
'overall_score': eval_job.score,
'total_questions': 0,
'correct_answers': 0,
'category_breakdown': {},
'difficulty_breakdown': {}
}
# Analyze metrics if available
if hasattr(eval_job, 'metrics') and eval_job.metrics:
metrics = eval_job.metrics
analysis.update({
'accuracy': metrics.get('accuracy', 0),
'precision': metrics.get('precision', 0),
'recall': metrics.get('recall', 0),
'f1_score': metrics.get('f1_score', 0)
})
# Category-specific analysis
for category, stats in metrics.get('categories', {}).items():
analysis['category_breakdown'][category] = {
'accuracy': stats.get('accuracy', 0),
'question_count': stats.get('count', 0)
}
return analysis
# Analyze results
analysis = analyze_evaluation_results(client, completed_eval.id)
if analysis:
print(f"Overall Score: {analysis['overall_score']:.3f}")
print(f"Accuracy: {analysis['accuracy']:.3f}")
print(f"F1 Score: {analysis['f1_score']:.3f}")
if analysis['category_breakdown']:
print("\nCategory Breakdown:")
for category, stats in analysis['category_breakdown'].items():
print(f" {category}: {stats['accuracy']:.3f} ({stats['question_count']} questions)")class EvaluationRequest:
model: str
evaluation_type: str
dataset: str
parameters: Optional[Dict[str, Any]] = None
class ClassifyParameters:
threshold: Optional[float] = None
categories: Optional[List[str]] = None
class ScoreParameters:
metric: Optional[str] = None
scale: Optional[Tuple[float, float]] = None
class CompareParameters:
baseline_model: Optional[str] = None
comparison_metric: Optional[str] = Noneclass EvaluationCreateResponse:
id: str
object: str
model: str
evaluation_type: str
dataset: str
status: str
created_at: int
class EvaluationJob:
id: str
object: str
model: str
evaluation_type: str
dataset: str
status: str
score: Optional[float]
metrics: Optional[Dict[str, Any]]
created_at: int
completed_at: Optional[int]
error: Optional[str]
class EvaluationStatusResponse:
id: str
status: str
progress: Optional[float]
estimated_completion: Optional[int]class EvaluationType:
CLASSIFY = "classify"
SCORE = "score"
COMPARE = "compare"
CUSTOM = "custom"
class EvaluationStatus:
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
class JudgeModelConfig:
model: str
temperature: Optional[float] = None
max_tokens: Optional[int] = None
criteria: Optional[List[str]] = None
class ModelRequest:
model: str
parameters: Optional[Dict[str, Any]] = None
system_prompt: Optional[str] = Nonereasoning-benchmark-v1 - Logical reasoning tasksqa-benchmark-v1 - Question answering evaluationcode-benchmark-v1 - Programming task evaluationmath-benchmark-v1 - Mathematical problem solvingreading-comprehension-v1 - Text understanding taskssafety-benchmark-v1 - AI safety and alignment evaluationInstall with Tessl CLI
npx tessl i tessl/pypi-together