Python client to interact with Aleph Alpha API endpoints
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Evaluate model performance against expected outputs with detailed metrics and analysis. Provides quantitative assessment of model predictions for quality assurance, benchmarking, and optimization.
Configure model evaluation by comparing generated outputs against expected results.
class EvaluationRequest:
prompt: Prompt
completion_expected: str
contextual_control_threshold: Optional[float] = None
control_log_additive: Optional[bool] = True
"""
Request for model evaluation against expected output.
Attributes:
- prompt: Input prompt for model evaluation
- completion_expected: Expected output text for comparison
- contextual_control_threshold: Threshold for attention controls
- control_log_additive: Method for applying attention controls
"""
def to_json(self) -> Mapping[str, Any]:
"""Serialize request to JSON format."""Structured response containing evaluation metrics and detailed analysis results.
class EvaluationResponse:
model_version: str
message: Optional[str]
result: Dict[str, Any]
num_tokens_prompt_total: int
"""
Response from model evaluation.
Attributes:
- model_version: Version of model used for evaluation
- message: Optional response message or status
- result: Detailed evaluation metrics and scores
- num_tokens_prompt_total: Total tokens processed in prompt
"""
@staticmethod
def from_json(json: Dict[str, Any]) -> EvaluationResponse:
"""Create response from JSON data."""Generate evaluation metrics comparing model output against expected results.
def evaluate(
self,
request: EvaluationRequest,
model: str
) -> EvaluationResponse:
"""
Evaluate model performance against expected output.
Parameters:
- request: Evaluation configuration with prompt and expected output
- model: Model name to evaluate
Returns:
EvaluationResponse with evaluation metrics
"""
async def evaluate(
self,
request: EvaluationRequest,
model: str
) -> EvaluationResponse:
"""
Evaluate model performance against expected output (async).
Parameters:
- request: Evaluation configuration
- model: Model name to evaluate
Returns:
EvaluationResponse with evaluation metrics
"""Comprehensive evaluation examples for quality assessment and benchmarking:
from aleph_alpha_client import Client, EvaluationRequest, Prompt
client = Client(token="your-api-token")
# Basic evaluation - compare model output to expected result
prompt = Prompt.from_text("What is the capital of France?")
expected_output = "Paris"
request = EvaluationRequest(
prompt=prompt,
completion_expected=expected_output
)
response = client.evaluate(request, model="luminous-extended")
print(f"Model version: {response.model_version}")
print(f"Evaluation results: {response.result}")
print(f"Tokens processed: {response.num_tokens_prompt_total}")
if response.message:
print(f"Message: {response.message}")
# Extract specific metrics from results
def extract_metrics(eval_response: EvaluationResponse) -> dict:
"""Extract key metrics from evaluation response."""
results = eval_response.result
# Common metrics that might be present
metrics = {}
if 'log_probability' in results:
metrics['log_probability'] = results['log_probability']
if 'perplexity' in results:
metrics['perplexity'] = results['perplexity']
if 'likelihood' in results:
metrics['likelihood'] = results['likelihood']
return metrics
metrics = extract_metrics(response)
print(f"Extracted metrics: {metrics}")
# Batch evaluation for benchmarking
evaluation_cases = [
{
"prompt": "Translate to French: Hello",
"expected": "Bonjour",
"category": "translation"
},
{
"prompt": "What is 2 + 2?",
"expected": "4",
"category": "math"
},
{
"prompt": "Name the first president of the USA",
"expected": "George Washington",
"category": "history"
},
{
"prompt": "What color is the sky?",
"expected": "blue",
"category": "general"
}
]
def run_evaluation_suite(cases: list, model: str) -> dict:
"""Run evaluation suite and collect results by category."""
results_by_category = {}
for case in cases:
prompt = Prompt.from_text(case["prompt"])
request = EvaluationRequest(
prompt=prompt,
completion_expected=case["expected"]
)
response = client.evaluate(request, model=model)
category = case["category"]
if category not in results_by_category:
results_by_category[category] = []
results_by_category[category].append({
"prompt": case["prompt"],
"expected": case["expected"],
"metrics": extract_metrics(response),
"raw_result": response.result
})
return results_by_category
# Run the evaluation suite
suite_results = run_evaluation_suite(evaluation_cases, "luminous-extended")
# Analyze results by category
for category, results in suite_results.items():
print(f"\n{category.upper()} Category Results:")
for result in results:
print(f" Prompt: '{result['prompt']}'")
print(f" Expected: '{result['expected']}'")
print(f" Metrics: {result['metrics']}")
# Multimodal evaluation
from aleph_alpha_client import Image
# Evaluate image description task
image = Image.from_file("landscape.jpg")
multimodal_prompt = Prompt([
Text.from_text("Describe this image in one word:"),
image
])
multimodal_request = EvaluationRequest(
prompt=multimodal_prompt,
completion_expected="landscape"
)
multimodal_response = client.evaluate(multimodal_request, model="luminous-extended")
print(f"Multimodal evaluation: {multimodal_response.result}")
# Evaluation with attention controls
from aleph_alpha_client import Text, TextControl, ControlTokenOverlap
controlled_text = Text(
text="The most important answer is Paris.",
controls=[
TextControl(
start=27, # Start at "Paris"
length=5, # Length of "Paris"
factor=2.0,
token_overlap=ControlTokenOverlap.Complete
)
]
)
controlled_prompt = Prompt([controlled_text])
controlled_request = EvaluationRequest(
prompt=controlled_prompt,
completion_expected="Paris",
control_log_additive=True
)
controlled_response = client.evaluate(controlled_request, model="luminous-extended")
print(f"Controlled evaluation: {controlled_response.result}")
# Compare performance across models
models_to_test = ["luminous-base", "luminous-extended", "luminous-supreme"]
def compare_models(prompt_text: str, expected: str, models: list) -> dict:
"""Compare evaluation results across multiple models."""
comparison = {}
prompt = Prompt.from_text(prompt_text)
request = EvaluationRequest(
prompt=prompt,
completion_expected=expected
)
for model in models:
try:
response = client.evaluate(request, model=model)
comparison[model] = {
"metrics": extract_metrics(response),
"tokens": response.num_tokens_prompt_total
}
except Exception as e:
comparison[model] = {"error": str(e)}
return comparison
# Compare models on a factual question
model_comparison = compare_models(
"What is the chemical symbol for gold?",
"Au",
models_to_test
)
print("\nModel Comparison Results:")
for model, result in model_comparison.items():
print(f"{model}: {result}")
# Statistical analysis of evaluation results
def analyze_evaluation_stats(results: list) -> dict:
"""Analyze statistics from multiple evaluation results."""
metrics_list = [extract_metrics(r) for r in results]
# Extract log probabilities if available
log_probs = [m.get('log_probability') for m in metrics_list if m.get('log_probability')]
if log_probs:
import statistics
return {
"count": len(log_probs),
"mean_log_prob": statistics.mean(log_probs),
"median_log_prob": statistics.median(log_probs),
"stdev_log_prob": statistics.stdev(log_probs) if len(log_probs) > 1 else 0
}
return {"count": len(results), "log_probs_available": False}
# Collect multiple evaluation results for analysis
multiple_prompts = [
("What is water made of?", "H2O"),
("Name the largest planet", "Jupiter"),
("What is 10 * 10?", "100"),
("Capital of Italy?", "Rome")
]
evaluation_results = []
for prompt_text, expected in multiple_prompts:
request = EvaluationRequest(
prompt=Prompt.from_text(prompt_text),
completion_expected=expected
)
response = client.evaluate(request, model="luminous-extended")
evaluation_results.append(response)
stats = analyze_evaluation_stats(evaluation_results)
print(f"\nEvaluation Statistics: {stats}")
# Async evaluation for large batches
import asyncio
async def async_evaluation_batch(cases: list, model: str):
"""Run evaluation batch asynchronously."""
async with AsyncClient(token="your-api-token") as async_client:
tasks = []
for case in cases:
prompt = Prompt.from_text(case["prompt"])
request = EvaluationRequest(
prompt=prompt,
completion_expected=case["expected"]
)
task = async_client.evaluate(request, model)
tasks.append(task)
results = await asyncio.gather(*tasks)
return results
# Run async evaluation
# async_results = asyncio.run(async_evaluation_batch(evaluation_cases, "luminous-extended"))
# print(f"Async evaluation completed: {len(async_results)} results")
# Custom evaluation pipeline
class EvaluationPipeline:
"""Custom evaluation pipeline with configurable metrics."""
def __init__(self, client, model):
self.client = client
self.model = model
self.results = []
def add_test_case(self, prompt: str, expected: str, category: str = "general"):
"""Add test case to pipeline."""
self.results.append({
"prompt": prompt,
"expected": expected,
"category": category,
"completed": False
})
def run_all(self):
"""Execute all test cases."""
for test_case in self.results:
if not test_case["completed"]:
request = EvaluationRequest(
prompt=Prompt.from_text(test_case["prompt"]),
completion_expected=test_case["expected"]
)
response = self.client.evaluate(request, self.model)
test_case["response"] = response
test_case["metrics"] = extract_metrics(response)
test_case["completed"] = True
def get_summary(self):
"""Get evaluation summary."""
completed = [r for r in self.results if r["completed"]]
categories = {}
for result in completed:
cat = result["category"]
if cat not in categories:
categories[cat] = []
categories[cat].append(result["metrics"])
return {
"total_tests": len(completed),
"categories": list(categories.keys()),
"category_counts": {cat: len(results) for cat, results in categories.items()}
}
# Use custom pipeline
pipeline = EvaluationPipeline(client, "luminous-extended")
pipeline.add_test_case("What is AI?", "Artificial Intelligence", "tech")
pipeline.add_test_case("Color of grass?", "green", "nature")
pipeline.add_test_case("2 + 3 = ?", "5", "math")
pipeline.run_all()
summary = pipeline.get_summary()
print(f"Pipeline summary: {summary}")Install with Tessl CLI
npx tessl i tessl/pypi-aleph-alpha-client