CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-aleph-alpha-client

Python client to interact with Aleph Alpha API endpoints

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

evaluation.mddocs/

Evaluation & Testing

Evaluate model performance against expected outputs with detailed metrics and analysis. Provides quantitative assessment of model predictions for quality assurance, benchmarking, and optimization.

Capabilities

Evaluation Requests

Configure model evaluation by comparing generated outputs against expected results.

class EvaluationRequest:
    prompt: Prompt
    completion_expected: str
    contextual_control_threshold: Optional[float] = None
    control_log_additive: Optional[bool] = True
    """
    Request for model evaluation against expected output.
    
    Attributes:
    - prompt: Input prompt for model evaluation
    - completion_expected: Expected output text for comparison
    - contextual_control_threshold: Threshold for attention controls
    - control_log_additive: Method for applying attention controls
    """

    def to_json(self) -> Mapping[str, Any]:
        """Serialize request to JSON format."""

Evaluation Responses

Structured response containing evaluation metrics and detailed analysis results.

class EvaluationResponse:
    model_version: str
    message: Optional[str]
    result: Dict[str, Any]
    num_tokens_prompt_total: int
    """
    Response from model evaluation.
    
    Attributes:
    - model_version: Version of model used for evaluation
    - message: Optional response message or status
    - result: Detailed evaluation metrics and scores
    - num_tokens_prompt_total: Total tokens processed in prompt
    """

    @staticmethod
    def from_json(json: Dict[str, Any]) -> EvaluationResponse:
        """Create response from JSON data."""

Model Evaluation

Generate evaluation metrics comparing model output against expected results.

def evaluate(
    self, 
    request: EvaluationRequest, 
    model: str
) -> EvaluationResponse:
    """
    Evaluate model performance against expected output.
    
    Parameters:
    - request: Evaluation configuration with prompt and expected output
    - model: Model name to evaluate
    
    Returns:
    EvaluationResponse with evaluation metrics
    """

async def evaluate(
    self, 
    request: EvaluationRequest, 
    model: str
) -> EvaluationResponse:
    """
    Evaluate model performance against expected output (async).
    
    Parameters:
    - request: Evaluation configuration
    - model: Model name to evaluate
    
    Returns:
    EvaluationResponse with evaluation metrics
    """

Usage Examples

Comprehensive evaluation examples for quality assessment and benchmarking:

from aleph_alpha_client import Client, EvaluationRequest, Prompt

client = Client(token="your-api-token")

# Basic evaluation - compare model output to expected result
prompt = Prompt.from_text("What is the capital of France?")
expected_output = "Paris"

request = EvaluationRequest(
    prompt=prompt,
    completion_expected=expected_output
)

response = client.evaluate(request, model="luminous-extended")

print(f"Model version: {response.model_version}")
print(f"Evaluation results: {response.result}")
print(f"Tokens processed: {response.num_tokens_prompt_total}")

if response.message:
    print(f"Message: {response.message}")

# Extract specific metrics from results
def extract_metrics(eval_response: EvaluationResponse) -> dict:
    """Extract key metrics from evaluation response."""
    results = eval_response.result
    
    # Common metrics that might be present
    metrics = {}
    
    if 'log_probability' in results:
        metrics['log_probability'] = results['log_probability']
    
    if 'perplexity' in results:
        metrics['perplexity'] = results['perplexity']
        
    if 'likelihood' in results:
        metrics['likelihood'] = results['likelihood']
    
    return metrics

metrics = extract_metrics(response)
print(f"Extracted metrics: {metrics}")

# Batch evaluation for benchmarking
evaluation_cases = [
    {
        "prompt": "Translate to French: Hello",
        "expected": "Bonjour",
        "category": "translation"
    },
    {
        "prompt": "What is 2 + 2?",
        "expected": "4",
        "category": "math"
    },
    {
        "prompt": "Name the first president of the USA",
        "expected": "George Washington",
        "category": "history"
    },
    {
        "prompt": "What color is the sky?",
        "expected": "blue",
        "category": "general"
    }
]

def run_evaluation_suite(cases: list, model: str) -> dict:
    """Run evaluation suite and collect results by category."""
    results_by_category = {}
    
    for case in cases:
        prompt = Prompt.from_text(case["prompt"])
        request = EvaluationRequest(
            prompt=prompt,
            completion_expected=case["expected"]
        )
        
        response = client.evaluate(request, model=model)
        
        category = case["category"]
        if category not in results_by_category:
            results_by_category[category] = []
        
        results_by_category[category].append({
            "prompt": case["prompt"],
            "expected": case["expected"],
            "metrics": extract_metrics(response),
            "raw_result": response.result
        })
    
    return results_by_category

# Run the evaluation suite
suite_results = run_evaluation_suite(evaluation_cases, "luminous-extended")

# Analyze results by category
for category, results in suite_results.items():
    print(f"\n{category.upper()} Category Results:")
    for result in results:
        print(f"  Prompt: '{result['prompt']}'")
        print(f"  Expected: '{result['expected']}'")
        print(f"  Metrics: {result['metrics']}")

# Multimodal evaluation
from aleph_alpha_client import Image

# Evaluate image description task
image = Image.from_file("landscape.jpg")
multimodal_prompt = Prompt([
    Text.from_text("Describe this image in one word:"),
    image
])

multimodal_request = EvaluationRequest(
    prompt=multimodal_prompt,
    completion_expected="landscape"
)

multimodal_response = client.evaluate(multimodal_request, model="luminous-extended")
print(f"Multimodal evaluation: {multimodal_response.result}")

# Evaluation with attention controls
from aleph_alpha_client import Text, TextControl, ControlTokenOverlap

controlled_text = Text(
    text="The most important answer is Paris.",
    controls=[
        TextControl(
            start=27,  # Start at "Paris"
            length=5,  # Length of "Paris"
            factor=2.0,
            token_overlap=ControlTokenOverlap.Complete
        )
    ]
)

controlled_prompt = Prompt([controlled_text])
controlled_request = EvaluationRequest(
    prompt=controlled_prompt,
    completion_expected="Paris",
    control_log_additive=True
)

controlled_response = client.evaluate(controlled_request, model="luminous-extended")
print(f"Controlled evaluation: {controlled_response.result}")

# Compare performance across models
models_to_test = ["luminous-base", "luminous-extended", "luminous-supreme"]

def compare_models(prompt_text: str, expected: str, models: list) -> dict:
    """Compare evaluation results across multiple models."""
    comparison = {}
    
    prompt = Prompt.from_text(prompt_text)
    request = EvaluationRequest(
        prompt=prompt,
        completion_expected=expected
    )
    
    for model in models:
        try:
            response = client.evaluate(request, model=model)
            comparison[model] = {
                "metrics": extract_metrics(response),
                "tokens": response.num_tokens_prompt_total
            }
        except Exception as e:
            comparison[model] = {"error": str(e)}
    
    return comparison

# Compare models on a factual question
model_comparison = compare_models(
    "What is the chemical symbol for gold?",
    "Au",
    models_to_test
)

print("\nModel Comparison Results:")
for model, result in model_comparison.items():
    print(f"{model}: {result}")

# Statistical analysis of evaluation results
def analyze_evaluation_stats(results: list) -> dict:
    """Analyze statistics from multiple evaluation results."""
    metrics_list = [extract_metrics(r) for r in results]
    
    # Extract log probabilities if available
    log_probs = [m.get('log_probability') for m in metrics_list if m.get('log_probability')]
    
    if log_probs:
        import statistics
        return {
            "count": len(log_probs),
            "mean_log_prob": statistics.mean(log_probs),
            "median_log_prob": statistics.median(log_probs),
            "stdev_log_prob": statistics.stdev(log_probs) if len(log_probs) > 1 else 0
        }
    
    return {"count": len(results), "log_probs_available": False}

# Collect multiple evaluation results for analysis
multiple_prompts = [
    ("What is water made of?", "H2O"),
    ("Name the largest planet", "Jupiter"),
    ("What is 10 * 10?", "100"),
    ("Capital of Italy?", "Rome")
]

evaluation_results = []
for prompt_text, expected in multiple_prompts:
    request = EvaluationRequest(
        prompt=Prompt.from_text(prompt_text),
        completion_expected=expected
    )
    response = client.evaluate(request, model="luminous-extended")
    evaluation_results.append(response)

stats = analyze_evaluation_stats(evaluation_results)
print(f"\nEvaluation Statistics: {stats}")

# Async evaluation for large batches
import asyncio

async def async_evaluation_batch(cases: list, model: str):
    """Run evaluation batch asynchronously."""
    async with AsyncClient(token="your-api-token") as async_client:
        tasks = []
        
        for case in cases:
            prompt = Prompt.from_text(case["prompt"])
            request = EvaluationRequest(
                prompt=prompt,
                completion_expected=case["expected"]
            )
            task = async_client.evaluate(request, model)
            tasks.append(task)
        
        results = await asyncio.gather(*tasks)
        return results

# Run async evaluation
# async_results = asyncio.run(async_evaluation_batch(evaluation_cases, "luminous-extended"))
# print(f"Async evaluation completed: {len(async_results)} results")

# Custom evaluation pipeline
class EvaluationPipeline:
    """Custom evaluation pipeline with configurable metrics."""
    
    def __init__(self, client, model):
        self.client = client
        self.model = model
        self.results = []
    
    def add_test_case(self, prompt: str, expected: str, category: str = "general"):
        """Add test case to pipeline."""
        self.results.append({
            "prompt": prompt,
            "expected": expected,
            "category": category,
            "completed": False
        })
    
    def run_all(self):
        """Execute all test cases."""
        for test_case in self.results:
            if not test_case["completed"]:
                request = EvaluationRequest(
                    prompt=Prompt.from_text(test_case["prompt"]),
                    completion_expected=test_case["expected"]
                )
                
                response = self.client.evaluate(request, self.model)
                test_case["response"] = response
                test_case["metrics"] = extract_metrics(response)
                test_case["completed"] = True
    
    def get_summary(self):
        """Get evaluation summary."""
        completed = [r for r in self.results if r["completed"]]
        categories = {}
        
        for result in completed:
            cat = result["category"]
            if cat not in categories:
                categories[cat] = []
            categories[cat].append(result["metrics"])
        
        return {
            "total_tests": len(completed),
            "categories": list(categories.keys()),
            "category_counts": {cat: len(results) for cat, results in categories.items()}
        }

# Use custom pipeline
pipeline = EvaluationPipeline(client, "luminous-extended")
pipeline.add_test_case("What is AI?", "Artificial Intelligence", "tech")
pipeline.add_test_case("Color of grass?", "green", "nature")
pipeline.add_test_case("2 + 3 = ?", "5", "math")

pipeline.run_all()
summary = pipeline.get_summary()
print(f"Pipeline summary: {summary}")

Install with Tessl CLI

npx tessl i tessl/pypi-aleph-alpha-client

docs

chat-interface.md

client-management.md

document-prompt-template.md

embeddings.md

evaluation.md

explanations.md

index.md

prompt-construction.md

steering.md

structured-output.md

text-completion.md

tokenization.md

translation.md

utilities.md

tile.json