HuggingFace community-driven open-source library of evaluation metrics for machine learning models and datasets.
—
Comprehensive evaluation workflows that run multiple tasks and datasets together for thorough model evaluation. Evaluation suites enable systematic benchmarking across diverse scenarios with standardized configurations.
The EvaluationSuite class provides multi-task, multi-dataset evaluation workflows:
class EvaluationSuite:
"""Multi-task, multi-dataset evaluation suite."""
@staticmethod
def load(
path: str,
download_mode: Optional[DownloadMode] = None,
revision: Optional[Union[str, Version]] = None,
download_config: Optional[DownloadConfig] = None
) -> EvaluationSuite:
"""Load an evaluation suite from Hub or local path."""
def run(self, model_or_pipeline) -> Dict[str, Any]:
"""Run the complete evaluation suite on a model."""Usage Example:
import evaluate
# Load a pre-defined evaluation suite
suite = evaluate.EvaluationSuite.load("super_glue")
# Run evaluation on a model
from transformers import pipeline
model = pipeline("text-classification", model="distilbert-base-uncased")
results = suite.run(model)
print(results)
# Results contain scores for all tasks in the suite
# {
# 'boolq': {'accuracy': 0.75},
# 'cb': {'accuracy': 0.82, 'f1': 0.79},
# 'copa': {'accuracy': 0.68},
# # ... more task results
# }Evaluation suites are defined using JSON configuration files that specify tasks, datasets, and metrics:
Example Suite Configuration:
{
"suite_name": "my_classification_suite",
"description": "Custom text classification evaluation suite",
"tasks": [
{
"task_type": "text-classification",
"dataset": "glue",
"subset": "sst2",
"split": "validation",
"metrics": ["accuracy", "f1"]
},
{
"task_type": "text-classification",
"dataset": "glue",
"subset": "mrpc",
"split": "validation",
"metrics": ["accuracy", "f1"]
},
{
"task_type": "text-classification",
"dataset": "imdb",
"split": "test[:1000]",
"metrics": ["accuracy"]
}
]
}Loading Custom Suite:
import evaluate
# Load custom suite from local file
custom_suite = evaluate.EvaluationSuite.load("./my_suite.json")
# Run on multiple models
models = [
"distilbert-base-uncased",
"bert-base-uncased",
"roberta-base"
]
all_results = {}
for model_name in models:
print(f"Evaluating {model_name}...")
model = pipeline("text-classification", model=model_name)
results = custom_suite.run(model)
all_results[model_name] = results
# Compare results across models
for task in results.keys():
print(f"\n{task} Results:")
for model_name in all_results:
accuracy = all_results[model_name][task].get('accuracy', 'N/A')
print(f" {model_name}: {accuracy:.3f}")The library includes several pre-built evaluation suites:
GLUE Suite:
import evaluate
# Load GLUE benchmark suite
glue_suite = evaluate.EvaluationSuite.load("glue")
# Evaluate a model on all GLUE tasks
from transformers import pipeline
model = pipeline("text-classification", model="bert-base-uncased")
glue_results = glue_suite.run(model)
# View results for specific tasks
print(f"CoLA: {glue_results['cola']['matthews_correlation']:.3f}")
print(f"SST-2: {glue_results['sst2']['accuracy']:.3f}")
print(f"MRPC: {glue_results['mrpc']['f1']:.3f}")SuperGLUE Suite:
import evaluate
# Load SuperGLUE benchmark
superglue_suite = evaluate.EvaluationSuite.load("super_glue")
# Run evaluation
results = superglue_suite.run(model)
# SuperGLUE includes more challenging tasks
print(f"BoolQ: {results['boolq']['accuracy']:.3f}")
print(f"RTE: {results['rte']['accuracy']:.3f}")
print(f"WiC: {results['wic']['accuracy']:.3f}")Multi-Modal Suite:
# Configuration for multi-modal evaluation
multimodal_config = {
"suite_name": "multimodal_suite",
"description": "Evaluation across text, image, and audio tasks",
"tasks": [
{
"task_type": "text-classification",
"dataset": "imdb",
"split": "test[:500]",
"metrics": ["accuracy"]
},
{
"task_type": "image-classification",
"dataset": "cifar10",
"split": "test[:500]",
"metrics": ["accuracy", "top_5_accuracy"]
},
{
"task_type": "audio-classification",
"dataset": "superb",
"subset": "ks",
"split": "test[:500]",
"metrics": ["accuracy"]
}
]
}
# Save and load the suite
import json
with open("multimodal_suite.json", "w") as f:
json.dump(multimodal_config, f, indent=2)
suite = evaluate.EvaluationSuite.load("./multimodal_suite.json")Domain-Specific Suite:
# Medical text classification suite
medical_suite_config = {
"suite_name": "medical_text_suite",
"description": "Medical text classification benchmarks",
"tasks": [
{
"task_type": "text-classification",
"dataset": "medical_questions_pairs",
"metrics": ["accuracy", "f1"]
},
{
"task_type": "text-classification",
"dataset": "pubmed_20k_rct",
"metrics": ["accuracy", "precision", "recall"]
}
]
}Comprehensive Results Processing:
import evaluate
import pandas as pd
# Load and run suite
suite = evaluate.EvaluationSuite.load("glue")
results = suite.run(model)
# Convert to DataFrame for analysis
results_data = []
for task, metrics in results.items():
for metric_name, value in metrics.items():
results_data.append({
'task': task,
'metric': metric_name,
'value': value
})
df = pd.DataFrame(results_data)
print(df.pivot(index='task', columns='metric', values='value'))
# Calculate overall suite score (if applicable)
accuracy_scores = [
metrics.get('accuracy', 0)
for metrics in results.values()
if 'accuracy' in metrics
]
overall_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Overall Suite Accuracy: {overall_accuracy:.3f}")Model Comparison with Suites:
import evaluate
suite = evaluate.EvaluationSuite.load("glue")
models_to_compare = [
"distilbert-base-uncased",
"bert-base-uncased",
"roberta-base"
]
comparison_results = {}
for model_name in models_to_compare:
model = pipeline("text-classification", model=model_name)
results = suite.run(model)
comparison_results[model_name] = results
# Create comparison table
import pandas as pd
comparison_data = []
for model_name, model_results in comparison_results.items():
for task, metrics in model_results.items():
for metric_name, value in metrics.items():
comparison_data.append({
'model': model_name,
'task': task,
'metric': metric_name,
'value': value
})
comparison_df = pd.DataFrame(comparison_data)
pivot_table = comparison_df.pivot_table(
index=['task', 'metric'],
columns='model',
values='value'
)
print(pivot_table)Evaluation suites may raise:
FileNotFoundError: Suite configuration file not foundValueError: Invalid suite configuration formatImportError: Missing dependencies for specific tasksRuntimeError: Model incompatibility with suite tasksExample:
import evaluate
try:
suite = evaluate.EvaluationSuite.load("nonexistent_suite")
except FileNotFoundError:
print("Suite not found")
try:
suite = evaluate.EvaluationSuite.load("glue")
# Model incompatible with some tasks
incompatible_model = pipeline("text-generation", model="gpt2")
results = suite.run(incompatible_model)
except RuntimeError as e:
print(f"Model incompatibility: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-evaluate