HuggingFace community-driven open-source library of evaluation metrics for machine learning models and datasets.
—
Functions for sharing evaluation results with the Hugging Face Hub and saving results locally with comprehensive metadata. These tools enable reproducible evaluation workflows and result sharing within the ML community.
Push evaluation results directly to model metadata on the Hugging Face Hub:
def push_to_hub(
model_id: str,
task_type: str,
dataset_type: str,
dataset_name: str,
metric_type: str,
metric_name: str,
metric_value: float,
task_name: Optional[str] = None,
dataset_config: Optional[str] = None,
dataset_split: Optional[str] = None,
dataset_revision: Optional[str] = None,
dataset_args: Optional[Dict[str, int]] = None,
metric_config: Optional[str] = None,
metric_args: Optional[Dict[str, int]] = None,
overwrite: bool = False
):
"""Push evaluation results to a model's metadata on Hugging Face Hub.
Args:
model_id: Model identifier on the Hub (e.g., "username/model-name")
task_type: Task type (must be from Hub's allowed tasks)
dataset_type: Dataset identifier from Hub
dataset_name: Human-readable dataset name
metric_type: Metric identifier from Hub
metric_name: Human-readable metric name
metric_value: Computed metric score
task_name: Human-readable task name (optional)
dataset_config: Dataset configuration/subset name (optional)
dataset_split: Dataset split used ("train", "test", "validation")
dataset_revision: Specific dataset revision/commit (optional)
dataset_args: Additional dataset parameters (optional)
metric_config: Metric configuration name (optional)
metric_args: Additional metric parameters (optional)
overwrite: Whether to overwrite existing results (default: False)
"""Usage Example:
import evaluate
# Evaluate a model
accuracy = evaluate.load("accuracy")
accuracy.add_batch(predictions=[1, 0, 1], references=[1, 1, 0])
result = accuracy.compute()
# Push results to the model's Hub page
evaluate.push_to_hub(
model_id="my-username/my-model",
task_type="text-classification",
dataset_type="glue",
dataset_name="sst2",
metric_type="accuracy",
metric_name="accuracy",
metric_value=result["accuracy"],
dataset_config="default",
dataset_split="validation"
)Advanced Example with Multiple Metrics:
import evaluate
# Evaluate with multiple metrics
combined = evaluate.combine(["accuracy", "f1", "precision", "recall"])
results = combined.compute(predictions=[1, 0, 1, 0], references=[1, 1, 0, 0])
# Push each metric separately
for metric_name, metric_value in results.items():
evaluate.push_to_hub(
model_id="my-username/my-classification-model",
task_type="text-classification",
dataset_type="custom",
dataset_name="my-dataset",
metric_type=metric_name,
metric_name=metric_name,
metric_value=metric_value,
dataset_split="test",
overwrite=True # Update existing results
)Save evaluation results to local JSON files with comprehensive metadata:
def save(path_or_file: Union[str, Path, TextIOWrapper], **data)The function automatically includes system metadata such as:
Usage Example:
import evaluate
# Run evaluation
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bleu_result = bleu.compute(
predictions=["hello there", "general kenobi"],
references=[["hello there"], ["general kenobi"]]
)
rouge_result = rouge.compute(
predictions=["hello there", "general kenobi"],
references=["hello there", "general kenobi"]
)
# Save results with metadata
evaluate.save(
"evaluation_results.json",
model_name="my-model-v1.0",
dataset="custom-test-set",
bleu_score=bleu_result,
rouge_scores=rouge_result,
notes="Initial baseline evaluation"
)Example Output Structure:
{
"model_name": "my-model-v1.0",
"dataset": "custom-test-set",
"bleu_score": {"bleu": 1.0},
"rouge_scores": {
"rouge1": 1.0,
"rouge2": 1.0,
"rougeL": 1.0,
"rougeLsum": 1.0
},
"notes": "Initial baseline evaluation",
"_timestamp": "2023-12-07T15:30:45.123456",
"_python_version": "3.9.7",
"_evaluate_version": "0.4.5",
"_platform": "Linux-5.4.0-x86_64"
}Save to File Object:
import evaluate
import json
# Evaluate model
accuracy = evaluate.load("accuracy")
result = accuracy.compute(predictions=[1, 0, 1], references=[1, 1, 0])
# Save to open file object
with open("results.json", "w") as f:
evaluate.save(
f,
experiment_id="exp_001",
model="bert-base-uncased",
accuracy=result["accuracy"],
hyperparameters={"lr": 0.001, "batch_size": 32}
)Batch Results Saving:
import evaluate
# Run multiple evaluations
evaluator = evaluate.evaluator("text-classification")
models = [
"distilbert-base-uncased",
"bert-base-uncased",
"roberta-base"
]
all_results = {}
for model_name in models:
results = evaluator.compute(
model_or_pipeline=model_name,
data="imdb",
split="test[:100]"
)
all_results[model_name] = results
# Save comprehensive comparison
evaluate.save(
"model_comparison.json",
experiment_name="IMDB Classification Comparison",
dataset="imdb",
results=all_results,
evaluation_config={
"split": "test[:100]",
"metric": "accuracy",
"task": "text-classification"
}
)Complete Evaluation and Sharing Workflow:
import evaluate
from transformers import pipeline
# Setup evaluation
model_name = "cardiffnlp/twitter-roberta-base-emotion"
evaluator = evaluate.evaluator("text-classification")
# Run evaluation
results = evaluator.compute(
model_or_pipeline=model_name,
data="emotion",
split="test[:200]",
metric="accuracy"
)
# Save detailed results locally
evaluate.save(
f"evaluation_{model_name.replace('/', '_')}.json",
model=model_name,
dataset="emotion",
split="test[:200]",
results=results,
evaluation_date="2023-12-07"
)
# Share key results on Hub
evaluate.push_to_hub(
model_id=model_name,
task_type="text-classification",
dataset_type="emotion",
dataset_name="emotion",
metric_type="accuracy",
metric_name="accuracy",
metric_value=results["accuracy"],
dataset_split="test"
)
print(f"Evaluation complete. Accuracy: {results['accuracy']:.3f}")Hub integration functions may raise:
ConnectionError: Network connectivity issuesHTTPError: Hub API authentication or permission errorsValueError: Invalid model_id format or missing required parametersFileNotFoundError: Invalid local file paths for savingPermissionError: Insufficient file system permissionsExample:
import evaluate
try:
evaluate.push_to_hub(
model_id="invalid/model/name/format",
task_type="text-classification",
# ... other parameters
)
except ValueError as e:
print(f"Invalid model ID: {e}")
try:
evaluate.save("/invalid/path/results.json", data="test")
except PermissionError as e:
print(f"Cannot write to path: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-evaluate