HuggingFace community-driven open-source library of evaluation metrics for machine learning models and datasets.
npx @tessl/cli install tessl/pypi-evaluate@0.4.0A comprehensive evaluation library for machine learning models and datasets, providing implementations of dozens of popular metrics spanning NLP to Computer Vision tasks. The library features dataset-specific metrics, easy integration with any ML framework (NumPy/Pandas/PyTorch/TensorFlow/JAX), type checking for input validation, metric cards with descriptions and usage examples, and community-driven extensibility through the Hugging Face Hub.
pip install evaluateimport evaluateFor specific components:
from evaluate import load, combine, push_to_hub, save
from evaluate import Metric, Comparison, Measurement, EvaluationModule
from evaluate import evaluatorimport evaluate
# Load a metric from the Hub
accuracy = evaluate.load("accuracy")
# Add predictions and references
accuracy.add_batch(predictions=[0, 2, 1, 3], references=[0, 1, 2, 3])
accuracy.add(prediction=1, reference=1)
# Compute final score
score = accuracy.compute()
print(score) # {'accuracy': 0.5}
# Combine multiple metrics
combined_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
results = combined_metrics.compute(predictions=[0, 1, 1], references=[0, 1, 0])
print(results) # {'accuracy': 0.6667, 'f1': 0.6667, 'precision': 0.6667, 'recall': 0.6667}
# Use task-specific evaluators
task_evaluator = evaluate.evaluator("text-classification")
eval_results = task_evaluator.compute(
model_or_pipeline="cardiffnlp/twitter-roberta-base-emotion",
data="emotion",
subset="split",
split="test[:40]"
)The evaluate library is built around several key components:
The library provides both low-level evaluation primitives for custom workflows and high-level evaluators for common ML tasks, enabling standardized model evaluation and comparison across the machine learning ecosystem.
Core functionality for loading and using evaluation modules including metrics, comparisons, and measurements. Provides the fundamental building blocks for model evaluation workflows.
def load(path: str, config_name: Optional[str] = None, **kwargs) -> EvaluationModule:
"""Load evaluation modules from Hub or local paths."""
def combine(evaluations: List[str], force_prefix: bool = False) -> CombinedEvaluations:
"""Combine multiple evaluation modules into a single object."""
class EvaluationModule:
"""Base class for all evaluation modules."""
def compute(self, *, predictions=None, references=None, **kwargs) -> Optional[dict]: ...
def add_batch(self, *, predictions=None, references=None, **kwargs): ...
def add(self, *, prediction=None, reference=None, **kwargs): ...High-level evaluators for common machine learning tasks that integrate models, datasets, and metrics into streamlined evaluation pipelines.
def evaluator(task: str) -> Evaluator:
"""Factory function to create task-specific evaluators."""
class Evaluator:
"""Base class for task-specific evaluators."""
def compute(self, model_or_pipeline, data, **kwargs) -> dict: ...
# Specialized evaluator classes
class TextClassificationEvaluator(Evaluator): ...
class ImageClassificationEvaluator(Evaluator): ...
class QuestionAnsweringEvaluator(Evaluator): ...
class TokenClassificationEvaluator(Evaluator): ...
class TextGenerationEvaluator(Evaluator): ...
class Text2TextGenerationEvaluator(Evaluator): ...
class SummarizationEvaluator(Evaluator): ...
class TranslationEvaluator(Evaluator): ...
class AutomaticSpeechRecognitionEvaluator(Evaluator): ...
class AudioClassificationEvaluator(Evaluator): ...Functions for sharing evaluation results with the Hugging Face Hub and saving results locally with comprehensive metadata.
def push_to_hub(
model_id: str,
task_type: str,
dataset_type: str,
metric_type: str,
metric_value: float,
**kwargs
): ...
def save(path_or_file, **data): ...Tools for discovering, listing, and inspecting available evaluation modules from the Hugging Face Hub and local sources.
def list_evaluation_modules(
module_type: Optional[str] = None,
include_community: bool = True,
with_details: bool = False
): ...
def inspect_evaluation_module(
path: str,
local_path: str,
**kwargs
): ...Comprehensive evaluation workflows that run multiple tasks and datasets together for thorough model evaluation.
class EvaluationSuite:
"""Multi-task, multi-dataset evaluation suite."""
@staticmethod
def load(path: str, **kwargs) -> EvaluationSuite: ...
def run(self, model_or_pipeline) -> dict: ...Helper functions for logging control and Gradio integration for interactive evaluation experiences.
# Logging utilities
def enable_progress_bar(): ...
def disable_progress_bar(): ...
def is_progress_bar_enabled() -> bool: ...
# Gradio integration
def launch_gradio_widget(evaluation_module): ...from typing import Dict, List, Optional, Union, Any
from datasets import Dataset
# Core evaluation types
class EvaluationModuleInfo:
"""Information about evaluation modules."""
description: str
citation: str
features: Any
inputs_description: str
homepage: Optional[str]
license: str
codebase_urls: List[str]
reference_urls: List[str]
class MetricInfo(EvaluationModuleInfo):
"""Information specific to metrics."""
class ComparisonInfo(EvaluationModuleInfo):
"""Information specific to comparisons."""
class MeasurementInfo(EvaluationModuleInfo):
"""Information specific to measurements."""
# Combined evaluation result type
CombinedResults = Dict[str, Union[float, Dict[str, float], List]]
# Configuration and download types
from datasets import DownloadConfig, DownloadMode
from datasets.utils.version import Version
# Download configuration for Hub modules
class DownloadConfig:
"""Configuration for downloading modules from Hub."""
cache_dir: Optional[str]
force_download: bool
resume_download: bool
use_auth_token: Optional[str]
# Download mode enumeration
class DownloadMode:
"""Download behavior for cached modules."""
REUSE_DATASET_IF_EXISTS: str
REUSE_CACHE_IF_EXISTS: str
FORCE_REDOWNLOAD: str
# Version handling for modules
class Version:
"""Version specification for modules."""
def __init__(self, version_str: str): ...
def __str__(self) -> str: ...