HuggingFace community-driven open-source library of evaluation metrics for machine learning models and datasets.
—
The core evaluation functionality provides the fundamental building blocks for model evaluation workflows. This includes loading evaluation modules, using metrics/comparisons/measurements, and combining multiple evaluations into unified workflows.
The primary way to access evaluation functionality is through the load function, which retrieves evaluation modules from the Hugging Face Hub or local paths.
def load(
path: str,
config_name: Optional[str] = None,
module_type: Optional[str] = None,
process_id: int = 0,
num_process: int = 1,
cache_dir: Optional[str] = None,
experiment_id: Optional[str] = None,
keep_in_memory: bool = False,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[DownloadMode] = None,
revision: Optional[Union[str, Version]] = None,
**init_kwargs
) -> EvaluationModule:
"""Load an EvaluationModule (metric, comparison, or measurement).
Args:
path: Path to evaluation module or module identifier from Hub
config_name: Configuration name for the module (e.g., GLUE subset)
module_type: Type of module ('metric', 'comparison', 'measurement')
process_id: Process ID for distributed evaluation (0-based)
num_process: Total number of processes in distributed setup
cache_dir: Directory for caching downloaded modules
experiment_id: Unique identifier for experiment tracking
keep_in_memory: Store all data in memory (not for distributed)
download_config: Configuration for downloading from Hub
download_mode: How to handle existing cached data
revision: Specific revision/version to load
**init_kwargs: Additional initialization arguments for the module
"""Usage Example:
import evaluate
# Load popular metrics
accuracy = evaluate.load("accuracy")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
# Load with specific configuration
squad_metric = evaluate.load("squad", config_name="v2")
# Load local evaluation module
custom_metric = evaluate.load("./path/to/custom_metric.py")All evaluation functionality inherits from the EvaluationModule base class, providing a consistent API across metrics, comparisons, and measurements.
class EvaluationModule:
"""Base class for all evaluation modules."""
def compute(
self,
*,
predictions=None,
references=None,
**kwargs
) -> Optional[Dict[str, Any]]:
"""Compute evaluation results from accumulated predictions and references."""
def add_batch(
self,
*,
predictions=None,
references=None,
**kwargs
):
"""Add a batch of predictions and references."""
def add(
self,
*,
prediction=None,
reference=None,
**kwargs
):
"""Add a single prediction and reference pair."""
def download_and_prepare(
self,
download_config: Optional[DownloadConfig] = None,
dl_manager: Optional[DownloadManager] = None
):
"""Download and prepare the evaluation module."""
# Properties
@property
def name(self) -> str:
"""Name of the evaluation module."""
@property
def description(self) -> str:
"""Description of what the module evaluates."""
@property
def citation(self) -> str:
"""Citation information for the evaluation method."""
@property
def features(self) -> Features:
"""Expected input features schema."""
@property
def inputs_description(self) -> str:
"""Description of expected inputs."""
@property
def homepage(self) -> Optional[str]:
"""Homepage URL for the evaluation method."""
@property
def license(self) -> str:
"""License information."""
@property
def codebase_urls(self) -> List[str]:
"""URLs to relevant codebases."""
@property
def reference_urls(self) -> List[str]:
"""URLs to reference papers or documentation."""Usage Example:
import evaluate
# Load and use a metric
accuracy = evaluate.load("accuracy")
# Add individual predictions
accuracy.add(prediction=1, reference=1)
accuracy.add(prediction=0, reference=1)
# Add batch predictions
accuracy.add_batch(
predictions=[1, 0, 1, 1],
references=[1, 1, 0, 1]
)
# Compute final results
result = accuracy.compute()
print(result) # {'accuracy': 0.625}
# Access module information
print(accuracy.description)
print(accuracy.citation)The library provides specialized classes for different types of evaluation:
class Metric(EvaluationModule):
"""Specialized evaluation module for metrics."""
class Comparison(EvaluationModule):
"""Specialized evaluation module for comparisons between models."""
class Measurement(EvaluationModule):
"""Specialized evaluation module for measurements."""These classes inherit all functionality from EvaluationModule but may have specialized behavior for their specific evaluation type.
The combine function allows you to run multiple evaluation modules together as a single unit:
def combine(
evaluations: Union[List[Union[str, EvaluationModule]], Dict[str, Union[str, EvaluationModule]]],
force_prefix: bool = False
) -> CombinedEvaluations:
"""Combine multiple evaluation modules into a single object.
Args:
evaluations: List or dict of evaluation modules. Can be module names (str)
or loaded EvaluationModule objects. If dict, keys are used as
prefixes for results.
force_prefix: If True, all results are prefixed with module names
"""class CombinedEvaluations:
"""Container for multiple evaluation modules."""
def add(
self,
*,
prediction=None,
reference=None,
**kwargs
):
"""Add prediction/reference to all contained modules."""
def add_batch(
self,
*,
predictions=None,
references=None,
**kwargs
):
"""Add batch predictions/references to all contained modules."""
def compute(
self,
*,
predictions=None,
references=None,
**kwargs
) -> Dict[str, Any]:
"""Compute results from all contained modules."""Usage Example:
import evaluate
# Combine multiple metrics
combined = evaluate.combine(["accuracy", "f1", "precision", "recall"])
# Use like a single metric
combined.add_batch(
predictions=[1, 0, 1, 0],
references=[1, 1, 0, 0]
)
results = combined.compute()
print(results)
# {
# 'accuracy': 0.5,
# 'f1': 0.5,
# 'precision': 0.5,
# 'recall': 0.5
# }
# Combine with custom names to avoid conflicts
combined_with_prefix = evaluate.combine([
("acc", evaluate.load("accuracy")),
("f1_macro", evaluate.load("f1", average="macro"))
], force_prefix=True)Evaluation modules may raise the following exceptions:
ValueError: Invalid input data or configurationTypeError: Incorrect data types for predictions or referencesImportError: Missing required dependencies for specific metricsConnectionError: Network issues when downloading from HubExample:
try:
metric = evaluate.load("nonexistent_metric")
except FileNotFoundError:
print("Metric not found")
try:
accuracy = evaluate.load("accuracy")
accuracy.compute(predictions=[1, 2], references=[1]) # Mismatched lengths
except ValueError as e:
print(f"Input validation error: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-evaluate