HuggingFace community-driven open-source library of evaluation metrics for machine learning models and datasets.
—
Tools for discovering, listing, and inspecting available evaluation modules from the Hugging Face Hub and local sources. These functions help users explore the ecosystem of available metrics, comparisons, and measurements.
Discover all available evaluation modules on the Hugging Face Hub:
def list_evaluation_modules(
module_type: Optional[str] = None,
include_community: bool = True,
with_details: bool = False
) -> List[Union[str, Dict[str, Any]]]:
"""List all evaluation modules available on the Hugging Face Hub.
Args:
module_type: Type filter ('metric', 'comparison', 'measurement', or None for all)
include_community: Whether to include community-contributed modules
with_details: Return full metadata dict instead of just module ID strings
Returns:
List of module IDs (strings) when with_details=False, or
list of metadata dictionaries when with_details=True
"""Parameters:
module_type: Filter by type ("metric", "comparison", "measurement") or None for allinclude_community: Whether to include community-contributed moduleswith_details: Whether to include detailed metadata for each moduleUsage Example:
import evaluate
# List all available evaluation modules
all_modules = evaluate.list_evaluation_modules()
print(f"Found {len(all_modules)} evaluation modules")
# List only metrics
metrics = evaluate.list_evaluation_modules(module_type="metric")
print(f"Available metrics: {len(metrics)}")
# List with detailed information
detailed_metrics = evaluate.list_evaluation_modules(
module_type="metric",
with_details=True
)
for metric in detailed_metrics[:5]: # Show first 5
print(f"- {metric['id']}: {metric.get('description', 'No description')}")Example Output:
# Basic listing
[
{'id': 'accuracy'},
{'id': 'bleu'},
{'id': 'rouge'},
{'id': 'f1'},
# ... more modules
]
# Detailed listing
[
{
'id': 'accuracy',
'description': 'Computes the accuracy classification score.',
'tags': ['evaluation', 'metric'],
'downloads': 50000,
# ... additional metadata
},
# ... more detailed entries
]Filter by Module Type:
import evaluate
# Get only comparison modules
comparisons = evaluate.list_evaluation_modules(module_type="comparison")
print("Available comparisons:", [comp['id'] for comp in comparisons])
# Get only measurement modules
measurements = evaluate.list_evaluation_modules(module_type="measurement")
print("Available measurements:", [meas['id'] for meas in measurements])
# Include only official modules (exclude community)
official_metrics = evaluate.list_evaluation_modules(
module_type="metric",
include_community=False
)Copy evaluation modules to local directories for inspection and modification:
def inspect_evaluation_module(
path: str,
local_path: str,
download_config: Optional[DownloadConfig] = None,
**download_kwargs
) -> None:
"""Copy an evaluation module locally for inspection and modification.
Args:
path: Path to evaluation module - can be Hub module name
(e.g., 'accuracy') or local path to module
local_path: Local directory path where module will be copied
download_config: Configuration for downloading from Hub (optional)
**download_kwargs: Additional download parameters
"""Parameters:
path: Hub module name or path to local modulelocal_path: Local directory where module will be copieddownload_config: Configuration for downloading from Hub**download_kwargs: Additional download parametersUsage Example:
import evaluate
import os
# Inspect a metric from the Hub
evaluate.inspect_evaluation_module(
path="accuracy",
local_path="./inspected_accuracy"
)
# Check what was downloaded
print("Inspected files:")
for root, dirs, files in os.walk("./inspected_accuracy"):
for file in files:
print(f"- {os.path.join(root, file)}")
# Now you can examine and modify the module
with open("./inspected_accuracy/accuracy.py", "r") as f:
print("Module source:")
print(f.read()[:500] + "...")Inspect Community Module:
import evaluate
# Inspect a community-contributed module
evaluate.inspect_evaluation_module(
path="username/custom-metric",
local_path="./custom_metric_inspection"
)
# Inspect with specific configuration
from datasets import DownloadConfig
config = DownloadConfig(
cache_dir="./custom_cache",
force_download=True
)
evaluate.inspect_evaluation_module(
path="bleu",
local_path="./bleu_source",
download_config=config
)Modify and Use Inspected Module:
import evaluate
# First inspect the module
evaluate.inspect_evaluation_module(
path="f1",
local_path="./my_f1_variant"
)
# Modify the local copy (edit files as needed)
# ... make changes to ./my_f1_variant/f1.py ...
# Load your modified version
custom_f1 = evaluate.load("./my_f1_variant")
# Use the modified metric
result = custom_f1.compute(
predictions=[1, 0, 1, 0],
references=[1, 1, 0, 0]
)
print(result)Explore Available Metrics for a Task:
import evaluate
# Get all metrics and filter by name patterns
all_modules = evaluate.list_evaluation_modules(with_details=True)
# Find text-related metrics
text_metrics = [
module for module in all_modules
if any(keyword in module['id'].lower()
for keyword in ['bleu', 'rouge', 'bertscore', 'meteor'])
]
print("Text generation metrics:")
for metric in text_metrics:
print(f"- {metric['id']}: {metric.get('description', '')}")
# Find classification metrics
classification_metrics = [
module for module in all_modules
if any(keyword in module['id'].lower()
for keyword in ['accuracy', 'f1', 'precision', 'recall'])
]
print("\nClassification metrics:")
for metric in classification_metrics:
print(f"- {metric['id']}: {metric.get('description', '')}")Research and Development Workflow:
import evaluate
# 1. Discover what's available
metrics = evaluate.list_evaluation_modules(
module_type="metric",
with_details=True
)
# 2. Find metrics of interest
nlp_metrics = [m for m in metrics if 'nlp' in str(m.get('tags', [])).lower()]
# 3. Inspect implementation details
for metric in nlp_metrics[:3]: # Inspect first 3
print(f"Inspecting {metric['id']}...")
evaluate.inspect_evaluation_module(
path=metric['id'],
local_path=f"./inspected_{metric['id']}"
)
# 4. Load and test metrics
for metric in nlp_metrics[:3]:
try:
loaded_metric = evaluate.load(metric['id'])
print(f"✓ Successfully loaded {metric['id']}")
print(f" Description: {loaded_metric.description}")
except Exception as e:
print(f"✗ Failed to load {metric['id']}: {e}")Compare Module Versions:
import evaluate
# Inspect different versions of a module
versions = ["v1.0.0", "v1.1.0", "main"]
for version in versions:
evaluate.inspect_evaluation_module(
path="bleu",
local_path=f"./bleu_{version}",
revision=version
)
print(f"Inspected BLEU {version}")
# Now compare the implementations manually
# ... examine differences between versions ...Module discovery functions may raise:
ConnectionError: Network issues accessing the HubFileNotFoundError: Module not found on Hub or locallyPermissionError: Cannot write to local inspection directoryValueError: Invalid module type or configurationExample:
import evaluate
try:
modules = evaluate.list_evaluation_modules()
except ConnectionError:
print("Cannot connect to Hub - working offline")
modules = []
try:
evaluate.inspect_evaluation_module(
path="nonexistent-metric",
local_path="./test"
)
except FileNotFoundError:
print("Metric not found on Hub")
try:
evaluate.inspect_evaluation_module(
path="accuracy",
local_path="/root/protected" # May not have write access
)
except PermissionError:
print("Cannot write to protected directory")Install with Tessl CLI
npx tessl i tessl/pypi-evaluate