HuggingFace community-driven open-source library of evaluation metrics for machine learning models and datasets.
—
Task-specific evaluators provide high-level evaluation pipelines that integrate models, datasets, and metrics for common machine learning tasks. They simplify the evaluation process by handling data loading, preprocessing, inference, and metric computation in a unified workflow.
The evaluator function is the primary way to create task-specific evaluators:
def evaluator(task: str) -> Evaluator:
"""Factory function to create task-specific evaluators.
Args:
task: Task type string specifying which evaluator to create.
Must be one of the supported task types.
Returns:
Task-specific evaluator instance with default metric configured.
Raises:
ImportError: If transformers is not installed (required for evaluators)
KeyError: If task type is not supported
"""Supported tasks:
"text-classification" (alias: "sentiment-analysis")"image-classification""question-answering""token-classification""text-generation""text2text-generation""summarization""translation""automatic-speech-recognition""audio-classification"Usage Example:
import evaluate
# Create task-specific evaluators
text_evaluator = evaluate.evaluator("text-classification")
qa_evaluator = evaluate.evaluator("question-answering")
img_evaluator = evaluate.evaluator("image-classification")All task evaluators inherit from the base Evaluator class:
class Evaluator:
"""Abstract base class for task-specific evaluators."""
def compute(
self,
model_or_pipeline,
data,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Optional[Union[str, EvaluationModule]] = None,
tokenizer: Optional[str] = None,
feature_extractor: Optional[str] = None,
strategy: str = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: Optional[int] = None,
random_state: Optional[int] = None,
input_column: str = "text",
label_column: str = "label",
label_mapping: Optional[Dict[str, Number]] = None
) -> Dict[str, float]
def load_data(
self,
data: Union[str, Dataset],
subset: Optional[str] = None,
split: Optional[str] = None
) -> Dataset
def prepare_data(
self,
data: Dataset,
input_column: str,
label_column: str,
*args,
**kwargs
) -> Dataset
def prepare_pipeline(
self,
model_or_pipeline,
tokenizer: Optional[str] = None,
feature_extractor: Optional[str] = None,
device: Optional[int] = None
)
def prepare_metric(self, metric: Union[str, EvaluationModule]) -> EvaluationModuleUsage Example:
import evaluate
# Create evaluator
evaluator = evaluate.evaluator("text-classification")
# Evaluate a model on a dataset
results = evaluator.compute(
model_or_pipeline="cardiffnlp/twitter-roberta-base-emotion",
data="emotion",
subset="split",
split="test[:100]",
metric="accuracy",
input_column="text",
label_column="label"
)
print(results) # {'accuracy': 0.85}Evaluates text classification models using accuracy as the default metric:
class TextClassificationEvaluator(Evaluator):
"""Evaluator for text classification tasks."""
# Default metric: "accuracy"Usage Example:
import evaluate
evaluator = evaluate.evaluator("text-classification")
# Evaluate with Transformers pipeline
from transformers import pipeline
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
results = evaluator.compute(
model_or_pipeline=classifier,
data="glue",
subset="sst2",
split="validation[:100]",
metric="accuracy"
)Evaluates question answering models using SQuAD metric as default:
class QuestionAnsweringEvaluator(Evaluator):
"""Evaluator for question answering tasks."""
# Default metric: "squad"Usage Example:
import evaluate
evaluator = evaluate.evaluator("question-answering")
results = evaluator.compute(
model_or_pipeline="distilbert-base-cased-distilled-squad",
data="squad",
split="validation[:100]",
metric="squad"
)
print(results) # {'exact_match': 78.5, 'f1': 86.2}Evaluates named entity recognition and other token classification tasks:
class TokenClassificationEvaluator(Evaluator):
"""Evaluator for token classification tasks."""
# Default metric: "seqeval"Usage Example:
import evaluate
evaluator = evaluate.evaluator("token-classification")
results = evaluator.compute(
model_or_pipeline="dbmdz/bert-large-cased-finetuned-conll03-english",
data="conll2003",
split="test[:100]",
metric="seqeval"
)Evaluates image classification models:
class ImageClassificationEvaluator(Evaluator):
"""Evaluator for image classification tasks."""
# Default metric: "accuracy"Usage Example:
import evaluate
evaluator = evaluate.evaluator("image-classification")
results = evaluator.compute(
model_or_pipeline="google/vit-base-patch16-224",
data="imagenet-1k",
split="validation[:100]",
metric="accuracy",
input_column="image",
label_column="label"
)Multiple evaluators for different text generation tasks:
class TextGenerationEvaluator(Evaluator):
"""Evaluator for general text generation tasks."""
# Default metric: "word_count"
class Text2TextGenerationEvaluator(Evaluator):
"""Evaluator for text-to-text generation tasks."""
# Default metric: "bleu"
class SummarizationEvaluator(Evaluator):
"""Evaluator for summarization tasks."""
# Default metric: "rouge"
class TranslationEvaluator(Evaluator):
"""Evaluator for translation tasks."""
# Default metric: "bleu"Usage Examples:
import evaluate
# Summarization
sum_evaluator = evaluate.evaluator("summarization")
results = sum_evaluator.compute(
model_or_pipeline="facebook/bart-large-cnn",
data="cnn_dailymail",
subset="3.0.0",
split="test[:100]"
)
# Translation
trans_evaluator = evaluate.evaluator("translation")
results = trans_evaluator.compute(
model_or_pipeline="Helsinki-NLP/opus-mt-en-de",
data="wmt14",
subset="de-en",
split="test[:100]"
)Evaluators for audio processing tasks:
class AudioClassificationEvaluator(Evaluator):
"""Evaluator for audio classification tasks."""
# Default metric: "accuracy"
class AutomaticSpeechRecognitionEvaluator(Evaluator):
"""Evaluator for automatic speech recognition tasks."""
# Default metric: "wer"Usage Examples:
import evaluate
# Audio classification
audio_evaluator = evaluate.evaluator("audio-classification")
results = audio_evaluator.compute(
model_or_pipeline="facebook/wav2vec2-base-960h",
data="superb",
subset="ks",
split="test[:100]"
)
# Speech recognition
asr_evaluator = evaluate.evaluator("automatic-speech-recognition")
results = asr_evaluator.compute(
model_or_pipeline="facebook/wav2vec2-base-960h",
data="librispeech_asr",
split="test.clean[:100]",
metric="wer"
)Task evaluators may raise these exceptions:
KeyError: Unknown task name provided to evaluator()ImportError: Missing transformers library (required for evaluators)ValueError: Invalid data format or model incompatibilityRuntimeError: Evaluation pipeline errorsExample:
try:
evaluator = evaluate.evaluator("unknown-task")
except KeyError as e:
print(f"Unsupported task: {e}")
try:
evaluator = evaluate.evaluator("text-classification")
# This will fail if transformers is not installed
except ImportError as e:
print("Install transformers: pip install evaluate[transformers]")Install with Tessl CLI
npx tessl i tessl/pypi-evaluate