HuggingFace community-driven open-source library of evaluation metrics for machine learning models and datasets.
—
Helper functions for logging control and Gradio integration for interactive evaluation experiences. These utilities enhance the evaluation workflow with progress tracking and interactive interfaces.
Control progress bar display during evaluation operations:
def enable_progress_bar():
"""Enable tqdm progress bars for evaluation operations."""
def disable_progress_bar():
"""Disable tqdm progress bars for evaluation operations."""
def is_progress_bar_enabled() -> bool:
"""Check if progress bars are currently enabled."""These functions are available in the evaluate.utils.logging module.
Usage Example:
import evaluate
# Check current progress bar status
print(f"Progress bars enabled: {evaluate.utils.logging.is_progress_bar_enabled()}")
# Disable progress bars for cleaner output
evaluate.utils.logging.disable_progress_bar()
# Run evaluation without progress bars
accuracy = evaluate.load("accuracy")
# ... no progress bar shown during loading
# Re-enable progress bars
evaluate.utils.logging.enable_progress_bar()
# Now progress bars will be shown again
bleu = evaluate.load("bleu") # Progress bar visibleScript Configuration:
import evaluate
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--quiet", action="store_true", help="Disable progress bars")
args = parser.parse_args()
if args.quiet:
evaluate.utils.logging.disable_progress_bar()
# Run evaluation with controlled output
metric = evaluate.load("rouge")
results = metric.compute(
predictions=["hello world"] * 1000,
references=["hello world"] * 1000
)
print(f"Results: {results}")
if __name__ == "__main__":
main()Interactive evaluation interfaces using Gradio widgets:
def infer_gradio_input_types(features: Features) -> Dict[str, str]:
"""Map metric feature types to Gradio input component types."""
def json_to_string_type(input_type: str) -> str:
"""Convert json input type to string type for Gradio."""
def parse_readme(readme_content: str) -> str:
"""Parse README content and remove YAML frontmatter."""
def parse_gradio_data(data: List[List[Any]]) -> Tuple[List, List]:
"""Parse data from Gradio Dataframe for metric computation."""
def parse_test_cases(test_cases: str) -> Dict[str, List]:
"""Parse test case strings into structured data for Gradio."""
def launch_gradio_widget(evaluation_module: EvaluationModule) -> gradio.Interface:
"""Launch interactive Gradio widget for an evaluation module."""These functions are available in the evaluate.utils.gradio module.
Usage Example:
import evaluate
# Load a metric
accuracy = evaluate.load("accuracy")
# Launch interactive widget
interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)
# The widget allows users to:
# - Input predictions and references interactively
# - See real-time evaluation results
# - Explore metric documentation
# - Try different input formatsCustom Gradio Interface:
import evaluate
import gradio as gr
def create_evaluation_interface():
# Load multiple metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
def evaluate_inputs(predictions_text, references_text):
# Parse input text to lists
predictions = [int(x.strip()) for x in predictions_text.split(",")]
references = [int(x.strip()) for x in references_text.split(",")]
# Compute all metrics
results = {
"accuracy": accuracy.compute(predictions=predictions, references=references),
"f1": f1.compute(predictions=predictions, references=references),
"precision": precision.compute(predictions=predictions, references=references),
"recall": recall.compute(predictions=predictions, references=references)
}
return str(results)
# Create interface
interface = gr.Interface(
fn=evaluate_inputs,
inputs=[
gr.Textbox(label="Predictions (comma-separated)", placeholder="1,0,1,0"),
gr.Textbox(label="References (comma-separated)", placeholder="1,1,0,0")
],
outputs=gr.Textbox(label="Evaluation Results"),
title="Multi-Metric Evaluation",
description="Evaluate predictions with multiple classification metrics"
)
return interface
# Launch custom interface
interface = create_evaluation_interface()
interface.launch()Batch Evaluation Interface:
import evaluate
import gradio as gr
import pandas as pd
def create_batch_evaluation_interface():
def evaluate_csv_data(csv_file):
# Read CSV file
df = pd.read_csv(csv_file.name)
if 'predictions' not in df.columns or 'references' not in df.columns:
return "Error: CSV must contain 'predictions' and 'references' columns"
predictions = df['predictions'].tolist()
references = df['references'].tolist()
# Run evaluation
combined = evaluate.combine(["accuracy", "f1", "precision", "recall"])
results = combined.compute(predictions=predictions, references=references)
return str(results)
interface = gr.Interface(
fn=evaluate_csv_data,
inputs=gr.File(label="Upload CSV with predictions and references"),
outputs=gr.Textbox(label="Evaluation Results"),
title="Batch Evaluation from CSV",
description="Upload a CSV file with 'predictions' and 'references' columns"
)
return interfaceHelper Functions for Data Processing:
import evaluate
# Parse test cases from string format
test_case_string = """
predictions: [1, 0, 1, 0]
references: [1, 1, 0, 0]
"""
parsed_cases = evaluate.utils.gradio.parse_test_cases(test_case_string)
print(parsed_cases) # {'predictions': [1, 0, 1, 0], 'references': [1, 1, 0, 0]}
# Infer Gradio input types from metric features
accuracy = evaluate.load("accuracy")
input_types = evaluate.utils.gradio.infer_gradio_input_types(accuracy.features)
print(input_types) # Maps feature types to Gradio component typesREADME Processing:
import evaluate
# Process metric README with YAML frontmatter
readme_with_yaml = """---
title: Accuracy
emoji: 🎯
tags:
- evaluate
- metric
---
# Accuracy
Accuracy is the fraction of predictions our model got right.
"""
clean_readme = evaluate.utils.gradio.parse_readme(readme_with_yaml)
print(clean_readme) # Returns content without YAML frontmatterComplete Interactive Evaluation Setup:
import evaluate
import gradio as gr
def setup_comprehensive_evaluation():
# Disable progress bars for cleaner interface
evaluate.utils.logging.disable_progress_bar()
# Load multiple evaluation modules
metrics = {
"accuracy": evaluate.load("accuracy"),
"f1": evaluate.load("f1"),
"bleu": evaluate.load("bleu"),
"rouge": evaluate.load("rouge")
}
def evaluate_text_classification(predictions, references):
pred_list = [int(x.strip()) for x in predictions.split(",")]
ref_list = [int(x.strip()) for x in references.split(",")]
results = {}
for name, metric in metrics.items():
if name in ["accuracy", "f1"]: # Classification metrics
results[name] = metric.compute(predictions=pred_list, references=ref_list)
return str(results)
def evaluate_text_generation(predictions, references):
pred_list = [x.strip() for x in predictions.split("\n")]
ref_list = [x.strip() for x in references.split("\n")]
results = {}
for name, metric in metrics.items():
if name in ["bleu", "rouge"]: # Generation metrics
if name == "bleu":
results[name] = metric.compute(predictions=pred_list, references=[[r] for r in ref_list])
else:
results[name] = metric.compute(predictions=pred_list, references=ref_list)
return str(results)
# Create tabbed interface
classification_interface = gr.Interface(
fn=evaluate_text_classification,
inputs=[
gr.Textbox(label="Predictions", placeholder="1,0,1,0"),
gr.Textbox(label="References", placeholder="1,1,0,0")
],
outputs=gr.Textbox(label="Results"),
title="Classification Evaluation"
)
generation_interface = gr.Interface(
fn=evaluate_text_generation,
inputs=[
gr.Textbox(label="Predictions", lines=5, placeholder="Generated text 1\nGenerated text 2"),
gr.Textbox(label="References", lines=5, placeholder="Reference text 1\nReference text 2")
],
outputs=gr.Textbox(label="Results"),
title="Generation Evaluation"
)
# Combine interfaces
demo = gr.TabbedInterface(
[classification_interface, generation_interface],
["Classification", "Generation"]
)
return demo
# Launch comprehensive evaluation interface
demo = setup_comprehensive_evaluation()
demo.launch(share=True) # Create shareable linkUtility functions may raise:
ImportError: Missing gradio dependency for widget functionsValueError: Invalid input formats for parsing functionsAttributeError: Incompatible evaluation module for Gradio integrationExample:
import evaluate
try:
# This requires gradio to be installed
interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)
except ImportError:
print("Install gradio: pip install gradio")
try:
# Invalid test case format
cases = evaluate.utils.gradio.parse_test_cases("invalid format")
except ValueError as e:
print(f"Parse error: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-evaluate