tessl/pypi-evaluate

HuggingFace community-driven open-source library of evaluation metrics for machine learning models and datasets.

—

Pending

Overview

Eval results

Files

Utilities

Name: tessl/pypi-evaluate
Author: tessl

Helper functions for logging control and Gradio integration for interactive evaluation experiences. These utilities enhance the evaluation workflow with progress tracking and interactive interfaces.

Capabilities

Logging Utilities

Control progress bar display during evaluation operations:

def enable_progress_bar():
    """Enable tqdm progress bars for evaluation operations."""

def disable_progress_bar(): 
    """Disable tqdm progress bars for evaluation operations."""

def is_progress_bar_enabled() -> bool:
    """Check if progress bars are currently enabled."""

These functions are available in the evaluate.utils.logging module.

Usage Example:

import evaluate

# Check current progress bar status
print(f"Progress bars enabled: {evaluate.utils.logging.is_progress_bar_enabled()}")

# Disable progress bars for cleaner output
evaluate.utils.logging.disable_progress_bar()

# Run evaluation without progress bars
accuracy = evaluate.load("accuracy")
# ... no progress bar shown during loading

# Re-enable progress bars
evaluate.utils.logging.enable_progress_bar()

# Now progress bars will be shown again
bleu = evaluate.load("bleu")  # Progress bar visible

Script Configuration:

import evaluate
import argparse

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--quiet", action="store_true", help="Disable progress bars")
    args = parser.parse_args()
    
    if args.quiet:
        evaluate.utils.logging.disable_progress_bar()
        
    # Run evaluation with controlled output
    metric = evaluate.load("rouge")
    results = metric.compute(
        predictions=["hello world"] * 1000,
        references=["hello world"] * 1000  
    )
    print(f"Results: {results}")

if __name__ == "__main__":
    main()

Gradio Integration

Interactive evaluation interfaces using Gradio widgets:

def infer_gradio_input_types(features: Features) -> Dict[str, str]:
    """Map metric feature types to Gradio input component types."""

def json_to_string_type(input_type: str) -> str:
    """Convert json input type to string type for Gradio."""

def parse_readme(readme_content: str) -> str:
    """Parse README content and remove YAML frontmatter."""

def parse_gradio_data(data: List[List[Any]]) -> Tuple[List, List]:
    """Parse data from Gradio Dataframe for metric computation."""

def parse_test_cases(test_cases: str) -> Dict[str, List]:
    """Parse test case strings into structured data for Gradio."""

def launch_gradio_widget(evaluation_module: EvaluationModule) -> gradio.Interface:
    """Launch interactive Gradio widget for an evaluation module."""

These functions are available in the evaluate.utils.gradio module.

Usage Example:

import evaluate

# Load a metric
accuracy = evaluate.load("accuracy")

# Launch interactive widget
interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)

# The widget allows users to:
# - Input predictions and references interactively
# - See real-time evaluation results
# - Explore metric documentation
# - Try different input formats

Custom Gradio Interface:

import evaluate
import gradio as gr

def create_evaluation_interface():
    # Load multiple metrics
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    
    def evaluate_inputs(predictions_text, references_text):
        # Parse input text to lists
        predictions = [int(x.strip()) for x in predictions_text.split(",")]
        references = [int(x.strip()) for x in references_text.split(",")]
        
        # Compute all metrics
        results = {
            "accuracy": accuracy.compute(predictions=predictions, references=references),
            "f1": f1.compute(predictions=predictions, references=references),
            "precision": precision.compute(predictions=predictions, references=references),
            "recall": recall.compute(predictions=predictions, references=references)
        }
        
        return str(results)
    
    # Create interface
    interface = gr.Interface(
        fn=evaluate_inputs,
        inputs=[
            gr.Textbox(label="Predictions (comma-separated)", placeholder="1,0,1,0"),
            gr.Textbox(label="References (comma-separated)", placeholder="1,1,0,0")
        ],
        outputs=gr.Textbox(label="Evaluation Results"),
        title="Multi-Metric Evaluation",
        description="Evaluate predictions with multiple classification metrics"
    )
    
    return interface

# Launch custom interface
interface = create_evaluation_interface()
interface.launch()

Batch Evaluation Interface:

import evaluate
import gradio as gr
import pandas as pd

def create_batch_evaluation_interface():
    def evaluate_csv_data(csv_file):
        # Read CSV file
        df = pd.read_csv(csv_file.name)
        
        if 'predictions' not in df.columns or 'references' not in df.columns:
            return "Error: CSV must contain 'predictions' and 'references' columns"
        
        predictions = df['predictions'].tolist()
        references = df['references'].tolist()
        
        # Run evaluation
        combined = evaluate.combine(["accuracy", "f1", "precision", "recall"])
        results = combined.compute(predictions=predictions, references=references)
        
        return str(results)
    
    interface = gr.Interface(
        fn=evaluate_csv_data,
        inputs=gr.File(label="Upload CSV with predictions and references"),
        outputs=gr.Textbox(label="Evaluation Results"),
        title="Batch Evaluation from CSV",
        description="Upload a CSV file with 'predictions' and 'references' columns"
    )
    
    return interface

Advanced Utility Functions

Helper Functions for Data Processing:

import evaluate

# Parse test cases from string format
test_case_string = """
predictions: [1, 0, 1, 0]
references: [1, 1, 0, 0]
"""

parsed_cases = evaluate.utils.gradio.parse_test_cases(test_case_string)
print(parsed_cases)  # {'predictions': [1, 0, 1, 0], 'references': [1, 1, 0, 0]}

# Infer Gradio input types from metric features
accuracy = evaluate.load("accuracy")
input_types = evaluate.utils.gradio.infer_gradio_input_types(accuracy.features)
print(input_types)  # Maps feature types to Gradio component types

README Processing:

import evaluate

# Process metric README with YAML frontmatter
readme_with_yaml = """---
title: Accuracy
emoji: 🎯
tags:
- evaluate
- metric
---

# Accuracy

Accuracy is the fraction of predictions our model got right.
"""

clean_readme = evaluate.utils.gradio.parse_readme(readme_with_yaml)
print(clean_readme)  # Returns content without YAML frontmatter

Integration with Evaluation Workflows

Complete Interactive Evaluation Setup:

import evaluate
import gradio as gr

def setup_comprehensive_evaluation():
    # Disable progress bars for cleaner interface
    evaluate.utils.logging.disable_progress_bar()
    
    # Load multiple evaluation modules
    metrics = {
        "accuracy": evaluate.load("accuracy"),
        "f1": evaluate.load("f1"),
        "bleu": evaluate.load("bleu"),
        "rouge": evaluate.load("rouge")
    }
    
    def evaluate_text_classification(predictions, references):
        pred_list = [int(x.strip()) for x in predictions.split(",")]
        ref_list = [int(x.strip()) for x in references.split(",")]
        
        results = {}
        for name, metric in metrics.items():
            if name in ["accuracy", "f1"]:  # Classification metrics
                results[name] = metric.compute(predictions=pred_list, references=ref_list)
        
        return str(results)
    
    def evaluate_text_generation(predictions, references):
        pred_list = [x.strip() for x in predictions.split("\n")]
        ref_list = [x.strip() for x in references.split("\n")]
        
        results = {}
        for name, metric in metrics.items():
            if name in ["bleu", "rouge"]:  # Generation metrics
                if name == "bleu":
                    results[name] = metric.compute(predictions=pred_list, references=[[r] for r in ref_list])
                else:
                    results[name] = metric.compute(predictions=pred_list, references=ref_list)
        
        return str(results)
    
    # Create tabbed interface
    classification_interface = gr.Interface(
        fn=evaluate_text_classification,
        inputs=[
            gr.Textbox(label="Predictions", placeholder="1,0,1,0"),
            gr.Textbox(label="References", placeholder="1,1,0,0")
        ],
        outputs=gr.Textbox(label="Results"),
        title="Classification Evaluation"
    )
    
    generation_interface = gr.Interface(
        fn=evaluate_text_generation,
        inputs=[
            gr.Textbox(label="Predictions", lines=5, placeholder="Generated text 1\nGenerated text 2"),
            gr.Textbox(label="References", lines=5, placeholder="Reference text 1\nReference text 2")
        ],
        outputs=gr.Textbox(label="Results"),
        title="Generation Evaluation"
    )
    
    # Combine interfaces
    demo = gr.TabbedInterface(
        [classification_interface, generation_interface],
        ["Classification", "Generation"]
    )
    
    return demo

# Launch comprehensive evaluation interface
demo = setup_comprehensive_evaluation()
demo.launch(share=True)  # Create shareable link

Error Handling

Utility functions may raise:

ImportError: Missing gradio dependency for widget functions
ValueError: Invalid input formats for parsing functions
AttributeError: Incompatible evaluation module for Gradio integration

Example:

import evaluate

try:
    # This requires gradio to be installed
    interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)
except ImportError:
    print("Install gradio: pip install gradio")

try:
    # Invalid test case format
    cases = evaluate.utils.gradio.parse_test_cases("invalid format")
except ValueError as e:
    print(f"Parse error: {e}")

Install with Tessl CLI