Tessl Tile for pypi/evaluate@0.4.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-evaluation.md evaluation-suites.md hub-integration.md index.md module-discovery.md task-evaluators.md utilities.md

utilities.mddocs/

0
# Utilities
1

2
Helper functions for logging control and Gradio integration for interactive evaluation experiences. These utilities enhance the evaluation workflow with progress tracking and interactive interfaces.
3

4
## Capabilities
5

6
### Logging Utilities
7

8
Control progress bar display during evaluation operations:
9

10
```python { .api }
11
def enable_progress_bar():
12
    """Enable tqdm progress bars for evaluation operations."""
13

14
def disable_progress_bar(): 
15
    """Disable tqdm progress bars for evaluation operations."""
16

17
def is_progress_bar_enabled() -> bool:
18
    """Check if progress bars are currently enabled."""
19
```
20

21
These functions are available in the `evaluate.utils.logging` module.
22

23
**Usage Example:**
24
```python
25
import evaluate
26

27
# Check current progress bar status
28
print(f"Progress bars enabled: {evaluate.utils.logging.is_progress_bar_enabled()}")
29

30
# Disable progress bars for cleaner output
31
evaluate.utils.logging.disable_progress_bar()
32

33
# Run evaluation without progress bars
34
accuracy = evaluate.load("accuracy")
35
# ... no progress bar shown during loading
36

37
# Re-enable progress bars
38
evaluate.utils.logging.enable_progress_bar()
39

40
# Now progress bars will be shown again
41
bleu = evaluate.load("bleu")  # Progress bar visible
42
```
43

44
**Script Configuration:**
45
```python
46
import evaluate
47
import argparse
48

49
def main():
50
    parser = argparse.ArgumentParser()
51
    parser.add_argument("--quiet", action="store_true", help="Disable progress bars")
52
    args = parser.parse_args()
53
    
54
    if args.quiet:
55
        evaluate.utils.logging.disable_progress_bar()
56
        
57
    # Run evaluation with controlled output
58
    metric = evaluate.load("rouge")
59
    results = metric.compute(
60
        predictions=["hello world"] * 1000,
61
        references=["hello world"] * 1000  
62
    )
63
    print(f"Results: {results}")
64

65
if __name__ == "__main__":
66
    main()
67
```
68

69
### Gradio Integration
70

71
Interactive evaluation interfaces using Gradio widgets:
72

73
```python { .api }
74
def infer_gradio_input_types(features: Features) -> Dict[str, str]:
75
    """Map metric feature types to Gradio input component types."""
76

77
def json_to_string_type(input_type: str) -> str:
78
    """Convert json input type to string type for Gradio."""
79

80
def parse_readme(readme_content: str) -> str:
81
    """Parse README content and remove YAML frontmatter."""
82

83
def parse_gradio_data(data: List[List[Any]]) -> Tuple[List, List]:
84
    """Parse data from Gradio Dataframe for metric computation."""
85

86
def parse_test_cases(test_cases: str) -> Dict[str, List]:
87
    """Parse test case strings into structured data for Gradio."""
88

89
def launch_gradio_widget(evaluation_module: EvaluationModule) -> gradio.Interface:
90
    """Launch interactive Gradio widget for an evaluation module."""
91
```
92

93
These functions are available in the `evaluate.utils.gradio` module.
94

95
**Usage Example:**
96
```python
97
import evaluate
98

99
# Load a metric
100
accuracy = evaluate.load("accuracy")
101

102
# Launch interactive widget
103
interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)
104

105
# The widget allows users to:
106
# - Input predictions and references interactively
107
# - See real-time evaluation results
108
# - Explore metric documentation
109
# - Try different input formats
110
```
111

112
**Custom Gradio Interface:**
113
```python
114
import evaluate
115
import gradio as gr
116

117
def create_evaluation_interface():
118
    # Load multiple metrics
119
    accuracy = evaluate.load("accuracy")
120
    f1 = evaluate.load("f1")
121
    precision = evaluate.load("precision")
122
    recall = evaluate.load("recall")
123
    
124
    def evaluate_inputs(predictions_text, references_text):
125
        # Parse input text to lists
126
        predictions = [int(x.strip()) for x in predictions_text.split(",")]
127
        references = [int(x.strip()) for x in references_text.split(",")]
128
        
129
        # Compute all metrics
130
        results = {
131
            "accuracy": accuracy.compute(predictions=predictions, references=references),
132
            "f1": f1.compute(predictions=predictions, references=references),
133
            "precision": precision.compute(predictions=predictions, references=references),
134
            "recall": recall.compute(predictions=predictions, references=references)
135
        }
136
        
137
        return str(results)
138
    
139
    # Create interface
140
    interface = gr.Interface(
141
        fn=evaluate_inputs,
142
        inputs=[
143
            gr.Textbox(label="Predictions (comma-separated)", placeholder="1,0,1,0"),
144
            gr.Textbox(label="References (comma-separated)", placeholder="1,1,0,0")
145
        ],
146
        outputs=gr.Textbox(label="Evaluation Results"),
147
        title="Multi-Metric Evaluation",
148
        description="Evaluate predictions with multiple classification metrics"
149
    )
150
    
151
    return interface
152

153
# Launch custom interface
154
interface = create_evaluation_interface()
155
interface.launch()
156
```
157

158
**Batch Evaluation Interface:**
159
```python
160
import evaluate
161
import gradio as gr
162
import pandas as pd
163

164
def create_batch_evaluation_interface():
165
    def evaluate_csv_data(csv_file):
166
        # Read CSV file
167
        df = pd.read_csv(csv_file.name)
168
        
169
        if 'predictions' not in df.columns or 'references' not in df.columns:
170
            return "Error: CSV must contain 'predictions' and 'references' columns"
171
        
172
        predictions = df['predictions'].tolist()
173
        references = df['references'].tolist()
174
        
175
        # Run evaluation
176
        combined = evaluate.combine(["accuracy", "f1", "precision", "recall"])
177
        results = combined.compute(predictions=predictions, references=references)
178
        
179
        return str(results)
180
    
181
    interface = gr.Interface(
182
        fn=evaluate_csv_data,
183
        inputs=gr.File(label="Upload CSV with predictions and references"),
184
        outputs=gr.Textbox(label="Evaluation Results"),
185
        title="Batch Evaluation from CSV",
186
        description="Upload a CSV file with 'predictions' and 'references' columns"
187
    )
188
    
189
    return interface
190
```
191

192
### Advanced Utility Functions
193

194
**Helper Functions for Data Processing:**
195
```python
196
import evaluate
197

198
# Parse test cases from string format
199
test_case_string = """
200
predictions: [1, 0, 1, 0]
201
references: [1, 1, 0, 0]
202
"""
203

204
parsed_cases = evaluate.utils.gradio.parse_test_cases(test_case_string)
205
print(parsed_cases)  # {'predictions': [1, 0, 1, 0], 'references': [1, 1, 0, 0]}
206

207
# Infer Gradio input types from metric features
208
accuracy = evaluate.load("accuracy")
209
input_types = evaluate.utils.gradio.infer_gradio_input_types(accuracy.features)
210
print(input_types)  # Maps feature types to Gradio component types
211
```
212

213
**README Processing:**
214
```python
215
import evaluate
216

217
# Process metric README with YAML frontmatter
218
readme_with_yaml = """---
219
title: Accuracy
220
emoji: 🎯
221
tags:
222
- evaluate
223
- metric
224
---
225

226
# Accuracy
227

228
Accuracy is the fraction of predictions our model got right.
229
"""
230

231
clean_readme = evaluate.utils.gradio.parse_readme(readme_with_yaml)
232
print(clean_readme)  # Returns content without YAML frontmatter
233
```
234

235
### Integration with Evaluation Workflows
236

237
**Complete Interactive Evaluation Setup:**
238
```python
239
import evaluate
240
import gradio as gr
241

242
def setup_comprehensive_evaluation():
243
    # Disable progress bars for cleaner interface
244
    evaluate.utils.logging.disable_progress_bar()
245
    
246
    # Load multiple evaluation modules
247
    metrics = {
248
        "accuracy": evaluate.load("accuracy"),
249
        "f1": evaluate.load("f1"),
250
        "bleu": evaluate.load("bleu"),
251
        "rouge": evaluate.load("rouge")
252
    }
253
    
254
    def evaluate_text_classification(predictions, references):
255
        pred_list = [int(x.strip()) for x in predictions.split(",")]
256
        ref_list = [int(x.strip()) for x in references.split(",")]
257
        
258
        results = {}
259
        for name, metric in metrics.items():
260
            if name in ["accuracy", "f1"]:  # Classification metrics
261
                results[name] = metric.compute(predictions=pred_list, references=ref_list)
262
        
263
        return str(results)
264
    
265
    def evaluate_text_generation(predictions, references):
266
        pred_list = [x.strip() for x in predictions.split("\n")]
267
        ref_list = [x.strip() for x in references.split("\n")]
268
        
269
        results = {}
270
        for name, metric in metrics.items():
271
            if name in ["bleu", "rouge"]:  # Generation metrics
272
                if name == "bleu":
273
                    results[name] = metric.compute(predictions=pred_list, references=[[r] for r in ref_list])
274
                else:
275
                    results[name] = metric.compute(predictions=pred_list, references=ref_list)
276
        
277
        return str(results)
278
    
279
    # Create tabbed interface
280
    classification_interface = gr.Interface(
281
        fn=evaluate_text_classification,
282
        inputs=[
283
            gr.Textbox(label="Predictions", placeholder="1,0,1,0"),
284
            gr.Textbox(label="References", placeholder="1,1,0,0")
285
        ],
286
        outputs=gr.Textbox(label="Results"),
287
        title="Classification Evaluation"
288
    )
289
    
290
    generation_interface = gr.Interface(
291
        fn=evaluate_text_generation,
292
        inputs=[
293
            gr.Textbox(label="Predictions", lines=5, placeholder="Generated text 1\nGenerated text 2"),
294
            gr.Textbox(label="References", lines=5, placeholder="Reference text 1\nReference text 2")
295
        ],
296
        outputs=gr.Textbox(label="Results"),
297
        title="Generation Evaluation"
298
    )
299
    
300
    # Combine interfaces
301
    demo = gr.TabbedInterface(
302
        [classification_interface, generation_interface],
303
        ["Classification", "Generation"]
304
    )
305
    
306
    return demo
307

308
# Launch comprehensive evaluation interface
309
demo = setup_comprehensive_evaluation()
310
demo.launch(share=True)  # Create shareable link
311
```
312

313
## Error Handling
314

315
Utility functions may raise:
316

317
- `ImportError`: Missing gradio dependency for widget functions
318
- `ValueError`: Invalid input formats for parsing functions
319
- `AttributeError`: Incompatible evaluation module for Gradio integration
320

321
**Example:**
322
```python
323
import evaluate
324

325
try:
326
    # This requires gradio to be installed
327
    interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)
328
except ImportError:
329
    print("Install gradio: pip install gradio")
330

331
try:
332
    # Invalid test case format
333
    cases = evaluate.utils.gradio.parse_test_cases("invalid format")
334
except ValueError as e:
335
    print(f"Parse error: {e}")
336
```

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/