0
# Utilities
1
2
Helper functions for logging control and Gradio integration for interactive evaluation experiences. These utilities enhance the evaluation workflow with progress tracking and interactive interfaces.
3
4
## Capabilities
5
6
### Logging Utilities
7
8
Control progress bar display during evaluation operations:
9
10
```python { .api }
11
def enable_progress_bar():
12
"""Enable tqdm progress bars for evaluation operations."""
13
14
def disable_progress_bar():
15
"""Disable tqdm progress bars for evaluation operations."""
16
17
def is_progress_bar_enabled() -> bool:
18
"""Check if progress bars are currently enabled."""
19
```
20
21
These functions are available in the `evaluate.utils.logging` module.
22
23
**Usage Example:**
24
```python
25
import evaluate
26
27
# Check current progress bar status
28
print(f"Progress bars enabled: {evaluate.utils.logging.is_progress_bar_enabled()}")
29
30
# Disable progress bars for cleaner output
31
evaluate.utils.logging.disable_progress_bar()
32
33
# Run evaluation without progress bars
34
accuracy = evaluate.load("accuracy")
35
# ... no progress bar shown during loading
36
37
# Re-enable progress bars
38
evaluate.utils.logging.enable_progress_bar()
39
40
# Now progress bars will be shown again
41
bleu = evaluate.load("bleu") # Progress bar visible
42
```
43
44
**Script Configuration:**
45
```python
46
import evaluate
47
import argparse
48
49
def main():
50
parser = argparse.ArgumentParser()
51
parser.add_argument("--quiet", action="store_true", help="Disable progress bars")
52
args = parser.parse_args()
53
54
if args.quiet:
55
evaluate.utils.logging.disable_progress_bar()
56
57
# Run evaluation with controlled output
58
metric = evaluate.load("rouge")
59
results = metric.compute(
60
predictions=["hello world"] * 1000,
61
references=["hello world"] * 1000
62
)
63
print(f"Results: {results}")
64
65
if __name__ == "__main__":
66
main()
67
```
68
69
### Gradio Integration
70
71
Interactive evaluation interfaces using Gradio widgets:
72
73
```python { .api }
74
def infer_gradio_input_types(features: Features) -> Dict[str, str]:
75
"""Map metric feature types to Gradio input component types."""
76
77
def json_to_string_type(input_type: str) -> str:
78
"""Convert json input type to string type for Gradio."""
79
80
def parse_readme(readme_content: str) -> str:
81
"""Parse README content and remove YAML frontmatter."""
82
83
def parse_gradio_data(data: List[List[Any]]) -> Tuple[List, List]:
84
"""Parse data from Gradio Dataframe for metric computation."""
85
86
def parse_test_cases(test_cases: str) -> Dict[str, List]:
87
"""Parse test case strings into structured data for Gradio."""
88
89
def launch_gradio_widget(evaluation_module: EvaluationModule) -> gradio.Interface:
90
"""Launch interactive Gradio widget for an evaluation module."""
91
```
92
93
These functions are available in the `evaluate.utils.gradio` module.
94
95
**Usage Example:**
96
```python
97
import evaluate
98
99
# Load a metric
100
accuracy = evaluate.load("accuracy")
101
102
# Launch interactive widget
103
interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)
104
105
# The widget allows users to:
106
# - Input predictions and references interactively
107
# - See real-time evaluation results
108
# - Explore metric documentation
109
# - Try different input formats
110
```
111
112
**Custom Gradio Interface:**
113
```python
114
import evaluate
115
import gradio as gr
116
117
def create_evaluation_interface():
118
# Load multiple metrics
119
accuracy = evaluate.load("accuracy")
120
f1 = evaluate.load("f1")
121
precision = evaluate.load("precision")
122
recall = evaluate.load("recall")
123
124
def evaluate_inputs(predictions_text, references_text):
125
# Parse input text to lists
126
predictions = [int(x.strip()) for x in predictions_text.split(",")]
127
references = [int(x.strip()) for x in references_text.split(",")]
128
129
# Compute all metrics
130
results = {
131
"accuracy": accuracy.compute(predictions=predictions, references=references),
132
"f1": f1.compute(predictions=predictions, references=references),
133
"precision": precision.compute(predictions=predictions, references=references),
134
"recall": recall.compute(predictions=predictions, references=references)
135
}
136
137
return str(results)
138
139
# Create interface
140
interface = gr.Interface(
141
fn=evaluate_inputs,
142
inputs=[
143
gr.Textbox(label="Predictions (comma-separated)", placeholder="1,0,1,0"),
144
gr.Textbox(label="References (comma-separated)", placeholder="1,1,0,0")
145
],
146
outputs=gr.Textbox(label="Evaluation Results"),
147
title="Multi-Metric Evaluation",
148
description="Evaluate predictions with multiple classification metrics"
149
)
150
151
return interface
152
153
# Launch custom interface
154
interface = create_evaluation_interface()
155
interface.launch()
156
```
157
158
**Batch Evaluation Interface:**
159
```python
160
import evaluate
161
import gradio as gr
162
import pandas as pd
163
164
def create_batch_evaluation_interface():
165
def evaluate_csv_data(csv_file):
166
# Read CSV file
167
df = pd.read_csv(csv_file.name)
168
169
if 'predictions' not in df.columns or 'references' not in df.columns:
170
return "Error: CSV must contain 'predictions' and 'references' columns"
171
172
predictions = df['predictions'].tolist()
173
references = df['references'].tolist()
174
175
# Run evaluation
176
combined = evaluate.combine(["accuracy", "f1", "precision", "recall"])
177
results = combined.compute(predictions=predictions, references=references)
178
179
return str(results)
180
181
interface = gr.Interface(
182
fn=evaluate_csv_data,
183
inputs=gr.File(label="Upload CSV with predictions and references"),
184
outputs=gr.Textbox(label="Evaluation Results"),
185
title="Batch Evaluation from CSV",
186
description="Upload a CSV file with 'predictions' and 'references' columns"
187
)
188
189
return interface
190
```
191
192
### Advanced Utility Functions
193
194
**Helper Functions for Data Processing:**
195
```python
196
import evaluate
197
198
# Parse test cases from string format
199
test_case_string = """
200
predictions: [1, 0, 1, 0]
201
references: [1, 1, 0, 0]
202
"""
203
204
parsed_cases = evaluate.utils.gradio.parse_test_cases(test_case_string)
205
print(parsed_cases) # {'predictions': [1, 0, 1, 0], 'references': [1, 1, 0, 0]}
206
207
# Infer Gradio input types from metric features
208
accuracy = evaluate.load("accuracy")
209
input_types = evaluate.utils.gradio.infer_gradio_input_types(accuracy.features)
210
print(input_types) # Maps feature types to Gradio component types
211
```
212
213
**README Processing:**
214
```python
215
import evaluate
216
217
# Process metric README with YAML frontmatter
218
readme_with_yaml = """---
219
title: Accuracy
220
emoji: 🎯
221
tags:
222
- evaluate
223
- metric
224
---
225
226
# Accuracy
227
228
Accuracy is the fraction of predictions our model got right.
229
"""
230
231
clean_readme = evaluate.utils.gradio.parse_readme(readme_with_yaml)
232
print(clean_readme) # Returns content without YAML frontmatter
233
```
234
235
### Integration with Evaluation Workflows
236
237
**Complete Interactive Evaluation Setup:**
238
```python
239
import evaluate
240
import gradio as gr
241
242
def setup_comprehensive_evaluation():
243
# Disable progress bars for cleaner interface
244
evaluate.utils.logging.disable_progress_bar()
245
246
# Load multiple evaluation modules
247
metrics = {
248
"accuracy": evaluate.load("accuracy"),
249
"f1": evaluate.load("f1"),
250
"bleu": evaluate.load("bleu"),
251
"rouge": evaluate.load("rouge")
252
}
253
254
def evaluate_text_classification(predictions, references):
255
pred_list = [int(x.strip()) for x in predictions.split(",")]
256
ref_list = [int(x.strip()) for x in references.split(",")]
257
258
results = {}
259
for name, metric in metrics.items():
260
if name in ["accuracy", "f1"]: # Classification metrics
261
results[name] = metric.compute(predictions=pred_list, references=ref_list)
262
263
return str(results)
264
265
def evaluate_text_generation(predictions, references):
266
pred_list = [x.strip() for x in predictions.split("\n")]
267
ref_list = [x.strip() for x in references.split("\n")]
268
269
results = {}
270
for name, metric in metrics.items():
271
if name in ["bleu", "rouge"]: # Generation metrics
272
if name == "bleu":
273
results[name] = metric.compute(predictions=pred_list, references=[[r] for r in ref_list])
274
else:
275
results[name] = metric.compute(predictions=pred_list, references=ref_list)
276
277
return str(results)
278
279
# Create tabbed interface
280
classification_interface = gr.Interface(
281
fn=evaluate_text_classification,
282
inputs=[
283
gr.Textbox(label="Predictions", placeholder="1,0,1,0"),
284
gr.Textbox(label="References", placeholder="1,1,0,0")
285
],
286
outputs=gr.Textbox(label="Results"),
287
title="Classification Evaluation"
288
)
289
290
generation_interface = gr.Interface(
291
fn=evaluate_text_generation,
292
inputs=[
293
gr.Textbox(label="Predictions", lines=5, placeholder="Generated text 1\nGenerated text 2"),
294
gr.Textbox(label="References", lines=5, placeholder="Reference text 1\nReference text 2")
295
],
296
outputs=gr.Textbox(label="Results"),
297
title="Generation Evaluation"
298
)
299
300
# Combine interfaces
301
demo = gr.TabbedInterface(
302
[classification_interface, generation_interface],
303
["Classification", "Generation"]
304
)
305
306
return demo
307
308
# Launch comprehensive evaluation interface
309
demo = setup_comprehensive_evaluation()
310
demo.launch(share=True) # Create shareable link
311
```
312
313
## Error Handling
314
315
Utility functions may raise:
316
317
- `ImportError`: Missing gradio dependency for widget functions
318
- `ValueError`: Invalid input formats for parsing functions
319
- `AttributeError`: Incompatible evaluation module for Gradio integration
320
321
**Example:**
322
```python
323
import evaluate
324
325
try:
326
# This requires gradio to be installed
327
interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)
328
except ImportError:
329
print("Install gradio: pip install gradio")
330
331
try:
332
# Invalid test case format
333
cases = evaluate.utils.gradio.parse_test_cases("invalid format")
334
except ValueError as e:
335
print(f"Parse error: {e}")
336
```