0
# Evaluation Suites
1
2
Comprehensive evaluation workflows that run multiple tasks and datasets together for thorough model evaluation. Evaluation suites enable systematic benchmarking across diverse scenarios with standardized configurations.
3
4
## Capabilities
5
6
### EvaluationSuite Class
7
8
The `EvaluationSuite` class provides multi-task, multi-dataset evaluation workflows:
9
10
```python { .api }
11
class EvaluationSuite:
12
"""Multi-task, multi-dataset evaluation suite."""
13
14
@staticmethod
15
def load(
16
path: str,
17
download_mode: Optional[DownloadMode] = None,
18
revision: Optional[Union[str, Version]] = None,
19
download_config: Optional[DownloadConfig] = None
20
) -> EvaluationSuite:
21
"""Load an evaluation suite from Hub or local path."""
22
23
def run(self, model_or_pipeline) -> Dict[str, Any]:
24
"""Run the complete evaluation suite on a model."""
25
```
26
27
**Usage Example:**
28
```python
29
import evaluate
30
31
# Load a pre-defined evaluation suite
32
suite = evaluate.EvaluationSuite.load("super_glue")
33
34
# Run evaluation on a model
35
from transformers import pipeline
36
model = pipeline("text-classification", model="distilbert-base-uncased")
37
38
results = suite.run(model)
39
print(results)
40
41
# Results contain scores for all tasks in the suite
42
# {
43
# 'boolq': {'accuracy': 0.75},
44
# 'cb': {'accuracy': 0.82, 'f1': 0.79},
45
# 'copa': {'accuracy': 0.68},
46
# # ... more task results
47
# }
48
```
49
50
### Creating Custom Evaluation Suites
51
52
Evaluation suites are defined using JSON configuration files that specify tasks, datasets, and metrics:
53
54
**Example Suite Configuration:**
55
```json
56
{
57
"suite_name": "my_classification_suite",
58
"description": "Custom text classification evaluation suite",
59
"tasks": [
60
{
61
"task_type": "text-classification",
62
"dataset": "glue",
63
"subset": "sst2",
64
"split": "validation",
65
"metrics": ["accuracy", "f1"]
66
},
67
{
68
"task_type": "text-classification",
69
"dataset": "glue",
70
"subset": "mrpc",
71
"split": "validation",
72
"metrics": ["accuracy", "f1"]
73
},
74
{
75
"task_type": "text-classification",
76
"dataset": "imdb",
77
"split": "test[:1000]",
78
"metrics": ["accuracy"]
79
}
80
]
81
}
82
```
83
84
**Loading Custom Suite:**
85
```python
86
import evaluate
87
88
# Load custom suite from local file
89
custom_suite = evaluate.EvaluationSuite.load("./my_suite.json")
90
91
# Run on multiple models
92
models = [
93
"distilbert-base-uncased",
94
"bert-base-uncased",
95
"roberta-base"
96
]
97
98
all_results = {}
99
for model_name in models:
100
print(f"Evaluating {model_name}...")
101
model = pipeline("text-classification", model=model_name)
102
results = custom_suite.run(model)
103
all_results[model_name] = results
104
105
# Compare results across models
106
for task in results.keys():
107
print(f"\n{task} Results:")
108
for model_name in all_results:
109
accuracy = all_results[model_name][task].get('accuracy', 'N/A')
110
print(f" {model_name}: {accuracy:.3f}")
111
```
112
113
### Pre-built Evaluation Suites
114
115
The library includes several pre-built evaluation suites:
116
117
**GLUE Suite:**
118
```python
119
import evaluate
120
121
# Load GLUE benchmark suite
122
glue_suite = evaluate.EvaluationSuite.load("glue")
123
124
# Evaluate a model on all GLUE tasks
125
from transformers import pipeline
126
model = pipeline("text-classification", model="bert-base-uncased")
127
128
glue_results = glue_suite.run(model)
129
130
# View results for specific tasks
131
print(f"CoLA: {glue_results['cola']['matthews_correlation']:.3f}")
132
print(f"SST-2: {glue_results['sst2']['accuracy']:.3f}")
133
print(f"MRPC: {glue_results['mrpc']['f1']:.3f}")
134
```
135
136
**SuperGLUE Suite:**
137
```python
138
import evaluate
139
140
# Load SuperGLUE benchmark
141
superglue_suite = evaluate.EvaluationSuite.load("super_glue")
142
143
# Run evaluation
144
results = superglue_suite.run(model)
145
146
# SuperGLUE includes more challenging tasks
147
print(f"BoolQ: {results['boolq']['accuracy']:.3f}")
148
print(f"RTE: {results['rte']['accuracy']:.3f}")
149
print(f"WiC: {results['wic']['accuracy']:.3f}")
150
```
151
152
### Advanced Suite Configuration
153
154
**Multi-Modal Suite:**
155
```python
156
# Configuration for multi-modal evaluation
157
multimodal_config = {
158
"suite_name": "multimodal_suite",
159
"description": "Evaluation across text, image, and audio tasks",
160
"tasks": [
161
{
162
"task_type": "text-classification",
163
"dataset": "imdb",
164
"split": "test[:500]",
165
"metrics": ["accuracy"]
166
},
167
{
168
"task_type": "image-classification",
169
"dataset": "cifar10",
170
"split": "test[:500]",
171
"metrics": ["accuracy", "top_5_accuracy"]
172
},
173
{
174
"task_type": "audio-classification",
175
"dataset": "superb",
176
"subset": "ks",
177
"split": "test[:500]",
178
"metrics": ["accuracy"]
179
}
180
]
181
}
182
183
# Save and load the suite
184
import json
185
with open("multimodal_suite.json", "w") as f:
186
json.dump(multimodal_config, f, indent=2)
187
188
suite = evaluate.EvaluationSuite.load("./multimodal_suite.json")
189
```
190
191
**Domain-Specific Suite:**
192
```python
193
# Medical text classification suite
194
medical_suite_config = {
195
"suite_name": "medical_text_suite",
196
"description": "Medical text classification benchmarks",
197
"tasks": [
198
{
199
"task_type": "text-classification",
200
"dataset": "medical_questions_pairs",
201
"metrics": ["accuracy", "f1"]
202
},
203
{
204
"task_type": "text-classification",
205
"dataset": "pubmed_20k_rct",
206
"metrics": ["accuracy", "precision", "recall"]
207
}
208
]
209
}
210
```
211
212
### Suite Results Analysis
213
214
**Comprehensive Results Processing:**
215
```python
216
import evaluate
217
import pandas as pd
218
219
# Load and run suite
220
suite = evaluate.EvaluationSuite.load("glue")
221
results = suite.run(model)
222
223
# Convert to DataFrame for analysis
224
results_data = []
225
for task, metrics in results.items():
226
for metric_name, value in metrics.items():
227
results_data.append({
228
'task': task,
229
'metric': metric_name,
230
'value': value
231
})
232
233
df = pd.DataFrame(results_data)
234
print(df.pivot(index='task', columns='metric', values='value'))
235
236
# Calculate overall suite score (if applicable)
237
accuracy_scores = [
238
metrics.get('accuracy', 0)
239
for metrics in results.values()
240
if 'accuracy' in metrics
241
]
242
overall_accuracy = sum(accuracy_scores) / len(accuracy_scores)
243
print(f"Overall Suite Accuracy: {overall_accuracy:.3f}")
244
```
245
246
**Model Comparison with Suites:**
247
```python
248
import evaluate
249
250
suite = evaluate.EvaluationSuite.load("glue")
251
models_to_compare = [
252
"distilbert-base-uncased",
253
"bert-base-uncased",
254
"roberta-base"
255
]
256
257
comparison_results = {}
258
for model_name in models_to_compare:
259
model = pipeline("text-classification", model=model_name)
260
results = suite.run(model)
261
comparison_results[model_name] = results
262
263
# Create comparison table
264
import pandas as pd
265
266
comparison_data = []
267
for model_name, model_results in comparison_results.items():
268
for task, metrics in model_results.items():
269
for metric_name, value in metrics.items():
270
comparison_data.append({
271
'model': model_name,
272
'task': task,
273
'metric': metric_name,
274
'value': value
275
})
276
277
comparison_df = pd.DataFrame(comparison_data)
278
pivot_table = comparison_df.pivot_table(
279
index=['task', 'metric'],
280
columns='model',
281
values='value'
282
)
283
print(pivot_table)
284
```
285
286
## Error Handling
287
288
Evaluation suites may raise:
289
290
- `FileNotFoundError`: Suite configuration file not found
291
- `ValueError`: Invalid suite configuration format
292
- `ImportError`: Missing dependencies for specific tasks
293
- `RuntimeError`: Model incompatibility with suite tasks
294
295
**Example:**
296
```python
297
import evaluate
298
299
try:
300
suite = evaluate.EvaluationSuite.load("nonexistent_suite")
301
except FileNotFoundError:
302
print("Suite not found")
303
304
try:
305
suite = evaluate.EvaluationSuite.load("glue")
306
# Model incompatible with some tasks
307
incompatible_model = pipeline("text-generation", model="gpt2")
308
results = suite.run(incompatible_model)
309
except RuntimeError as e:
310
print(f"Model incompatibility: {e}")
311
```