Tessl Tile for pypi/evaluate@0.4.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-evaluation.md evaluation-suites.md hub-integration.md index.md module-discovery.md task-evaluators.md utilities.md

evaluation-suites.mddocs/

0
# Evaluation Suites
1

2
Comprehensive evaluation workflows that run multiple tasks and datasets together for thorough model evaluation. Evaluation suites enable systematic benchmarking across diverse scenarios with standardized configurations.
3

4
## Capabilities
5

6
### EvaluationSuite Class
7

8
The `EvaluationSuite` class provides multi-task, multi-dataset evaluation workflows:
9

10
```python { .api }
11
class EvaluationSuite:
12
    """Multi-task, multi-dataset evaluation suite."""
13
    
14
    @staticmethod
15
    def load(
16
        path: str,
17
        download_mode: Optional[DownloadMode] = None,
18
        revision: Optional[Union[str, Version]] = None,
19
        download_config: Optional[DownloadConfig] = None
20
    ) -> EvaluationSuite:
21
        """Load an evaluation suite from Hub or local path."""
22
    
23
    def run(self, model_or_pipeline) -> Dict[str, Any]:
24
        """Run the complete evaluation suite on a model."""
25
```
26

27
**Usage Example:**
28
```python
29
import evaluate
30

31
# Load a pre-defined evaluation suite
32
suite = evaluate.EvaluationSuite.load("super_glue")
33

34
# Run evaluation on a model
35
from transformers import pipeline
36
model = pipeline("text-classification", model="distilbert-base-uncased")
37

38
results = suite.run(model)
39
print(results)
40

41
# Results contain scores for all tasks in the suite
42
# {
43
#     'boolq': {'accuracy': 0.75},
44
#     'cb': {'accuracy': 0.82, 'f1': 0.79},
45
#     'copa': {'accuracy': 0.68},
46
#     # ... more task results
47
# }
48
```
49

50
### Creating Custom Evaluation Suites
51

52
Evaluation suites are defined using JSON configuration files that specify tasks, datasets, and metrics:
53

54
**Example Suite Configuration:**
55
```json
56
{
57
    "suite_name": "my_classification_suite",
58
    "description": "Custom text classification evaluation suite",
59
    "tasks": [
60
        {
61
            "task_type": "text-classification",
62
            "dataset": "glue",
63
            "subset": "sst2",
64
            "split": "validation",
65
            "metrics": ["accuracy", "f1"]
66
        },
67
        {
68
            "task_type": "text-classification", 
69
            "dataset": "glue",
70
            "subset": "mrpc",
71
            "split": "validation",
72
            "metrics": ["accuracy", "f1"]
73
        },
74
        {
75
            "task_type": "text-classification",
76
            "dataset": "imdb", 
77
            "split": "test[:1000]",
78
            "metrics": ["accuracy"]
79
        }
80
    ]
81
}
82
```
83

84
**Loading Custom Suite:**
85
```python
86
import evaluate
87

88
# Load custom suite from local file
89
custom_suite = evaluate.EvaluationSuite.load("./my_suite.json")
90

91
# Run on multiple models
92
models = [
93
    "distilbert-base-uncased",
94
    "bert-base-uncased", 
95
    "roberta-base"
96
]
97

98
all_results = {}
99
for model_name in models:
100
    print(f"Evaluating {model_name}...")
101
    model = pipeline("text-classification", model=model_name)
102
    results = custom_suite.run(model)
103
    all_results[model_name] = results
104

105
# Compare results across models
106
for task in results.keys():
107
    print(f"\n{task} Results:")
108
    for model_name in all_results:
109
        accuracy = all_results[model_name][task].get('accuracy', 'N/A')
110
        print(f"  {model_name}: {accuracy:.3f}")
111
```
112

113
### Pre-built Evaluation Suites
114

115
The library includes several pre-built evaluation suites:
116

117
**GLUE Suite:**
118
```python
119
import evaluate
120

121
# Load GLUE benchmark suite
122
glue_suite = evaluate.EvaluationSuite.load("glue")
123

124
# Evaluate a model on all GLUE tasks
125
from transformers import pipeline
126
model = pipeline("text-classification", model="bert-base-uncased")
127

128
glue_results = glue_suite.run(model)
129

130
# View results for specific tasks
131
print(f"CoLA: {glue_results['cola']['matthews_correlation']:.3f}")
132
print(f"SST-2: {glue_results['sst2']['accuracy']:.3f}")
133
print(f"MRPC: {glue_results['mrpc']['f1']:.3f}")
134
```
135

136
**SuperGLUE Suite:**
137
```python
138
import evaluate
139

140
# Load SuperGLUE benchmark
141
superglue_suite = evaluate.EvaluationSuite.load("super_glue")
142

143
# Run evaluation
144
results = superglue_suite.run(model)
145

146
# SuperGLUE includes more challenging tasks
147
print(f"BoolQ: {results['boolq']['accuracy']:.3f}")
148
print(f"RTE: {results['rte']['accuracy']:.3f}")
149
print(f"WiC: {results['wic']['accuracy']:.3f}")
150
```
151

152
### Advanced Suite Configuration
153

154
**Multi-Modal Suite:**
155
```python
156
# Configuration for multi-modal evaluation
157
multimodal_config = {
158
    "suite_name": "multimodal_suite",
159
    "description": "Evaluation across text, image, and audio tasks",
160
    "tasks": [
161
        {
162
            "task_type": "text-classification",
163
            "dataset": "imdb",
164
            "split": "test[:500]",
165
            "metrics": ["accuracy"]
166
        },
167
        {
168
            "task_type": "image-classification", 
169
            "dataset": "cifar10",
170
            "split": "test[:500]",
171
            "metrics": ["accuracy", "top_5_accuracy"]
172
        },
173
        {
174
            "task_type": "audio-classification",
175
            "dataset": "superb",
176
            "subset": "ks", 
177
            "split": "test[:500]",
178
            "metrics": ["accuracy"]
179
        }
180
    ]
181
}
182

183
# Save and load the suite
184
import json
185
with open("multimodal_suite.json", "w") as f:
186
    json.dump(multimodal_config, f, indent=2)
187

188
suite = evaluate.EvaluationSuite.load("./multimodal_suite.json")
189
```
190

191
**Domain-Specific Suite:**
192
```python
193
# Medical text classification suite
194
medical_suite_config = {
195
    "suite_name": "medical_text_suite",
196
    "description": "Medical text classification benchmarks",  
197
    "tasks": [
198
        {
199
            "task_type": "text-classification",
200
            "dataset": "medical_questions_pairs",
201
            "metrics": ["accuracy", "f1"]
202
        },
203
        {
204
            "task_type": "text-classification",
205
            "dataset": "pubmed_20k_rct",
206
            "metrics": ["accuracy", "precision", "recall"]
207
        }
208
    ]
209
}
210
```
211

212
### Suite Results Analysis
213

214
**Comprehensive Results Processing:**
215
```python
216
import evaluate
217
import pandas as pd
218

219
# Load and run suite
220
suite = evaluate.EvaluationSuite.load("glue")
221
results = suite.run(model)
222

223
# Convert to DataFrame for analysis
224
results_data = []
225
for task, metrics in results.items():
226
    for metric_name, value in metrics.items():
227
        results_data.append({
228
            'task': task,
229
            'metric': metric_name, 
230
            'value': value
231
        })
232

233
df = pd.DataFrame(results_data)
234
print(df.pivot(index='task', columns='metric', values='value'))
235

236
# Calculate overall suite score (if applicable)
237
accuracy_scores = [
238
    metrics.get('accuracy', 0) 
239
    for metrics in results.values() 
240
    if 'accuracy' in metrics
241
]
242
overall_accuracy = sum(accuracy_scores) / len(accuracy_scores)
243
print(f"Overall Suite Accuracy: {overall_accuracy:.3f}")
244
```
245

246
**Model Comparison with Suites:**
247
```python
248
import evaluate
249

250
suite = evaluate.EvaluationSuite.load("glue")
251
models_to_compare = [
252
    "distilbert-base-uncased",
253
    "bert-base-uncased",
254
    "roberta-base"
255
]
256

257
comparison_results = {}
258
for model_name in models_to_compare:
259
    model = pipeline("text-classification", model=model_name)
260
    results = suite.run(model)
261
    comparison_results[model_name] = results
262

263
# Create comparison table
264
import pandas as pd
265

266
comparison_data = []
267
for model_name, model_results in comparison_results.items():
268
    for task, metrics in model_results.items():
269
        for metric_name, value in metrics.items():
270
            comparison_data.append({
271
                'model': model_name,
272
                'task': task,
273
                'metric': metric_name,
274
                'value': value
275
            })
276

277
comparison_df = pd.DataFrame(comparison_data)
278
pivot_table = comparison_df.pivot_table(
279
    index=['task', 'metric'], 
280
    columns='model', 
281
    values='value'
282
)
283
print(pivot_table)
284
```
285

286
## Error Handling
287

288
Evaluation suites may raise:
289

290
- `FileNotFoundError`: Suite configuration file not found
291
- `ValueError`: Invalid suite configuration format
292
- `ImportError`: Missing dependencies for specific tasks
293
- `RuntimeError`: Model incompatibility with suite tasks
294

295
**Example:**
296
```python
297
import evaluate
298

299
try:
300
    suite = evaluate.EvaluationSuite.load("nonexistent_suite")
301
except FileNotFoundError:
302
    print("Suite not found")
303

304
try:
305
    suite = evaluate.EvaluationSuite.load("glue")
306
    # Model incompatible with some tasks
307
    incompatible_model = pipeline("text-generation", model="gpt2")
308
    results = suite.run(incompatible_model)
309
except RuntimeError as e:
310
    print(f"Model incompatibility: {e}")
311
```

Version

Tile

Files

evaluation-suites.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

evaluation-suites.mddocs/