or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-evaluation.mdevaluation-suites.mdhub-integration.mdindex.mdmodule-discovery.mdtask-evaluators.mdutilities.md

evaluation-suites.mddocs/

0

# Evaluation Suites

1

2

Comprehensive evaluation workflows that run multiple tasks and datasets together for thorough model evaluation. Evaluation suites enable systematic benchmarking across diverse scenarios with standardized configurations.

3

4

## Capabilities

5

6

### EvaluationSuite Class

7

8

The `EvaluationSuite` class provides multi-task, multi-dataset evaluation workflows:

9

10

```python { .api }

11

class EvaluationSuite:

12

"""Multi-task, multi-dataset evaluation suite."""

13

14

@staticmethod

15

def load(

16

path: str,

17

download_mode: Optional[DownloadMode] = None,

18

revision: Optional[Union[str, Version]] = None,

19

download_config: Optional[DownloadConfig] = None

20

) -> EvaluationSuite:

21

"""Load an evaluation suite from Hub or local path."""

22

23

def run(self, model_or_pipeline) -> Dict[str, Any]:

24

"""Run the complete evaluation suite on a model."""

25

```

26

27

**Usage Example:**

28

```python

29

import evaluate

30

31

# Load a pre-defined evaluation suite

32

suite = evaluate.EvaluationSuite.load("super_glue")

33

34

# Run evaluation on a model

35

from transformers import pipeline

36

model = pipeline("text-classification", model="distilbert-base-uncased")

37

38

results = suite.run(model)

39

print(results)

40

41

# Results contain scores for all tasks in the suite

42

# {

43

# 'boolq': {'accuracy': 0.75},

44

# 'cb': {'accuracy': 0.82, 'f1': 0.79},

45

# 'copa': {'accuracy': 0.68},

46

# # ... more task results

47

# }

48

```

49

50

### Creating Custom Evaluation Suites

51

52

Evaluation suites are defined using JSON configuration files that specify tasks, datasets, and metrics:

53

54

**Example Suite Configuration:**

55

```json

56

{

57

"suite_name": "my_classification_suite",

58

"description": "Custom text classification evaluation suite",

59

"tasks": [

60

{

61

"task_type": "text-classification",

62

"dataset": "glue",

63

"subset": "sst2",

64

"split": "validation",

65

"metrics": ["accuracy", "f1"]

66

},

67

{

68

"task_type": "text-classification",

69

"dataset": "glue",

70

"subset": "mrpc",

71

"split": "validation",

72

"metrics": ["accuracy", "f1"]

73

},

74

{

75

"task_type": "text-classification",

76

"dataset": "imdb",

77

"split": "test[:1000]",

78

"metrics": ["accuracy"]

79

}

80

]

81

}

82

```

83

84

**Loading Custom Suite:**

85

```python

86

import evaluate

87

88

# Load custom suite from local file

89

custom_suite = evaluate.EvaluationSuite.load("./my_suite.json")

90

91

# Run on multiple models

92

models = [

93

"distilbert-base-uncased",

94

"bert-base-uncased",

95

"roberta-base"

96

]

97

98

all_results = {}

99

for model_name in models:

100

print(f"Evaluating {model_name}...")

101

model = pipeline("text-classification", model=model_name)

102

results = custom_suite.run(model)

103

all_results[model_name] = results

104

105

# Compare results across models

106

for task in results.keys():

107

print(f"\n{task} Results:")

108

for model_name in all_results:

109

accuracy = all_results[model_name][task].get('accuracy', 'N/A')

110

print(f" {model_name}: {accuracy:.3f}")

111

```

112

113

### Pre-built Evaluation Suites

114

115

The library includes several pre-built evaluation suites:

116

117

**GLUE Suite:**

118

```python

119

import evaluate

120

121

# Load GLUE benchmark suite

122

glue_suite = evaluate.EvaluationSuite.load("glue")

123

124

# Evaluate a model on all GLUE tasks

125

from transformers import pipeline

126

model = pipeline("text-classification", model="bert-base-uncased")

127

128

glue_results = glue_suite.run(model)

129

130

# View results for specific tasks

131

print(f"CoLA: {glue_results['cola']['matthews_correlation']:.3f}")

132

print(f"SST-2: {glue_results['sst2']['accuracy']:.3f}")

133

print(f"MRPC: {glue_results['mrpc']['f1']:.3f}")

134

```

135

136

**SuperGLUE Suite:**

137

```python

138

import evaluate

139

140

# Load SuperGLUE benchmark

141

superglue_suite = evaluate.EvaluationSuite.load("super_glue")

142

143

# Run evaluation

144

results = superglue_suite.run(model)

145

146

# SuperGLUE includes more challenging tasks

147

print(f"BoolQ: {results['boolq']['accuracy']:.3f}")

148

print(f"RTE: {results['rte']['accuracy']:.3f}")

149

print(f"WiC: {results['wic']['accuracy']:.3f}")

150

```

151

152

### Advanced Suite Configuration

153

154

**Multi-Modal Suite:**

155

```python

156

# Configuration for multi-modal evaluation

157

multimodal_config = {

158

"suite_name": "multimodal_suite",

159

"description": "Evaluation across text, image, and audio tasks",

160

"tasks": [

161

{

162

"task_type": "text-classification",

163

"dataset": "imdb",

164

"split": "test[:500]",

165

"metrics": ["accuracy"]

166

},

167

{

168

"task_type": "image-classification",

169

"dataset": "cifar10",

170

"split": "test[:500]",

171

"metrics": ["accuracy", "top_5_accuracy"]

172

},

173

{

174

"task_type": "audio-classification",

175

"dataset": "superb",

176

"subset": "ks",

177

"split": "test[:500]",

178

"metrics": ["accuracy"]

179

}

180

]

181

}

182

183

# Save and load the suite

184

import json

185

with open("multimodal_suite.json", "w") as f:

186

json.dump(multimodal_config, f, indent=2)

187

188

suite = evaluate.EvaluationSuite.load("./multimodal_suite.json")

189

```

190

191

**Domain-Specific Suite:**

192

```python

193

# Medical text classification suite

194

medical_suite_config = {

195

"suite_name": "medical_text_suite",

196

"description": "Medical text classification benchmarks",

197

"tasks": [

198

{

199

"task_type": "text-classification",

200

"dataset": "medical_questions_pairs",

201

"metrics": ["accuracy", "f1"]

202

},

203

{

204

"task_type": "text-classification",

205

"dataset": "pubmed_20k_rct",

206

"metrics": ["accuracy", "precision", "recall"]

207

}

208

]

209

}

210

```

211

212

### Suite Results Analysis

213

214

**Comprehensive Results Processing:**

215

```python

216

import evaluate

217

import pandas as pd

218

219

# Load and run suite

220

suite = evaluate.EvaluationSuite.load("glue")

221

results = suite.run(model)

222

223

# Convert to DataFrame for analysis

224

results_data = []

225

for task, metrics in results.items():

226

for metric_name, value in metrics.items():

227

results_data.append({

228

'task': task,

229

'metric': metric_name,

230

'value': value

231

})

232

233

df = pd.DataFrame(results_data)

234

print(df.pivot(index='task', columns='metric', values='value'))

235

236

# Calculate overall suite score (if applicable)

237

accuracy_scores = [

238

metrics.get('accuracy', 0)

239

for metrics in results.values()

240

if 'accuracy' in metrics

241

]

242

overall_accuracy = sum(accuracy_scores) / len(accuracy_scores)

243

print(f"Overall Suite Accuracy: {overall_accuracy:.3f}")

244

```

245

246

**Model Comparison with Suites:**

247

```python

248

import evaluate

249

250

suite = evaluate.EvaluationSuite.load("glue")

251

models_to_compare = [

252

"distilbert-base-uncased",

253

"bert-base-uncased",

254

"roberta-base"

255

]

256

257

comparison_results = {}

258

for model_name in models_to_compare:

259

model = pipeline("text-classification", model=model_name)

260

results = suite.run(model)

261

comparison_results[model_name] = results

262

263

# Create comparison table

264

import pandas as pd

265

266

comparison_data = []

267

for model_name, model_results in comparison_results.items():

268

for task, metrics in model_results.items():

269

for metric_name, value in metrics.items():

270

comparison_data.append({

271

'model': model_name,

272

'task': task,

273

'metric': metric_name,

274

'value': value

275

})

276

277

comparison_df = pd.DataFrame(comparison_data)

278

pivot_table = comparison_df.pivot_table(

279

index=['task', 'metric'],

280

columns='model',

281

values='value'

282

)

283

print(pivot_table)

284

```

285

286

## Error Handling

287

288

Evaluation suites may raise:

289

290

- `FileNotFoundError`: Suite configuration file not found

291

- `ValueError`: Invalid suite configuration format

292

- `ImportError`: Missing dependencies for specific tasks

293

- `RuntimeError`: Model incompatibility with suite tasks

294

295

**Example:**

296

```python

297

import evaluate

298

299

try:

300

suite = evaluate.EvaluationSuite.load("nonexistent_suite")

301

except FileNotFoundError:

302

print("Suite not found")

303

304

try:

305

suite = evaluate.EvaluationSuite.load("glue")

306

# Model incompatible with some tasks

307

incompatible_model = pipeline("text-generation", model="gpt2")

308

results = suite.run(incompatible_model)

309

except RuntimeError as e:

310

print(f"Model incompatibility: {e}")

311

```