or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-evaluation.mdevaluation-suites.mdhub-integration.mdindex.mdmodule-discovery.mdtask-evaluators.mdutilities.md

utilities.mddocs/

0

# Utilities

1

2

Helper functions for logging control and Gradio integration for interactive evaluation experiences. These utilities enhance the evaluation workflow with progress tracking and interactive interfaces.

3

4

## Capabilities

5

6

### Logging Utilities

7

8

Control progress bar display during evaluation operations:

9

10

```python { .api }

11

def enable_progress_bar():

12

"""Enable tqdm progress bars for evaluation operations."""

13

14

def disable_progress_bar():

15

"""Disable tqdm progress bars for evaluation operations."""

16

17

def is_progress_bar_enabled() -> bool:

18

"""Check if progress bars are currently enabled."""

19

```

20

21

These functions are available in the `evaluate.utils.logging` module.

22

23

**Usage Example:**

24

```python

25

import evaluate

26

27

# Check current progress bar status

28

print(f"Progress bars enabled: {evaluate.utils.logging.is_progress_bar_enabled()}")

29

30

# Disable progress bars for cleaner output

31

evaluate.utils.logging.disable_progress_bar()

32

33

# Run evaluation without progress bars

34

accuracy = evaluate.load("accuracy")

35

# ... no progress bar shown during loading

36

37

# Re-enable progress bars

38

evaluate.utils.logging.enable_progress_bar()

39

40

# Now progress bars will be shown again

41

bleu = evaluate.load("bleu") # Progress bar visible

42

```

43

44

**Script Configuration:**

45

```python

46

import evaluate

47

import argparse

48

49

def main():

50

parser = argparse.ArgumentParser()

51

parser.add_argument("--quiet", action="store_true", help="Disable progress bars")

52

args = parser.parse_args()

53

54

if args.quiet:

55

evaluate.utils.logging.disable_progress_bar()

56

57

# Run evaluation with controlled output

58

metric = evaluate.load("rouge")

59

results = metric.compute(

60

predictions=["hello world"] * 1000,

61

references=["hello world"] * 1000

62

)

63

print(f"Results: {results}")

64

65

if __name__ == "__main__":

66

main()

67

```

68

69

### Gradio Integration

70

71

Interactive evaluation interfaces using Gradio widgets:

72

73

```python { .api }

74

def infer_gradio_input_types(features: Features) -> Dict[str, str]:

75

"""Map metric feature types to Gradio input component types."""

76

77

def json_to_string_type(input_type: str) -> str:

78

"""Convert json input type to string type for Gradio."""

79

80

def parse_readme(readme_content: str) -> str:

81

"""Parse README content and remove YAML frontmatter."""

82

83

def parse_gradio_data(data: List[List[Any]]) -> Tuple[List, List]:

84

"""Parse data from Gradio Dataframe for metric computation."""

85

86

def parse_test_cases(test_cases: str) -> Dict[str, List]:

87

"""Parse test case strings into structured data for Gradio."""

88

89

def launch_gradio_widget(evaluation_module: EvaluationModule) -> gradio.Interface:

90

"""Launch interactive Gradio widget for an evaluation module."""

91

```

92

93

These functions are available in the `evaluate.utils.gradio` module.

94

95

**Usage Example:**

96

```python

97

import evaluate

98

99

# Load a metric

100

accuracy = evaluate.load("accuracy")

101

102

# Launch interactive widget

103

interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)

104

105

# The widget allows users to:

106

# - Input predictions and references interactively

107

# - See real-time evaluation results

108

# - Explore metric documentation

109

# - Try different input formats

110

```

111

112

**Custom Gradio Interface:**

113

```python

114

import evaluate

115

import gradio as gr

116

117

def create_evaluation_interface():

118

# Load multiple metrics

119

accuracy = evaluate.load("accuracy")

120

f1 = evaluate.load("f1")

121

precision = evaluate.load("precision")

122

recall = evaluate.load("recall")

123

124

def evaluate_inputs(predictions_text, references_text):

125

# Parse input text to lists

126

predictions = [int(x.strip()) for x in predictions_text.split(",")]

127

references = [int(x.strip()) for x in references_text.split(",")]

128

129

# Compute all metrics

130

results = {

131

"accuracy": accuracy.compute(predictions=predictions, references=references),

132

"f1": f1.compute(predictions=predictions, references=references),

133

"precision": precision.compute(predictions=predictions, references=references),

134

"recall": recall.compute(predictions=predictions, references=references)

135

}

136

137

return str(results)

138

139

# Create interface

140

interface = gr.Interface(

141

fn=evaluate_inputs,

142

inputs=[

143

gr.Textbox(label="Predictions (comma-separated)", placeholder="1,0,1,0"),

144

gr.Textbox(label="References (comma-separated)", placeholder="1,1,0,0")

145

],

146

outputs=gr.Textbox(label="Evaluation Results"),

147

title="Multi-Metric Evaluation",

148

description="Evaluate predictions with multiple classification metrics"

149

)

150

151

return interface

152

153

# Launch custom interface

154

interface = create_evaluation_interface()

155

interface.launch()

156

```

157

158

**Batch Evaluation Interface:**

159

```python

160

import evaluate

161

import gradio as gr

162

import pandas as pd

163

164

def create_batch_evaluation_interface():

165

def evaluate_csv_data(csv_file):

166

# Read CSV file

167

df = pd.read_csv(csv_file.name)

168

169

if 'predictions' not in df.columns or 'references' not in df.columns:

170

return "Error: CSV must contain 'predictions' and 'references' columns"

171

172

predictions = df['predictions'].tolist()

173

references = df['references'].tolist()

174

175

# Run evaluation

176

combined = evaluate.combine(["accuracy", "f1", "precision", "recall"])

177

results = combined.compute(predictions=predictions, references=references)

178

179

return str(results)

180

181

interface = gr.Interface(

182

fn=evaluate_csv_data,

183

inputs=gr.File(label="Upload CSV with predictions and references"),

184

outputs=gr.Textbox(label="Evaluation Results"),

185

title="Batch Evaluation from CSV",

186

description="Upload a CSV file with 'predictions' and 'references' columns"

187

)

188

189

return interface

190

```

191

192

### Advanced Utility Functions

193

194

**Helper Functions for Data Processing:**

195

```python

196

import evaluate

197

198

# Parse test cases from string format

199

test_case_string = """

200

predictions: [1, 0, 1, 0]

201

references: [1, 1, 0, 0]

202

"""

203

204

parsed_cases = evaluate.utils.gradio.parse_test_cases(test_case_string)

205

print(parsed_cases) # {'predictions': [1, 0, 1, 0], 'references': [1, 1, 0, 0]}

206

207

# Infer Gradio input types from metric features

208

accuracy = evaluate.load("accuracy")

209

input_types = evaluate.utils.gradio.infer_gradio_input_types(accuracy.features)

210

print(input_types) # Maps feature types to Gradio component types

211

```

212

213

**README Processing:**

214

```python

215

import evaluate

216

217

# Process metric README with YAML frontmatter

218

readme_with_yaml = """---

219

title: Accuracy

220

emoji: 🎯

221

tags:

222

- evaluate

223

- metric

224

---

225

226

# Accuracy

227

228

Accuracy is the fraction of predictions our model got right.

229

"""

230

231

clean_readme = evaluate.utils.gradio.parse_readme(readme_with_yaml)

232

print(clean_readme) # Returns content without YAML frontmatter

233

```

234

235

### Integration with Evaluation Workflows

236

237

**Complete Interactive Evaluation Setup:**

238

```python

239

import evaluate

240

import gradio as gr

241

242

def setup_comprehensive_evaluation():

243

# Disable progress bars for cleaner interface

244

evaluate.utils.logging.disable_progress_bar()

245

246

# Load multiple evaluation modules

247

metrics = {

248

"accuracy": evaluate.load("accuracy"),

249

"f1": evaluate.load("f1"),

250

"bleu": evaluate.load("bleu"),

251

"rouge": evaluate.load("rouge")

252

}

253

254

def evaluate_text_classification(predictions, references):

255

pred_list = [int(x.strip()) for x in predictions.split(",")]

256

ref_list = [int(x.strip()) for x in references.split(",")]

257

258

results = {}

259

for name, metric in metrics.items():

260

if name in ["accuracy", "f1"]: # Classification metrics

261

results[name] = metric.compute(predictions=pred_list, references=ref_list)

262

263

return str(results)

264

265

def evaluate_text_generation(predictions, references):

266

pred_list = [x.strip() for x in predictions.split("\n")]

267

ref_list = [x.strip() for x in references.split("\n")]

268

269

results = {}

270

for name, metric in metrics.items():

271

if name in ["bleu", "rouge"]: # Generation metrics

272

if name == "bleu":

273

results[name] = metric.compute(predictions=pred_list, references=[[r] for r in ref_list])

274

else:

275

results[name] = metric.compute(predictions=pred_list, references=ref_list)

276

277

return str(results)

278

279

# Create tabbed interface

280

classification_interface = gr.Interface(

281

fn=evaluate_text_classification,

282

inputs=[

283

gr.Textbox(label="Predictions", placeholder="1,0,1,0"),

284

gr.Textbox(label="References", placeholder="1,1,0,0")

285

],

286

outputs=gr.Textbox(label="Results"),

287

title="Classification Evaluation"

288

)

289

290

generation_interface = gr.Interface(

291

fn=evaluate_text_generation,

292

inputs=[

293

gr.Textbox(label="Predictions", lines=5, placeholder="Generated text 1\nGenerated text 2"),

294

gr.Textbox(label="References", lines=5, placeholder="Reference text 1\nReference text 2")

295

],

296

outputs=gr.Textbox(label="Results"),

297

title="Generation Evaluation"

298

)

299

300

# Combine interfaces

301

demo = gr.TabbedInterface(

302

[classification_interface, generation_interface],

303

["Classification", "Generation"]

304

)

305

306

return demo

307

308

# Launch comprehensive evaluation interface

309

demo = setup_comprehensive_evaluation()

310

demo.launch(share=True) # Create shareable link

311

```

312

313

## Error Handling

314

315

Utility functions may raise:

316

317

- `ImportError`: Missing gradio dependency for widget functions

318

- `ValueError`: Invalid input formats for parsing functions

319

- `AttributeError`: Incompatible evaluation module for Gradio integration

320

321

**Example:**

322

```python

323

import evaluate

324

325

try:

326

# This requires gradio to be installed

327

interface = evaluate.utils.gradio.launch_gradio_widget(accuracy)

328

except ImportError:

329

print("Install gradio: pip install gradio")

330

331

try:

332

# Invalid test case format

333

cases = evaluate.utils.gradio.parse_test_cases("invalid format")

334

except ValueError as e:

335

print(f"Parse error: {e}")

336

```