or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-models.mdevaluation-metrics.mdgenerative-models.mdimage-models.mdindex.mdlayers-components.mdmultimodal-models.mdtext-generation-sampling.mdtext-models.mdtokenizers.mdutilities-helpers.md

evaluation-metrics.mddocs/

0

# Evaluation Metrics

1

2

Metrics for evaluating model performance on various tasks including text generation, translation, and classification. Keras Hub provides implementations of standard NLP evaluation metrics.

3

4

## Capabilities

5

6

### Text Generation Metrics

7

8

Metrics for evaluating the quality of generated text against reference texts.

9

10

```python { .api }

11

class Bleu:

12

"""

13

BLEU (Bilingual Evaluation Understudy) score for machine translation

14

and text generation evaluation. Measures n-gram overlap between

15

generated and reference texts.

16

"""

17

def __init__(

18

self,

19

max_order: int = 4,

20

smooth: bool = False,

21

name: str = "bleu",

22

dtype: str = None,

23

**kwargs

24

): ...

25

26

def update_state(self, y_true, y_pred, sample_weight=None): ...

27

def result(self): ...

28

def reset_state(self): ...

29

30

class RougeN:

31

"""

32

ROUGE-N score for evaluating summarization and text generation.

33

Measures n-gram recall between generated and reference texts.

34

"""

35

def __init__(

36

self,

37

order: int = 1,

38

use_stemmer: bool = False,

39

name: str = None,

40

dtype: str = None,

41

**kwargs

42

): ...

43

44

def update_state(self, y_true, y_pred, sample_weight=None): ...

45

def result(self): ...

46

def reset_state(self): ...

47

48

class RougeL:

49

"""

50

ROUGE-L score based on Longest Common Subsequence (LCS).

51

Evaluates fluency and coherence in generated text.

52

"""

53

def __init__(

54

self,

55

use_stemmer: bool = False,

56

name: str = "rouge_l",

57

dtype: str = None,

58

**kwargs

59

): ...

60

61

def update_state(self, y_true, y_pred, sample_weight=None): ...

62

def result(self): ...

63

def reset_state(self): ...

64

```

65

66

### Language Model Metrics

67

68

Metrics specifically designed for evaluating language models.

69

70

```python { .api }

71

class Perplexity:

72

"""

73

Perplexity metric for language model evaluation.

74

Measures how well a probability model predicts a sample.

75

Lower perplexity indicates better model performance.

76

"""

77

def __init__(

78

self,

79

from_logits: bool = True,

80

mask_token_id: int = None,

81

name: str = "perplexity",

82

dtype: str = None,

83

**kwargs

84

): ...

85

86

def update_state(self, y_true, y_pred, sample_weight=None): ...

87

def result(self): ...

88

def reset_state(self): ...

89

```

90

91

### String Distance Metrics

92

93

Metrics for measuring similarity between text sequences.

94

95

```python { .api }

96

class EditDistance:

97

"""

98

Edit distance (Levenshtein distance) metric.

99

Measures the minimum number of single-character edits

100

required to transform one string into another.

101

"""

102

def __init__(

103

self,

104

normalize: bool = False,

105

name: str = "edit_distance",

106

dtype: str = None,

107

**kwargs

108

): ...

109

110

def update_state(self, y_true, y_pred, sample_weight=None): ...

111

def result(self): ...

112

def reset_state(self): ...

113

```

114

115

## Usage Examples

116

117

### BLEU Score for Translation Evaluation

118

119

```python

120

import keras_hub

121

import numpy as np

122

123

# Create BLEU metric

124

bleu_metric = keras_hub.metrics.Bleu(max_order=4, smooth=True)

125

126

# Reference and generated texts

127

# In practice, these would be tokenized sequences

128

references = [

129

[1, 2, 3, 4, 5], # Reference translation

130

[6, 7, 8, 9] # Another reference

131

]

132

133

predictions = [

134

[1, 2, 3, 4, 6], # Generated translation

135

[6, 7, 8, 10] # Another generated translation

136

]

137

138

# Update metric with batch of data

139

bleu_metric.update_state(references, predictions)

140

141

# Get BLEU score

142

bleu_score = bleu_metric.result()

143

print(f"BLEU Score: {bleu_score:.4f}")

144

145

# Reset for new evaluation

146

bleu_metric.reset_state()

147

```

148

149

### ROUGE Metrics for Summarization

150

151

```python

152

import keras_hub

153

154

# ROUGE-1 (unigram overlap)

155

rouge1_metric = keras_hub.metrics.RougeN(order=1)

156

157

# ROUGE-2 (bigram overlap)

158

rouge2_metric = keras_hub.metrics.RougeN(order=2)

159

160

# ROUGE-L (longest common subsequence)

161

rougel_metric = keras_hub.metrics.RougeL()

162

163

# Reference and generated summaries

164

reference_summaries = [

165

"The quick brown fox jumps over the lazy dog",

166

"Machine learning is transforming many industries"

167

]

168

169

generated_summaries = [

170

"A quick brown fox jumps over a lazy dog",

171

"Machine learning transforms many different industries"

172

]

173

174

# Evaluate with different ROUGE metrics

175

for metric, name in [(rouge1_metric, "ROUGE-1"),

176

(rouge2_metric, "ROUGE-2"),

177

(rougel_metric, "ROUGE-L")]:

178

metric.update_state(reference_summaries, generated_summaries)

179

score = metric.result()

180

print(f"{name} Score: {score:.4f}")

181

metric.reset_state()

182

```

183

184

### Perplexity for Language Model Evaluation

185

186

```python

187

import keras_hub

188

import numpy as np

189

190

# Create perplexity metric

191

perplexity_metric = keras_hub.metrics.Perplexity(from_logits=True)

192

193

# Simulate language model predictions and targets

194

# In practice, these come from your language model

195

batch_size, sequence_length, vocab_size = 2, 10, 1000

196

197

# True token IDs

198

true_tokens = np.random.randint(0, vocab_size, (batch_size, sequence_length))

199

200

# Model logits (before softmax)

201

predicted_logits = np.random.randn(batch_size, sequence_length, vocab_size)

202

203

# Update perplexity metric

204

perplexity_metric.update_state(true_tokens, predicted_logits)

205

206

# Get perplexity score

207

perplexity = perplexity_metric.result()

208

print(f"Perplexity: {perplexity:.2f}")

209

```

210

211

### Edit Distance for Text Similarity

212

213

```python

214

import keras_hub

215

216

# Create edit distance metric

217

edit_distance_metric = keras_hub.metrics.EditDistance(normalize=True)

218

219

# Compare generated text with reference

220

reference_texts = ["hello world", "machine learning"]

221

generated_texts = ["helo world", "machine learning"]

222

223

# Update metric

224

edit_distance_metric.update_state(reference_texts, generated_texts)

225

226

# Get normalized edit distance (0 = identical, 1 = completely different)

227

distance = edit_distance_metric.result()

228

print(f"Normalized Edit Distance: {distance:.4f}")

229

```

230

231

### Using Metrics in Model Training

232

233

```python

234

import keras_hub

235

import keras

236

237

# Load a language model

238

model = keras_hub.models.GPT2CausalLM.from_preset("gpt2_base_en")

239

240

# Compile with perplexity metric

241

model.compile(

242

optimizer="adam",

243

loss="sparse_categorical_crossentropy",

244

metrics=[keras_hub.metrics.Perplexity()]

245

)

246

247

# During training, perplexity will be computed and logged

248

# model.fit(train_data, validation_data=val_data, epochs=3)

249

```

250

251

### Batch Evaluation with Multiple Metrics

252

253

```python

254

import keras_hub

255

256

# Create multiple metrics

257

metrics = {

258

"BLEU": keras_hub.metrics.Bleu(),

259

"ROUGE-1": keras_hub.metrics.RougeN(order=1),

260

"ROUGE-L": keras_hub.metrics.RougeL(),

261

"Edit Distance": keras_hub.metrics.EditDistance(normalize=True)

262

}

263

264

# Batch of reference and generated texts

265

references = [

266

"The cat sat on the mat",

267

"AI is revolutionizing technology",

268

"Python is a programming language"

269

]

270

271

predictions = [

272

"A cat sat on the mat",

273

"AI revolutionizes technology",

274

"Python is a programming language"

275

]

276

277

# Evaluate with all metrics

278

results = {}

279

for name, metric in metrics.items():

280

metric.update_state(references, predictions)

281

results[name] = metric.result().numpy()

282

metric.reset_state()

283

284

# Print results

285

for name, score in results.items():

286

print(f"{name}: {score:.4f}")

287

```

288

289

### Evaluating Text Generation Model

290

291

```python

292

import keras_hub

293

294

def evaluate_generation_model(model, test_prompts, reference_continuations):

295

"""

296

Comprehensive evaluation of a text generation model.

297

"""

298

# Generate text for test prompts

299

generated_texts = []

300

for prompt in test_prompts:

301

generated = model.generate(prompt, max_length=50)

302

# Extract only the generated part (remove prompt)

303

generated_part = generated[len(prompt):]

304

generated_texts.append(generated_part)

305

306

# Initialize metrics

307

bleu = keras_hub.metrics.Bleu()

308

rouge1 = keras_hub.metrics.RougeN(order=1)

309

rougel = keras_hub.metrics.RougeL()

310

edit_dist = keras_hub.metrics.EditDistance(normalize=True)

311

312

# Compute metrics

313

metrics_results = {}

314

315

for metric, name in [(bleu, "BLEU"), (rouge1, "ROUGE-1"),

316

(rougel, "ROUGE-L"), (edit_dist, "Edit Distance")]:

317

metric.update_state(reference_continuations, generated_texts)

318

metrics_results[name] = metric.result().numpy()

319

metric.reset_state()

320

321

return metrics_results

322

323

# Example usage

324

model = keras_hub.models.GPT2CausalLM.from_preset("gpt2_base_en")

325

326

test_prompts = ["The weather today is", "In the future, AI will"]

327

references = ["sunny and warm", "help solve many problems"]

328

329

results = evaluate_generation_model(model, test_prompts, references)

330

print("Generation Model Evaluation:")

331

for metric, score in results.items():

332

print(f" {metric}: {score:.4f}")

333

```

334

335

### Custom Metric Usage in Callbacks

336

337

```python

338

import keras_hub

339

import keras

340

341

class RougeCallback(keras.callbacks.Callback):

342

"""Custom callback to compute ROUGE score during training."""

343

344

def __init__(self, validation_data):

345

self.validation_data = validation_data

346

self.rouge_metric = keras_hub.metrics.RougeL()

347

348

def on_epoch_end(self, epoch, logs=None):

349

# Generate predictions for validation data

350

val_references, val_predictions = self.validation_data

351

352

# Update ROUGE metric

353

self.rouge_metric.update_state(val_references, val_predictions)

354

rouge_score = self.rouge_metric.result()

355

356

# Log the score

357

logs = logs or {}

358

logs['val_rouge_l'] = rouge_score

359

360

print(f"Epoch {epoch + 1} - ROUGE-L: {rouge_score:.4f}")

361

362

# Reset metric for next epoch

363

self.rouge_metric.reset_state()

364

365

# Use callback during training

366

# validation_texts = (references, predictions)

367

# callback = RougeCallback(validation_texts)

368

# model.fit(train_data, callbacks=[callback])

369

```