0
# Evaluation Metrics
1
2
Metrics for evaluating model performance on various tasks including text generation, translation, and classification. Keras Hub provides implementations of standard NLP evaluation metrics.
3
4
## Capabilities
5
6
### Text Generation Metrics
7
8
Metrics for evaluating the quality of generated text against reference texts.
9
10
```python { .api }
11
class Bleu:
12
"""
13
BLEU (Bilingual Evaluation Understudy) score for machine translation
14
and text generation evaluation. Measures n-gram overlap between
15
generated and reference texts.
16
"""
17
def __init__(
18
self,
19
max_order: int = 4,
20
smooth: bool = False,
21
name: str = "bleu",
22
dtype: str = None,
23
**kwargs
24
): ...
25
26
def update_state(self, y_true, y_pred, sample_weight=None): ...
27
def result(self): ...
28
def reset_state(self): ...
29
30
class RougeN:
31
"""
32
ROUGE-N score for evaluating summarization and text generation.
33
Measures n-gram recall between generated and reference texts.
34
"""
35
def __init__(
36
self,
37
order: int = 1,
38
use_stemmer: bool = False,
39
name: str = None,
40
dtype: str = None,
41
**kwargs
42
): ...
43
44
def update_state(self, y_true, y_pred, sample_weight=None): ...
45
def result(self): ...
46
def reset_state(self): ...
47
48
class RougeL:
49
"""
50
ROUGE-L score based on Longest Common Subsequence (LCS).
51
Evaluates fluency and coherence in generated text.
52
"""
53
def __init__(
54
self,
55
use_stemmer: bool = False,
56
name: str = "rouge_l",
57
dtype: str = None,
58
**kwargs
59
): ...
60
61
def update_state(self, y_true, y_pred, sample_weight=None): ...
62
def result(self): ...
63
def reset_state(self): ...
64
```
65
66
### Language Model Metrics
67
68
Metrics specifically designed for evaluating language models.
69
70
```python { .api }
71
class Perplexity:
72
"""
73
Perplexity metric for language model evaluation.
74
Measures how well a probability model predicts a sample.
75
Lower perplexity indicates better model performance.
76
"""
77
def __init__(
78
self,
79
from_logits: bool = True,
80
mask_token_id: int = None,
81
name: str = "perplexity",
82
dtype: str = None,
83
**kwargs
84
): ...
85
86
def update_state(self, y_true, y_pred, sample_weight=None): ...
87
def result(self): ...
88
def reset_state(self): ...
89
```
90
91
### String Distance Metrics
92
93
Metrics for measuring similarity between text sequences.
94
95
```python { .api }
96
class EditDistance:
97
"""
98
Edit distance (Levenshtein distance) metric.
99
Measures the minimum number of single-character edits
100
required to transform one string into another.
101
"""
102
def __init__(
103
self,
104
normalize: bool = False,
105
name: str = "edit_distance",
106
dtype: str = None,
107
**kwargs
108
): ...
109
110
def update_state(self, y_true, y_pred, sample_weight=None): ...
111
def result(self): ...
112
def reset_state(self): ...
113
```
114
115
## Usage Examples
116
117
### BLEU Score for Translation Evaluation
118
119
```python
120
import keras_hub
121
import numpy as np
122
123
# Create BLEU metric
124
bleu_metric = keras_hub.metrics.Bleu(max_order=4, smooth=True)
125
126
# Reference and generated texts
127
# In practice, these would be tokenized sequences
128
references = [
129
[1, 2, 3, 4, 5], # Reference translation
130
[6, 7, 8, 9] # Another reference
131
]
132
133
predictions = [
134
[1, 2, 3, 4, 6], # Generated translation
135
[6, 7, 8, 10] # Another generated translation
136
]
137
138
# Update metric with batch of data
139
bleu_metric.update_state(references, predictions)
140
141
# Get BLEU score
142
bleu_score = bleu_metric.result()
143
print(f"BLEU Score: {bleu_score:.4f}")
144
145
# Reset for new evaluation
146
bleu_metric.reset_state()
147
```
148
149
### ROUGE Metrics for Summarization
150
151
```python
152
import keras_hub
153
154
# ROUGE-1 (unigram overlap)
155
rouge1_metric = keras_hub.metrics.RougeN(order=1)
156
157
# ROUGE-2 (bigram overlap)
158
rouge2_metric = keras_hub.metrics.RougeN(order=2)
159
160
# ROUGE-L (longest common subsequence)
161
rougel_metric = keras_hub.metrics.RougeL()
162
163
# Reference and generated summaries
164
reference_summaries = [
165
"The quick brown fox jumps over the lazy dog",
166
"Machine learning is transforming many industries"
167
]
168
169
generated_summaries = [
170
"A quick brown fox jumps over a lazy dog",
171
"Machine learning transforms many different industries"
172
]
173
174
# Evaluate with different ROUGE metrics
175
for metric, name in [(rouge1_metric, "ROUGE-1"),
176
(rouge2_metric, "ROUGE-2"),
177
(rougel_metric, "ROUGE-L")]:
178
metric.update_state(reference_summaries, generated_summaries)
179
score = metric.result()
180
print(f"{name} Score: {score:.4f}")
181
metric.reset_state()
182
```
183
184
### Perplexity for Language Model Evaluation
185
186
```python
187
import keras_hub
188
import numpy as np
189
190
# Create perplexity metric
191
perplexity_metric = keras_hub.metrics.Perplexity(from_logits=True)
192
193
# Simulate language model predictions and targets
194
# In practice, these come from your language model
195
batch_size, sequence_length, vocab_size = 2, 10, 1000
196
197
# True token IDs
198
true_tokens = np.random.randint(0, vocab_size, (batch_size, sequence_length))
199
200
# Model logits (before softmax)
201
predicted_logits = np.random.randn(batch_size, sequence_length, vocab_size)
202
203
# Update perplexity metric
204
perplexity_metric.update_state(true_tokens, predicted_logits)
205
206
# Get perplexity score
207
perplexity = perplexity_metric.result()
208
print(f"Perplexity: {perplexity:.2f}")
209
```
210
211
### Edit Distance for Text Similarity
212
213
```python
214
import keras_hub
215
216
# Create edit distance metric
217
edit_distance_metric = keras_hub.metrics.EditDistance(normalize=True)
218
219
# Compare generated text with reference
220
reference_texts = ["hello world", "machine learning"]
221
generated_texts = ["helo world", "machine learning"]
222
223
# Update metric
224
edit_distance_metric.update_state(reference_texts, generated_texts)
225
226
# Get normalized edit distance (0 = identical, 1 = completely different)
227
distance = edit_distance_metric.result()
228
print(f"Normalized Edit Distance: {distance:.4f}")
229
```
230
231
### Using Metrics in Model Training
232
233
```python
234
import keras_hub
235
import keras
236
237
# Load a language model
238
model = keras_hub.models.GPT2CausalLM.from_preset("gpt2_base_en")
239
240
# Compile with perplexity metric
241
model.compile(
242
optimizer="adam",
243
loss="sparse_categorical_crossentropy",
244
metrics=[keras_hub.metrics.Perplexity()]
245
)
246
247
# During training, perplexity will be computed and logged
248
# model.fit(train_data, validation_data=val_data, epochs=3)
249
```
250
251
### Batch Evaluation with Multiple Metrics
252
253
```python
254
import keras_hub
255
256
# Create multiple metrics
257
metrics = {
258
"BLEU": keras_hub.metrics.Bleu(),
259
"ROUGE-1": keras_hub.metrics.RougeN(order=1),
260
"ROUGE-L": keras_hub.metrics.RougeL(),
261
"Edit Distance": keras_hub.metrics.EditDistance(normalize=True)
262
}
263
264
# Batch of reference and generated texts
265
references = [
266
"The cat sat on the mat",
267
"AI is revolutionizing technology",
268
"Python is a programming language"
269
]
270
271
predictions = [
272
"A cat sat on the mat",
273
"AI revolutionizes technology",
274
"Python is a programming language"
275
]
276
277
# Evaluate with all metrics
278
results = {}
279
for name, metric in metrics.items():
280
metric.update_state(references, predictions)
281
results[name] = metric.result().numpy()
282
metric.reset_state()
283
284
# Print results
285
for name, score in results.items():
286
print(f"{name}: {score:.4f}")
287
```
288
289
### Evaluating Text Generation Model
290
291
```python
292
import keras_hub
293
294
def evaluate_generation_model(model, test_prompts, reference_continuations):
295
"""
296
Comprehensive evaluation of a text generation model.
297
"""
298
# Generate text for test prompts
299
generated_texts = []
300
for prompt in test_prompts:
301
generated = model.generate(prompt, max_length=50)
302
# Extract only the generated part (remove prompt)
303
generated_part = generated[len(prompt):]
304
generated_texts.append(generated_part)
305
306
# Initialize metrics
307
bleu = keras_hub.metrics.Bleu()
308
rouge1 = keras_hub.metrics.RougeN(order=1)
309
rougel = keras_hub.metrics.RougeL()
310
edit_dist = keras_hub.metrics.EditDistance(normalize=True)
311
312
# Compute metrics
313
metrics_results = {}
314
315
for metric, name in [(bleu, "BLEU"), (rouge1, "ROUGE-1"),
316
(rougel, "ROUGE-L"), (edit_dist, "Edit Distance")]:
317
metric.update_state(reference_continuations, generated_texts)
318
metrics_results[name] = metric.result().numpy()
319
metric.reset_state()
320
321
return metrics_results
322
323
# Example usage
324
model = keras_hub.models.GPT2CausalLM.from_preset("gpt2_base_en")
325
326
test_prompts = ["The weather today is", "In the future, AI will"]
327
references = ["sunny and warm", "help solve many problems"]
328
329
results = evaluate_generation_model(model, test_prompts, references)
330
print("Generation Model Evaluation:")
331
for metric, score in results.items():
332
print(f" {metric}: {score:.4f}")
333
```
334
335
### Custom Metric Usage in Callbacks
336
337
```python
338
import keras_hub
339
import keras
340
341
class RougeCallback(keras.callbacks.Callback):
342
"""Custom callback to compute ROUGE score during training."""
343
344
def __init__(self, validation_data):
345
self.validation_data = validation_data
346
self.rouge_metric = keras_hub.metrics.RougeL()
347
348
def on_epoch_end(self, epoch, logs=None):
349
# Generate predictions for validation data
350
val_references, val_predictions = self.validation_data
351
352
# Update ROUGE metric
353
self.rouge_metric.update_state(val_references, val_predictions)
354
rouge_score = self.rouge_metric.result()
355
356
# Log the score
357
logs = logs or {}
358
logs['val_rouge_l'] = rouge_score
359
360
print(f"Epoch {epoch + 1} - ROUGE-L: {rouge_score:.4f}")
361
362
# Reset metric for next epoch
363
self.rouge_metric.reset_state()
364
365
# Use callback during training
366
# validation_texts = (references, predictions)
367
# callback = RougeCallback(validation_texts)
368
# model.fit(train_data, callbacks=[callback])
369
```