Pretrained models for Keras with multi-framework compatibility.
—
Metrics for evaluating model performance on various tasks including text generation, translation, and classification. Keras Hub provides implementations of standard NLP evaluation metrics.
Metrics for evaluating the quality of generated text against reference texts.
class Bleu:
"""
BLEU (Bilingual Evaluation Understudy) score for machine translation
and text generation evaluation. Measures n-gram overlap between
generated and reference texts.
"""
def __init__(
self,
max_order: int = 4,
smooth: bool = False,
name: str = "bleu",
dtype: str = None,
**kwargs
): ...
def update_state(self, y_true, y_pred, sample_weight=None): ...
def result(self): ...
def reset_state(self): ...
class RougeN:
"""
ROUGE-N score for evaluating summarization and text generation.
Measures n-gram recall between generated and reference texts.
"""
def __init__(
self,
order: int = 1,
use_stemmer: bool = False,
name: str = None,
dtype: str = None,
**kwargs
): ...
def update_state(self, y_true, y_pred, sample_weight=None): ...
def result(self): ...
def reset_state(self): ...
class RougeL:
"""
ROUGE-L score based on Longest Common Subsequence (LCS).
Evaluates fluency and coherence in generated text.
"""
def __init__(
self,
use_stemmer: bool = False,
name: str = "rouge_l",
dtype: str = None,
**kwargs
): ...
def update_state(self, y_true, y_pred, sample_weight=None): ...
def result(self): ...
def reset_state(self): ...Metrics specifically designed for evaluating language models.
class Perplexity:
"""
Perplexity metric for language model evaluation.
Measures how well a probability model predicts a sample.
Lower perplexity indicates better model performance.
"""
def __init__(
self,
from_logits: bool = True,
mask_token_id: int = None,
name: str = "perplexity",
dtype: str = None,
**kwargs
): ...
def update_state(self, y_true, y_pred, sample_weight=None): ...
def result(self): ...
def reset_state(self): ...Metrics for measuring similarity between text sequences.
class EditDistance:
"""
Edit distance (Levenshtein distance) metric.
Measures the minimum number of single-character edits
required to transform one string into another.
"""
def __init__(
self,
normalize: bool = False,
name: str = "edit_distance",
dtype: str = None,
**kwargs
): ...
def update_state(self, y_true, y_pred, sample_weight=None): ...
def result(self): ...
def reset_state(self): ...import keras_hub
import numpy as np
# Create BLEU metric
bleu_metric = keras_hub.metrics.Bleu(max_order=4, smooth=True)
# Reference and generated texts
# In practice, these would be tokenized sequences
references = [
[1, 2, 3, 4, 5], # Reference translation
[6, 7, 8, 9] # Another reference
]
predictions = [
[1, 2, 3, 4, 6], # Generated translation
[6, 7, 8, 10] # Another generated translation
]
# Update metric with batch of data
bleu_metric.update_state(references, predictions)
# Get BLEU score
bleu_score = bleu_metric.result()
print(f"BLEU Score: {bleu_score:.4f}")
# Reset for new evaluation
bleu_metric.reset_state()import keras_hub
# ROUGE-1 (unigram overlap)
rouge1_metric = keras_hub.metrics.RougeN(order=1)
# ROUGE-2 (bigram overlap)
rouge2_metric = keras_hub.metrics.RougeN(order=2)
# ROUGE-L (longest common subsequence)
rougel_metric = keras_hub.metrics.RougeL()
# Reference and generated summaries
reference_summaries = [
"The quick brown fox jumps over the lazy dog",
"Machine learning is transforming many industries"
]
generated_summaries = [
"A quick brown fox jumps over a lazy dog",
"Machine learning transforms many different industries"
]
# Evaluate with different ROUGE metrics
for metric, name in [(rouge1_metric, "ROUGE-1"),
(rouge2_metric, "ROUGE-2"),
(rougel_metric, "ROUGE-L")]:
metric.update_state(reference_summaries, generated_summaries)
score = metric.result()
print(f"{name} Score: {score:.4f}")
metric.reset_state()import keras_hub
import numpy as np
# Create perplexity metric
perplexity_metric = keras_hub.metrics.Perplexity(from_logits=True)
# Simulate language model predictions and targets
# In practice, these come from your language model
batch_size, sequence_length, vocab_size = 2, 10, 1000
# True token IDs
true_tokens = np.random.randint(0, vocab_size, (batch_size, sequence_length))
# Model logits (before softmax)
predicted_logits = np.random.randn(batch_size, sequence_length, vocab_size)
# Update perplexity metric
perplexity_metric.update_state(true_tokens, predicted_logits)
# Get perplexity score
perplexity = perplexity_metric.result()
print(f"Perplexity: {perplexity:.2f}")import keras_hub
# Create edit distance metric
edit_distance_metric = keras_hub.metrics.EditDistance(normalize=True)
# Compare generated text with reference
reference_texts = ["hello world", "machine learning"]
generated_texts = ["helo world", "machine learning"]
# Update metric
edit_distance_metric.update_state(reference_texts, generated_texts)
# Get normalized edit distance (0 = identical, 1 = completely different)
distance = edit_distance_metric.result()
print(f"Normalized Edit Distance: {distance:.4f}")import keras_hub
import keras
# Load a language model
model = keras_hub.models.GPT2CausalLM.from_preset("gpt2_base_en")
# Compile with perplexity metric
model.compile(
optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=[keras_hub.metrics.Perplexity()]
)
# During training, perplexity will be computed and logged
# model.fit(train_data, validation_data=val_data, epochs=3)import keras_hub
# Create multiple metrics
metrics = {
"BLEU": keras_hub.metrics.Bleu(),
"ROUGE-1": keras_hub.metrics.RougeN(order=1),
"ROUGE-L": keras_hub.metrics.RougeL(),
"Edit Distance": keras_hub.metrics.EditDistance(normalize=True)
}
# Batch of reference and generated texts
references = [
"The cat sat on the mat",
"AI is revolutionizing technology",
"Python is a programming language"
]
predictions = [
"A cat sat on the mat",
"AI revolutionizes technology",
"Python is a programming language"
]
# Evaluate with all metrics
results = {}
for name, metric in metrics.items():
metric.update_state(references, predictions)
results[name] = metric.result().numpy()
metric.reset_state()
# Print results
for name, score in results.items():
print(f"{name}: {score:.4f}")import keras_hub
def evaluate_generation_model(model, test_prompts, reference_continuations):
"""
Comprehensive evaluation of a text generation model.
"""
# Generate text for test prompts
generated_texts = []
for prompt in test_prompts:
generated = model.generate(prompt, max_length=50)
# Extract only the generated part (remove prompt)
generated_part = generated[len(prompt):]
generated_texts.append(generated_part)
# Initialize metrics
bleu = keras_hub.metrics.Bleu()
rouge1 = keras_hub.metrics.RougeN(order=1)
rougel = keras_hub.metrics.RougeL()
edit_dist = keras_hub.metrics.EditDistance(normalize=True)
# Compute metrics
metrics_results = {}
for metric, name in [(bleu, "BLEU"), (rouge1, "ROUGE-1"),
(rougel, "ROUGE-L"), (edit_dist, "Edit Distance")]:
metric.update_state(reference_continuations, generated_texts)
metrics_results[name] = metric.result().numpy()
metric.reset_state()
return metrics_results
# Example usage
model = keras_hub.models.GPT2CausalLM.from_preset("gpt2_base_en")
test_prompts = ["The weather today is", "In the future, AI will"]
references = ["sunny and warm", "help solve many problems"]
results = evaluate_generation_model(model, test_prompts, references)
print("Generation Model Evaluation:")
for metric, score in results.items():
print(f" {metric}: {score:.4f}")import keras_hub
import keras
class RougeCallback(keras.callbacks.Callback):
"""Custom callback to compute ROUGE score during training."""
def __init__(self, validation_data):
self.validation_data = validation_data
self.rouge_metric = keras_hub.metrics.RougeL()
def on_epoch_end(self, epoch, logs=None):
# Generate predictions for validation data
val_references, val_predictions = self.validation_data
# Update ROUGE metric
self.rouge_metric.update_state(val_references, val_predictions)
rouge_score = self.rouge_metric.result()
# Log the score
logs = logs or {}
logs['val_rouge_l'] = rouge_score
print(f"Epoch {epoch + 1} - ROUGE-L: {rouge_score:.4f}")
# Reset metric for next epoch
self.rouge_metric.reset_state()
# Use callback during training
# validation_texts = (references, predictions)
# callback = RougeCallback(validation_texts)
# model.fit(train_data, callbacks=[callback])Install with Tessl CLI
npx tessl i tessl/pypi-keras-hub