CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-keras-hub

Pretrained models for Keras with multi-framework compatibility.

Pending
Overview
Eval results
Files

evaluation-metrics.mddocs/

Evaluation Metrics

Metrics for evaluating model performance on various tasks including text generation, translation, and classification. Keras Hub provides implementations of standard NLP evaluation metrics.

Capabilities

Text Generation Metrics

Metrics for evaluating the quality of generated text against reference texts.

class Bleu:
    """
    BLEU (Bilingual Evaluation Understudy) score for machine translation
    and text generation evaluation. Measures n-gram overlap between
    generated and reference texts.
    """
    def __init__(
        self,
        max_order: int = 4,
        smooth: bool = False,
        name: str = "bleu",
        dtype: str = None,
        **kwargs
    ): ...
    
    def update_state(self, y_true, y_pred, sample_weight=None): ...
    def result(self): ...
    def reset_state(self): ...

class RougeN:
    """
    ROUGE-N score for evaluating summarization and text generation.
    Measures n-gram recall between generated and reference texts.
    """
    def __init__(
        self,
        order: int = 1,
        use_stemmer: bool = False,
        name: str = None,
        dtype: str = None,
        **kwargs
    ): ...
    
    def update_state(self, y_true, y_pred, sample_weight=None): ...
    def result(self): ...
    def reset_state(self): ...

class RougeL:
    """
    ROUGE-L score based on Longest Common Subsequence (LCS).
    Evaluates fluency and coherence in generated text.
    """
    def __init__(
        self,
        use_stemmer: bool = False,
        name: str = "rouge_l",
        dtype: str = None,
        **kwargs
    ): ...
    
    def update_state(self, y_true, y_pred, sample_weight=None): ...
    def result(self): ...
    def reset_state(self): ...

Language Model Metrics

Metrics specifically designed for evaluating language models.

class Perplexity:
    """
    Perplexity metric for language model evaluation.
    Measures how well a probability model predicts a sample.
    Lower perplexity indicates better model performance.
    """
    def __init__(
        self,
        from_logits: bool = True,
        mask_token_id: int = None,
        name: str = "perplexity",
        dtype: str = None,
        **kwargs
    ): ...
    
    def update_state(self, y_true, y_pred, sample_weight=None): ...
    def result(self): ...
    def reset_state(self): ...

String Distance Metrics

Metrics for measuring similarity between text sequences.

class EditDistance:
    """
    Edit distance (Levenshtein distance) metric.
    Measures the minimum number of single-character edits
    required to transform one string into another.
    """
    def __init__(
        self,
        normalize: bool = False,
        name: str = "edit_distance",
        dtype: str = None,
        **kwargs
    ): ...
    
    def update_state(self, y_true, y_pred, sample_weight=None): ...
    def result(self): ...
    def reset_state(self): ...

Usage Examples

BLEU Score for Translation Evaluation

import keras_hub
import numpy as np

# Create BLEU metric
bleu_metric = keras_hub.metrics.Bleu(max_order=4, smooth=True)

# Reference and generated texts
# In practice, these would be tokenized sequences
references = [
    [1, 2, 3, 4, 5],  # Reference translation
    [6, 7, 8, 9]      # Another reference
]

predictions = [
    [1, 2, 3, 4, 6],  # Generated translation
    [6, 7, 8, 10]     # Another generated translation
]

# Update metric with batch of data
bleu_metric.update_state(references, predictions)

# Get BLEU score
bleu_score = bleu_metric.result()
print(f"BLEU Score: {bleu_score:.4f}")

# Reset for new evaluation
bleu_metric.reset_state()

ROUGE Metrics for Summarization

import keras_hub

# ROUGE-1 (unigram overlap)
rouge1_metric = keras_hub.metrics.RougeN(order=1)

# ROUGE-2 (bigram overlap)  
rouge2_metric = keras_hub.metrics.RougeN(order=2)

# ROUGE-L (longest common subsequence)
rougel_metric = keras_hub.metrics.RougeL()

# Reference and generated summaries
reference_summaries = [
    "The quick brown fox jumps over the lazy dog",
    "Machine learning is transforming many industries"
]

generated_summaries = [
    "A quick brown fox jumps over a lazy dog",
    "Machine learning transforms many different industries"
]

# Evaluate with different ROUGE metrics
for metric, name in [(rouge1_metric, "ROUGE-1"), 
                     (rouge2_metric, "ROUGE-2"), 
                     (rougel_metric, "ROUGE-L")]:
    metric.update_state(reference_summaries, generated_summaries)
    score = metric.result()
    print(f"{name} Score: {score:.4f}")
    metric.reset_state()

Perplexity for Language Model Evaluation

import keras_hub
import numpy as np

# Create perplexity metric
perplexity_metric = keras_hub.metrics.Perplexity(from_logits=True)

# Simulate language model predictions and targets
# In practice, these come from your language model
batch_size, sequence_length, vocab_size = 2, 10, 1000

# True token IDs
true_tokens = np.random.randint(0, vocab_size, (batch_size, sequence_length))

# Model logits (before softmax)
predicted_logits = np.random.randn(batch_size, sequence_length, vocab_size)

# Update perplexity metric
perplexity_metric.update_state(true_tokens, predicted_logits)

# Get perplexity score
perplexity = perplexity_metric.result()
print(f"Perplexity: {perplexity:.2f}")

Edit Distance for Text Similarity

import keras_hub

# Create edit distance metric
edit_distance_metric = keras_hub.metrics.EditDistance(normalize=True)

# Compare generated text with reference
reference_texts = ["hello world", "machine learning"]
generated_texts = ["helo world", "machine learning"]

# Update metric
edit_distance_metric.update_state(reference_texts, generated_texts)

# Get normalized edit distance (0 = identical, 1 = completely different)
distance = edit_distance_metric.result()
print(f"Normalized Edit Distance: {distance:.4f}")

Using Metrics in Model Training

import keras_hub
import keras

# Load a language model
model = keras_hub.models.GPT2CausalLM.from_preset("gpt2_base_en")

# Compile with perplexity metric
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=[keras_hub.metrics.Perplexity()]
)

# During training, perplexity will be computed and logged
# model.fit(train_data, validation_data=val_data, epochs=3)

Batch Evaluation with Multiple Metrics

import keras_hub

# Create multiple metrics
metrics = {
    "BLEU": keras_hub.metrics.Bleu(),
    "ROUGE-1": keras_hub.metrics.RougeN(order=1),
    "ROUGE-L": keras_hub.metrics.RougeL(),
    "Edit Distance": keras_hub.metrics.EditDistance(normalize=True)
}

# Batch of reference and generated texts
references = [
    "The cat sat on the mat",
    "AI is revolutionizing technology",
    "Python is a programming language"
]

predictions = [
    "A cat sat on the mat",
    "AI revolutionizes technology",
    "Python is a programming language"
]

# Evaluate with all metrics
results = {}
for name, metric in metrics.items():
    metric.update_state(references, predictions)
    results[name] = metric.result().numpy()
    metric.reset_state()

# Print results
for name, score in results.items():
    print(f"{name}: {score:.4f}")

Evaluating Text Generation Model

import keras_hub

def evaluate_generation_model(model, test_prompts, reference_continuations):
    """
    Comprehensive evaluation of a text generation model.
    """
    # Generate text for test prompts
    generated_texts = []
    for prompt in test_prompts:
        generated = model.generate(prompt, max_length=50)
        # Extract only the generated part (remove prompt)
        generated_part = generated[len(prompt):]
        generated_texts.append(generated_part)
    
    # Initialize metrics
    bleu = keras_hub.metrics.Bleu()
    rouge1 = keras_hub.metrics.RougeN(order=1)
    rougel = keras_hub.metrics.RougeL()
    edit_dist = keras_hub.metrics.EditDistance(normalize=True)
    
    # Compute metrics
    metrics_results = {}
    
    for metric, name in [(bleu, "BLEU"), (rouge1, "ROUGE-1"), 
                        (rougel, "ROUGE-L"), (edit_dist, "Edit Distance")]:
        metric.update_state(reference_continuations, generated_texts)
        metrics_results[name] = metric.result().numpy()
        metric.reset_state()
    
    return metrics_results

# Example usage
model = keras_hub.models.GPT2CausalLM.from_preset("gpt2_base_en")

test_prompts = ["The weather today is", "In the future, AI will"]
references = ["sunny and warm", "help solve many problems"]

results = evaluate_generation_model(model, test_prompts, references)
print("Generation Model Evaluation:")
for metric, score in results.items():
    print(f"  {metric}: {score:.4f}")

Custom Metric Usage in Callbacks

import keras_hub
import keras

class RougeCallback(keras.callbacks.Callback):
    """Custom callback to compute ROUGE score during training."""
    
    def __init__(self, validation_data):
        self.validation_data = validation_data
        self.rouge_metric = keras_hub.metrics.RougeL()
        
    def on_epoch_end(self, epoch, logs=None):
        # Generate predictions for validation data
        val_references, val_predictions = self.validation_data
        
        # Update ROUGE metric
        self.rouge_metric.update_state(val_references, val_predictions)
        rouge_score = self.rouge_metric.result()
        
        # Log the score
        logs = logs or {}
        logs['val_rouge_l'] = rouge_score
        
        print(f"Epoch {epoch + 1} - ROUGE-L: {rouge_score:.4f}")
        
        # Reset metric for next epoch
        self.rouge_metric.reset_state()

# Use callback during training
# validation_texts = (references, predictions)
# callback = RougeCallback(validation_texts)
# model.fit(train_data, callbacks=[callback])

Install with Tessl CLI

npx tessl i tessl/pypi-keras-hub

docs

audio-models.md

evaluation-metrics.md

generative-models.md

image-models.md

index.md

layers-components.md

multimodal-models.md

text-generation-sampling.md

text-models.md

tokenizers.md

utilities-helpers.md

tile.json