Pretrained models for Keras with multi-framework compatibility.
—
Audio processing models for speech recognition, audio-to-text conversion, and audio understanding tasks. Keras Hub provides implementations of state-of-the-art audio models including Whisper and Moonshine.
Whisper is a robust speech recognition model that can transcribe audio in multiple languages and handle various audio conditions.
class WhisperBackbone(Backbone):
"""Whisper transformer backbone for speech recognition."""
def __init__(
self,
vocabulary_size: int,
num_layers: int,
num_heads: int,
hidden_dim: int,
intermediate_dim: int,
num_mels: int = 80,
dropout: float = 0.0,
max_encoder_sequence_length: int = 3000,
max_decoder_sequence_length: int = 448,
**kwargs
): ...
class WhisperTokenizer:
"""Whisper tokenizer for text processing."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...Moonshine is an efficient speech recognition model optimized for fast inference and low resource usage.
class MoonshineBackbone(Backbone):
"""Moonshine backbone for audio-to-text conversion."""
def __init__(
self,
vocabulary_size: int,
num_layers: int,
hidden_dim: int,
num_heads: int,
**kwargs
): ...
class MoonshineAudioToText:
"""Moonshine model for audio-to-text conversion."""
def __init__(
self,
backbone: MoonshineBackbone,
preprocessor: Preprocessor = None,
**kwargs
): ...
class MoonshineAudioToTextPreprocessor:
"""Preprocessor for Moonshine audio-to-text."""
def __init__(
self,
audio_converter: AudioConverter,
tokenizer: MoonshineTokenizer,
**kwargs
): ...
class MoonshineTokenizer:
"""Moonshine tokenizer for text processing."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class MoonshineAudioConverter:
"""Audio converter for Moonshine models."""
def __init__(
self,
sample_rate: int = 16000,
num_mels: int = 80,
hop_length: int = 160,
win_length: int = 400,
**kwargs
): ...Base class for audio preprocessing and conversion.
class AudioConverter:
"""Base class for audio data conversion."""
def __init__(
self,
sample_rate: int = 16000,
**kwargs
): ...
def __call__(self, audio_data): ...import keras_hub
import numpy as np
# Load pretrained Moonshine model
model = keras_hub.models.MoonshineAudioToText.from_preset("moonshine_base")
# Prepare audio data (example with synthetic data)
# In practice, you would load actual audio files
audio_data = np.random.random((16000,)) # 1 second of audio at 16kHz
audio_batch = np.expand_dims(audio_data, axis=0) # Add batch dimension
# Transcribe audio
transcription = model.predict(audio_batch)
print("Transcription:", transcription)import keras_hub
# Create audio converter
audio_converter = keras_hub.layers.MoonshineAudioConverter(
sample_rate=16000,
num_mels=80
)
# Convert audio to mel spectrogram features
audio_features = audio_converter(audio_data)
print(f"Audio features shape: {audio_features.shape}")import keras_hub
# Load backbone and create custom model
backbone = keras_hub.models.MoonshineBackbone.from_preset("moonshine_base")
# Create preprocessor
preprocessor = keras_hub.models.MoonshineAudioToTextPreprocessor(
audio_converter=keras_hub.layers.MoonshineAudioConverter(),
tokenizer=keras_hub.tokenizers.MoonshineTokenizer.from_preset("moonshine_base")
)
# Create custom model
model = keras_hub.models.MoonshineAudioToText(
backbone=backbone,
preprocessor=preprocessor
)
# Compile and use model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")Install with Tessl CLI
npx tessl i tessl/pypi-keras-hub