Pretrained models for Keras with multi-framework compatibility.
—
Models that process multiple modalities like text and images together for advanced AI capabilities. Keras Hub provides implementations of CLIP, SigLIP, PaliGemma, and other multimodal architectures.
CLIP learns visual concepts from natural language supervision by jointly training text and image encoders.
class CLIPBackbone(Backbone):
"""CLIP multimodal backbone."""
def __init__(
self,
text_encoder: CLIPTextEncoder,
vision_encoder: CLIPVisionEncoder,
**kwargs
): ...
class CLIPTextEncoder(Backbone):
"""CLIP text encoder using transformer architecture."""
def __init__(
self,
vocabulary_size: int,
num_layers: int,
num_heads: int,
hidden_dim: int,
intermediate_dim: int,
max_sequence_length: int = 77,
**kwargs
): ...
class CLIPVisionEncoder(Backbone):
"""CLIP vision encoder using Vision Transformer architecture."""
def __init__(
self,
image_shape: tuple = (224, 224, 3),
patch_size: int = 32,
num_layers: int = 12,
num_heads: int = 12,
hidden_dim: int = 768,
intermediate_dim: int = 3072,
**kwargs
): ...
class CLIPPreprocessor:
"""Preprocessor for CLIP multimodal inputs."""
def __init__(
self,
tokenizer: CLIPTokenizer,
image_converter: CLIPImageConverter,
**kwargs
): ...
class CLIPTokenizer:
"""CLIP tokenizer for text processing."""
def __init__(
self,
vocabulary: dict = None,
merges: list = None,
**kwargs
): ...
class CLIPImageConverter:
"""Image converter for CLIP models."""
def __init__(
self,
height: int = 224,
width: int = 224,
crop_to_aspect_ratio: bool = True,
interpolation: str = "bilinear",
**kwargs
): ...SigLIP is an improved version of CLIP using sigmoid loss for better multimodal understanding.
class SigLIPBackbone(Backbone):
"""SigLIP multimodal backbone."""
def __init__(
self,
text_encoder: SigLIPTextEncoder,
vision_encoder: SigLIPVisionEncoder,
**kwargs
): ...
class SigLIPTextEncoder(Backbone):
"""SigLIP text encoder."""
def __init__(
self,
vocabulary_size: int,
num_layers: int,
num_heads: int,
hidden_dim: int,
intermediate_dim: int,
max_sequence_length: int = 77,
**kwargs
): ...
class SigLIPVisionEncoder(Backbone):
"""SigLIP vision encoder."""
def __init__(
self,
image_shape: tuple = (224, 224, 3),
patch_size: int = 16,
num_layers: int = 12,
num_heads: int = 12,
hidden_dim: int = 768,
intermediate_dim: int = 3072,
**kwargs
): ...
class SigLIPPreprocessor:
"""Preprocessor for SigLIP multimodal inputs."""
def __init__(
self,
tokenizer: SigLIPTokenizer,
image_converter: SigLIPImageConverter,
**kwargs
): ...
class SigLIPTokenizer:
"""SigLIP tokenizer for text processing."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class SigLIPImageConverter:
"""Image converter for SigLIP models."""
def __init__(
self,
height: int = 224,
width: int = 224,
crop_to_aspect_ratio: bool = True,
interpolation: str = "bilinear",
**kwargs
): ...PaliGemma combines vision and language understanding in a unified architecture for multimodal tasks.
class PaliGemmaBackbone(Backbone):
"""PaliGemma multimodal backbone."""
def __init__(
self,
vocabulary_size: int,
image_size: int,
num_layers: int,
num_heads: int,
hidden_dim: int,
intermediate_dim: int,
**kwargs
): ...
class PaliGemmaCausalLM(CausalLM):
"""PaliGemma model for multimodal causal language modeling."""
def __init__(
self,
backbone: PaliGemmaBackbone,
preprocessor: Preprocessor = None,
**kwargs
): ...
class PaliGemmaCausalLMPreprocessor:
"""Preprocessor for PaliGemma causal language modeling."""
def __init__(
self,
tokenizer: PaliGemmaTokenizer,
image_converter: PaliGemmaImageConverter,
sequence_length: int = 1024,
**kwargs
): ...
class PaliGemmaTokenizer:
"""PaliGemma tokenizer for text processing."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class PaliGemmaImageConverter:
"""Image converter for PaliGemma models."""
def __init__(
self,
height: int = 224,
width: int = 224,
crop_to_aspect_ratio: bool = True,
interpolation: str = "bilinear",
**kwargs
): ...Gemma3 includes vision capabilities for multimodal understanding.
class Gemma3VisionEncoder(Backbone):
"""Gemma3 vision encoder for multimodal tasks."""
def __init__(
self,
image_shape: tuple = (224, 224, 3),
patch_size: int = 16,
num_layers: int = 12,
num_heads: int = 12,
hidden_dim: int = 768,
**kwargs
): ...
class Gemma3ImageConverter:
"""Image converter for Gemma3 models."""
def __init__(
self,
height: int = 224,
width: int = 224,
crop_to_aspect_ratio: bool = True,
interpolation: str = "bilinear",
**kwargs
): ...import keras_hub
import numpy as np
# Load pretrained CLIP model
clip_model = keras_hub.models.CLIPBackbone.from_preset("clip_vit_base_patch32")
# Prepare text and image data
texts = ["a cat sitting on a table", "a dog running in a park"]
images = np.random.random((2, 224, 224, 3)) # Example images
# Get embeddings
text_embeddings = clip_model.text_encoder(texts)
image_embeddings = clip_model.vision_encoder(images)
# Compute similarity
similarity = np.dot(text_embeddings, image_embeddings.T)
print("Text-image similarity:", similarity)import keras_hub
# Load PaliGemma model
model = keras_hub.models.PaliGemmaCausalLM.from_preset("paligemma_3b_mix_224")
# Prepare multimodal input (image + text prompt)
image = np.random.random((224, 224, 3))
text_prompt = "Describe what you see in the image:"
# Generate text based on image and prompt
response = model.generate([image, text_prompt], max_length=100)
print("Generated description:", response)import keras_hub
# Load CLIP text encoder
text_encoder = keras_hub.models.CLIPTextEncoder.from_preset("clip_vit_base_patch32")
# Load CLIP vision encoder
vision_encoder = keras_hub.models.CLIPVisionEncoder.from_preset("clip_vit_base_patch32")
# Process text
text_features = text_encoder(["a beautiful sunset"])
# Process image
image_features = vision_encoder([image])
# Use features for downstream tasks
print("Text features shape:", text_features.shape)
print("Image features shape:", image_features.shape)import keras_hub
# Create custom CLIP-like model
text_encoder = keras_hub.models.CLIPTextEncoder(
vocabulary_size=50000,
num_layers=12,
num_heads=12,
hidden_dim=768,
intermediate_dim=3072
)
vision_encoder = keras_hub.models.CLIPVisionEncoder(
image_shape=(224, 224, 3),
patch_size=32,
num_layers=12,
num_heads=12,
hidden_dim=768
)
# Combine encoders
multimodal_model = keras_hub.models.CLIPBackbone(
text_encoder=text_encoder,
vision_encoder=vision_encoder
)
# Create preprocessor
preprocessor = keras_hub.models.CLIPPreprocessor(
tokenizer=keras_hub.tokenizers.CLIPTokenizer.from_preset("clip_vit_base_patch32"),
image_converter=keras_hub.layers.CLIPImageConverter()
)
# Use for training or inference
# multimodal_model.compile(optimizer="adam", loss="contrastive_loss")Install with Tessl CLI
npx tessl i tessl/pypi-keras-hub