0
# Multimodal Models
1
2
Models that process multiple modalities like text and images together for advanced AI capabilities. Keras Hub provides implementations of CLIP, SigLIP, PaliGemma, and other multimodal architectures.
3
4
## Capabilities
5
6
### CLIP (Contrastive Language-Image Pre-Training)
7
8
CLIP learns visual concepts from natural language supervision by jointly training text and image encoders.
9
10
```python { .api }
11
class CLIPBackbone(Backbone):
12
"""CLIP multimodal backbone."""
13
def __init__(
14
self,
15
text_encoder: CLIPTextEncoder,
16
vision_encoder: CLIPVisionEncoder,
17
**kwargs
18
): ...
19
20
class CLIPTextEncoder(Backbone):
21
"""CLIP text encoder using transformer architecture."""
22
def __init__(
23
self,
24
vocabulary_size: int,
25
num_layers: int,
26
num_heads: int,
27
hidden_dim: int,
28
intermediate_dim: int,
29
max_sequence_length: int = 77,
30
**kwargs
31
): ...
32
33
class CLIPVisionEncoder(Backbone):
34
"""CLIP vision encoder using Vision Transformer architecture."""
35
def __init__(
36
self,
37
image_shape: tuple = (224, 224, 3),
38
patch_size: int = 32,
39
num_layers: int = 12,
40
num_heads: int = 12,
41
hidden_dim: int = 768,
42
intermediate_dim: int = 3072,
43
**kwargs
44
): ...
45
46
class CLIPPreprocessor:
47
"""Preprocessor for CLIP multimodal inputs."""
48
def __init__(
49
self,
50
tokenizer: CLIPTokenizer,
51
image_converter: CLIPImageConverter,
52
**kwargs
53
): ...
54
55
class CLIPTokenizer:
56
"""CLIP tokenizer for text processing."""
57
def __init__(
58
self,
59
vocabulary: dict = None,
60
merges: list = None,
61
**kwargs
62
): ...
63
64
class CLIPImageConverter:
65
"""Image converter for CLIP models."""
66
def __init__(
67
self,
68
height: int = 224,
69
width: int = 224,
70
crop_to_aspect_ratio: bool = True,
71
interpolation: str = "bilinear",
72
**kwargs
73
): ...
74
```
75
76
### SigLIP (Sigmoid Loss for Language Image Pre-Training)
77
78
SigLIP is an improved version of CLIP using sigmoid loss for better multimodal understanding.
79
80
```python { .api }
81
class SigLIPBackbone(Backbone):
82
"""SigLIP multimodal backbone."""
83
def __init__(
84
self,
85
text_encoder: SigLIPTextEncoder,
86
vision_encoder: SigLIPVisionEncoder,
87
**kwargs
88
): ...
89
90
class SigLIPTextEncoder(Backbone):
91
"""SigLIP text encoder."""
92
def __init__(
93
self,
94
vocabulary_size: int,
95
num_layers: int,
96
num_heads: int,
97
hidden_dim: int,
98
intermediate_dim: int,
99
max_sequence_length: int = 77,
100
**kwargs
101
): ...
102
103
class SigLIPVisionEncoder(Backbone):
104
"""SigLIP vision encoder."""
105
def __init__(
106
self,
107
image_shape: tuple = (224, 224, 3),
108
patch_size: int = 16,
109
num_layers: int = 12,
110
num_heads: int = 12,
111
hidden_dim: int = 768,
112
intermediate_dim: int = 3072,
113
**kwargs
114
): ...
115
116
class SigLIPPreprocessor:
117
"""Preprocessor for SigLIP multimodal inputs."""
118
def __init__(
119
self,
120
tokenizer: SigLIPTokenizer,
121
image_converter: SigLIPImageConverter,
122
**kwargs
123
): ...
124
125
class SigLIPTokenizer:
126
"""SigLIP tokenizer for text processing."""
127
def __init__(
128
self,
129
vocabulary: dict = None,
130
**kwargs
131
): ...
132
133
class SigLIPImageConverter:
134
"""Image converter for SigLIP models."""
135
def __init__(
136
self,
137
height: int = 224,
138
width: int = 224,
139
crop_to_aspect_ratio: bool = True,
140
interpolation: str = "bilinear",
141
**kwargs
142
): ...
143
```
144
145
### PaliGemma (Pathways Language and Image model based on Gemma)
146
147
PaliGemma combines vision and language understanding in a unified architecture for multimodal tasks.
148
149
```python { .api }
150
class PaliGemmaBackbone(Backbone):
151
"""PaliGemma multimodal backbone."""
152
def __init__(
153
self,
154
vocabulary_size: int,
155
image_size: int,
156
num_layers: int,
157
num_heads: int,
158
hidden_dim: int,
159
intermediate_dim: int,
160
**kwargs
161
): ...
162
163
class PaliGemmaCausalLM(CausalLM):
164
"""PaliGemma model for multimodal causal language modeling."""
165
def __init__(
166
self,
167
backbone: PaliGemmaBackbone,
168
preprocessor: Preprocessor = None,
169
**kwargs
170
): ...
171
172
class PaliGemmaCausalLMPreprocessor:
173
"""Preprocessor for PaliGemma causal language modeling."""
174
def __init__(
175
self,
176
tokenizer: PaliGemmaTokenizer,
177
image_converter: PaliGemmaImageConverter,
178
sequence_length: int = 1024,
179
**kwargs
180
): ...
181
182
class PaliGemmaTokenizer:
183
"""PaliGemma tokenizer for text processing."""
184
def __init__(
185
self,
186
vocabulary: dict = None,
187
**kwargs
188
): ...
189
190
class PaliGemmaImageConverter:
191
"""Image converter for PaliGemma models."""
192
def __init__(
193
self,
194
height: int = 224,
195
width: int = 224,
196
crop_to_aspect_ratio: bool = True,
197
interpolation: str = "bilinear",
198
**kwargs
199
): ...
200
```
201
202
### Gemma3 Vision Components
203
204
Gemma3 includes vision capabilities for multimodal understanding.
205
206
```python { .api }
207
class Gemma3VisionEncoder(Backbone):
208
"""Gemma3 vision encoder for multimodal tasks."""
209
def __init__(
210
self,
211
image_shape: tuple = (224, 224, 3),
212
patch_size: int = 16,
213
num_layers: int = 12,
214
num_heads: int = 12,
215
hidden_dim: int = 768,
216
**kwargs
217
): ...
218
219
class Gemma3ImageConverter:
220
"""Image converter for Gemma3 models."""
221
def __init__(
222
self,
223
height: int = 224,
224
width: int = 224,
225
crop_to_aspect_ratio: bool = True,
226
interpolation: str = "bilinear",
227
**kwargs
228
): ...
229
```
230
231
## Usage Examples
232
233
### Image-Text Similarity with CLIP
234
235
```python
236
import keras_hub
237
import numpy as np
238
239
# Load pretrained CLIP model
240
clip_model = keras_hub.models.CLIPBackbone.from_preset("clip_vit_base_patch32")
241
242
# Prepare text and image data
243
texts = ["a cat sitting on a table", "a dog running in a park"]
244
images = np.random.random((2, 224, 224, 3)) # Example images
245
246
# Get embeddings
247
text_embeddings = clip_model.text_encoder(texts)
248
image_embeddings = clip_model.vision_encoder(images)
249
250
# Compute similarity
251
similarity = np.dot(text_embeddings, image_embeddings.T)
252
print("Text-image similarity:", similarity)
253
```
254
255
### Multimodal Text Generation with PaliGemma
256
257
```python
258
import keras_hub
259
260
# Load PaliGemma model
261
model = keras_hub.models.PaliGemmaCausalLM.from_preset("paligemma_3b_mix_224")
262
263
# Prepare multimodal input (image + text prompt)
264
image = np.random.random((224, 224, 3))
265
text_prompt = "Describe what you see in the image:"
266
267
# Generate text based on image and prompt
268
response = model.generate([image, text_prompt], max_length=100)
269
print("Generated description:", response)
270
```
271
272
### Using CLIP Components Separately
273
274
```python
275
import keras_hub
276
277
# Load CLIP text encoder
278
text_encoder = keras_hub.models.CLIPTextEncoder.from_preset("clip_vit_base_patch32")
279
280
# Load CLIP vision encoder
281
vision_encoder = keras_hub.models.CLIPVisionEncoder.from_preset("clip_vit_base_patch32")
282
283
# Process text
284
text_features = text_encoder(["a beautiful sunset"])
285
286
# Process image
287
image_features = vision_encoder([image])
288
289
# Use features for downstream tasks
290
print("Text features shape:", text_features.shape)
291
print("Image features shape:", image_features.shape)
292
```
293
294
### Custom Multimodal Pipeline
295
296
```python
297
import keras_hub
298
299
# Create custom CLIP-like model
300
text_encoder = keras_hub.models.CLIPTextEncoder(
301
vocabulary_size=50000,
302
num_layers=12,
303
num_heads=12,
304
hidden_dim=768,
305
intermediate_dim=3072
306
)
307
308
vision_encoder = keras_hub.models.CLIPVisionEncoder(
309
image_shape=(224, 224, 3),
310
patch_size=32,
311
num_layers=12,
312
num_heads=12,
313
hidden_dim=768
314
)
315
316
# Combine encoders
317
multimodal_model = keras_hub.models.CLIPBackbone(
318
text_encoder=text_encoder,
319
vision_encoder=vision_encoder
320
)
321
322
# Create preprocessor
323
preprocessor = keras_hub.models.CLIPPreprocessor(
324
tokenizer=keras_hub.tokenizers.CLIPTokenizer.from_preset("clip_vit_base_patch32"),
325
image_converter=keras_hub.layers.CLIPImageConverter()
326
)
327
328
# Use for training or inference
329
# multimodal_model.compile(optimizer="adam", loss="contrastive_loss")
330
```