Tessl Tile for pypi/keras-hub@0.22.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-models.md evaluation-metrics.md generative-models.md image-models.md index.md layers-components.md multimodal-models.md text-generation-sampling.md text-models.md tokenizers.md utilities-helpers.md

multimodal-models.mddocs/

0
# Multimodal Models
1

2
Models that process multiple modalities like text and images together for advanced AI capabilities. Keras Hub provides implementations of CLIP, SigLIP, PaliGemma, and other multimodal architectures.
3

4
## Capabilities
5

6
### CLIP (Contrastive Language-Image Pre-Training)
7

8
CLIP learns visual concepts from natural language supervision by jointly training text and image encoders.
9

10
```python { .api }
11
class CLIPBackbone(Backbone):
12
    """CLIP multimodal backbone."""
13
    def __init__(
14
        self,
15
        text_encoder: CLIPTextEncoder,
16
        vision_encoder: CLIPVisionEncoder,
17
        **kwargs
18
    ): ...
19

20
class CLIPTextEncoder(Backbone):
21
    """CLIP text encoder using transformer architecture."""
22
    def __init__(
23
        self,
24
        vocabulary_size: int,
25
        num_layers: int,
26
        num_heads: int,
27
        hidden_dim: int,
28
        intermediate_dim: int,
29
        max_sequence_length: int = 77,
30
        **kwargs
31
    ): ...
32

33
class CLIPVisionEncoder(Backbone):
34
    """CLIP vision encoder using Vision Transformer architecture."""
35
    def __init__(
36
        self,
37
        image_shape: tuple = (224, 224, 3),
38
        patch_size: int = 32,
39
        num_layers: int = 12,
40
        num_heads: int = 12,
41
        hidden_dim: int = 768,
42
        intermediate_dim: int = 3072,
43
        **kwargs
44
    ): ...
45

46
class CLIPPreprocessor:
47
    """Preprocessor for CLIP multimodal inputs."""
48
    def __init__(
49
        self,
50
        tokenizer: CLIPTokenizer,
51
        image_converter: CLIPImageConverter,
52
        **kwargs
53
    ): ...
54

55
class CLIPTokenizer:
56
    """CLIP tokenizer for text processing."""
57
    def __init__(
58
        self,
59
        vocabulary: dict = None,
60
        merges: list = None,
61
        **kwargs
62
    ): ...
63

64
class CLIPImageConverter:
65
    """Image converter for CLIP models."""
66
    def __init__(
67
        self,
68
        height: int = 224,
69
        width: int = 224,
70
        crop_to_aspect_ratio: bool = True,
71
        interpolation: str = "bilinear",
72
        **kwargs
73
    ): ...
74
```
75

76
### SigLIP (Sigmoid Loss for Language Image Pre-Training)
77

78
SigLIP is an improved version of CLIP using sigmoid loss for better multimodal understanding.
79

80
```python { .api }
81
class SigLIPBackbone(Backbone):
82
    """SigLIP multimodal backbone."""
83
    def __init__(
84
        self,
85
        text_encoder: SigLIPTextEncoder,
86
        vision_encoder: SigLIPVisionEncoder,
87
        **kwargs
88
    ): ...
89

90
class SigLIPTextEncoder(Backbone):
91
    """SigLIP text encoder."""
92
    def __init__(
93
        self,
94
        vocabulary_size: int,
95
        num_layers: int,
96
        num_heads: int,
97
        hidden_dim: int,
98
        intermediate_dim: int,
99
        max_sequence_length: int = 77,
100
        **kwargs
101
    ): ...
102

103
class SigLIPVisionEncoder(Backbone):
104
    """SigLIP vision encoder."""
105
    def __init__(
106
        self,
107
        image_shape: tuple = (224, 224, 3),
108
        patch_size: int = 16,
109
        num_layers: int = 12,
110
        num_heads: int = 12,
111
        hidden_dim: int = 768,
112
        intermediate_dim: int = 3072,
113
        **kwargs
114
    ): ...
115

116
class SigLIPPreprocessor:
117
    """Preprocessor for SigLIP multimodal inputs."""
118
    def __init__(
119
        self,
120
        tokenizer: SigLIPTokenizer,
121
        image_converter: SigLIPImageConverter,
122
        **kwargs
123
    ): ...
124

125
class SigLIPTokenizer:
126
    """SigLIP tokenizer for text processing."""
127
    def __init__(
128
        self,
129
        vocabulary: dict = None,
130
        **kwargs
131
    ): ...
132

133
class SigLIPImageConverter:
134
    """Image converter for SigLIP models."""
135
    def __init__(
136
        self,
137
        height: int = 224,
138
        width: int = 224,
139
        crop_to_aspect_ratio: bool = True,
140
        interpolation: str = "bilinear",
141
        **kwargs
142
    ): ...
143
```
144

145
### PaliGemma (Pathways Language and Image model based on Gemma)
146

147
PaliGemma combines vision and language understanding in a unified architecture for multimodal tasks.
148

149
```python { .api }
150
class PaliGemmaBackbone(Backbone):
151
    """PaliGemma multimodal backbone."""
152
    def __init__(
153
        self,
154
        vocabulary_size: int,
155
        image_size: int,
156
        num_layers: int,
157
        num_heads: int,
158
        hidden_dim: int,
159
        intermediate_dim: int,
160
        **kwargs
161
    ): ...
162

163
class PaliGemmaCausalLM(CausalLM):
164
    """PaliGemma model for multimodal causal language modeling."""
165
    def __init__(
166
        self,
167
        backbone: PaliGemmaBackbone,
168
        preprocessor: Preprocessor = None,
169
        **kwargs
170
    ): ...
171

172
class PaliGemmaCausalLMPreprocessor:
173
    """Preprocessor for PaliGemma causal language modeling."""
174
    def __init__(
175
        self,
176
        tokenizer: PaliGemmaTokenizer,
177
        image_converter: PaliGemmaImageConverter,
178
        sequence_length: int = 1024,
179
        **kwargs
180
    ): ...
181

182
class PaliGemmaTokenizer:
183
    """PaliGemma tokenizer for text processing."""
184
    def __init__(
185
        self,
186
        vocabulary: dict = None,
187
        **kwargs
188
    ): ...
189

190
class PaliGemmaImageConverter:
191
    """Image converter for PaliGemma models."""
192
    def __init__(
193
        self,
194
        height: int = 224,
195
        width: int = 224,
196
        crop_to_aspect_ratio: bool = True,
197
        interpolation: str = "bilinear",
198
        **kwargs
199
    ): ...
200
```
201

202
### Gemma3 Vision Components
203

204
Gemma3 includes vision capabilities for multimodal understanding.
205

206
```python { .api }
207
class Gemma3VisionEncoder(Backbone):
208
    """Gemma3 vision encoder for multimodal tasks."""
209
    def __init__(
210
        self,
211
        image_shape: tuple = (224, 224, 3),
212
        patch_size: int = 16,
213
        num_layers: int = 12,
214
        num_heads: int = 12,
215
        hidden_dim: int = 768,
216
        **kwargs
217
    ): ...
218

219
class Gemma3ImageConverter:
220
    """Image converter for Gemma3 models."""
221
    def __init__(
222
        self,
223
        height: int = 224,
224
        width: int = 224,
225
        crop_to_aspect_ratio: bool = True,
226
        interpolation: str = "bilinear",
227
        **kwargs
228
    ): ...
229
```
230

231
## Usage Examples
232

233
### Image-Text Similarity with CLIP
234

235
```python
236
import keras_hub
237
import numpy as np
238

239
# Load pretrained CLIP model
240
clip_model = keras_hub.models.CLIPBackbone.from_preset("clip_vit_base_patch32")
241

242
# Prepare text and image data
243
texts = ["a cat sitting on a table", "a dog running in a park"]
244
images = np.random.random((2, 224, 224, 3))  # Example images
245

246
# Get embeddings
247
text_embeddings = clip_model.text_encoder(texts)
248
image_embeddings = clip_model.vision_encoder(images)
249

250
# Compute similarity
251
similarity = np.dot(text_embeddings, image_embeddings.T)
252
print("Text-image similarity:", similarity)
253
```
254

255
### Multimodal Text Generation with PaliGemma
256

257
```python
258
import keras_hub
259

260
# Load PaliGemma model
261
model = keras_hub.models.PaliGemmaCausalLM.from_preset("paligemma_3b_mix_224")
262

263
# Prepare multimodal input (image + text prompt)
264
image = np.random.random((224, 224, 3))
265
text_prompt = "Describe what you see in the image:"
266

267
# Generate text based on image and prompt
268
response = model.generate([image, text_prompt], max_length=100)
269
print("Generated description:", response)
270
```
271

272
### Using CLIP Components Separately
273

274
```python
275
import keras_hub
276

277
# Load CLIP text encoder
278
text_encoder = keras_hub.models.CLIPTextEncoder.from_preset("clip_vit_base_patch32")
279

280
# Load CLIP vision encoder
281
vision_encoder = keras_hub.models.CLIPVisionEncoder.from_preset("clip_vit_base_patch32")
282

283
# Process text
284
text_features = text_encoder(["a beautiful sunset"])
285

286
# Process image
287
image_features = vision_encoder([image])
288

289
# Use features for downstream tasks
290
print("Text features shape:", text_features.shape)
291
print("Image features shape:", image_features.shape)
292
```
293

294
### Custom Multimodal Pipeline
295

296
```python
297
import keras_hub
298

299
# Create custom CLIP-like model
300
text_encoder = keras_hub.models.CLIPTextEncoder(
301
    vocabulary_size=50000,
302
    num_layers=12,
303
    num_heads=12,
304
    hidden_dim=768,
305
    intermediate_dim=3072
306
)
307

308
vision_encoder = keras_hub.models.CLIPVisionEncoder(
309
    image_shape=(224, 224, 3),
310
    patch_size=32,
311
    num_layers=12,
312
    num_heads=12,
313
    hidden_dim=768
314
)
315

316
# Combine encoders
317
multimodal_model = keras_hub.models.CLIPBackbone(
318
    text_encoder=text_encoder,
319
    vision_encoder=vision_encoder
320
)
321

322
# Create preprocessor
323
preprocessor = keras_hub.models.CLIPPreprocessor(
324
    tokenizer=keras_hub.tokenizers.CLIPTokenizer.from_preset("clip_vit_base_patch32"),
325
    image_converter=keras_hub.layers.CLIPImageConverter()
326
)
327

328
# Use for training or inference
329
# multimodal_model.compile(optimizer="adam", loss="contrastive_loss")
330
```

Version

Tile

Files

multimodal-models.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

multimodal-models.mddocs/