or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-models.mdevaluation-metrics.mdgenerative-models.mdimage-models.mdindex.mdlayers-components.mdmultimodal-models.mdtext-generation-sampling.mdtext-models.mdtokenizers.mdutilities-helpers.md

multimodal-models.mddocs/

0

# Multimodal Models

1

2

Models that process multiple modalities like text and images together for advanced AI capabilities. Keras Hub provides implementations of CLIP, SigLIP, PaliGemma, and other multimodal architectures.

3

4

## Capabilities

5

6

### CLIP (Contrastive Language-Image Pre-Training)

7

8

CLIP learns visual concepts from natural language supervision by jointly training text and image encoders.

9

10

```python { .api }

11

class CLIPBackbone(Backbone):

12

"""CLIP multimodal backbone."""

13

def __init__(

14

self,

15

text_encoder: CLIPTextEncoder,

16

vision_encoder: CLIPVisionEncoder,

17

**kwargs

18

): ...

19

20

class CLIPTextEncoder(Backbone):

21

"""CLIP text encoder using transformer architecture."""

22

def __init__(

23

self,

24

vocabulary_size: int,

25

num_layers: int,

26

num_heads: int,

27

hidden_dim: int,

28

intermediate_dim: int,

29

max_sequence_length: int = 77,

30

**kwargs

31

): ...

32

33

class CLIPVisionEncoder(Backbone):

34

"""CLIP vision encoder using Vision Transformer architecture."""

35

def __init__(

36

self,

37

image_shape: tuple = (224, 224, 3),

38

patch_size: int = 32,

39

num_layers: int = 12,

40

num_heads: int = 12,

41

hidden_dim: int = 768,

42

intermediate_dim: int = 3072,

43

**kwargs

44

): ...

45

46

class CLIPPreprocessor:

47

"""Preprocessor for CLIP multimodal inputs."""

48

def __init__(

49

self,

50

tokenizer: CLIPTokenizer,

51

image_converter: CLIPImageConverter,

52

**kwargs

53

): ...

54

55

class CLIPTokenizer:

56

"""CLIP tokenizer for text processing."""

57

def __init__(

58

self,

59

vocabulary: dict = None,

60

merges: list = None,

61

**kwargs

62

): ...

63

64

class CLIPImageConverter:

65

"""Image converter for CLIP models."""

66

def __init__(

67

self,

68

height: int = 224,

69

width: int = 224,

70

crop_to_aspect_ratio: bool = True,

71

interpolation: str = "bilinear",

72

**kwargs

73

): ...

74

```

75

76

### SigLIP (Sigmoid Loss for Language Image Pre-Training)

77

78

SigLIP is an improved version of CLIP using sigmoid loss for better multimodal understanding.

79

80

```python { .api }

81

class SigLIPBackbone(Backbone):

82

"""SigLIP multimodal backbone."""

83

def __init__(

84

self,

85

text_encoder: SigLIPTextEncoder,

86

vision_encoder: SigLIPVisionEncoder,

87

**kwargs

88

): ...

89

90

class SigLIPTextEncoder(Backbone):

91

"""SigLIP text encoder."""

92

def __init__(

93

self,

94

vocabulary_size: int,

95

num_layers: int,

96

num_heads: int,

97

hidden_dim: int,

98

intermediate_dim: int,

99

max_sequence_length: int = 77,

100

**kwargs

101

): ...

102

103

class SigLIPVisionEncoder(Backbone):

104

"""SigLIP vision encoder."""

105

def __init__(

106

self,

107

image_shape: tuple = (224, 224, 3),

108

patch_size: int = 16,

109

num_layers: int = 12,

110

num_heads: int = 12,

111

hidden_dim: int = 768,

112

intermediate_dim: int = 3072,

113

**kwargs

114

): ...

115

116

class SigLIPPreprocessor:

117

"""Preprocessor for SigLIP multimodal inputs."""

118

def __init__(

119

self,

120

tokenizer: SigLIPTokenizer,

121

image_converter: SigLIPImageConverter,

122

**kwargs

123

): ...

124

125

class SigLIPTokenizer:

126

"""SigLIP tokenizer for text processing."""

127

def __init__(

128

self,

129

vocabulary: dict = None,

130

**kwargs

131

): ...

132

133

class SigLIPImageConverter:

134

"""Image converter for SigLIP models."""

135

def __init__(

136

self,

137

height: int = 224,

138

width: int = 224,

139

crop_to_aspect_ratio: bool = True,

140

interpolation: str = "bilinear",

141

**kwargs

142

): ...

143

```

144

145

### PaliGemma (Pathways Language and Image model based on Gemma)

146

147

PaliGemma combines vision and language understanding in a unified architecture for multimodal tasks.

148

149

```python { .api }

150

class PaliGemmaBackbone(Backbone):

151

"""PaliGemma multimodal backbone."""

152

def __init__(

153

self,

154

vocabulary_size: int,

155

image_size: int,

156

num_layers: int,

157

num_heads: int,

158

hidden_dim: int,

159

intermediate_dim: int,

160

**kwargs

161

): ...

162

163

class PaliGemmaCausalLM(CausalLM):

164

"""PaliGemma model for multimodal causal language modeling."""

165

def __init__(

166

self,

167

backbone: PaliGemmaBackbone,

168

preprocessor: Preprocessor = None,

169

**kwargs

170

): ...

171

172

class PaliGemmaCausalLMPreprocessor:

173

"""Preprocessor for PaliGemma causal language modeling."""

174

def __init__(

175

self,

176

tokenizer: PaliGemmaTokenizer,

177

image_converter: PaliGemmaImageConverter,

178

sequence_length: int = 1024,

179

**kwargs

180

): ...

181

182

class PaliGemmaTokenizer:

183

"""PaliGemma tokenizer for text processing."""

184

def __init__(

185

self,

186

vocabulary: dict = None,

187

**kwargs

188

): ...

189

190

class PaliGemmaImageConverter:

191

"""Image converter for PaliGemma models."""

192

def __init__(

193

self,

194

height: int = 224,

195

width: int = 224,

196

crop_to_aspect_ratio: bool = True,

197

interpolation: str = "bilinear",

198

**kwargs

199

): ...

200

```

201

202

### Gemma3 Vision Components

203

204

Gemma3 includes vision capabilities for multimodal understanding.

205

206

```python { .api }

207

class Gemma3VisionEncoder(Backbone):

208

"""Gemma3 vision encoder for multimodal tasks."""

209

def __init__(

210

self,

211

image_shape: tuple = (224, 224, 3),

212

patch_size: int = 16,

213

num_layers: int = 12,

214

num_heads: int = 12,

215

hidden_dim: int = 768,

216

**kwargs

217

): ...

218

219

class Gemma3ImageConverter:

220

"""Image converter for Gemma3 models."""

221

def __init__(

222

self,

223

height: int = 224,

224

width: int = 224,

225

crop_to_aspect_ratio: bool = True,

226

interpolation: str = "bilinear",

227

**kwargs

228

): ...

229

```

230

231

## Usage Examples

232

233

### Image-Text Similarity with CLIP

234

235

```python

236

import keras_hub

237

import numpy as np

238

239

# Load pretrained CLIP model

240

clip_model = keras_hub.models.CLIPBackbone.from_preset("clip_vit_base_patch32")

241

242

# Prepare text and image data

243

texts = ["a cat sitting on a table", "a dog running in a park"]

244

images = np.random.random((2, 224, 224, 3)) # Example images

245

246

# Get embeddings

247

text_embeddings = clip_model.text_encoder(texts)

248

image_embeddings = clip_model.vision_encoder(images)

249

250

# Compute similarity

251

similarity = np.dot(text_embeddings, image_embeddings.T)

252

print("Text-image similarity:", similarity)

253

```

254

255

### Multimodal Text Generation with PaliGemma

256

257

```python

258

import keras_hub

259

260

# Load PaliGemma model

261

model = keras_hub.models.PaliGemmaCausalLM.from_preset("paligemma_3b_mix_224")

262

263

# Prepare multimodal input (image + text prompt)

264

image = np.random.random((224, 224, 3))

265

text_prompt = "Describe what you see in the image:"

266

267

# Generate text based on image and prompt

268

response = model.generate([image, text_prompt], max_length=100)

269

print("Generated description:", response)

270

```

271

272

### Using CLIP Components Separately

273

274

```python

275

import keras_hub

276

277

# Load CLIP text encoder

278

text_encoder = keras_hub.models.CLIPTextEncoder.from_preset("clip_vit_base_patch32")

279

280

# Load CLIP vision encoder

281

vision_encoder = keras_hub.models.CLIPVisionEncoder.from_preset("clip_vit_base_patch32")

282

283

# Process text

284

text_features = text_encoder(["a beautiful sunset"])

285

286

# Process image

287

image_features = vision_encoder([image])

288

289

# Use features for downstream tasks

290

print("Text features shape:", text_features.shape)

291

print("Image features shape:", image_features.shape)

292

```

293

294

### Custom Multimodal Pipeline

295

296

```python

297

import keras_hub

298

299

# Create custom CLIP-like model

300

text_encoder = keras_hub.models.CLIPTextEncoder(

301

vocabulary_size=50000,

302

num_layers=12,

303

num_heads=12,

304

hidden_dim=768,

305

intermediate_dim=3072

306

)

307

308

vision_encoder = keras_hub.models.CLIPVisionEncoder(

309

image_shape=(224, 224, 3),

310

patch_size=32,

311

num_layers=12,

312

num_heads=12,

313

hidden_dim=768

314

)

315

316

# Combine encoders

317

multimodal_model = keras_hub.models.CLIPBackbone(

318

text_encoder=text_encoder,

319

vision_encoder=vision_encoder

320

)

321

322

# Create preprocessor

323

preprocessor = keras_hub.models.CLIPPreprocessor(

324

tokenizer=keras_hub.tokenizers.CLIPTokenizer.from_preset("clip_vit_base_patch32"),

325

image_converter=keras_hub.layers.CLIPImageConverter()

326

)

327

328

# Use for training or inference

329

# multimodal_model.compile(optimizer="adam", loss="contrastive_loss")

330

```