or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

caching.mdchat-completion.mdgrammar.mdindex.mdllama-model.mdlow-level.mdserver.mdtokenization.mdvision.md

vision.mddocs/

0

# Vision and Multimodal

1

2

LLaVA vision model integration for processing images alongside text, supporting various image formats and multimodal conversation flows with visual understanding capabilities.

3

4

## Capabilities

5

6

### Image Embedding

7

8

Create embeddings from images for vision-language processing.

9

10

```python { .api }

11

def llava_image_embed_make_with_filename(

12

ctx_clip,

13

n_threads: int,

14

image_path: bytes

15

) -> llava_image_embed:

16

"""

17

Create image embedding from image file.

18

19

Args:

20

ctx_clip: CLIP context pointer

21

n_threads: Number of threads to use for processing

22

image_path: Path to image file (as bytes)

23

24

Returns:

25

Image embedding structure pointer

26

"""

27

28

def llava_image_embed_make_with_bytes(

29

ctx_clip,

30

image_bytes: bytes,

31

image_bytes_length: int

32

) -> llava_image_embed:

33

"""

34

Create image embedding from image bytes.

35

36

Args:

37

ctx_clip: CLIP context pointer

38

image_bytes: Raw image data

39

image_bytes_length: Length of image data

40

41

Returns:

42

Image embedding structure

43

"""

44

45

def llava_image_embed_free(embed) -> None:

46

"""

47

Free image embedding memory.

48

49

Args:

50

embed: Image embedding to free

51

"""

52

```

53

54

### Vision Model Validation

55

56

Validate compatibility between text and vision model embeddings.

57

58

```python { .api }

59

def llava_validate_embed_size(

60

n_embd: int,

61

n_image_embd: int

62

) -> bool:

63

"""

64

Validate that text and image embedding dimensions are compatible.

65

66

Args:

67

n_embd: Text model embedding dimensions

68

n_image_embd: Image model embedding dimensions

69

70

Returns:

71

True if embeddings are compatible

72

"""

73

```

74

75

### CLIP Context Management

76

77

Manage CLIP vision encoder context for image processing.

78

79

```python { .api }

80

# Type definitions for vision processing

81

clip_ctx_p = ctypes.POINTER(ctypes.c_void_p) # CLIP context pointer type

82

83

class llava_image_embed(ctypes.Structure):

84

"""Image embedding structure for vision models."""

85

_fields_ = [

86

("embed", ctypes.POINTER(ctypes.c_float)),

87

("n_image_pos", ctypes.c_int),

88

]

89

```

90

91

## Usage Examples

92

93

### Basic Image Processing

94

95

```python

96

from llama_cpp import Llama

97

import llama_cpp.llama_cpp as llama_cpp

98

99

# Initialize vision-capable model (LLaVA)

100

llm = Llama(

101

model_path="./models/llava-v1.5-7b.gguf",

102

clip_model_path="./models/mmproj-model.gguf", # Vision projector

103

n_ctx=2048,

104

verbose=False,

105

)

106

107

# Process image with text prompt

108

response = llm.create_chat_completion(

109

messages=[

110

{

111

"role": "user",

112

"content": [

113

{"type": "text", "text": "What do you see in this image?"},

114

{"type": "image_url", "image_url": {"url": "file://./image.jpg"}}

115

]

116

}

117

],

118

max_tokens=200,

119

)

120

121

print("Vision response:", response['choices'][0]['message']['content'])

122

```

123

124

### Image Analysis Conversation

125

126

```python

127

# Multi-turn conversation about an image

128

messages = [

129

{

130

"role": "user",

131

"content": [

132

{"type": "text", "text": "Describe this image in detail."},

133

{"type": "image_url", "image_url": {"url": "file://./photo.jpg"}}

134

]

135

}

136

]

137

138

# First response

139

response = llm.create_chat_completion(messages=messages, max_tokens=150)

140

messages.append({

141

"role": "assistant",

142

"content": response['choices'][0]['message']['content']

143

})

144

145

# Follow-up question

146

messages.append({

147

"role": "user",

148

"content": "What colors are most prominent in this image?"

149

})

150

151

response = llm.create_chat_completion(messages=messages, max_tokens=100)

152

print("Color analysis:", response['choices'][0]['message']['content'])

153

```

154

155

### Batch Image Processing

156

157

```python

158

import os

159

from pathlib import Path

160

161

# Process multiple images

162

image_dir = Path("./images")

163

image_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.png"))

164

165

for image_file in image_files[:5]: # Process first 5 images

166

messages = [

167

{

168

"role": "user",

169

"content": [

170

{"type": "text", "text": "Provide a brief caption for this image."},

171

{"type": "image_url", "image_url": {"url": f"file://{image_file}"}}

172

]

173

}

174

]

175

176

response = llm.create_chat_completion(

177

messages=messages,

178

max_tokens=50,

179

temperature=0.7,

180

)

181

182

caption = response['choices'][0]['message']['content']

183

print(f"{image_file.name}: {caption}")

184

```

185

186

### Image-based Question Answering

187

188

```python

189

def ask_about_image(image_path: str, question: str) -> str:

190

"""Ask a specific question about an image."""

191

messages = [

192

{

193

"role": "user",

194

"content": [

195

{"type": "text", "text": question},

196

{"type": "image_url", "image_url": {"url": f"file://{image_path}"}}

197

]

198

}

199

]

200

201

response = llm.create_chat_completion(

202

messages=messages,

203

max_tokens=100,

204

temperature=0.3, # Lower temperature for more factual responses

205

)

206

207

return response['choices'][0]['message']['content']

208

209

# Example questions

210

image_path = "./sample_image.jpg"

211

questions = [

212

"How many people are in this image?",

213

"What is the main activity happening?",

214

"What is the setting or location?",

215

"What emotions are visible on people's faces?",

216

"Are there any text or signs visible?",

217

]

218

219

for question in questions:

220

answer = ask_about_image(image_path, question)

221

print(f"Q: {question}")

222

print(f"A: {answer}\n")

223

```

224

225

### Low-Level Image Embedding

226

227

```python

228

import llama_cpp.llama_cpp as llama_cpp

229

from ctypes import c_void_p, cast

230

231

# Assuming you have access to CLIP context (advanced usage)

232

# This would typically be handled internally by the Llama class

233

234

def process_image_embedding(image_path: str, ctx_clip):

235

"""Process image embedding at low level."""

236

237

# Create image embedding from file

238

embed = llama_cpp.llava_image_embed_make_with_filename(

239

ctx_clip,

240

image_path.encode('utf-8')

241

)

242

243

if embed:

244

print(f"Created embedding for {image_path}")

245

print(f"Image positions: {embed.contents.n_image_pos}")

246

247

# Process embedding (your custom logic here)

248

# ...

249

250

# Free embedding memory

251

llama_cpp.llava_image_embed_free(embed)

252

print("Embedding memory freed")

253

else:

254

print(f"Failed to create embedding for {image_path}")

255

256

# Note: This is advanced usage and requires proper CLIP context setup

257

```

258

259

### Image Format Support

260

261

```python

262

import base64

263

from io import BytesIO

264

from PIL import Image

265

266

def process_base64_image(base64_data: str, question: str) -> str:

267

"""Process image provided as base64 data."""

268

269

# Convert base64 to image URL format

270

image_url = f"data:image/jpeg;base64,{base64_data}"

271

272

messages = [

273

{

274

"role": "user",

275

"content": [

276

{"type": "text", "text": question},

277

{"type": "image_url", "image_url": {"url": image_url}}

278

]

279

}

280

]

281

282

response = llm.create_chat_completion(messages=messages, max_tokens=150)

283

return response['choices'][0]['message']['content']

284

285

def resize_and_encode_image(image_path: str, max_size: tuple = (512, 512)) -> str:

286

"""Resize image and convert to base64 for processing."""

287

with Image.open(image_path) as img:

288

# Resize image to reduce processing time

289

img.thumbnail(max_size, Image.Resampling.LANCZOS)

290

291

# Convert to RGB if necessary

292

if img.mode != 'RGB':

293

img = img.convert('RGB')

294

295

# Save to base64

296

buffer = BytesIO()

297

img.save(buffer, format='JPEG', quality=85)

298

base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')

299

300

return base64_data

301

302

# Process resized image

303

image_path = "./large_image.jpg"

304

base64_image = resize_and_encode_image(image_path)

305

result = process_base64_image(base64_image, "What are the main objects in this image?")

306

print(result)

307

```

308

309

### Vision Model Performance Tuning

310

311

```python

312

# Initialize vision model with optimized settings

313

llm = Llama(

314

model_path="./models/llava-1.6-mistral-7b.gguf",

315

clip_model_path="./models/mmproj-model.gguf",

316

n_ctx=4096, # Larger context for complex vision tasks

317

n_gpu_layers=30, # Offload to GPU for faster processing

318

verbose=False,

319

n_threads=8,

320

)

321

322

def benchmark_vision_processing(image_path: str, num_runs: int = 3):

323

"""Benchmark vision processing performance."""

324

import time

325

326

messages = [

327

{

328

"role": "user",

329

"content": [

330

{"type": "text", "text": "Describe this image concisely."},

331

{"type": "image_url", "image_url": {"url": f"file://{image_path}"}}

332

]

333

}

334

]

335

336

times = []

337

for i in range(num_runs):

338

start_time = time.time()

339

340

response = llm.create_chat_completion(

341

messages=messages,

342

max_tokens=100,

343

temperature=0.5,

344

)

345

346

end_time = time.time()

347

processing_time = end_time - start_time

348

times.append(processing_time)

349

350

print(f"Run {i+1}: {processing_time:.2f}s")

351

if i == 0: # Print response from first run

352

print(f"Response: {response['choices'][0]['message']['content']}")

353

354

avg_time = sum(times) / len(times)

355

print(f"Average processing time: {avg_time:.2f}s")

356

357

benchmark_vision_processing("./test_image.jpg")

358

```

359

360

### Image Comparison

361

362

```python

363

def compare_images(image1_path: str, image2_path: str) -> str:

364

"""Compare two images and describe differences."""

365

messages = [

366

{

367

"role": "user",

368

"content": [

369

{"type": "text", "text": "Compare these two images and describe the main differences:"},

370

{"type": "image_url", "image_url": {"url": f"file://{image1_path}"}},

371

{"type": "text", "text": "versus"},

372

{"type": "image_url", "image_url": {"url": f"file://{image2_path}"}}

373

]

374

}

375

]

376

377

response = llm.create_chat_completion(

378

messages=messages,

379

max_tokens=200,

380

temperature=0.3,

381

)

382

383

return response['choices'][0]['message']['content']

384

385

# Compare two images

386

comparison = compare_images("./before.jpg", "./after.jpg")

387

print("Image comparison:", comparison)

388

```

389

390

### Visual Chat Interface

391

392

```python

393

class VisualChatbot:

394

def __init__(self, model_path: str, clip_model_path: str):

395

self.llm = Llama(

396

model_path=model_path,

397

clip_model_path=clip_model_path,

398

n_ctx=2048,

399

verbose=False,

400

)

401

self.conversation_history = []

402

403

def add_text_message(self, text: str):

404

"""Add text message to conversation."""

405

self.conversation_history.append({

406

"role": "user",

407

"content": text

408

})

409

410

def add_image_message(self, image_path: str, text: str = ""):

411

"""Add image with optional text to conversation."""

412

content = []

413

if text:

414

content.append({"type": "text", "text": text})

415

content.append({"type": "image_url", "image_url": {"url": f"file://{image_path}"}})

416

417

self.conversation_history.append({

418

"role": "user",

419

"content": content

420

})

421

422

def get_response(self, max_tokens: int = 150) -> str:

423

"""Get response from the model."""

424

response = self.llm.create_chat_completion(

425

messages=self.conversation_history,

426

max_tokens=max_tokens,

427

)

428

429

assistant_message = response['choices'][0]['message']['content']

430

self.conversation_history.append({

431

"role": "assistant",

432

"content": assistant_message

433

})

434

435

return assistant_message

436

437

# Example usage

438

chatbot = VisualChatbot(

439

"./models/llava-v1.5-7b.gguf",

440

"./models/mmproj-model.gguf"

441

)

442

443

chatbot.add_image_message("./vacation_photo.jpg", "Look at this vacation photo!")

444

response1 = chatbot.get_response()

445

print("Bot:", response1)

446

447

chatbot.add_text_message("What activities would you recommend at this location?")

448

response2 = chatbot.get_response()

449

print("Bot:", response2)

450

```