0
# Vision and Multimodal
1
2
LLaVA vision model integration for processing images alongside text, supporting various image formats and multimodal conversation flows with visual understanding capabilities.
3
4
## Capabilities
5
6
### Image Embedding
7
8
Create embeddings from images for vision-language processing.
9
10
```python { .api }
11
def llava_image_embed_make_with_filename(
12
ctx_clip,
13
n_threads: int,
14
image_path: bytes
15
) -> llava_image_embed:
16
"""
17
Create image embedding from image file.
18
19
Args:
20
ctx_clip: CLIP context pointer
21
n_threads: Number of threads to use for processing
22
image_path: Path to image file (as bytes)
23
24
Returns:
25
Image embedding structure pointer
26
"""
27
28
def llava_image_embed_make_with_bytes(
29
ctx_clip,
30
image_bytes: bytes,
31
image_bytes_length: int
32
) -> llava_image_embed:
33
"""
34
Create image embedding from image bytes.
35
36
Args:
37
ctx_clip: CLIP context pointer
38
image_bytes: Raw image data
39
image_bytes_length: Length of image data
40
41
Returns:
42
Image embedding structure
43
"""
44
45
def llava_image_embed_free(embed) -> None:
46
"""
47
Free image embedding memory.
48
49
Args:
50
embed: Image embedding to free
51
"""
52
```
53
54
### Vision Model Validation
55
56
Validate compatibility between text and vision model embeddings.
57
58
```python { .api }
59
def llava_validate_embed_size(
60
n_embd: int,
61
n_image_embd: int
62
) -> bool:
63
"""
64
Validate that text and image embedding dimensions are compatible.
65
66
Args:
67
n_embd: Text model embedding dimensions
68
n_image_embd: Image model embedding dimensions
69
70
Returns:
71
True if embeddings are compatible
72
"""
73
```
74
75
### CLIP Context Management
76
77
Manage CLIP vision encoder context for image processing.
78
79
```python { .api }
80
# Type definitions for vision processing
81
clip_ctx_p = ctypes.POINTER(ctypes.c_void_p) # CLIP context pointer type
82
83
class llava_image_embed(ctypes.Structure):
84
"""Image embedding structure for vision models."""
85
_fields_ = [
86
("embed", ctypes.POINTER(ctypes.c_float)),
87
("n_image_pos", ctypes.c_int),
88
]
89
```
90
91
## Usage Examples
92
93
### Basic Image Processing
94
95
```python
96
from llama_cpp import Llama
97
import llama_cpp.llama_cpp as llama_cpp
98
99
# Initialize vision-capable model (LLaVA)
100
llm = Llama(
101
model_path="./models/llava-v1.5-7b.gguf",
102
clip_model_path="./models/mmproj-model.gguf", # Vision projector
103
n_ctx=2048,
104
verbose=False,
105
)
106
107
# Process image with text prompt
108
response = llm.create_chat_completion(
109
messages=[
110
{
111
"role": "user",
112
"content": [
113
{"type": "text", "text": "What do you see in this image?"},
114
{"type": "image_url", "image_url": {"url": "file://./image.jpg"}}
115
]
116
}
117
],
118
max_tokens=200,
119
)
120
121
print("Vision response:", response['choices'][0]['message']['content'])
122
```
123
124
### Image Analysis Conversation
125
126
```python
127
# Multi-turn conversation about an image
128
messages = [
129
{
130
"role": "user",
131
"content": [
132
{"type": "text", "text": "Describe this image in detail."},
133
{"type": "image_url", "image_url": {"url": "file://./photo.jpg"}}
134
]
135
}
136
]
137
138
# First response
139
response = llm.create_chat_completion(messages=messages, max_tokens=150)
140
messages.append({
141
"role": "assistant",
142
"content": response['choices'][0]['message']['content']
143
})
144
145
# Follow-up question
146
messages.append({
147
"role": "user",
148
"content": "What colors are most prominent in this image?"
149
})
150
151
response = llm.create_chat_completion(messages=messages, max_tokens=100)
152
print("Color analysis:", response['choices'][0]['message']['content'])
153
```
154
155
### Batch Image Processing
156
157
```python
158
import os
159
from pathlib import Path
160
161
# Process multiple images
162
image_dir = Path("./images")
163
image_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.png"))
164
165
for image_file in image_files[:5]: # Process first 5 images
166
messages = [
167
{
168
"role": "user",
169
"content": [
170
{"type": "text", "text": "Provide a brief caption for this image."},
171
{"type": "image_url", "image_url": {"url": f"file://{image_file}"}}
172
]
173
}
174
]
175
176
response = llm.create_chat_completion(
177
messages=messages,
178
max_tokens=50,
179
temperature=0.7,
180
)
181
182
caption = response['choices'][0]['message']['content']
183
print(f"{image_file.name}: {caption}")
184
```
185
186
### Image-based Question Answering
187
188
```python
189
def ask_about_image(image_path: str, question: str) -> str:
190
"""Ask a specific question about an image."""
191
messages = [
192
{
193
"role": "user",
194
"content": [
195
{"type": "text", "text": question},
196
{"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
197
]
198
}
199
]
200
201
response = llm.create_chat_completion(
202
messages=messages,
203
max_tokens=100,
204
temperature=0.3, # Lower temperature for more factual responses
205
)
206
207
return response['choices'][0]['message']['content']
208
209
# Example questions
210
image_path = "./sample_image.jpg"
211
questions = [
212
"How many people are in this image?",
213
"What is the main activity happening?",
214
"What is the setting or location?",
215
"What emotions are visible on people's faces?",
216
"Are there any text or signs visible?",
217
]
218
219
for question in questions:
220
answer = ask_about_image(image_path, question)
221
print(f"Q: {question}")
222
print(f"A: {answer}\n")
223
```
224
225
### Low-Level Image Embedding
226
227
```python
228
import llama_cpp.llama_cpp as llama_cpp
229
from ctypes import c_void_p, cast
230
231
# Assuming you have access to CLIP context (advanced usage)
232
# This would typically be handled internally by the Llama class
233
234
def process_image_embedding(image_path: str, ctx_clip):
235
"""Process image embedding at low level."""
236
237
# Create image embedding from file
238
embed = llama_cpp.llava_image_embed_make_with_filename(
239
ctx_clip,
240
image_path.encode('utf-8')
241
)
242
243
if embed:
244
print(f"Created embedding for {image_path}")
245
print(f"Image positions: {embed.contents.n_image_pos}")
246
247
# Process embedding (your custom logic here)
248
# ...
249
250
# Free embedding memory
251
llama_cpp.llava_image_embed_free(embed)
252
print("Embedding memory freed")
253
else:
254
print(f"Failed to create embedding for {image_path}")
255
256
# Note: This is advanced usage and requires proper CLIP context setup
257
```
258
259
### Image Format Support
260
261
```python
262
import base64
263
from io import BytesIO
264
from PIL import Image
265
266
def process_base64_image(base64_data: str, question: str) -> str:
267
"""Process image provided as base64 data."""
268
269
# Convert base64 to image URL format
270
image_url = f"data:image/jpeg;base64,{base64_data}"
271
272
messages = [
273
{
274
"role": "user",
275
"content": [
276
{"type": "text", "text": question},
277
{"type": "image_url", "image_url": {"url": image_url}}
278
]
279
}
280
]
281
282
response = llm.create_chat_completion(messages=messages, max_tokens=150)
283
return response['choices'][0]['message']['content']
284
285
def resize_and_encode_image(image_path: str, max_size: tuple = (512, 512)) -> str:
286
"""Resize image and convert to base64 for processing."""
287
with Image.open(image_path) as img:
288
# Resize image to reduce processing time
289
img.thumbnail(max_size, Image.Resampling.LANCZOS)
290
291
# Convert to RGB if necessary
292
if img.mode != 'RGB':
293
img = img.convert('RGB')
294
295
# Save to base64
296
buffer = BytesIO()
297
img.save(buffer, format='JPEG', quality=85)
298
base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
299
300
return base64_data
301
302
# Process resized image
303
image_path = "./large_image.jpg"
304
base64_image = resize_and_encode_image(image_path)
305
result = process_base64_image(base64_image, "What are the main objects in this image?")
306
print(result)
307
```
308
309
### Vision Model Performance Tuning
310
311
```python
312
# Initialize vision model with optimized settings
313
llm = Llama(
314
model_path="./models/llava-1.6-mistral-7b.gguf",
315
clip_model_path="./models/mmproj-model.gguf",
316
n_ctx=4096, # Larger context for complex vision tasks
317
n_gpu_layers=30, # Offload to GPU for faster processing
318
verbose=False,
319
n_threads=8,
320
)
321
322
def benchmark_vision_processing(image_path: str, num_runs: int = 3):
323
"""Benchmark vision processing performance."""
324
import time
325
326
messages = [
327
{
328
"role": "user",
329
"content": [
330
{"type": "text", "text": "Describe this image concisely."},
331
{"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
332
]
333
}
334
]
335
336
times = []
337
for i in range(num_runs):
338
start_time = time.time()
339
340
response = llm.create_chat_completion(
341
messages=messages,
342
max_tokens=100,
343
temperature=0.5,
344
)
345
346
end_time = time.time()
347
processing_time = end_time - start_time
348
times.append(processing_time)
349
350
print(f"Run {i+1}: {processing_time:.2f}s")
351
if i == 0: # Print response from first run
352
print(f"Response: {response['choices'][0]['message']['content']}")
353
354
avg_time = sum(times) / len(times)
355
print(f"Average processing time: {avg_time:.2f}s")
356
357
benchmark_vision_processing("./test_image.jpg")
358
```
359
360
### Image Comparison
361
362
```python
363
def compare_images(image1_path: str, image2_path: str) -> str:
364
"""Compare two images and describe differences."""
365
messages = [
366
{
367
"role": "user",
368
"content": [
369
{"type": "text", "text": "Compare these two images and describe the main differences:"},
370
{"type": "image_url", "image_url": {"url": f"file://{image1_path}"}},
371
{"type": "text", "text": "versus"},
372
{"type": "image_url", "image_url": {"url": f"file://{image2_path}"}}
373
]
374
}
375
]
376
377
response = llm.create_chat_completion(
378
messages=messages,
379
max_tokens=200,
380
temperature=0.3,
381
)
382
383
return response['choices'][0]['message']['content']
384
385
# Compare two images
386
comparison = compare_images("./before.jpg", "./after.jpg")
387
print("Image comparison:", comparison)
388
```
389
390
### Visual Chat Interface
391
392
```python
393
class VisualChatbot:
394
def __init__(self, model_path: str, clip_model_path: str):
395
self.llm = Llama(
396
model_path=model_path,
397
clip_model_path=clip_model_path,
398
n_ctx=2048,
399
verbose=False,
400
)
401
self.conversation_history = []
402
403
def add_text_message(self, text: str):
404
"""Add text message to conversation."""
405
self.conversation_history.append({
406
"role": "user",
407
"content": text
408
})
409
410
def add_image_message(self, image_path: str, text: str = ""):
411
"""Add image with optional text to conversation."""
412
content = []
413
if text:
414
content.append({"type": "text", "text": text})
415
content.append({"type": "image_url", "image_url": {"url": f"file://{image_path}"}})
416
417
self.conversation_history.append({
418
"role": "user",
419
"content": content
420
})
421
422
def get_response(self, max_tokens: int = 150) -> str:
423
"""Get response from the model."""
424
response = self.llm.create_chat_completion(
425
messages=self.conversation_history,
426
max_tokens=max_tokens,
427
)
428
429
assistant_message = response['choices'][0]['message']['content']
430
self.conversation_history.append({
431
"role": "assistant",
432
"content": assistant_message
433
})
434
435
return assistant_message
436
437
# Example usage
438
chatbot = VisualChatbot(
439
"./models/llava-v1.5-7b.gguf",
440
"./models/mmproj-model.gguf"
441
)
442
443
chatbot.add_image_message("./vacation_photo.jpg", "Look at this vacation photo!")
444
response1 = chatbot.get_response()
445
print("Bot:", response1)
446
447
chatbot.add_text_message("What activities would you recommend at this location?")
448
response2 = chatbot.get_response()
449
print("Bot:", response2)
450
```