Tessl Tile for pypi/llama-cpp-python@0.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

caching.md chat-completion.md grammar.md index.md llama-model.md low-level.md server.md tokenization.md vision.md

vision.mddocs/

0
# Vision and Multimodal
1

2
LLaVA vision model integration for processing images alongside text, supporting various image formats and multimodal conversation flows with visual understanding capabilities.
3

4
## Capabilities
5

6
### Image Embedding
7

8
Create embeddings from images for vision-language processing.
9

10
```python { .api }
11
def llava_image_embed_make_with_filename(
12
    ctx_clip, 
13
    n_threads: int, 
14
    image_path: bytes
15
) -> llava_image_embed:
16
    """
17
    Create image embedding from image file.
18
    
19
    Args:
20
        ctx_clip: CLIP context pointer
21
        n_threads: Number of threads to use for processing
22
        image_path: Path to image file (as bytes)
23
        
24
    Returns:
25
        Image embedding structure pointer
26
    """
27

28
def llava_image_embed_make_with_bytes(
29
    ctx_clip,
30
    image_bytes: bytes,
31
    image_bytes_length: int
32
) -> llava_image_embed:
33
    """
34
    Create image embedding from image bytes.
35
    
36
    Args:
37
        ctx_clip: CLIP context pointer
38
        image_bytes: Raw image data
39
        image_bytes_length: Length of image data
40
        
41
    Returns:
42
        Image embedding structure
43
    """
44

45
def llava_image_embed_free(embed) -> None:
46
    """
47
    Free image embedding memory.
48
    
49
    Args:
50
        embed: Image embedding to free
51
    """
52
```
53

54
### Vision Model Validation
55

56
Validate compatibility between text and vision model embeddings.
57

58
```python { .api }
59
def llava_validate_embed_size(
60
    n_embd: int, 
61
    n_image_embd: int
62
) -> bool:
63
    """
64
    Validate that text and image embedding dimensions are compatible.
65
    
66
    Args:
67
        n_embd: Text model embedding dimensions
68
        n_image_embd: Image model embedding dimensions
69
        
70
    Returns:
71
        True if embeddings are compatible
72
    """
73
```
74

75
### CLIP Context Management
76

77
Manage CLIP vision encoder context for image processing.
78

79
```python { .api }
80
# Type definitions for vision processing
81
clip_ctx_p = ctypes.POINTER(ctypes.c_void_p)  # CLIP context pointer type
82

83
class llava_image_embed(ctypes.Structure):
84
    """Image embedding structure for vision models."""
85
    _fields_ = [
86
        ("embed", ctypes.POINTER(ctypes.c_float)),
87
        ("n_image_pos", ctypes.c_int),
88
    ]
89
```
90

91
## Usage Examples
92

93
### Basic Image Processing
94

95
```python
96
from llama_cpp import Llama
97
import llama_cpp.llama_cpp as llama_cpp
98

99
# Initialize vision-capable model (LLaVA)
100
llm = Llama(
101
    model_path="./models/llava-v1.5-7b.gguf",
102
    clip_model_path="./models/mmproj-model.gguf",  # Vision projector
103
    n_ctx=2048,
104
    verbose=False,
105
)
106

107
# Process image with text prompt
108
response = llm.create_chat_completion(
109
    messages=[
110
        {
111
            "role": "user",
112
            "content": [
113
                {"type": "text", "text": "What do you see in this image?"},
114
                {"type": "image_url", "image_url": {"url": "file://./image.jpg"}}
115
            ]
116
        }
117
    ],
118
    max_tokens=200,
119
)
120

121
print("Vision response:", response['choices'][0]['message']['content'])
122
```
123

124
### Image Analysis Conversation
125

126
```python
127
# Multi-turn conversation about an image
128
messages = [
129
    {
130
        "role": "user",
131
        "content": [
132
            {"type": "text", "text": "Describe this image in detail."},
133
            {"type": "image_url", "image_url": {"url": "file://./photo.jpg"}}
134
        ]
135
    }
136
]
137

138
# First response
139
response = llm.create_chat_completion(messages=messages, max_tokens=150)
140
messages.append({
141
    "role": "assistant",
142
    "content": response['choices'][0]['message']['content']
143
})
144

145
# Follow-up question
146
messages.append({
147
    "role": "user",
148
    "content": "What colors are most prominent in this image?"
149
})
150

151
response = llm.create_chat_completion(messages=messages, max_tokens=100)
152
print("Color analysis:", response['choices'][0]['message']['content'])
153
```
154

155
### Batch Image Processing
156

157
```python
158
import os
159
from pathlib import Path
160

161
# Process multiple images
162
image_dir = Path("./images")
163
image_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.png"))
164

165
for image_file in image_files[:5]:  # Process first 5 images
166
    messages = [
167
        {
168
            "role": "user",
169
            "content": [
170
                {"type": "text", "text": "Provide a brief caption for this image."},
171
                {"type": "image_url", "image_url": {"url": f"file://{image_file}"}}
172
            ]
173
        }
174
    ]
175
    
176
    response = llm.create_chat_completion(
177
        messages=messages,
178
        max_tokens=50,
179
        temperature=0.7,
180
    )
181
    
182
    caption = response['choices'][0]['message']['content']
183
    print(f"{image_file.name}: {caption}")
184
```
185

186
### Image-based Question Answering
187

188
```python
189
def ask_about_image(image_path: str, question: str) -> str:
190
    """Ask a specific question about an image."""
191
    messages = [
192
        {
193
            "role": "user",
194
            "content": [
195
                {"type": "text", "text": question},
196
                {"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
197
            ]
198
        }
199
    ]
200
    
201
    response = llm.create_chat_completion(
202
        messages=messages,
203
        max_tokens=100,
204
        temperature=0.3,  # Lower temperature for more factual responses
205
    )
206
    
207
    return response['choices'][0]['message']['content']
208

209
# Example questions
210
image_path = "./sample_image.jpg"
211
questions = [
212
    "How many people are in this image?",
213
    "What is the main activity happening?",
214
    "What is the setting or location?",
215
    "What emotions are visible on people's faces?",
216
    "Are there any text or signs visible?",
217
]
218

219
for question in questions:
220
    answer = ask_about_image(image_path, question)
221
    print(f"Q: {question}")
222
    print(f"A: {answer}\n")
223
```
224

225
### Low-Level Image Embedding
226

227
```python
228
import llama_cpp.llama_cpp as llama_cpp
229
from ctypes import c_void_p, cast
230

231
# Assuming you have access to CLIP context (advanced usage)
232
# This would typically be handled internally by the Llama class
233

234
def process_image_embedding(image_path: str, ctx_clip):
235
    """Process image embedding at low level."""
236
    
237
    # Create image embedding from file
238
    embed = llama_cpp.llava_image_embed_make_with_filename(
239
        ctx_clip, 
240
        image_path.encode('utf-8')
241
    )
242
    
243
    if embed:
244
        print(f"Created embedding for {image_path}")
245
        print(f"Image positions: {embed.contents.n_image_pos}")
246
        
247
        # Process embedding (your custom logic here)
248
        # ...
249
        
250
        # Free embedding memory
251
        llama_cpp.llava_image_embed_free(embed)
252
        print("Embedding memory freed")
253
    else:
254
        print(f"Failed to create embedding for {image_path}")
255

256
# Note: This is advanced usage and requires proper CLIP context setup
257
```
258

259
### Image Format Support
260

261
```python
262
import base64
263
from io import BytesIO
264
from PIL import Image
265

266
def process_base64_image(base64_data: str, question: str) -> str:
267
    """Process image provided as base64 data."""
268
    
269
    # Convert base64 to image URL format
270
    image_url = f"data:image/jpeg;base64,{base64_data}"
271
    
272
    messages = [
273
        {
274
            "role": "user",
275
            "content": [
276
                {"type": "text", "text": question},
277
                {"type": "image_url", "image_url": {"url": image_url}}
278
            ]
279
        }
280
    ]
281
    
282
    response = llm.create_chat_completion(messages=messages, max_tokens=150)
283
    return response['choices'][0]['message']['content']
284

285
def resize_and_encode_image(image_path: str, max_size: tuple = (512, 512)) -> str:
286
    """Resize image and convert to base64 for processing."""
287
    with Image.open(image_path) as img:
288
        # Resize image to reduce processing time
289
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
290
        
291
        # Convert to RGB if necessary
292
        if img.mode != 'RGB':
293
            img = img.convert('RGB')
294
        
295
        # Save to base64
296
        buffer = BytesIO()
297
        img.save(buffer, format='JPEG', quality=85)
298
        base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
299
        
300
        return base64_data
301

302
# Process resized image
303
image_path = "./large_image.jpg"
304
base64_image = resize_and_encode_image(image_path)
305
result = process_base64_image(base64_image, "What are the main objects in this image?")
306
print(result)
307
```
308

309
### Vision Model Performance Tuning
310

311
```python
312
# Initialize vision model with optimized settings
313
llm = Llama(
314
    model_path="./models/llava-1.6-mistral-7b.gguf",
315
    clip_model_path="./models/mmproj-model.gguf",
316
    n_ctx=4096,  # Larger context for complex vision tasks
317
    n_gpu_layers=30,  # Offload to GPU for faster processing
318
    verbose=False,
319
    n_threads=8,
320
)
321

322
def benchmark_vision_processing(image_path: str, num_runs: int = 3):
323
    """Benchmark vision processing performance."""
324
    import time
325
    
326
    messages = [
327
        {
328
            "role": "user",
329
            "content": [
330
                {"type": "text", "text": "Describe this image concisely."},
331
                {"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
332
            ]
333
        }
334
    ]
335
    
336
    times = []
337
    for i in range(num_runs):
338
        start_time = time.time()
339
        
340
        response = llm.create_chat_completion(
341
            messages=messages,
342
            max_tokens=100,
343
            temperature=0.5,
344
        )
345
        
346
        end_time = time.time()
347
        processing_time = end_time - start_time
348
        times.append(processing_time)
349
        
350
        print(f"Run {i+1}: {processing_time:.2f}s")
351
        if i == 0:  # Print response from first run
352
            print(f"Response: {response['choices'][0]['message']['content']}")
353
    
354
    avg_time = sum(times) / len(times)
355
    print(f"Average processing time: {avg_time:.2f}s")
356

357
benchmark_vision_processing("./test_image.jpg")
358
```
359

360
### Image Comparison
361

362
```python
363
def compare_images(image1_path: str, image2_path: str) -> str:
364
    """Compare two images and describe differences."""
365
    messages = [
366
        {
367
            "role": "user",
368
            "content": [
369
                {"type": "text", "text": "Compare these two images and describe the main differences:"},
370
                {"type": "image_url", "image_url": {"url": f"file://{image1_path}"}},
371
                {"type": "text", "text": "versus"},
372
                {"type": "image_url", "image_url": {"url": f"file://{image2_path}"}}
373
            ]
374
        }
375
    ]
376
    
377
    response = llm.create_chat_completion(
378
        messages=messages,
379
        max_tokens=200,
380
        temperature=0.3,
381
    )
382
    
383
    return response['choices'][0]['message']['content']
384

385
# Compare two images
386
comparison = compare_images("./before.jpg", "./after.jpg")
387
print("Image comparison:", comparison)
388
```
389

390
### Visual Chat Interface
391

392
```python
393
class VisualChatbot:
394
    def __init__(self, model_path: str, clip_model_path: str):
395
        self.llm = Llama(
396
            model_path=model_path,
397
            clip_model_path=clip_model_path,
398
            n_ctx=2048,
399
            verbose=False,
400
        )
401
        self.conversation_history = []
402
    
403
    def add_text_message(self, text: str):
404
        """Add text message to conversation."""
405
        self.conversation_history.append({
406
            "role": "user",
407
            "content": text
408
        })
409
    
410
    def add_image_message(self, image_path: str, text: str = ""):
411
        """Add image with optional text to conversation."""
412
        content = []
413
        if text:
414
            content.append({"type": "text", "text": text})
415
        content.append({"type": "image_url", "image_url": {"url": f"file://{image_path}"}})
416
        
417
        self.conversation_history.append({
418
            "role": "user",
419
            "content": content
420
        })
421
    
422
    def get_response(self, max_tokens: int = 150) -> str:
423
        """Get response from the model."""
424
        response = self.llm.create_chat_completion(
425
            messages=self.conversation_history,
426
            max_tokens=max_tokens,
427
        )
428
        
429
        assistant_message = response['choices'][0]['message']['content']
430
        self.conversation_history.append({
431
            "role": "assistant",
432
            "content": assistant_message
433
        })
434
        
435
        return assistant_message
436

437
# Example usage
438
chatbot = VisualChatbot(
439
    "./models/llava-v1.5-7b.gguf",
440
    "./models/mmproj-model.gguf"
441
)
442

443
chatbot.add_image_message("./vacation_photo.jpg", "Look at this vacation photo!")
444
response1 = chatbot.get_response()
445
print("Bot:", response1)
446

447
chatbot.add_text_message("What activities would you recommend at this location?")
448
response2 = chatbot.get_response()
449
print("Bot:", response2)
450
```

Version

Tile

Files

vision.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

vision.mddocs/