Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
—
LLaVA vision model integration for processing images alongside text, supporting various image formats and multimodal conversation flows with visual understanding capabilities.
Create embeddings from images for vision-language processing.
def llava_image_embed_make_with_filename(
ctx_clip,
n_threads: int,
image_path: bytes
) -> llava_image_embed:
"""
Create image embedding from image file.
Args:
ctx_clip: CLIP context pointer
n_threads: Number of threads to use for processing
image_path: Path to image file (as bytes)
Returns:
Image embedding structure pointer
"""
def llava_image_embed_make_with_bytes(
ctx_clip,
image_bytes: bytes,
image_bytes_length: int
) -> llava_image_embed:
"""
Create image embedding from image bytes.
Args:
ctx_clip: CLIP context pointer
image_bytes: Raw image data
image_bytes_length: Length of image data
Returns:
Image embedding structure
"""
def llava_image_embed_free(embed) -> None:
"""
Free image embedding memory.
Args:
embed: Image embedding to free
"""Validate compatibility between text and vision model embeddings.
def llava_validate_embed_size(
n_embd: int,
n_image_embd: int
) -> bool:
"""
Validate that text and image embedding dimensions are compatible.
Args:
n_embd: Text model embedding dimensions
n_image_embd: Image model embedding dimensions
Returns:
True if embeddings are compatible
"""Manage CLIP vision encoder context for image processing.
# Type definitions for vision processing
clip_ctx_p = ctypes.POINTER(ctypes.c_void_p) # CLIP context pointer type
class llava_image_embed(ctypes.Structure):
"""Image embedding structure for vision models."""
_fields_ = [
("embed", ctypes.POINTER(ctypes.c_float)),
("n_image_pos", ctypes.c_int),
]from llama_cpp import Llama
import llama_cpp.llama_cpp as llama_cpp
# Initialize vision-capable model (LLaVA)
llm = Llama(
model_path="./models/llava-v1.5-7b.gguf",
clip_model_path="./models/mmproj-model.gguf", # Vision projector
n_ctx=2048,
verbose=False,
)
# Process image with text prompt
response = llm.create_chat_completion(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What do you see in this image?"},
{"type": "image_url", "image_url": {"url": "file://./image.jpg"}}
]
}
],
max_tokens=200,
)
print("Vision response:", response['choices'][0]['message']['content'])# Multi-turn conversation about an image
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in detail."},
{"type": "image_url", "image_url": {"url": "file://./photo.jpg"}}
]
}
]
# First response
response = llm.create_chat_completion(messages=messages, max_tokens=150)
messages.append({
"role": "assistant",
"content": response['choices'][0]['message']['content']
})
# Follow-up question
messages.append({
"role": "user",
"content": "What colors are most prominent in this image?"
})
response = llm.create_chat_completion(messages=messages, max_tokens=100)
print("Color analysis:", response['choices'][0]['message']['content'])import os
from pathlib import Path
# Process multiple images
image_dir = Path("./images")
image_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.png"))
for image_file in image_files[:5]: # Process first 5 images
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Provide a brief caption for this image."},
{"type": "image_url", "image_url": {"url": f"file://{image_file}"}}
]
}
]
response = llm.create_chat_completion(
messages=messages,
max_tokens=50,
temperature=0.7,
)
caption = response['choices'][0]['message']['content']
print(f"{image_file.name}: {caption}")def ask_about_image(image_path: str, question: str) -> str:
"""Ask a specific question about an image."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
]
}
]
response = llm.create_chat_completion(
messages=messages,
max_tokens=100,
temperature=0.3, # Lower temperature for more factual responses
)
return response['choices'][0]['message']['content']
# Example questions
image_path = "./sample_image.jpg"
questions = [
"How many people are in this image?",
"What is the main activity happening?",
"What is the setting or location?",
"What emotions are visible on people's faces?",
"Are there any text or signs visible?",
]
for question in questions:
answer = ask_about_image(image_path, question)
print(f"Q: {question}")
print(f"A: {answer}\n")import llama_cpp.llama_cpp as llama_cpp
from ctypes import c_void_p, cast
# Assuming you have access to CLIP context (advanced usage)
# This would typically be handled internally by the Llama class
def process_image_embedding(image_path: str, ctx_clip):
"""Process image embedding at low level."""
# Create image embedding from file
embed = llama_cpp.llava_image_embed_make_with_filename(
ctx_clip,
image_path.encode('utf-8')
)
if embed:
print(f"Created embedding for {image_path}")
print(f"Image positions: {embed.contents.n_image_pos}")
# Process embedding (your custom logic here)
# ...
# Free embedding memory
llama_cpp.llava_image_embed_free(embed)
print("Embedding memory freed")
else:
print(f"Failed to create embedding for {image_path}")
# Note: This is advanced usage and requires proper CLIP context setupimport base64
from io import BytesIO
from PIL import Image
def process_base64_image(base64_data: str, question: str) -> str:
"""Process image provided as base64 data."""
# Convert base64 to image URL format
image_url = f"data:image/jpeg;base64,{base64_data}"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}}
]
}
]
response = llm.create_chat_completion(messages=messages, max_tokens=150)
return response['choices'][0]['message']['content']
def resize_and_encode_image(image_path: str, max_size: tuple = (512, 512)) -> str:
"""Resize image and convert to base64 for processing."""
with Image.open(image_path) as img:
# Resize image to reduce processing time
img.thumbnail(max_size, Image.Resampling.LANCZOS)
# Convert to RGB if necessary
if img.mode != 'RGB':
img = img.convert('RGB')
# Save to base64
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=85)
base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
return base64_data
# Process resized image
image_path = "./large_image.jpg"
base64_image = resize_and_encode_image(image_path)
result = process_base64_image(base64_image, "What are the main objects in this image?")
print(result)# Initialize vision model with optimized settings
llm = Llama(
model_path="./models/llava-1.6-mistral-7b.gguf",
clip_model_path="./models/mmproj-model.gguf",
n_ctx=4096, # Larger context for complex vision tasks
n_gpu_layers=30, # Offload to GPU for faster processing
verbose=False,
n_threads=8,
)
def benchmark_vision_processing(image_path: str, num_runs: int = 3):
"""Benchmark vision processing performance."""
import time
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image concisely."},
{"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
]
}
]
times = []
for i in range(num_runs):
start_time = time.time()
response = llm.create_chat_completion(
messages=messages,
max_tokens=100,
temperature=0.5,
)
end_time = time.time()
processing_time = end_time - start_time
times.append(processing_time)
print(f"Run {i+1}: {processing_time:.2f}s")
if i == 0: # Print response from first run
print(f"Response: {response['choices'][0]['message']['content']}")
avg_time = sum(times) / len(times)
print(f"Average processing time: {avg_time:.2f}s")
benchmark_vision_processing("./test_image.jpg")def compare_images(image1_path: str, image2_path: str) -> str:
"""Compare two images and describe differences."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Compare these two images and describe the main differences:"},
{"type": "image_url", "image_url": {"url": f"file://{image1_path}"}},
{"type": "text", "text": "versus"},
{"type": "image_url", "image_url": {"url": f"file://{image2_path}"}}
]
}
]
response = llm.create_chat_completion(
messages=messages,
max_tokens=200,
temperature=0.3,
)
return response['choices'][0]['message']['content']
# Compare two images
comparison = compare_images("./before.jpg", "./after.jpg")
print("Image comparison:", comparison)class VisualChatbot:
def __init__(self, model_path: str, clip_model_path: str):
self.llm = Llama(
model_path=model_path,
clip_model_path=clip_model_path,
n_ctx=2048,
verbose=False,
)
self.conversation_history = []
def add_text_message(self, text: str):
"""Add text message to conversation."""
self.conversation_history.append({
"role": "user",
"content": text
})
def add_image_message(self, image_path: str, text: str = ""):
"""Add image with optional text to conversation."""
content = []
if text:
content.append({"type": "text", "text": text})
content.append({"type": "image_url", "image_url": {"url": f"file://{image_path}"}})
self.conversation_history.append({
"role": "user",
"content": content
})
def get_response(self, max_tokens: int = 150) -> str:
"""Get response from the model."""
response = self.llm.create_chat_completion(
messages=self.conversation_history,
max_tokens=max_tokens,
)
assistant_message = response['choices'][0]['message']['content']
self.conversation_history.append({
"role": "assistant",
"content": assistant_message
})
return assistant_message
# Example usage
chatbot = VisualChatbot(
"./models/llava-v1.5-7b.gguf",
"./models/mmproj-model.gguf"
)
chatbot.add_image_message("./vacation_photo.jpg", "Look at this vacation photo!")
response1 = chatbot.get_response()
print("Bot:", response1)
chatbot.add_text_message("What activities would you recommend at this location?")
response2 = chatbot.get_response()
print("Bot:", response2)Install with Tessl CLI
npx tessl i tessl/pypi-llama-cpp-python