tessl/pypi-llama-cpp-python

Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.

—

Pending

Overview

Eval results

Files

Vision and Multimodal

Name: tessl/pypi-llama-cpp-python
Author: tessl

LLaVA vision model integration for processing images alongside text, supporting various image formats and multimodal conversation flows with visual understanding capabilities.

Capabilities

Image Embedding

Create embeddings from images for vision-language processing.

def llava_image_embed_make_with_filename(
    ctx_clip, 
    n_threads: int, 
    image_path: bytes
) -> llava_image_embed:
    """
    Create image embedding from image file.
    
    Args:
        ctx_clip: CLIP context pointer
        n_threads: Number of threads to use for processing
        image_path: Path to image file (as bytes)
        
    Returns:
        Image embedding structure pointer
    """

def llava_image_embed_make_with_bytes(
    ctx_clip,
    image_bytes: bytes,
    image_bytes_length: int
) -> llava_image_embed:
    """
    Create image embedding from image bytes.
    
    Args:
        ctx_clip: CLIP context pointer
        image_bytes: Raw image data
        image_bytes_length: Length of image data
        
    Returns:
        Image embedding structure
    """

def llava_image_embed_free(embed) -> None:
    """
    Free image embedding memory.
    
    Args:
        embed: Image embedding to free
    """

Vision Model Validation

Validate compatibility between text and vision model embeddings.

def llava_validate_embed_size(
    n_embd: int, 
    n_image_embd: int
) -> bool:
    """
    Validate that text and image embedding dimensions are compatible.
    
    Args:
        n_embd: Text model embedding dimensions
        n_image_embd: Image model embedding dimensions
        
    Returns:
        True if embeddings are compatible
    """

CLIP Context Management

Manage CLIP vision encoder context for image processing.

# Type definitions for vision processing
clip_ctx_p = ctypes.POINTER(ctypes.c_void_p)  # CLIP context pointer type

class llava_image_embed(ctypes.Structure):
    """Image embedding structure for vision models."""
    _fields_ = [
        ("embed", ctypes.POINTER(ctypes.c_float)),
        ("n_image_pos", ctypes.c_int),
    ]

Usage Examples

Basic Image Processing

from llama_cpp import Llama
import llama_cpp.llama_cpp as llama_cpp

# Initialize vision-capable model (LLaVA)
llm = Llama(
    model_path="./models/llava-v1.5-7b.gguf",
    clip_model_path="./models/mmproj-model.gguf",  # Vision projector
    n_ctx=2048,
    verbose=False,
)

# Process image with text prompt
response = llm.create_chat_completion(
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What do you see in this image?"},
                {"type": "image_url", "image_url": {"url": "file://./image.jpg"}}
            ]
        }
    ],
    max_tokens=200,
)

print("Vision response:", response['choices'][0]['message']['content'])

Image Analysis Conversation

# Multi-turn conversation about an image
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this image in detail."},
            {"type": "image_url", "image_url": {"url": "file://./photo.jpg"}}
        ]
    }
]

# First response
response = llm.create_chat_completion(messages=messages, max_tokens=150)
messages.append({
    "role": "assistant",
    "content": response['choices'][0]['message']['content']
})

# Follow-up question
messages.append({
    "role": "user",
    "content": "What colors are most prominent in this image?"
})

response = llm.create_chat_completion(messages=messages, max_tokens=100)
print("Color analysis:", response['choices'][0]['message']['content'])

Batch Image Processing

import os
from pathlib import Path

# Process multiple images
image_dir = Path("./images")
image_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.png"))

for image_file in image_files[:5]:  # Process first 5 images
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Provide a brief caption for this image."},
                {"type": "image_url", "image_url": {"url": f"file://{image_file}"}}
            ]
        }
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=50,
        temperature=0.7,
    )
    
    caption = response['choices'][0]['message']['content']
    print(f"{image_file.name}: {caption}")

Image-based Question Answering

def ask_about_image(image_path: str, question: str) -> str:
    """Ask a specific question about an image."""
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
            ]
        }
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=100,
        temperature=0.3,  # Lower temperature for more factual responses
    )
    
    return response['choices'][0]['message']['content']

# Example questions
image_path = "./sample_image.jpg"
questions = [
    "How many people are in this image?",
    "What is the main activity happening?",
    "What is the setting or location?",
    "What emotions are visible on people's faces?",
    "Are there any text or signs visible?",
]

for question in questions:
    answer = ask_about_image(image_path, question)
    print(f"Q: {question}")
    print(f"A: {answer}\n")

Low-Level Image Embedding

import llama_cpp.llama_cpp as llama_cpp
from ctypes import c_void_p, cast

# Assuming you have access to CLIP context (advanced usage)
# This would typically be handled internally by the Llama class

def process_image_embedding(image_path: str, ctx_clip):
    """Process image embedding at low level."""
    
    # Create image embedding from file
    embed = llama_cpp.llava_image_embed_make_with_filename(
        ctx_clip, 
        image_path.encode('utf-8')
    )
    
    if embed:
        print(f"Created embedding for {image_path}")
        print(f"Image positions: {embed.contents.n_image_pos}")
        
        # Process embedding (your custom logic here)
        # ...
        
        # Free embedding memory
        llama_cpp.llava_image_embed_free(embed)
        print("Embedding memory freed")
    else:
        print(f"Failed to create embedding for {image_path}")

# Note: This is advanced usage and requires proper CLIP context setup

Image Format Support

import base64
from io import BytesIO
from PIL import Image

def process_base64_image(base64_data: str, question: str) -> str:
    """Process image provided as base64 data."""
    
    # Convert base64 to image URL format
    image_url = f"data:image/jpeg;base64,{base64_data}"
    
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image_url", "image_url": {"url": image_url}}
            ]
        }
    ]
    
    response = llm.create_chat_completion(messages=messages, max_tokens=150)
    return response['choices'][0]['message']['content']

def resize_and_encode_image(image_path: str, max_size: tuple = (512, 512)) -> str:
    """Resize image and convert to base64 for processing."""
    with Image.open(image_path) as img:
        # Resize image to reduce processing time
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        
        # Convert to RGB if necessary
        if img.mode != 'RGB':
            img = img.convert('RGB')
        
        # Save to base64
        buffer = BytesIO()
        img.save(buffer, format='JPEG', quality=85)
        base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
        
        return base64_data

# Process resized image
image_path = "./large_image.jpg"
base64_image = resize_and_encode_image(image_path)
result = process_base64_image(base64_image, "What are the main objects in this image?")
print(result)

Vision Model Performance Tuning

# Initialize vision model with optimized settings
llm = Llama(
    model_path="./models/llava-1.6-mistral-7b.gguf",
    clip_model_path="./models/mmproj-model.gguf",
    n_ctx=4096,  # Larger context for complex vision tasks
    n_gpu_layers=30,  # Offload to GPU for faster processing
    verbose=False,
    n_threads=8,
)

def benchmark_vision_processing(image_path: str, num_runs: int = 3):
    """Benchmark vision processing performance."""
    import time
    
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe this image concisely."},
                {"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
            ]
        }
    ]
    
    times = []
    for i in range(num_runs):
        start_time = time.time()
        
        response = llm.create_chat_completion(
            messages=messages,
            max_tokens=100,
            temperature=0.5,
        )
        
        end_time = time.time()
        processing_time = end_time - start_time
        times.append(processing_time)
        
        print(f"Run {i+1}: {processing_time:.2f}s")
        if i == 0:  # Print response from first run
            print(f"Response: {response['choices'][0]['message']['content']}")
    
    avg_time = sum(times) / len(times)
    print(f"Average processing time: {avg_time:.2f}s")

benchmark_vision_processing("./test_image.jpg")

Image Comparison

def compare_images(image1_path: str, image2_path: str) -> str:
    """Compare two images and describe differences."""
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Compare these two images and describe the main differences:"},
                {"type": "image_url", "image_url": {"url": f"file://{image1_path}"}},
                {"type": "text", "text": "versus"},
                {"type": "image_url", "image_url": {"url": f"file://{image2_path}"}}
            ]
        }
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=200,
        temperature=0.3,
    )
    
    return response['choices'][0]['message']['content']

# Compare two images
comparison = compare_images("./before.jpg", "./after.jpg")
print("Image comparison:", comparison)

Visual Chat Interface

class VisualChatbot:
    def __init__(self, model_path: str, clip_model_path: str):
        self.llm = Llama(
            model_path=model_path,
            clip_model_path=clip_model_path,
            n_ctx=2048,
            verbose=False,
        )
        self.conversation_history = []
    
    def add_text_message(self, text: str):
        """Add text message to conversation."""
        self.conversation_history.append({
            "role": "user",
            "content": text
        })
    
    def add_image_message(self, image_path: str, text: str = ""):
        """Add image with optional text to conversation."""
        content = []
        if text:
            content.append({"type": "text", "text": text})
        content.append({"type": "image_url", "image_url": {"url": f"file://{image_path}"}})
        
        self.conversation_history.append({
            "role": "user",
            "content": content
        })
    
    def get_response(self, max_tokens: int = 150) -> str:
        """Get response from the model."""
        response = self.llm.create_chat_completion(
            messages=self.conversation_history,
            max_tokens=max_tokens,
        )
        
        assistant_message = response['choices'][0]['message']['content']
        self.conversation_history.append({
            "role": "assistant",
            "content": assistant_message
        })
        
        return assistant_message

# Example usage
chatbot = VisualChatbot(
    "./models/llava-v1.5-7b.gguf",
    "./models/mmproj-model.gguf"
)

chatbot.add_image_message("./vacation_photo.jpg", "Look at this vacation photo!")
response1 = chatbot.get_response()
print("Bot:", response1)

chatbot.add_text_message("What activities would you recommend at this location?")
response2 = chatbot.get_response()
print("Bot:", response2)

Install with Tessl CLI