CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-llama-cpp-python

Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.

Pending
Overview
Eval results
Files

low-level.mddocs/

Low-Level API

Direct access to llama.cpp C functions through ctypes bindings, providing maximum control over model loading, context management, backend operations, and hardware-specific optimizations.

Capabilities

Backend Management

Initialize and manage the llama.cpp backend system.

def llama_backend_init() -> None:
    """
    Initialize llama.cpp backend.
    Must be called before using any other functions.
    """

def llama_backend_free() -> None:
    """
    Free backend resources.
    Should be called when shutting down.
    """

def llama_numa_init(numa_strategy: int) -> None:
    """
    Initialize NUMA support.
    
    Args:
        numa_strategy: NUMA initialization strategy
    """

Model Management

Low-level model loading, saving, and memory management.

def llama_model_load_from_file(
    path_model: bytes, 
    params
) -> llama_model_p:
    """
    Load model from file.
    
    Args:
        path_model: Path to model file (bytes)
        params: Model parameters structure
        
    Returns:
        Model pointer or null on failure
    """

def llama_model_free(model: llama_model_p) -> None:
    """
    Free model memory.
    
    Args:
        model: Model pointer to free
    """

def llama_model_save_to_file(
    model: llama_model_p, 
    fname: bytes, 
    **kwargs
) -> bool:
    """
    Save model to file.
    
    Args:
        model: Model pointer
        fname: Output filename (bytes)
        **kwargs: Additional save parameters
        
    Returns:
        True if successful
    """

def llama_model_default_params():
    """
    Get default model loading parameters.
    
    Returns:
        Default parameter structure
    """

def llama_model_quantize_default_params():
    """
    Get default quantization parameters.
    
    Returns:
        Default quantization structure
    """

Context Management

Create and manage model contexts for inference.

def llama_new_context_with_model(
    model: llama_model_p, 
    params
) -> llama_context_p:
    """
    Create new context with model.
    
    Args:
        model: Model pointer
        params: Context parameters
        
    Returns:
        Context pointer or null on failure
    """

def llama_free(ctx: llama_context_p) -> None:
    """
    Free context memory.
    
    Args:
        ctx: Context pointer to free
    """

def llama_context_default_params():
    """
    Get default context parameters.
    
    Returns:
        Default parameter structure
    """

System Information

Query system capabilities and model properties.

def llama_supports_mmap() -> bool:
    """Check if memory mapping is supported."""

def llama_supports_mlock() -> bool:
    """Check if memory locking is supported."""

def llama_supports_gpu_offload() -> bool:
    """Check if GPU offload is supported."""

def llama_max_devices() -> int:
    """Get maximum number of devices."""

def llama_time_us() -> int:
    """Get current time in microseconds."""

def llama_n_ctx(ctx: llama_context_p) -> int:
    """
    Get context size.
    
    Args:
        ctx: Context pointer
        
    Returns:
        Context size in tokens
    """

def llama_n_embd(model: llama_model_p) -> int:
    """
    Get embedding dimensions.
    
    Args:
        model: Model pointer
        
    Returns:
        Embedding dimension count
    """

Core Constants

Default Values

LLAMA_DEFAULT_SEED: int = 0xFFFFFFFF  # Default random seed
LLAMA_TOKEN_NULL: int = -1            # Null token value
LLAMA_MAX_DEVICES: int                # Maximum device count

File Format Magic Numbers

LLAMA_FILE_MAGIC_GGLA: int    # GGLA file format identifier
LLAMA_FILE_MAGIC_GGSN: int    # GGSN file format identifier
LLAMA_FILE_MAGIC_GGSQ: int    # GGSQ file format identifier
LLAMA_SESSION_MAGIC: int      # Session file magic number
LLAMA_SESSION_VERSION: int    # Session file version
LLAMA_STATE_SEQ_MAGIC: int    # State sequence magic number
LLAMA_STATE_SEQ_VERSION: int  # State sequence version

Vocabulary Types

LLAMA_VOCAB_TYPE_NONE: int = 0  # No vocabulary
LLAMA_VOCAB_TYPE_SPM: int = 1   # SentencePiece model
LLAMA_VOCAB_TYPE_BPE: int = 2   # Byte pair encoding
LLAMA_VOCAB_TYPE_WPM: int = 3   # WordPiece model
LLAMA_VOCAB_TYPE_UGM: int = 4   # Unigram model
LLAMA_VOCAB_TYPE_RWKV: int = 5  # RWKV tokenizer

GGML Quantization Types

# Float types
GGML_TYPE_F32: int  # 32-bit float
GGML_TYPE_F16: int  # 16-bit float

# Quantized types
GGML_TYPE_Q4_0: int  # 4-bit quantization, type 0
GGML_TYPE_Q4_1: int  # 4-bit quantization, type 1
GGML_TYPE_Q5_0: int  # 5-bit quantization, type 0
GGML_TYPE_Q5_1: int  # 5-bit quantization, type 1
GGML_TYPE_Q8_0: int  # 8-bit quantization, type 0
GGML_TYPE_Q8_1: int  # 8-bit quantization, type 1

# K-quantization types
GGML_TYPE_Q2_K: int  # 2-bit K-quantization
GGML_TYPE_Q3_K: int  # 3-bit K-quantization
GGML_TYPE_Q4_K: int  # 4-bit K-quantization
GGML_TYPE_Q5_K: int  # 5-bit K-quantization
GGML_TYPE_Q6_K: int  # 6-bit K-quantization
GGML_TYPE_Q8_K: int  # 8-bit K-quantization

# Integer quantization types
GGML_TYPE_IQ2_XXS: int  # Integer quantization 2-bit, XXS
GGML_TYPE_IQ2_XS: int   # Integer quantization 2-bit, XS
GGML_TYPE_IQ3_XXS: int  # Integer quantization 3-bit, XXS
GGML_TYPE_IQ1_S: int    # Integer quantization 1-bit, S
GGML_TYPE_IQ4_NL: int   # Integer quantization 4-bit, NL
GGML_TYPE_IQ3_S: int    # Integer quantization 3-bit, S
GGML_TYPE_IQ2_S: int    # Integer quantization 2-bit, S
GGML_TYPE_IQ4_XS: int   # Integer quantization 4-bit, XS
GGML_TYPE_IQ1_M: int    # Integer quantization 1-bit, M

# Standard integer types
GGML_TYPE_I8: int   # 8-bit signed integer
GGML_TYPE_I16: int  # 16-bit signed integer
GGML_TYPE_I32: int  # 32-bit signed integer
GGML_TYPE_I64: int  # 64-bit signed integer

Pointer Types

# Core pointer types
llama_model_p = ctypes.POINTER(ctypes.c_void_p)    # Model pointer
llama_context_p = ctypes.POINTER(ctypes.c_void_p)  # Context pointer
llama_token = ctypes.c_int32                       # Token type

Usage Examples

Basic Low-Level Setup

import llama_cpp.llama_cpp as llama_cpp
import ctypes

# Initialize backend
llama_cpp.llama_backend_init()
print("Backend initialized")

try:
    # Get default parameters
    model_params = llama_cpp.llama_model_default_params()
    context_params = llama_cpp.llama_context_default_params()
    
    # Load model
    model_path = b"./models/llama-2-7b.gguf"
    model = llama_cpp.llama_model_load_from_file(model_path, model_params)
    
    if not model:
        raise Exception("Failed to load model")
    print("Model loaded successfully")
    
    # Create context
    context = llama_cpp.llama_new_context_with_model(model, context_params)
    
    if not context:
        raise Exception("Failed to create context")
    print("Context created successfully")
    
    # Get model information
    n_ctx = llama_cpp.llama_n_ctx(context)
    n_embd = llama_cpp.llama_n_embd(model)
    
    print(f"Context size: {n_ctx}")
    print(f"Embedding dimensions: {n_embd}")
    
finally:
    # Cleanup
    if 'context' in locals():
        llama_cpp.llama_free(context)
    if 'model' in locals():
        llama_cpp.llama_model_free(model)
    
    llama_cpp.llama_backend_free()
    print("Cleanup completed")

System Capability Detection

import llama_cpp.llama_cpp as llama_cpp

# Check system capabilities
capabilities = {
    "mmap_support": llama_cpp.llama_supports_mmap(),
    "mlock_support": llama_cpp.llama_supports_mlock(),
    "gpu_offload": llama_cpp.llama_supports_gpu_offload(),
    "max_devices": llama_cpp.llama_max_devices(),
}

print("System capabilities:")
for capability, supported in capabilities.items():
    status = "✓" if supported else "✗"
    print(f"  {status} {capability}: {supported}")

# Timing utilities
start_time = llama_cpp.llama_time_us()
# ... some operation ...
end_time = llama_cpp.llama_time_us()
duration_ms = (end_time - start_time) / 1000
print(f"Operation took {duration_ms:.2f}ms")

Custom Parameter Configuration

import llama_cpp.llama_cpp as llama_cpp
import ctypes

# Initialize backend
llama_cpp.llama_backend_init()

# Get and modify default parameters
model_params = llama_cpp.llama_model_default_params()
context_params = llama_cpp.llama_context_default_params()

# Modify model parameters (example - actual field names depend on structure)
# model_params.n_gpu_layers = 35
# model_params.use_mmap = True
# model_params.use_mlock = False

# Modify context parameters
# context_params.n_ctx = 4096
# context_params.n_batch = 512
# context_params.n_threads = 8

print("Custom parameters configured")

try:
    # Load with custom parameters
    model = llama_cpp.llama_model_load_from_file(
        b"./models/model.gguf", 
        model_params
    )
    
    if model:
        context = llama_cpp.llama_new_context_with_model(model, context_params)
        if context:
            print("Model and context created with custom parameters")
            
            # Get actual values
            actual_ctx = llama_cpp.llama_n_ctx(context)
            actual_embd = llama_cpp.llama_n_embd(model)
            print(f"Actual context size: {actual_ctx}")
            print(f"Actual embedding dimensions: {actual_embd}")
            
            llama_cpp.llama_free(context)
        llama_cpp.llama_model_free(model)
        
finally:
    llama_cpp.llama_backend_free()

Memory Management Patterns

import llama_cpp.llama_cpp as llama_cpp
import gc
import psutil
import os

def get_memory_usage():
    """Get current memory usage in MB."""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

class LowLevelLlama:
    def __init__(self):
        self.model = None
        self.context = None
        self.backend_initialized = False
    
    def initialize_backend(self):
        """Initialize backend if not already done."""
        if not self.backend_initialized:
            llama_cpp.llama_backend_init()
            self.backend_initialized = True
    
    def load_model(self, model_path: str):
        """Load model with automatic cleanup."""
        self.initialize_backend()
        
        # Clean up existing model
        if self.model:
            self.free_model()
        
        initial_memory = get_memory_usage()
        
        model_params = llama_cpp.llama_model_default_params()
        self.model = llama_cpp.llama_model_load_from_file(
            model_path.encode('utf-8'), 
            model_params
        )
        
        if not self.model:
            raise RuntimeError(f"Failed to load model: {model_path}")
        
        final_memory = get_memory_usage()
        memory_increase = final_memory - initial_memory
        
        print(f"Model loaded: {memory_increase:.1f}MB memory increase")
        return True
    
    def create_context(self):
        """Create context with automatic cleanup."""
        if not self.model:
            raise RuntimeError("Model must be loaded first")
        
        # Clean up existing context
        if self.context:
            self.free_context()
        
        context_params = llama_cpp.llama_context_default_params()
        self.context = llama_cpp.llama_new_context_with_model(
            self.model, 
            context_params
        )
        
        if not self.context:
            raise RuntimeError("Failed to create context")
        
        print("Context created successfully")
        return True
    
    def free_context(self):
        """Free context memory."""
        if self.context:
            llama_cpp.llama_free(self.context)
            self.context = None
            gc.collect()  # Force garbage collection
    
    def free_model(self):
        """Free model memory."""
        if self.model:
            llama_cpp.llama_model_free(self.model)
            self.model = None
            gc.collect()
    
    def cleanup(self):
        """Full cleanup."""
        self.free_context()
        self.free_model()
        
        if self.backend_initialized:
            llama_cpp.llama_backend_free()
            self.backend_initialized = False
    
    def __del__(self):
        """Destructor for automatic cleanup."""
        self.cleanup()

# Usage example
llama = LowLevelLlama()

try:
    print(f"Initial memory: {get_memory_usage():.1f}MB")
    
    llama.load_model("./models/model.gguf")
    print(f"After model load: {get_memory_usage():.1f}MB")
    
    llama.create_context()
    print(f"After context creation: {get_memory_usage():.1f}MB")
    
    # Use model...
    
finally:
    llama.cleanup()
    print(f"After cleanup: {get_memory_usage():.1f}MB")

Error Handling and Validation

import llama_cpp.llama_cpp as llama_cpp
import ctypes

def validate_model_file(file_path: str) -> bool:
    """Validate model file before loading."""
    import os
    
    if not os.path.exists(file_path):
        print(f"Model file not found: {file_path}")
        return False
    
    file_size = os.path.getsize(file_path)
    if file_size < 1024:  # Less than 1KB is suspicious
        print(f"Model file too small: {file_size} bytes")
        return False
    
    # Check file extension
    if not file_path.lower().endswith(('.gguf', '.ggml', '.bin')):
        print(f"Unexpected file extension: {file_path}")
        return False
    
    return True

def safe_model_loading(model_path: str):
    """Demonstrate safe model loading with error handling."""
    
    if not validate_model_file(model_path):
        return None
    
    llama_cpp.llama_backend_init()
    
    try:
        # Check system capabilities first
        if not llama_cpp.llama_supports_mmap():
            print("Warning: Memory mapping not supported")
        
        # Get default parameters
        model_params = llama_cpp.llama_model_default_params()
        
        # Attempt to load model
        print(f"Loading model: {model_path}")
        model = llama_cpp.llama_model_load_from_file(
            model_path.encode('utf-8'), 
            model_params
        )
        
        if not model:
            print("Model loading failed - check file format and permissions")
            return None
        
        # Validate model properties
        try:
            context_params = llama_cpp.llama_context_default_params()
            context = llama_cpp.llama_new_context_with_model(model, context_params)
            
            if context:
                n_ctx = llama_cpp.llama_n_ctx(context)
                n_embd = llama_cpp.llama_n_embd(model)
                
                print(f"Model validation successful:")
                print(f"  Context size: {n_ctx}")
                print(f"  Embeddings: {n_embd}")
                
                llama_cpp.llama_free(context)
                return model
            else:
                print("Context creation failed - insufficient memory?")
                llama_cpp.llama_model_free(model)
                return None
                
        except Exception as e:
            print(f"Model validation error: {e}")
            llama_cpp.llama_model_free(model)
            return None
    
    except Exception as e:
        print(f"Unexpected error during model loading: {e}")
        return None
    
    finally:
        # Backend cleanup handled by caller
        pass

# Usage
model = safe_model_loading("./models/test-model.gguf")
if model:
    print("Model ready for use")
    # Use model...
    llama_cpp.llama_model_free(model)

llama_cpp.llama_backend_free()

Performance Monitoring

import llama_cpp.llama_cpp as llama_cpp
import time
import contextlib

@contextlib.contextmanager
def performance_monitor(operation_name: str):
    """Context manager for performance monitoring."""
    start_time = llama_cpp.llama_time_us()
    start_memory = get_memory_usage()
    
    try:
        yield
    finally:
        end_time = llama_cpp.llama_time_us()
        end_memory = get_memory_usage()
        
        duration_ms = (end_time - start_time) / 1000
        memory_change = end_memory - start_memory
        
        print(f"{operation_name}:")
        print(f"  Duration: {duration_ms:.2f}ms")
        print(f"  Memory change: {memory_change:+.1f}MB")

# Usage example
llama_cpp.llama_backend_init()

try:
    with performance_monitor("Model Loading"):
        model_params = llama_cpp.llama_model_default_params()
        model = llama_cpp.llama_model_load_from_file(
            b"./models/model.gguf", 
            model_params
        )
    
    if model:
        with performance_monitor("Context Creation"):
            context_params = llama_cpp.llama_context_default_params()
            context = llama_cpp.llama_new_context_with_model(model, context_params)
        
        if context:
            with performance_monitor("Model Info Retrieval"):
                n_ctx = llama_cpp.llama_n_ctx(context)
                n_embd = llama_cpp.llama_n_embd(model)
                print(f"Context: {n_ctx}, Embeddings: {n_embd}")
            
            llama_cpp.llama_free(context)
        
        llama_cpp.llama_model_free(model)

finally:
    llama_cpp.llama_backend_free()

Install with Tessl CLI

npx tessl i tessl/pypi-llama-cpp-python

docs

caching.md

chat-completion.md

grammar.md

index.md

llama-model.md

low-level.md

server.md

tokenization.md

vision.md

tile.json