Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
—
Direct access to llama.cpp C functions through ctypes bindings, providing maximum control over model loading, context management, backend operations, and hardware-specific optimizations.
Initialize and manage the llama.cpp backend system.
def llama_backend_init() -> None:
"""
Initialize llama.cpp backend.
Must be called before using any other functions.
"""
def llama_backend_free() -> None:
"""
Free backend resources.
Should be called when shutting down.
"""
def llama_numa_init(numa_strategy: int) -> None:
"""
Initialize NUMA support.
Args:
numa_strategy: NUMA initialization strategy
"""Low-level model loading, saving, and memory management.
def llama_model_load_from_file(
path_model: bytes,
params
) -> llama_model_p:
"""
Load model from file.
Args:
path_model: Path to model file (bytes)
params: Model parameters structure
Returns:
Model pointer or null on failure
"""
def llama_model_free(model: llama_model_p) -> None:
"""
Free model memory.
Args:
model: Model pointer to free
"""
def llama_model_save_to_file(
model: llama_model_p,
fname: bytes,
**kwargs
) -> bool:
"""
Save model to file.
Args:
model: Model pointer
fname: Output filename (bytes)
**kwargs: Additional save parameters
Returns:
True if successful
"""
def llama_model_default_params():
"""
Get default model loading parameters.
Returns:
Default parameter structure
"""
def llama_model_quantize_default_params():
"""
Get default quantization parameters.
Returns:
Default quantization structure
"""Create and manage model contexts for inference.
def llama_new_context_with_model(
model: llama_model_p,
params
) -> llama_context_p:
"""
Create new context with model.
Args:
model: Model pointer
params: Context parameters
Returns:
Context pointer or null on failure
"""
def llama_free(ctx: llama_context_p) -> None:
"""
Free context memory.
Args:
ctx: Context pointer to free
"""
def llama_context_default_params():
"""
Get default context parameters.
Returns:
Default parameter structure
"""Query system capabilities and model properties.
def llama_supports_mmap() -> bool:
"""Check if memory mapping is supported."""
def llama_supports_mlock() -> bool:
"""Check if memory locking is supported."""
def llama_supports_gpu_offload() -> bool:
"""Check if GPU offload is supported."""
def llama_max_devices() -> int:
"""Get maximum number of devices."""
def llama_time_us() -> int:
"""Get current time in microseconds."""
def llama_n_ctx(ctx: llama_context_p) -> int:
"""
Get context size.
Args:
ctx: Context pointer
Returns:
Context size in tokens
"""
def llama_n_embd(model: llama_model_p) -> int:
"""
Get embedding dimensions.
Args:
model: Model pointer
Returns:
Embedding dimension count
"""LLAMA_DEFAULT_SEED: int = 0xFFFFFFFF # Default random seed
LLAMA_TOKEN_NULL: int = -1 # Null token value
LLAMA_MAX_DEVICES: int # Maximum device countLLAMA_FILE_MAGIC_GGLA: int # GGLA file format identifier
LLAMA_FILE_MAGIC_GGSN: int # GGSN file format identifier
LLAMA_FILE_MAGIC_GGSQ: int # GGSQ file format identifier
LLAMA_SESSION_MAGIC: int # Session file magic number
LLAMA_SESSION_VERSION: int # Session file version
LLAMA_STATE_SEQ_MAGIC: int # State sequence magic number
LLAMA_STATE_SEQ_VERSION: int # State sequence versionLLAMA_VOCAB_TYPE_NONE: int = 0 # No vocabulary
LLAMA_VOCAB_TYPE_SPM: int = 1 # SentencePiece model
LLAMA_VOCAB_TYPE_BPE: int = 2 # Byte pair encoding
LLAMA_VOCAB_TYPE_WPM: int = 3 # WordPiece model
LLAMA_VOCAB_TYPE_UGM: int = 4 # Unigram model
LLAMA_VOCAB_TYPE_RWKV: int = 5 # RWKV tokenizer# Float types
GGML_TYPE_F32: int # 32-bit float
GGML_TYPE_F16: int # 16-bit float
# Quantized types
GGML_TYPE_Q4_0: int # 4-bit quantization, type 0
GGML_TYPE_Q4_1: int # 4-bit quantization, type 1
GGML_TYPE_Q5_0: int # 5-bit quantization, type 0
GGML_TYPE_Q5_1: int # 5-bit quantization, type 1
GGML_TYPE_Q8_0: int # 8-bit quantization, type 0
GGML_TYPE_Q8_1: int # 8-bit quantization, type 1
# K-quantization types
GGML_TYPE_Q2_K: int # 2-bit K-quantization
GGML_TYPE_Q3_K: int # 3-bit K-quantization
GGML_TYPE_Q4_K: int # 4-bit K-quantization
GGML_TYPE_Q5_K: int # 5-bit K-quantization
GGML_TYPE_Q6_K: int # 6-bit K-quantization
GGML_TYPE_Q8_K: int # 8-bit K-quantization
# Integer quantization types
GGML_TYPE_IQ2_XXS: int # Integer quantization 2-bit, XXS
GGML_TYPE_IQ2_XS: int # Integer quantization 2-bit, XS
GGML_TYPE_IQ3_XXS: int # Integer quantization 3-bit, XXS
GGML_TYPE_IQ1_S: int # Integer quantization 1-bit, S
GGML_TYPE_IQ4_NL: int # Integer quantization 4-bit, NL
GGML_TYPE_IQ3_S: int # Integer quantization 3-bit, S
GGML_TYPE_IQ2_S: int # Integer quantization 2-bit, S
GGML_TYPE_IQ4_XS: int # Integer quantization 4-bit, XS
GGML_TYPE_IQ1_M: int # Integer quantization 1-bit, M
# Standard integer types
GGML_TYPE_I8: int # 8-bit signed integer
GGML_TYPE_I16: int # 16-bit signed integer
GGML_TYPE_I32: int # 32-bit signed integer
GGML_TYPE_I64: int # 64-bit signed integer# Core pointer types
llama_model_p = ctypes.POINTER(ctypes.c_void_p) # Model pointer
llama_context_p = ctypes.POINTER(ctypes.c_void_p) # Context pointer
llama_token = ctypes.c_int32 # Token typeimport llama_cpp.llama_cpp as llama_cpp
import ctypes
# Initialize backend
llama_cpp.llama_backend_init()
print("Backend initialized")
try:
# Get default parameters
model_params = llama_cpp.llama_model_default_params()
context_params = llama_cpp.llama_context_default_params()
# Load model
model_path = b"./models/llama-2-7b.gguf"
model = llama_cpp.llama_model_load_from_file(model_path, model_params)
if not model:
raise Exception("Failed to load model")
print("Model loaded successfully")
# Create context
context = llama_cpp.llama_new_context_with_model(model, context_params)
if not context:
raise Exception("Failed to create context")
print("Context created successfully")
# Get model information
n_ctx = llama_cpp.llama_n_ctx(context)
n_embd = llama_cpp.llama_n_embd(model)
print(f"Context size: {n_ctx}")
print(f"Embedding dimensions: {n_embd}")
finally:
# Cleanup
if 'context' in locals():
llama_cpp.llama_free(context)
if 'model' in locals():
llama_cpp.llama_model_free(model)
llama_cpp.llama_backend_free()
print("Cleanup completed")import llama_cpp.llama_cpp as llama_cpp
# Check system capabilities
capabilities = {
"mmap_support": llama_cpp.llama_supports_mmap(),
"mlock_support": llama_cpp.llama_supports_mlock(),
"gpu_offload": llama_cpp.llama_supports_gpu_offload(),
"max_devices": llama_cpp.llama_max_devices(),
}
print("System capabilities:")
for capability, supported in capabilities.items():
status = "✓" if supported else "✗"
print(f" {status} {capability}: {supported}")
# Timing utilities
start_time = llama_cpp.llama_time_us()
# ... some operation ...
end_time = llama_cpp.llama_time_us()
duration_ms = (end_time - start_time) / 1000
print(f"Operation took {duration_ms:.2f}ms")import llama_cpp.llama_cpp as llama_cpp
import ctypes
# Initialize backend
llama_cpp.llama_backend_init()
# Get and modify default parameters
model_params = llama_cpp.llama_model_default_params()
context_params = llama_cpp.llama_context_default_params()
# Modify model parameters (example - actual field names depend on structure)
# model_params.n_gpu_layers = 35
# model_params.use_mmap = True
# model_params.use_mlock = False
# Modify context parameters
# context_params.n_ctx = 4096
# context_params.n_batch = 512
# context_params.n_threads = 8
print("Custom parameters configured")
try:
# Load with custom parameters
model = llama_cpp.llama_model_load_from_file(
b"./models/model.gguf",
model_params
)
if model:
context = llama_cpp.llama_new_context_with_model(model, context_params)
if context:
print("Model and context created with custom parameters")
# Get actual values
actual_ctx = llama_cpp.llama_n_ctx(context)
actual_embd = llama_cpp.llama_n_embd(model)
print(f"Actual context size: {actual_ctx}")
print(f"Actual embedding dimensions: {actual_embd}")
llama_cpp.llama_free(context)
llama_cpp.llama_model_free(model)
finally:
llama_cpp.llama_backend_free()import llama_cpp.llama_cpp as llama_cpp
import gc
import psutil
import os
def get_memory_usage():
"""Get current memory usage in MB."""
process = psutil.Process(os.getpid())
return process.memory_info().rss / 1024 / 1024
class LowLevelLlama:
def __init__(self):
self.model = None
self.context = None
self.backend_initialized = False
def initialize_backend(self):
"""Initialize backend if not already done."""
if not self.backend_initialized:
llama_cpp.llama_backend_init()
self.backend_initialized = True
def load_model(self, model_path: str):
"""Load model with automatic cleanup."""
self.initialize_backend()
# Clean up existing model
if self.model:
self.free_model()
initial_memory = get_memory_usage()
model_params = llama_cpp.llama_model_default_params()
self.model = llama_cpp.llama_model_load_from_file(
model_path.encode('utf-8'),
model_params
)
if not self.model:
raise RuntimeError(f"Failed to load model: {model_path}")
final_memory = get_memory_usage()
memory_increase = final_memory - initial_memory
print(f"Model loaded: {memory_increase:.1f}MB memory increase")
return True
def create_context(self):
"""Create context with automatic cleanup."""
if not self.model:
raise RuntimeError("Model must be loaded first")
# Clean up existing context
if self.context:
self.free_context()
context_params = llama_cpp.llama_context_default_params()
self.context = llama_cpp.llama_new_context_with_model(
self.model,
context_params
)
if not self.context:
raise RuntimeError("Failed to create context")
print("Context created successfully")
return True
def free_context(self):
"""Free context memory."""
if self.context:
llama_cpp.llama_free(self.context)
self.context = None
gc.collect() # Force garbage collection
def free_model(self):
"""Free model memory."""
if self.model:
llama_cpp.llama_model_free(self.model)
self.model = None
gc.collect()
def cleanup(self):
"""Full cleanup."""
self.free_context()
self.free_model()
if self.backend_initialized:
llama_cpp.llama_backend_free()
self.backend_initialized = False
def __del__(self):
"""Destructor for automatic cleanup."""
self.cleanup()
# Usage example
llama = LowLevelLlama()
try:
print(f"Initial memory: {get_memory_usage():.1f}MB")
llama.load_model("./models/model.gguf")
print(f"After model load: {get_memory_usage():.1f}MB")
llama.create_context()
print(f"After context creation: {get_memory_usage():.1f}MB")
# Use model...
finally:
llama.cleanup()
print(f"After cleanup: {get_memory_usage():.1f}MB")import llama_cpp.llama_cpp as llama_cpp
import ctypes
def validate_model_file(file_path: str) -> bool:
"""Validate model file before loading."""
import os
if not os.path.exists(file_path):
print(f"Model file not found: {file_path}")
return False
file_size = os.path.getsize(file_path)
if file_size < 1024: # Less than 1KB is suspicious
print(f"Model file too small: {file_size} bytes")
return False
# Check file extension
if not file_path.lower().endswith(('.gguf', '.ggml', '.bin')):
print(f"Unexpected file extension: {file_path}")
return False
return True
def safe_model_loading(model_path: str):
"""Demonstrate safe model loading with error handling."""
if not validate_model_file(model_path):
return None
llama_cpp.llama_backend_init()
try:
# Check system capabilities first
if not llama_cpp.llama_supports_mmap():
print("Warning: Memory mapping not supported")
# Get default parameters
model_params = llama_cpp.llama_model_default_params()
# Attempt to load model
print(f"Loading model: {model_path}")
model = llama_cpp.llama_model_load_from_file(
model_path.encode('utf-8'),
model_params
)
if not model:
print("Model loading failed - check file format and permissions")
return None
# Validate model properties
try:
context_params = llama_cpp.llama_context_default_params()
context = llama_cpp.llama_new_context_with_model(model, context_params)
if context:
n_ctx = llama_cpp.llama_n_ctx(context)
n_embd = llama_cpp.llama_n_embd(model)
print(f"Model validation successful:")
print(f" Context size: {n_ctx}")
print(f" Embeddings: {n_embd}")
llama_cpp.llama_free(context)
return model
else:
print("Context creation failed - insufficient memory?")
llama_cpp.llama_model_free(model)
return None
except Exception as e:
print(f"Model validation error: {e}")
llama_cpp.llama_model_free(model)
return None
except Exception as e:
print(f"Unexpected error during model loading: {e}")
return None
finally:
# Backend cleanup handled by caller
pass
# Usage
model = safe_model_loading("./models/test-model.gguf")
if model:
print("Model ready for use")
# Use model...
llama_cpp.llama_model_free(model)
llama_cpp.llama_backend_free()import llama_cpp.llama_cpp as llama_cpp
import time
import contextlib
@contextlib.contextmanager
def performance_monitor(operation_name: str):
"""Context manager for performance monitoring."""
start_time = llama_cpp.llama_time_us()
start_memory = get_memory_usage()
try:
yield
finally:
end_time = llama_cpp.llama_time_us()
end_memory = get_memory_usage()
duration_ms = (end_time - start_time) / 1000
memory_change = end_memory - start_memory
print(f"{operation_name}:")
print(f" Duration: {duration_ms:.2f}ms")
print(f" Memory change: {memory_change:+.1f}MB")
# Usage example
llama_cpp.llama_backend_init()
try:
with performance_monitor("Model Loading"):
model_params = llama_cpp.llama_model_default_params()
model = llama_cpp.llama_model_load_from_file(
b"./models/model.gguf",
model_params
)
if model:
with performance_monitor("Context Creation"):
context_params = llama_cpp.llama_context_default_params()
context = llama_cpp.llama_new_context_with_model(model, context_params)
if context:
with performance_monitor("Model Info Retrieval"):
n_ctx = llama_cpp.llama_n_ctx(context)
n_embd = llama_cpp.llama_n_embd(model)
print(f"Context: {n_ctx}, Embeddings: {n_embd}")
llama_cpp.llama_free(context)
llama_cpp.llama_model_free(model)
finally:
llama_cpp.llama_backend_free()Install with Tessl CLI
npx tessl i tessl/pypi-llama-cpp-python