Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
—
FastAPI-based web server with OpenAI-compatible endpoints, settings management, and multi-model configuration support for production deployments and REST API access.
Configure web server parameters and hosting options.
class ServerSettings:
host: str = "127.0.0.1"
port: int = 8000
interrupt_requests: bool = True
def __init__(
self,
host: str = "127.0.0.1",
port: int = 8000,
interrupt_requests: bool = True,
**kwargs
):
"""
Initialize server configuration.
Args:
host: Server bind address
port: Server port number
interrupt_requests: Allow request interruption
"""Configure model parameters for server deployment.
class ModelSettings:
model: str
model_alias: Optional[str] = None
n_ctx: int = 2048
n_threads: Optional[int] = None
n_gpu_layers: int = 0
main_gpu: int = 0
tensor_split: Optional[List[float]] = None
vocab_only: bool = False
use_mmap: bool = True
use_mlock: bool = False
kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
seed: int = 0xFFFFFFFF
n_batch: int = 512
n_threads_batch: Optional[int] = None
rope_scaling_type: int = -1
rope_freq_base: float = 0.0
rope_freq_scale: float = 0.0
yarn_ext_factor: float = -1.0
yarn_attn_factor: float = 1.0
yarn_beta_fast: float = 32.0
yarn_beta_slow: float = 1.0
yarn_orig_ctx: int = 0
mul_mat_q: bool = True
f16_kv: bool = True
logits_all: bool = False
embedding: bool = False
offload_kqv: bool = True
flash_attn: bool = False
last_n_tokens_size: int = 64
lora_base: Optional[str] = None
lora_scale: float = 1.0
lora_path: Optional[str] = None
numa: Union[bool, int] = False
chat_format: Optional[str] = None
chat_handler: Optional[object] = None
draft_model: Optional[object] = None
tokenizer: Optional[object] = None
hf_pretrained_model_name_or_path: Optional[str] = None
hf_model_repo_id: Optional[str] = None
clip_model_path: Optional[str] = None
cache: bool = False
cache_type: str = "ram"
cache_size: int = 2 << 30
verbose: bool = True
def __init__(
self,
model: str,
**kwargs
):
"""
Initialize model configuration.
Args:
model: Path to model file
**kwargs: Additional model parameters
"""Unified configuration combining server and model settings.
class Settings(ServerSettings, ModelSettings):
def __init__(
self,
model: str,
**kwargs
):
"""
Combined server and model settings.
Args:
model: Path to model file
**kwargs: Server and model parameters
"""Configuration from file for serving multiple models.
class ConfigFileSettings:
config_file: str
models: List[ModelSettings]
def __init__(
self,
config_file: str,
**kwargs
):
"""
Initialize configuration from file.
Args:
config_file: Path to configuration file
"""
@classmethod
def from_file(cls, config_file: str) -> "ConfigFileSettings":
"""
Load configuration from file.
Args:
config_file: Path to YAML/JSON config file
Returns:
ConfigFileSettings instance
"""Type definitions for REST API endpoints.
# Temperature field definition
temperature_field = Field(
default=0.8,
ge=0.0,
le=2.0,
description="Sampling temperature"
)
# Top-p field definition
top_p_field = Field(
default=0.95,
ge=0.0,
le=1.0,
description="Nucleus sampling parameter"
)
# Max tokens field definition
max_tokens_field = Field(
default=16,
ge=1,
description="Maximum tokens to generate"
)
# Stream field definition
stream_field = Field(
default=False,
description="Enable streaming response"
)
# Stop field definition
stop_field = Field(
default=None,
description="Stop sequences for generation"
)
# Model field definition
model_field = Field(
default=None,
description="Model name for response metadata"
)
# Frequency penalty field definition
frequency_penalty_field = Field(
default=0.0,
ge=-2.0,
le=2.0,
description="Frequency penalty for token repetition"
)
# Presence penalty field definition
presence_penalty_field = Field(
default=0.0,
ge=-2.0,
le=2.0,
description="Presence penalty for new topics"
)from llama_cpp.server.settings import Settings
import uvicorn
# Create server configuration
settings = Settings(
model="./models/llama-2-7b-chat.gguf",
host="0.0.0.0", # Allow external connections
port=8000,
n_ctx=2048,
n_gpu_layers=35, # Offload to GPU
chat_format="llama-2",
)
# This would typically be handled by the server startup script
print(f"Server configured to run on {settings.host}:{settings.port}")
print(f"Model: {settings.model}")
print(f"Context size: {settings.n_ctx}")
print(f"GPU layers: {settings.n_gpu_layers}")import yaml
from llama_cpp.server.settings import ConfigFileSettings
# Create multi-model configuration file
config = {
"models": [
{
"model": "./models/llama-2-7b-chat.gguf",
"model_alias": "llama-7b",
"n_ctx": 2048,
"n_gpu_layers": 35,
"chat_format": "llama-2",
},
{
"model": "./models/mistral-7b-instruct.gguf",
"model_alias": "mistral-7b",
"n_ctx": 4096,
"n_gpu_layers": 35,
"chat_format": "mistral-instruct",
},
{
"model": "./models/codellama-13b.gguf",
"model_alias": "codellama-13b",
"n_ctx": 2048,
"n_gpu_layers": 40,
"chat_format": "codellama-instruct",
}
],
"host": "0.0.0.0",
"port": 8000,
"interrupt_requests": True,
}
# Save configuration file
with open("server_config.yaml", "w") as f:
yaml.dump(config, f)
# Load configuration
config_settings = ConfigFileSettings.from_file("server_config.yaml")
print(f"Loaded {len(config_settings.models)} model configurations")from llama_cpp.server.settings import Settings
# Production-ready configuration
production_settings = Settings(
model="./models/production-model.gguf",
host="0.0.0.0",
port=8080,
# Performance settings
n_ctx=4096,
n_threads=16,
n_gpu_layers=50,
n_batch=512,
# Memory optimization
use_mmap=True,
use_mlock=True,
f16_kv=True,
# Caching
cache=True,
cache_type="disk",
cache_size=4 << 30, # 4GB cache
# Security
interrupt_requests=True,
# Logging
verbose=False,
)
print("Production server configuration:")
print(f"- Host: {production_settings.host}:{production_settings.port}")
print(f"- Context: {production_settings.n_ctx} tokens")
print(f"- GPU layers: {production_settings.n_gpu_layers}")
print(f"- Cache: {production_settings.cache_type} ({production_settings.cache_size // (1024**3)}GB)")# Development configuration with debugging
dev_settings = Settings(
model="./models/small-model.gguf",
host="127.0.0.1", # Local only
port=8000,
# Smaller model for faster iteration
n_ctx=1024,
n_threads=4,
n_gpu_layers=0, # CPU only for debugging
# Debug settings
verbose=True,
logits_all=True, # For debugging token probabilities
# No caching for development
cache=False,
)
print("Development server configuration:")
print(f"- Local access only: {dev_settings.host}:{dev_settings.port}")
print(f"- CPU-only processing")
print(f"- Verbose logging enabled")# Server with custom chat format
custom_chat_settings = Settings(
model="./models/custom-model.gguf",
host="0.0.0.0",
port=8000,
n_ctx=2048,
# Custom format
chat_format="custom", # Requires custom handler registration
# Vision support
clip_model_path="./models/vision-projector.gguf",
# LoRA adapter
lora_path="./adapters/domain-specific-lora.bin",
lora_scale=0.8,
)
print("Custom model server configuration:")
print(f"- Chat format: {custom_chat_settings.chat_format}")
print(f"- Vision support: {'Yes' if custom_chat_settings.clip_model_path else 'No'}")
print(f"- LoRA adapter: {custom_chat_settings.lora_path}")import os
from llama_cpp.server.settings import Settings
# Configuration from environment variables
env_settings = Settings(
model=os.getenv("LLAMA_MODEL_PATH", "./models/default.gguf"),
host=os.getenv("LLAMA_HOST", "127.0.0.1"),
port=int(os.getenv("LLAMA_PORT", "8000")),
n_ctx=int(os.getenv("LLAMA_N_CTX", "2048")),
n_gpu_layers=int(os.getenv("LLAMA_N_GPU_LAYERS", "0")),
n_threads=int(os.getenv("LLAMA_N_THREADS", "4")),
chat_format=os.getenv("LLAMA_CHAT_FORMAT", "llama-2"),
verbose=os.getenv("LLAMA_VERBOSE", "false").lower() == "true",
)
print("Environment-based configuration:")
print(f"- Model: {env_settings.model}")
print(f"- Server: {env_settings.host}:{env_settings.port}")
print(f"- GPU layers: {env_settings.n_gpu_layers}")
print(f"- Chat format: {env_settings.chat_format}")# Server configuration with health monitoring
monitoring_settings = Settings(
model="./models/model.gguf",
host="0.0.0.0",
port=8000,
# Enable request interruption for health checks
interrupt_requests=True,
# Optimized for responsiveness
n_ctx=1024,
n_batch=128,
# Minimal logging for production
verbose=False,
)
# Example health check endpoint configuration
health_check_config = {
"endpoint": "/health",
"timeout": 5.0,
"check_model_loaded": True,
"check_memory_usage": True,
"max_memory_percent": 90,
}
print("Health monitoring configuration:")
print(f"- Health endpoint: {health_check_config['endpoint']}")
print(f"- Timeout: {health_check_config['timeout']}s")
print(f"- Memory limit: {health_check_config['max_memory_percent']}%")# Multiple server instances for load balancing
servers = []
base_port = 8000
for i in range(3): # 3 server instances
server_settings = Settings(
model=f"./models/model-replica-{i}.gguf",
host="127.0.0.1",
port=base_port + i,
# Distributed GPU usage
main_gpu=i % 2, # Alternate between GPUs
n_gpu_layers=30,
# Instance-specific settings
n_ctx=2048,
n_threads=8,
# Consistent behavior
seed=42, # Fixed seed for reproducibility
temperature=0.7,
)
servers.append(server_settings)
print(f"Server {i+1}: port {server_settings.port}, GPU {server_settings.main_gpu}")
# Load balancer would distribute requests across these instances
load_balancer_config = {
"strategy": "round_robin",
"health_check_interval": 30,
"retry_attempts": 3,
"timeout": 30.0,
}
print(f"Load balancer: {load_balancer_config['strategy']} across {len(servers)} instances")# Configuration optimized for Docker deployment
docker_settings = Settings(
model="/app/models/model.gguf", # Container path
host="0.0.0.0", # Bind to all interfaces
port=8000,
# Container resource limits
n_ctx=2048,
n_threads=None, # Auto-detect container CPU limits
n_gpu_layers=40, # Assume GPU availability
# Container-friendly settings
use_mmap=True, # Efficient memory usage
verbose=False, # Reduce log volume
# Caching in container
cache=True,
cache_type="ram", # Avoid persistent storage issues
cache_size=1 << 30, # 1GB RAM cache
)
# Environment variables for Docker
docker_env = {
"LLAMA_MODEL_PATH": docker_settings.model,
"LLAMA_HOST": docker_settings.host,
"LLAMA_PORT": str(docker_settings.port),
"LLAMA_N_CTX": str(docker_settings.n_ctx),
"LLAMA_N_GPU_LAYERS": str(docker_settings.n_gpu_layers),
"LLAMA_CACHE_SIZE": str(docker_settings.cache_size),
}
print("Docker deployment configuration:")
for key, value in docker_env.items():
print(f"- {key}={value}")Install with Tessl CLI
npx tessl i tessl/pypi-llama-cpp-python