CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-llama-cpp-python

Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.

Pending
Overview
Eval results
Files

server.mddocs/

Server Components

FastAPI-based web server with OpenAI-compatible endpoints, settings management, and multi-model configuration support for production deployments and REST API access.

Capabilities

Server Settings

Configure web server parameters and hosting options.

class ServerSettings:
    host: str = "127.0.0.1"
    port: int = 8000
    interrupt_requests: bool = True
    
    def __init__(
        self,
        host: str = "127.0.0.1",
        port: int = 8000,
        interrupt_requests: bool = True,
        **kwargs
    ):
        """
        Initialize server configuration.
        
        Args:
            host: Server bind address
            port: Server port number
            interrupt_requests: Allow request interruption
        """

Model Settings

Configure model parameters for server deployment.

class ModelSettings:
    model: str
    model_alias: Optional[str] = None
    n_ctx: int = 2048
    n_threads: Optional[int] = None
    n_gpu_layers: int = 0
    main_gpu: int = 0
    tensor_split: Optional[List[float]] = None
    vocab_only: bool = False
    use_mmap: bool = True
    use_mlock: bool = False
    kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
    seed: int = 0xFFFFFFFF
    n_batch: int = 512
    n_threads_batch: Optional[int] = None
    rope_scaling_type: int = -1
    rope_freq_base: float = 0.0
    rope_freq_scale: float = 0.0
    yarn_ext_factor: float = -1.0
    yarn_attn_factor: float = 1.0
    yarn_beta_fast: float = 32.0
    yarn_beta_slow: float = 1.0
    yarn_orig_ctx: int = 0
    mul_mat_q: bool = True
    f16_kv: bool = True
    logits_all: bool = False
    embedding: bool = False
    offload_kqv: bool = True
    flash_attn: bool = False
    last_n_tokens_size: int = 64
    lora_base: Optional[str] = None
    lora_scale: float = 1.0
    lora_path: Optional[str] = None
    numa: Union[bool, int] = False
    chat_format: Optional[str] = None
    chat_handler: Optional[object] = None
    draft_model: Optional[object] = None
    tokenizer: Optional[object] = None
    hf_pretrained_model_name_or_path: Optional[str] = None
    hf_model_repo_id: Optional[str] = None
    clip_model_path: Optional[str] = None
    cache: bool = False
    cache_type: str = "ram"
    cache_size: int = 2 << 30
    verbose: bool = True
    
    def __init__(
        self,
        model: str,
        **kwargs
    ):
        """
        Initialize model configuration.
        
        Args:
            model: Path to model file
            **kwargs: Additional model parameters
        """

Combined Settings

Unified configuration combining server and model settings.

class Settings(ServerSettings, ModelSettings):
    def __init__(
        self,
        model: str,
        **kwargs
    ):
        """
        Combined server and model settings.
        
        Args:
            model: Path to model file
            **kwargs: Server and model parameters
        """

Multi-Model Configuration

Configuration from file for serving multiple models.

class ConfigFileSettings:
    config_file: str
    models: List[ModelSettings]
    
    def __init__(
        self,
        config_file: str,
        **kwargs
    ):
        """
        Initialize configuration from file.
        
        Args:
            config_file: Path to configuration file
        """

    @classmethod
    def from_file(cls, config_file: str) -> "ConfigFileSettings":
        """
        Load configuration from file.
        
        Args:
            config_file: Path to YAML/JSON config file
            
        Returns:
            ConfigFileSettings instance
        """

Request/Response Models

Type definitions for REST API endpoints.

# Temperature field definition
temperature_field = Field(
    default=0.8, 
    ge=0.0, 
    le=2.0, 
    description="Sampling temperature"
)

# Top-p field definition  
top_p_field = Field(
    default=0.95,
    ge=0.0,
    le=1.0,
    description="Nucleus sampling parameter"
)

# Max tokens field definition
max_tokens_field = Field(
    default=16,
    ge=1,
    description="Maximum tokens to generate"
)

# Stream field definition
stream_field = Field(
    default=False,
    description="Enable streaming response"
)

# Stop field definition
stop_field = Field(
    default=None,
    description="Stop sequences for generation"
)

# Model field definition
model_field = Field(
    default=None,
    description="Model name for response metadata"
)

# Frequency penalty field definition
frequency_penalty_field = Field(
    default=0.0,
    ge=-2.0,
    le=2.0,
    description="Frequency penalty for token repetition"
)

# Presence penalty field definition
presence_penalty_field = Field(
    default=0.0,
    ge=-2.0,
    le=2.0,
    description="Presence penalty for new topics"
)

Usage Examples

Basic Server Setup

from llama_cpp.server.settings import Settings
import uvicorn

# Create server configuration
settings = Settings(
    model="./models/llama-2-7b-chat.gguf",
    host="0.0.0.0",  # Allow external connections
    port=8000,
    n_ctx=2048,
    n_gpu_layers=35,  # Offload to GPU
    chat_format="llama-2",
)

# This would typically be handled by the server startup script
print(f"Server configured to run on {settings.host}:{settings.port}")
print(f"Model: {settings.model}")
print(f"Context size: {settings.n_ctx}")
print(f"GPU layers: {settings.n_gpu_layers}")

Multi-Model Configuration

import yaml
from llama_cpp.server.settings import ConfigFileSettings

# Create multi-model configuration file
config = {
    "models": [
        {
            "model": "./models/llama-2-7b-chat.gguf",
            "model_alias": "llama-7b",
            "n_ctx": 2048,
            "n_gpu_layers": 35,
            "chat_format": "llama-2",
        },
        {
            "model": "./models/mistral-7b-instruct.gguf", 
            "model_alias": "mistral-7b",
            "n_ctx": 4096,
            "n_gpu_layers": 35,
            "chat_format": "mistral-instruct",
        },
        {
            "model": "./models/codellama-13b.gguf",
            "model_alias": "codellama-13b", 
            "n_ctx": 2048,
            "n_gpu_layers": 40,
            "chat_format": "codellama-instruct",
        }
    ],
    "host": "0.0.0.0",
    "port": 8000,
    "interrupt_requests": True,
}

# Save configuration file
with open("server_config.yaml", "w") as f:
    yaml.dump(config, f)

# Load configuration
config_settings = ConfigFileSettings.from_file("server_config.yaml")
print(f"Loaded {len(config_settings.models)} model configurations")

Production Server Configuration

from llama_cpp.server.settings import Settings

# Production-ready configuration
production_settings = Settings(
    model="./models/production-model.gguf",
    host="0.0.0.0",
    port=8080,
    
    # Performance settings
    n_ctx=4096,
    n_threads=16,
    n_gpu_layers=50,
    n_batch=512,
    
    # Memory optimization
    use_mmap=True,
    use_mlock=True,
    f16_kv=True,
    
    # Caching
    cache=True,
    cache_type="disk",
    cache_size=4 << 30,  # 4GB cache
    
    # Security
    interrupt_requests=True,
    
    # Logging
    verbose=False,
)

print("Production server configuration:")
print(f"- Host: {production_settings.host}:{production_settings.port}")
print(f"- Context: {production_settings.n_ctx} tokens")
print(f"- GPU layers: {production_settings.n_gpu_layers}")
print(f"- Cache: {production_settings.cache_type} ({production_settings.cache_size // (1024**3)}GB)")

Development Server Configuration

# Development configuration with debugging
dev_settings = Settings(
    model="./models/small-model.gguf",
    host="127.0.0.1",  # Local only
    port=8000,
    
    # Smaller model for faster iteration
    n_ctx=1024,
    n_threads=4,
    n_gpu_layers=0,  # CPU only for debugging
    
    # Debug settings
    verbose=True,
    logits_all=True,  # For debugging token probabilities
    
    # No caching for development
    cache=False,
)

print("Development server configuration:")
print(f"- Local access only: {dev_settings.host}:{dev_settings.port}")
print(f"- CPU-only processing")
print(f"- Verbose logging enabled")

Custom Chat Format Configuration

# Server with custom chat format
custom_chat_settings = Settings(
    model="./models/custom-model.gguf",
    host="0.0.0.0",
    port=8000,
    n_ctx=2048,
    
    # Custom format
    chat_format="custom",  # Requires custom handler registration
    
    # Vision support
    clip_model_path="./models/vision-projector.gguf",
    
    # LoRA adapter
    lora_path="./adapters/domain-specific-lora.bin",
    lora_scale=0.8,
)

print("Custom model server configuration:")
print(f"- Chat format: {custom_chat_settings.chat_format}")
print(f"- Vision support: {'Yes' if custom_chat_settings.clip_model_path else 'No'}")
print(f"- LoRA adapter: {custom_chat_settings.lora_path}")

Environment-Based Configuration

import os
from llama_cpp.server.settings import Settings

# Configuration from environment variables
env_settings = Settings(
    model=os.getenv("LLAMA_MODEL_PATH", "./models/default.gguf"),
    host=os.getenv("LLAMA_HOST", "127.0.0.1"),
    port=int(os.getenv("LLAMA_PORT", "8000")),
    n_ctx=int(os.getenv("LLAMA_N_CTX", "2048")),
    n_gpu_layers=int(os.getenv("LLAMA_N_GPU_LAYERS", "0")),
    n_threads=int(os.getenv("LLAMA_N_THREADS", "4")),
    chat_format=os.getenv("LLAMA_CHAT_FORMAT", "llama-2"),
    verbose=os.getenv("LLAMA_VERBOSE", "false").lower() == "true",
)

print("Environment-based configuration:")
print(f"- Model: {env_settings.model}")
print(f"- Server: {env_settings.host}:{env_settings.port}")
print(f"- GPU layers: {env_settings.n_gpu_layers}")
print(f"- Chat format: {env_settings.chat_format}")

Health Check Configuration

# Server configuration with health monitoring
monitoring_settings = Settings(
    model="./models/model.gguf",
    host="0.0.0.0",
    port=8000,
    
    # Enable request interruption for health checks
    interrupt_requests=True,
    
    # Optimized for responsiveness
    n_ctx=1024,
    n_batch=128,
    
    # Minimal logging for production
    verbose=False,
)

# Example health check endpoint configuration
health_check_config = {
    "endpoint": "/health",
    "timeout": 5.0,
    "check_model_loaded": True,
    "check_memory_usage": True,
    "max_memory_percent": 90,
}

print("Health monitoring configuration:")
print(f"- Health endpoint: {health_check_config['endpoint']}")
print(f"- Timeout: {health_check_config['timeout']}s")
print(f"- Memory limit: {health_check_config['max_memory_percent']}%")

Load Balancer Configuration

# Multiple server instances for load balancing
servers = []

base_port = 8000
for i in range(3):  # 3 server instances
    server_settings = Settings(
        model=f"./models/model-replica-{i}.gguf",
        host="127.0.0.1",
        port=base_port + i,
        
        # Distributed GPU usage
        main_gpu=i % 2,  # Alternate between GPUs
        n_gpu_layers=30,
        
        # Instance-specific settings
        n_ctx=2048,
        n_threads=8,
        
        # Consistent behavior
        seed=42,  # Fixed seed for reproducibility
        temperature=0.7,
    )
    
    servers.append(server_settings)
    print(f"Server {i+1}: port {server_settings.port}, GPU {server_settings.main_gpu}")

# Load balancer would distribute requests across these instances
load_balancer_config = {
    "strategy": "round_robin",
    "health_check_interval": 30,
    "retry_attempts": 3,
    "timeout": 30.0,
}

print(f"Load balancer: {load_balancer_config['strategy']} across {len(servers)} instances")

Docker Deployment Configuration

# Configuration optimized for Docker deployment
docker_settings = Settings(
    model="/app/models/model.gguf",  # Container path
    host="0.0.0.0",  # Bind to all interfaces
    port=8000,
    
    # Container resource limits
    n_ctx=2048,
    n_threads=None,  # Auto-detect container CPU limits
    n_gpu_layers=40,  # Assume GPU availability
    
    # Container-friendly settings
    use_mmap=True,  # Efficient memory usage
    verbose=False,  # Reduce log volume
    
    # Caching in container
    cache=True,
    cache_type="ram",  # Avoid persistent storage issues
    cache_size=1 << 30,  # 1GB RAM cache
)

# Environment variables for Docker
docker_env = {
    "LLAMA_MODEL_PATH": docker_settings.model,
    "LLAMA_HOST": docker_settings.host,
    "LLAMA_PORT": str(docker_settings.port),
    "LLAMA_N_CTX": str(docker_settings.n_ctx),
    "LLAMA_N_GPU_LAYERS": str(docker_settings.n_gpu_layers),
    "LLAMA_CACHE_SIZE": str(docker_settings.cache_size),
}

print("Docker deployment configuration:")
for key, value in docker_env.items():
    print(f"- {key}={value}")

Install with Tessl CLI

npx tessl i tessl/pypi-llama-cpp-python

docs

caching.md

chat-completion.md

grammar.md

index.md

llama-model.md

low-level.md

server.md

tokenization.md

vision.md

tile.json