Tessl Tile for pypi/llama-cpp-python@0.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

caching.md chat-completion.md grammar.md index.md llama-model.md low-level.md server.md tokenization.md vision.md

server.mddocs/

0
# Server Components
1

2
FastAPI-based web server with OpenAI-compatible endpoints, settings management, and multi-model configuration support for production deployments and REST API access.
3

4
## Capabilities
5

6
### Server Settings
7

8
Configure web server parameters and hosting options.
9

10
```python { .api }
11
class ServerSettings:
12
    host: str = "127.0.0.1"
13
    port: int = 8000
14
    interrupt_requests: bool = True
15
    
16
    def __init__(
17
        self,
18
        host: str = "127.0.0.1",
19
        port: int = 8000,
20
        interrupt_requests: bool = True,
21
        **kwargs
22
    ):
23
        """
24
        Initialize server configuration.
25
        
26
        Args:
27
            host: Server bind address
28
            port: Server port number
29
            interrupt_requests: Allow request interruption
30
        """
31
```
32

33
### Model Settings
34

35
Configure model parameters for server deployment.
36

37
```python { .api }
38
class ModelSettings:
39
    model: str
40
    model_alias: Optional[str] = None
41
    n_ctx: int = 2048
42
    n_threads: Optional[int] = None
43
    n_gpu_layers: int = 0
44
    main_gpu: int = 0
45
    tensor_split: Optional[List[float]] = None
46
    vocab_only: bool = False
47
    use_mmap: bool = True
48
    use_mlock: bool = False
49
    kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
50
    seed: int = 0xFFFFFFFF
51
    n_batch: int = 512
52
    n_threads_batch: Optional[int] = None
53
    rope_scaling_type: int = -1
54
    rope_freq_base: float = 0.0
55
    rope_freq_scale: float = 0.0
56
    yarn_ext_factor: float = -1.0
57
    yarn_attn_factor: float = 1.0
58
    yarn_beta_fast: float = 32.0
59
    yarn_beta_slow: float = 1.0
60
    yarn_orig_ctx: int = 0
61
    mul_mat_q: bool = True
62
    f16_kv: bool = True
63
    logits_all: bool = False
64
    embedding: bool = False
65
    offload_kqv: bool = True
66
    flash_attn: bool = False
67
    last_n_tokens_size: int = 64
68
    lora_base: Optional[str] = None
69
    lora_scale: float = 1.0
70
    lora_path: Optional[str] = None
71
    numa: Union[bool, int] = False
72
    chat_format: Optional[str] = None
73
    chat_handler: Optional[object] = None
74
    draft_model: Optional[object] = None
75
    tokenizer: Optional[object] = None
76
    hf_pretrained_model_name_or_path: Optional[str] = None
77
    hf_model_repo_id: Optional[str] = None
78
    clip_model_path: Optional[str] = None
79
    cache: bool = False
80
    cache_type: str = "ram"
81
    cache_size: int = 2 << 30
82
    verbose: bool = True
83
    
84
    def __init__(
85
        self,
86
        model: str,
87
        **kwargs
88
    ):
89
        """
90
        Initialize model configuration.
91
        
92
        Args:
93
            model: Path to model file
94
            **kwargs: Additional model parameters
95
        """
96
```
97

98
### Combined Settings
99

100
Unified configuration combining server and model settings.
101

102
```python { .api }
103
class Settings(ServerSettings, ModelSettings):
104
    def __init__(
105
        self,
106
        model: str,
107
        **kwargs
108
    ):
109
        """
110
        Combined server and model settings.
111
        
112
        Args:
113
            model: Path to model file
114
            **kwargs: Server and model parameters
115
        """
116
```
117

118
### Multi-Model Configuration
119

120
Configuration from file for serving multiple models.
121

122
```python { .api }
123
class ConfigFileSettings:
124
    config_file: str
125
    models: List[ModelSettings]
126
    
127
    def __init__(
128
        self,
129
        config_file: str,
130
        **kwargs
131
    ):
132
        """
133
        Initialize configuration from file.
134
        
135
        Args:
136
            config_file: Path to configuration file
137
        """
138

139
    @classmethod
140
    def from_file(cls, config_file: str) -> "ConfigFileSettings":
141
        """
142
        Load configuration from file.
143
        
144
        Args:
145
            config_file: Path to YAML/JSON config file
146
            
147
        Returns:
148
            ConfigFileSettings instance
149
        """
150
```
151

152
### Request/Response Models
153

154
Type definitions for REST API endpoints.
155

156
```python { .api }
157
# Temperature field definition
158
temperature_field = Field(
159
    default=0.8, 
160
    ge=0.0, 
161
    le=2.0, 
162
    description="Sampling temperature"
163
)
164

165
# Top-p field definition  
166
top_p_field = Field(
167
    default=0.95,
168
    ge=0.0,
169
    le=1.0,
170
    description="Nucleus sampling parameter"
171
)
172

173
# Max tokens field definition
174
max_tokens_field = Field(
175
    default=16,
176
    ge=1,
177
    description="Maximum tokens to generate"
178
)
179

180
# Stream field definition
181
stream_field = Field(
182
    default=False,
183
    description="Enable streaming response"
184
)
185

186
# Stop field definition
187
stop_field = Field(
188
    default=None,
189
    description="Stop sequences for generation"
190
)
191

192
# Model field definition
193
model_field = Field(
194
    default=None,
195
    description="Model name for response metadata"
196
)
197

198
# Frequency penalty field definition
199
frequency_penalty_field = Field(
200
    default=0.0,
201
    ge=-2.0,
202
    le=2.0,
203
    description="Frequency penalty for token repetition"
204
)
205

206
# Presence penalty field definition
207
presence_penalty_field = Field(
208
    default=0.0,
209
    ge=-2.0,
210
    le=2.0,
211
    description="Presence penalty for new topics"
212
)
213
```
214

215
## Usage Examples
216

217
### Basic Server Setup
218

219
```python
220
from llama_cpp.server.settings import Settings
221
import uvicorn
222

223
# Create server configuration
224
settings = Settings(
225
    model="./models/llama-2-7b-chat.gguf",
226
    host="0.0.0.0",  # Allow external connections
227
    port=8000,
228
    n_ctx=2048,
229
    n_gpu_layers=35,  # Offload to GPU
230
    chat_format="llama-2",
231
)
232

233
# This would typically be handled by the server startup script
234
print(f"Server configured to run on {settings.host}:{settings.port}")
235
print(f"Model: {settings.model}")
236
print(f"Context size: {settings.n_ctx}")
237
print(f"GPU layers: {settings.n_gpu_layers}")
238
```
239

240
### Multi-Model Configuration
241

242
```python
243
import yaml
244
from llama_cpp.server.settings import ConfigFileSettings
245

246
# Create multi-model configuration file
247
config = {
248
    "models": [
249
        {
250
            "model": "./models/llama-2-7b-chat.gguf",
251
            "model_alias": "llama-7b",
252
            "n_ctx": 2048,
253
            "n_gpu_layers": 35,
254
            "chat_format": "llama-2",
255
        },
256
        {
257
            "model": "./models/mistral-7b-instruct.gguf", 
258
            "model_alias": "mistral-7b",
259
            "n_ctx": 4096,
260
            "n_gpu_layers": 35,
261
            "chat_format": "mistral-instruct",
262
        },
263
        {
264
            "model": "./models/codellama-13b.gguf",
265
            "model_alias": "codellama-13b", 
266
            "n_ctx": 2048,
267
            "n_gpu_layers": 40,
268
            "chat_format": "codellama-instruct",
269
        }
270
    ],
271
    "host": "0.0.0.0",
272
    "port": 8000,
273
    "interrupt_requests": True,
274
}
275

276
# Save configuration file
277
with open("server_config.yaml", "w") as f:
278
    yaml.dump(config, f)
279

280
# Load configuration
281
config_settings = ConfigFileSettings.from_file("server_config.yaml")
282
print(f"Loaded {len(config_settings.models)} model configurations")
283
```
284

285
### Production Server Configuration
286

287
```python
288
from llama_cpp.server.settings import Settings
289

290
# Production-ready configuration
291
production_settings = Settings(
292
    model="./models/production-model.gguf",
293
    host="0.0.0.0",
294
    port=8080,
295
    
296
    # Performance settings
297
    n_ctx=4096,
298
    n_threads=16,
299
    n_gpu_layers=50,
300
    n_batch=512,
301
    
302
    # Memory optimization
303
    use_mmap=True,
304
    use_mlock=True,
305
    f16_kv=True,
306
    
307
    # Caching
308
    cache=True,
309
    cache_type="disk",
310
    cache_size=4 << 30,  # 4GB cache
311
    
312
    # Security
313
    interrupt_requests=True,
314
    
315
    # Logging
316
    verbose=False,
317
)
318

319
print("Production server configuration:")
320
print(f"- Host: {production_settings.host}:{production_settings.port}")
321
print(f"- Context: {production_settings.n_ctx} tokens")
322
print(f"- GPU layers: {production_settings.n_gpu_layers}")
323
print(f"- Cache: {production_settings.cache_type} ({production_settings.cache_size // (1024**3)}GB)")
324
```
325

326
### Development Server Configuration
327

328
```python
329
# Development configuration with debugging
330
dev_settings = Settings(
331
    model="./models/small-model.gguf",
332
    host="127.0.0.1",  # Local only
333
    port=8000,
334
    
335
    # Smaller model for faster iteration
336
    n_ctx=1024,
337
    n_threads=4,
338
    n_gpu_layers=0,  # CPU only for debugging
339
    
340
    # Debug settings
341
    verbose=True,
342
    logits_all=True,  # For debugging token probabilities
343
    
344
    # No caching for development
345
    cache=False,
346
)
347

348
print("Development server configuration:")
349
print(f"- Local access only: {dev_settings.host}:{dev_settings.port}")
350
print(f"- CPU-only processing")
351
print(f"- Verbose logging enabled")
352
```
353

354
### Custom Chat Format Configuration
355

356
```python
357
# Server with custom chat format
358
custom_chat_settings = Settings(
359
    model="./models/custom-model.gguf",
360
    host="0.0.0.0",
361
    port=8000,
362
    n_ctx=2048,
363
    
364
    # Custom format
365
    chat_format="custom",  # Requires custom handler registration
366
    
367
    # Vision support
368
    clip_model_path="./models/vision-projector.gguf",
369
    
370
    # LoRA adapter
371
    lora_path="./adapters/domain-specific-lora.bin",
372
    lora_scale=0.8,
373
)
374

375
print("Custom model server configuration:")
376
print(f"- Chat format: {custom_chat_settings.chat_format}")
377
print(f"- Vision support: {'Yes' if custom_chat_settings.clip_model_path else 'No'}")
378
print(f"- LoRA adapter: {custom_chat_settings.lora_path}")
379
```
380

381
### Environment-Based Configuration
382

383
```python
384
import os
385
from llama_cpp.server.settings import Settings
386

387
# Configuration from environment variables
388
env_settings = Settings(
389
    model=os.getenv("LLAMA_MODEL_PATH", "./models/default.gguf"),
390
    host=os.getenv("LLAMA_HOST", "127.0.0.1"),
391
    port=int(os.getenv("LLAMA_PORT", "8000")),
392
    n_ctx=int(os.getenv("LLAMA_N_CTX", "2048")),
393
    n_gpu_layers=int(os.getenv("LLAMA_N_GPU_LAYERS", "0")),
394
    n_threads=int(os.getenv("LLAMA_N_THREADS", "4")),
395
    chat_format=os.getenv("LLAMA_CHAT_FORMAT", "llama-2"),
396
    verbose=os.getenv("LLAMA_VERBOSE", "false").lower() == "true",
397
)
398

399
print("Environment-based configuration:")
400
print(f"- Model: {env_settings.model}")
401
print(f"- Server: {env_settings.host}:{env_settings.port}")
402
print(f"- GPU layers: {env_settings.n_gpu_layers}")
403
print(f"- Chat format: {env_settings.chat_format}")
404
```
405

406
### Health Check Configuration
407

408
```python
409
# Server configuration with health monitoring
410
monitoring_settings = Settings(
411
    model="./models/model.gguf",
412
    host="0.0.0.0",
413
    port=8000,
414
    
415
    # Enable request interruption for health checks
416
    interrupt_requests=True,
417
    
418
    # Optimized for responsiveness
419
    n_ctx=1024,
420
    n_batch=128,
421
    
422
    # Minimal logging for production
423
    verbose=False,
424
)
425

426
# Example health check endpoint configuration
427
health_check_config = {
428
    "endpoint": "/health",
429
    "timeout": 5.0,
430
    "check_model_loaded": True,
431
    "check_memory_usage": True,
432
    "max_memory_percent": 90,
433
}
434

435
print("Health monitoring configuration:")
436
print(f"- Health endpoint: {health_check_config['endpoint']}")
437
print(f"- Timeout: {health_check_config['timeout']}s")
438
print(f"- Memory limit: {health_check_config['max_memory_percent']}%")
439
```
440

441
### Load Balancer Configuration
442

443
```python
444
# Multiple server instances for load balancing
445
servers = []
446

447
base_port = 8000
448
for i in range(3):  # 3 server instances
449
    server_settings = Settings(
450
        model=f"./models/model-replica-{i}.gguf",
451
        host="127.0.0.1",
452
        port=base_port + i,
453
        
454
        # Distributed GPU usage
455
        main_gpu=i % 2,  # Alternate between GPUs
456
        n_gpu_layers=30,
457
        
458
        # Instance-specific settings
459
        n_ctx=2048,
460
        n_threads=8,
461
        
462
        # Consistent behavior
463
        seed=42,  # Fixed seed for reproducibility
464
        temperature=0.7,
465
    )
466
    
467
    servers.append(server_settings)
468
    print(f"Server {i+1}: port {server_settings.port}, GPU {server_settings.main_gpu}")
469

470
# Load balancer would distribute requests across these instances
471
load_balancer_config = {
472
    "strategy": "round_robin",
473
    "health_check_interval": 30,
474
    "retry_attempts": 3,
475
    "timeout": 30.0,
476
}
477

478
print(f"Load balancer: {load_balancer_config['strategy']} across {len(servers)} instances")
479
```
480

481
### Docker Deployment Configuration
482

483
```python
484
# Configuration optimized for Docker deployment
485
docker_settings = Settings(
486
    model="/app/models/model.gguf",  # Container path
487
    host="0.0.0.0",  # Bind to all interfaces
488
    port=8000,
489
    
490
    # Container resource limits
491
    n_ctx=2048,
492
    n_threads=None,  # Auto-detect container CPU limits
493
    n_gpu_layers=40,  # Assume GPU availability
494
    
495
    # Container-friendly settings
496
    use_mmap=True,  # Efficient memory usage
497
    verbose=False,  # Reduce log volume
498
    
499
    # Caching in container
500
    cache=True,
501
    cache_type="ram",  # Avoid persistent storage issues
502
    cache_size=1 << 30,  # 1GB RAM cache
503
)
504

505
# Environment variables for Docker
506
docker_env = {
507
    "LLAMA_MODEL_PATH": docker_settings.model,
508
    "LLAMA_HOST": docker_settings.host,
509
    "LLAMA_PORT": str(docker_settings.port),
510
    "LLAMA_N_CTX": str(docker_settings.n_ctx),
511
    "LLAMA_N_GPU_LAYERS": str(docker_settings.n_gpu_layers),
512
    "LLAMA_CACHE_SIZE": str(docker_settings.cache_size),
513
}
514

515
print("Docker deployment configuration:")
516
for key, value in docker_env.items():
517
    print(f"- {key}={value}")
518
```

Version

Tile

Files

server.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

server.mddocs/