0
# Server Components
1
2
FastAPI-based web server with OpenAI-compatible endpoints, settings management, and multi-model configuration support for production deployments and REST API access.
3
4
## Capabilities
5
6
### Server Settings
7
8
Configure web server parameters and hosting options.
9
10
```python { .api }
11
class ServerSettings:
12
host: str = "127.0.0.1"
13
port: int = 8000
14
interrupt_requests: bool = True
15
16
def __init__(
17
self,
18
host: str = "127.0.0.1",
19
port: int = 8000,
20
interrupt_requests: bool = True,
21
**kwargs
22
):
23
"""
24
Initialize server configuration.
25
26
Args:
27
host: Server bind address
28
port: Server port number
29
interrupt_requests: Allow request interruption
30
"""
31
```
32
33
### Model Settings
34
35
Configure model parameters for server deployment.
36
37
```python { .api }
38
class ModelSettings:
39
model: str
40
model_alias: Optional[str] = None
41
n_ctx: int = 2048
42
n_threads: Optional[int] = None
43
n_gpu_layers: int = 0
44
main_gpu: int = 0
45
tensor_split: Optional[List[float]] = None
46
vocab_only: bool = False
47
use_mmap: bool = True
48
use_mlock: bool = False
49
kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
50
seed: int = 0xFFFFFFFF
51
n_batch: int = 512
52
n_threads_batch: Optional[int] = None
53
rope_scaling_type: int = -1
54
rope_freq_base: float = 0.0
55
rope_freq_scale: float = 0.0
56
yarn_ext_factor: float = -1.0
57
yarn_attn_factor: float = 1.0
58
yarn_beta_fast: float = 32.0
59
yarn_beta_slow: float = 1.0
60
yarn_orig_ctx: int = 0
61
mul_mat_q: bool = True
62
f16_kv: bool = True
63
logits_all: bool = False
64
embedding: bool = False
65
offload_kqv: bool = True
66
flash_attn: bool = False
67
last_n_tokens_size: int = 64
68
lora_base: Optional[str] = None
69
lora_scale: float = 1.0
70
lora_path: Optional[str] = None
71
numa: Union[bool, int] = False
72
chat_format: Optional[str] = None
73
chat_handler: Optional[object] = None
74
draft_model: Optional[object] = None
75
tokenizer: Optional[object] = None
76
hf_pretrained_model_name_or_path: Optional[str] = None
77
hf_model_repo_id: Optional[str] = None
78
clip_model_path: Optional[str] = None
79
cache: bool = False
80
cache_type: str = "ram"
81
cache_size: int = 2 << 30
82
verbose: bool = True
83
84
def __init__(
85
self,
86
model: str,
87
**kwargs
88
):
89
"""
90
Initialize model configuration.
91
92
Args:
93
model: Path to model file
94
**kwargs: Additional model parameters
95
"""
96
```
97
98
### Combined Settings
99
100
Unified configuration combining server and model settings.
101
102
```python { .api }
103
class Settings(ServerSettings, ModelSettings):
104
def __init__(
105
self,
106
model: str,
107
**kwargs
108
):
109
"""
110
Combined server and model settings.
111
112
Args:
113
model: Path to model file
114
**kwargs: Server and model parameters
115
"""
116
```
117
118
### Multi-Model Configuration
119
120
Configuration from file for serving multiple models.
121
122
```python { .api }
123
class ConfigFileSettings:
124
config_file: str
125
models: List[ModelSettings]
126
127
def __init__(
128
self,
129
config_file: str,
130
**kwargs
131
):
132
"""
133
Initialize configuration from file.
134
135
Args:
136
config_file: Path to configuration file
137
"""
138
139
@classmethod
140
def from_file(cls, config_file: str) -> "ConfigFileSettings":
141
"""
142
Load configuration from file.
143
144
Args:
145
config_file: Path to YAML/JSON config file
146
147
Returns:
148
ConfigFileSettings instance
149
"""
150
```
151
152
### Request/Response Models
153
154
Type definitions for REST API endpoints.
155
156
```python { .api }
157
# Temperature field definition
158
temperature_field = Field(
159
default=0.8,
160
ge=0.0,
161
le=2.0,
162
description="Sampling temperature"
163
)
164
165
# Top-p field definition
166
top_p_field = Field(
167
default=0.95,
168
ge=0.0,
169
le=1.0,
170
description="Nucleus sampling parameter"
171
)
172
173
# Max tokens field definition
174
max_tokens_field = Field(
175
default=16,
176
ge=1,
177
description="Maximum tokens to generate"
178
)
179
180
# Stream field definition
181
stream_field = Field(
182
default=False,
183
description="Enable streaming response"
184
)
185
186
# Stop field definition
187
stop_field = Field(
188
default=None,
189
description="Stop sequences for generation"
190
)
191
192
# Model field definition
193
model_field = Field(
194
default=None,
195
description="Model name for response metadata"
196
)
197
198
# Frequency penalty field definition
199
frequency_penalty_field = Field(
200
default=0.0,
201
ge=-2.0,
202
le=2.0,
203
description="Frequency penalty for token repetition"
204
)
205
206
# Presence penalty field definition
207
presence_penalty_field = Field(
208
default=0.0,
209
ge=-2.0,
210
le=2.0,
211
description="Presence penalty for new topics"
212
)
213
```
214
215
## Usage Examples
216
217
### Basic Server Setup
218
219
```python
220
from llama_cpp.server.settings import Settings
221
import uvicorn
222
223
# Create server configuration
224
settings = Settings(
225
model="./models/llama-2-7b-chat.gguf",
226
host="0.0.0.0", # Allow external connections
227
port=8000,
228
n_ctx=2048,
229
n_gpu_layers=35, # Offload to GPU
230
chat_format="llama-2",
231
)
232
233
# This would typically be handled by the server startup script
234
print(f"Server configured to run on {settings.host}:{settings.port}")
235
print(f"Model: {settings.model}")
236
print(f"Context size: {settings.n_ctx}")
237
print(f"GPU layers: {settings.n_gpu_layers}")
238
```
239
240
### Multi-Model Configuration
241
242
```python
243
import yaml
244
from llama_cpp.server.settings import ConfigFileSettings
245
246
# Create multi-model configuration file
247
config = {
248
"models": [
249
{
250
"model": "./models/llama-2-7b-chat.gguf",
251
"model_alias": "llama-7b",
252
"n_ctx": 2048,
253
"n_gpu_layers": 35,
254
"chat_format": "llama-2",
255
},
256
{
257
"model": "./models/mistral-7b-instruct.gguf",
258
"model_alias": "mistral-7b",
259
"n_ctx": 4096,
260
"n_gpu_layers": 35,
261
"chat_format": "mistral-instruct",
262
},
263
{
264
"model": "./models/codellama-13b.gguf",
265
"model_alias": "codellama-13b",
266
"n_ctx": 2048,
267
"n_gpu_layers": 40,
268
"chat_format": "codellama-instruct",
269
}
270
],
271
"host": "0.0.0.0",
272
"port": 8000,
273
"interrupt_requests": True,
274
}
275
276
# Save configuration file
277
with open("server_config.yaml", "w") as f:
278
yaml.dump(config, f)
279
280
# Load configuration
281
config_settings = ConfigFileSettings.from_file("server_config.yaml")
282
print(f"Loaded {len(config_settings.models)} model configurations")
283
```
284
285
### Production Server Configuration
286
287
```python
288
from llama_cpp.server.settings import Settings
289
290
# Production-ready configuration
291
production_settings = Settings(
292
model="./models/production-model.gguf",
293
host="0.0.0.0",
294
port=8080,
295
296
# Performance settings
297
n_ctx=4096,
298
n_threads=16,
299
n_gpu_layers=50,
300
n_batch=512,
301
302
# Memory optimization
303
use_mmap=True,
304
use_mlock=True,
305
f16_kv=True,
306
307
# Caching
308
cache=True,
309
cache_type="disk",
310
cache_size=4 << 30, # 4GB cache
311
312
# Security
313
interrupt_requests=True,
314
315
# Logging
316
verbose=False,
317
)
318
319
print("Production server configuration:")
320
print(f"- Host: {production_settings.host}:{production_settings.port}")
321
print(f"- Context: {production_settings.n_ctx} tokens")
322
print(f"- GPU layers: {production_settings.n_gpu_layers}")
323
print(f"- Cache: {production_settings.cache_type} ({production_settings.cache_size // (1024**3)}GB)")
324
```
325
326
### Development Server Configuration
327
328
```python
329
# Development configuration with debugging
330
dev_settings = Settings(
331
model="./models/small-model.gguf",
332
host="127.0.0.1", # Local only
333
port=8000,
334
335
# Smaller model for faster iteration
336
n_ctx=1024,
337
n_threads=4,
338
n_gpu_layers=0, # CPU only for debugging
339
340
# Debug settings
341
verbose=True,
342
logits_all=True, # For debugging token probabilities
343
344
# No caching for development
345
cache=False,
346
)
347
348
print("Development server configuration:")
349
print(f"- Local access only: {dev_settings.host}:{dev_settings.port}")
350
print(f"- CPU-only processing")
351
print(f"- Verbose logging enabled")
352
```
353
354
### Custom Chat Format Configuration
355
356
```python
357
# Server with custom chat format
358
custom_chat_settings = Settings(
359
model="./models/custom-model.gguf",
360
host="0.0.0.0",
361
port=8000,
362
n_ctx=2048,
363
364
# Custom format
365
chat_format="custom", # Requires custom handler registration
366
367
# Vision support
368
clip_model_path="./models/vision-projector.gguf",
369
370
# LoRA adapter
371
lora_path="./adapters/domain-specific-lora.bin",
372
lora_scale=0.8,
373
)
374
375
print("Custom model server configuration:")
376
print(f"- Chat format: {custom_chat_settings.chat_format}")
377
print(f"- Vision support: {'Yes' if custom_chat_settings.clip_model_path else 'No'}")
378
print(f"- LoRA adapter: {custom_chat_settings.lora_path}")
379
```
380
381
### Environment-Based Configuration
382
383
```python
384
import os
385
from llama_cpp.server.settings import Settings
386
387
# Configuration from environment variables
388
env_settings = Settings(
389
model=os.getenv("LLAMA_MODEL_PATH", "./models/default.gguf"),
390
host=os.getenv("LLAMA_HOST", "127.0.0.1"),
391
port=int(os.getenv("LLAMA_PORT", "8000")),
392
n_ctx=int(os.getenv("LLAMA_N_CTX", "2048")),
393
n_gpu_layers=int(os.getenv("LLAMA_N_GPU_LAYERS", "0")),
394
n_threads=int(os.getenv("LLAMA_N_THREADS", "4")),
395
chat_format=os.getenv("LLAMA_CHAT_FORMAT", "llama-2"),
396
verbose=os.getenv("LLAMA_VERBOSE", "false").lower() == "true",
397
)
398
399
print("Environment-based configuration:")
400
print(f"- Model: {env_settings.model}")
401
print(f"- Server: {env_settings.host}:{env_settings.port}")
402
print(f"- GPU layers: {env_settings.n_gpu_layers}")
403
print(f"- Chat format: {env_settings.chat_format}")
404
```
405
406
### Health Check Configuration
407
408
```python
409
# Server configuration with health monitoring
410
monitoring_settings = Settings(
411
model="./models/model.gguf",
412
host="0.0.0.0",
413
port=8000,
414
415
# Enable request interruption for health checks
416
interrupt_requests=True,
417
418
# Optimized for responsiveness
419
n_ctx=1024,
420
n_batch=128,
421
422
# Minimal logging for production
423
verbose=False,
424
)
425
426
# Example health check endpoint configuration
427
health_check_config = {
428
"endpoint": "/health",
429
"timeout": 5.0,
430
"check_model_loaded": True,
431
"check_memory_usage": True,
432
"max_memory_percent": 90,
433
}
434
435
print("Health monitoring configuration:")
436
print(f"- Health endpoint: {health_check_config['endpoint']}")
437
print(f"- Timeout: {health_check_config['timeout']}s")
438
print(f"- Memory limit: {health_check_config['max_memory_percent']}%")
439
```
440
441
### Load Balancer Configuration
442
443
```python
444
# Multiple server instances for load balancing
445
servers = []
446
447
base_port = 8000
448
for i in range(3): # 3 server instances
449
server_settings = Settings(
450
model=f"./models/model-replica-{i}.gguf",
451
host="127.0.0.1",
452
port=base_port + i,
453
454
# Distributed GPU usage
455
main_gpu=i % 2, # Alternate between GPUs
456
n_gpu_layers=30,
457
458
# Instance-specific settings
459
n_ctx=2048,
460
n_threads=8,
461
462
# Consistent behavior
463
seed=42, # Fixed seed for reproducibility
464
temperature=0.7,
465
)
466
467
servers.append(server_settings)
468
print(f"Server {i+1}: port {server_settings.port}, GPU {server_settings.main_gpu}")
469
470
# Load balancer would distribute requests across these instances
471
load_balancer_config = {
472
"strategy": "round_robin",
473
"health_check_interval": 30,
474
"retry_attempts": 3,
475
"timeout": 30.0,
476
}
477
478
print(f"Load balancer: {load_balancer_config['strategy']} across {len(servers)} instances")
479
```
480
481
### Docker Deployment Configuration
482
483
```python
484
# Configuration optimized for Docker deployment
485
docker_settings = Settings(
486
model="/app/models/model.gguf", # Container path
487
host="0.0.0.0", # Bind to all interfaces
488
port=8000,
489
490
# Container resource limits
491
n_ctx=2048,
492
n_threads=None, # Auto-detect container CPU limits
493
n_gpu_layers=40, # Assume GPU availability
494
495
# Container-friendly settings
496
use_mmap=True, # Efficient memory usage
497
verbose=False, # Reduce log volume
498
499
# Caching in container
500
cache=True,
501
cache_type="ram", # Avoid persistent storage issues
502
cache_size=1 << 30, # 1GB RAM cache
503
)
504
505
# Environment variables for Docker
506
docker_env = {
507
"LLAMA_MODEL_PATH": docker_settings.model,
508
"LLAMA_HOST": docker_settings.host,
509
"LLAMA_PORT": str(docker_settings.port),
510
"LLAMA_N_CTX": str(docker_settings.n_ctx),
511
"LLAMA_N_GPU_LAYERS": str(docker_settings.n_gpu_layers),
512
"LLAMA_CACHE_SIZE": str(docker_settings.cache_size),
513
}
514
515
print("Docker deployment configuration:")
516
for key, value in docker_env.items():
517
print(f"- {key}={value}")
518
```