or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

caching.mdchat-completion.mdgrammar.mdindex.mdllama-model.mdlow-level.mdserver.mdtokenization.mdvision.md

server.mddocs/

0

# Server Components

1

2

FastAPI-based web server with OpenAI-compatible endpoints, settings management, and multi-model configuration support for production deployments and REST API access.

3

4

## Capabilities

5

6

### Server Settings

7

8

Configure web server parameters and hosting options.

9

10

```python { .api }

11

class ServerSettings:

12

host: str = "127.0.0.1"

13

port: int = 8000

14

interrupt_requests: bool = True

15

16

def __init__(

17

self,

18

host: str = "127.0.0.1",

19

port: int = 8000,

20

interrupt_requests: bool = True,

21

**kwargs

22

):

23

"""

24

Initialize server configuration.

25

26

Args:

27

host: Server bind address

28

port: Server port number

29

interrupt_requests: Allow request interruption

30

"""

31

```

32

33

### Model Settings

34

35

Configure model parameters for server deployment.

36

37

```python { .api }

38

class ModelSettings:

39

model: str

40

model_alias: Optional[str] = None

41

n_ctx: int = 2048

42

n_threads: Optional[int] = None

43

n_gpu_layers: int = 0

44

main_gpu: int = 0

45

tensor_split: Optional[List[float]] = None

46

vocab_only: bool = False

47

use_mmap: bool = True

48

use_mlock: bool = False

49

kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None

50

seed: int = 0xFFFFFFFF

51

n_batch: int = 512

52

n_threads_batch: Optional[int] = None

53

rope_scaling_type: int = -1

54

rope_freq_base: float = 0.0

55

rope_freq_scale: float = 0.0

56

yarn_ext_factor: float = -1.0

57

yarn_attn_factor: float = 1.0

58

yarn_beta_fast: float = 32.0

59

yarn_beta_slow: float = 1.0

60

yarn_orig_ctx: int = 0

61

mul_mat_q: bool = True

62

f16_kv: bool = True

63

logits_all: bool = False

64

embedding: bool = False

65

offload_kqv: bool = True

66

flash_attn: bool = False

67

last_n_tokens_size: int = 64

68

lora_base: Optional[str] = None

69

lora_scale: float = 1.0

70

lora_path: Optional[str] = None

71

numa: Union[bool, int] = False

72

chat_format: Optional[str] = None

73

chat_handler: Optional[object] = None

74

draft_model: Optional[object] = None

75

tokenizer: Optional[object] = None

76

hf_pretrained_model_name_or_path: Optional[str] = None

77

hf_model_repo_id: Optional[str] = None

78

clip_model_path: Optional[str] = None

79

cache: bool = False

80

cache_type: str = "ram"

81

cache_size: int = 2 << 30

82

verbose: bool = True

83

84

def __init__(

85

self,

86

model: str,

87

**kwargs

88

):

89

"""

90

Initialize model configuration.

91

92

Args:

93

model: Path to model file

94

**kwargs: Additional model parameters

95

"""

96

```

97

98

### Combined Settings

99

100

Unified configuration combining server and model settings.

101

102

```python { .api }

103

class Settings(ServerSettings, ModelSettings):

104

def __init__(

105

self,

106

model: str,

107

**kwargs

108

):

109

"""

110

Combined server and model settings.

111

112

Args:

113

model: Path to model file

114

**kwargs: Server and model parameters

115

"""

116

```

117

118

### Multi-Model Configuration

119

120

Configuration from file for serving multiple models.

121

122

```python { .api }

123

class ConfigFileSettings:

124

config_file: str

125

models: List[ModelSettings]

126

127

def __init__(

128

self,

129

config_file: str,

130

**kwargs

131

):

132

"""

133

Initialize configuration from file.

134

135

Args:

136

config_file: Path to configuration file

137

"""

138

139

@classmethod

140

def from_file(cls, config_file: str) -> "ConfigFileSettings":

141

"""

142

Load configuration from file.

143

144

Args:

145

config_file: Path to YAML/JSON config file

146

147

Returns:

148

ConfigFileSettings instance

149

"""

150

```

151

152

### Request/Response Models

153

154

Type definitions for REST API endpoints.

155

156

```python { .api }

157

# Temperature field definition

158

temperature_field = Field(

159

default=0.8,

160

ge=0.0,

161

le=2.0,

162

description="Sampling temperature"

163

)

164

165

# Top-p field definition

166

top_p_field = Field(

167

default=0.95,

168

ge=0.0,

169

le=1.0,

170

description="Nucleus sampling parameter"

171

)

172

173

# Max tokens field definition

174

max_tokens_field = Field(

175

default=16,

176

ge=1,

177

description="Maximum tokens to generate"

178

)

179

180

# Stream field definition

181

stream_field = Field(

182

default=False,

183

description="Enable streaming response"

184

)

185

186

# Stop field definition

187

stop_field = Field(

188

default=None,

189

description="Stop sequences for generation"

190

)

191

192

# Model field definition

193

model_field = Field(

194

default=None,

195

description="Model name for response metadata"

196

)

197

198

# Frequency penalty field definition

199

frequency_penalty_field = Field(

200

default=0.0,

201

ge=-2.0,

202

le=2.0,

203

description="Frequency penalty for token repetition"

204

)

205

206

# Presence penalty field definition

207

presence_penalty_field = Field(

208

default=0.0,

209

ge=-2.0,

210

le=2.0,

211

description="Presence penalty for new topics"

212

)

213

```

214

215

## Usage Examples

216

217

### Basic Server Setup

218

219

```python

220

from llama_cpp.server.settings import Settings

221

import uvicorn

222

223

# Create server configuration

224

settings = Settings(

225

model="./models/llama-2-7b-chat.gguf",

226

host="0.0.0.0", # Allow external connections

227

port=8000,

228

n_ctx=2048,

229

n_gpu_layers=35, # Offload to GPU

230

chat_format="llama-2",

231

)

232

233

# This would typically be handled by the server startup script

234

print(f"Server configured to run on {settings.host}:{settings.port}")

235

print(f"Model: {settings.model}")

236

print(f"Context size: {settings.n_ctx}")

237

print(f"GPU layers: {settings.n_gpu_layers}")

238

```

239

240

### Multi-Model Configuration

241

242

```python

243

import yaml

244

from llama_cpp.server.settings import ConfigFileSettings

245

246

# Create multi-model configuration file

247

config = {

248

"models": [

249

{

250

"model": "./models/llama-2-7b-chat.gguf",

251

"model_alias": "llama-7b",

252

"n_ctx": 2048,

253

"n_gpu_layers": 35,

254

"chat_format": "llama-2",

255

},

256

{

257

"model": "./models/mistral-7b-instruct.gguf",

258

"model_alias": "mistral-7b",

259

"n_ctx": 4096,

260

"n_gpu_layers": 35,

261

"chat_format": "mistral-instruct",

262

},

263

{

264

"model": "./models/codellama-13b.gguf",

265

"model_alias": "codellama-13b",

266

"n_ctx": 2048,

267

"n_gpu_layers": 40,

268

"chat_format": "codellama-instruct",

269

}

270

],

271

"host": "0.0.0.0",

272

"port": 8000,

273

"interrupt_requests": True,

274

}

275

276

# Save configuration file

277

with open("server_config.yaml", "w") as f:

278

yaml.dump(config, f)

279

280

# Load configuration

281

config_settings = ConfigFileSettings.from_file("server_config.yaml")

282

print(f"Loaded {len(config_settings.models)} model configurations")

283

```

284

285

### Production Server Configuration

286

287

```python

288

from llama_cpp.server.settings import Settings

289

290

# Production-ready configuration

291

production_settings = Settings(

292

model="./models/production-model.gguf",

293

host="0.0.0.0",

294

port=8080,

295

296

# Performance settings

297

n_ctx=4096,

298

n_threads=16,

299

n_gpu_layers=50,

300

n_batch=512,

301

302

# Memory optimization

303

use_mmap=True,

304

use_mlock=True,

305

f16_kv=True,

306

307

# Caching

308

cache=True,

309

cache_type="disk",

310

cache_size=4 << 30, # 4GB cache

311

312

# Security

313

interrupt_requests=True,

314

315

# Logging

316

verbose=False,

317

)

318

319

print("Production server configuration:")

320

print(f"- Host: {production_settings.host}:{production_settings.port}")

321

print(f"- Context: {production_settings.n_ctx} tokens")

322

print(f"- GPU layers: {production_settings.n_gpu_layers}")

323

print(f"- Cache: {production_settings.cache_type} ({production_settings.cache_size // (1024**3)}GB)")

324

```

325

326

### Development Server Configuration

327

328

```python

329

# Development configuration with debugging

330

dev_settings = Settings(

331

model="./models/small-model.gguf",

332

host="127.0.0.1", # Local only

333

port=8000,

334

335

# Smaller model for faster iteration

336

n_ctx=1024,

337

n_threads=4,

338

n_gpu_layers=0, # CPU only for debugging

339

340

# Debug settings

341

verbose=True,

342

logits_all=True, # For debugging token probabilities

343

344

# No caching for development

345

cache=False,

346

)

347

348

print("Development server configuration:")

349

print(f"- Local access only: {dev_settings.host}:{dev_settings.port}")

350

print(f"- CPU-only processing")

351

print(f"- Verbose logging enabled")

352

```

353

354

### Custom Chat Format Configuration

355

356

```python

357

# Server with custom chat format

358

custom_chat_settings = Settings(

359

model="./models/custom-model.gguf",

360

host="0.0.0.0",

361

port=8000,

362

n_ctx=2048,

363

364

# Custom format

365

chat_format="custom", # Requires custom handler registration

366

367

# Vision support

368

clip_model_path="./models/vision-projector.gguf",

369

370

# LoRA adapter

371

lora_path="./adapters/domain-specific-lora.bin",

372

lora_scale=0.8,

373

)

374

375

print("Custom model server configuration:")

376

print(f"- Chat format: {custom_chat_settings.chat_format}")

377

print(f"- Vision support: {'Yes' if custom_chat_settings.clip_model_path else 'No'}")

378

print(f"- LoRA adapter: {custom_chat_settings.lora_path}")

379

```

380

381

### Environment-Based Configuration

382

383

```python

384

import os

385

from llama_cpp.server.settings import Settings

386

387

# Configuration from environment variables

388

env_settings = Settings(

389

model=os.getenv("LLAMA_MODEL_PATH", "./models/default.gguf"),

390

host=os.getenv("LLAMA_HOST", "127.0.0.1"),

391

port=int(os.getenv("LLAMA_PORT", "8000")),

392

n_ctx=int(os.getenv("LLAMA_N_CTX", "2048")),

393

n_gpu_layers=int(os.getenv("LLAMA_N_GPU_LAYERS", "0")),

394

n_threads=int(os.getenv("LLAMA_N_THREADS", "4")),

395

chat_format=os.getenv("LLAMA_CHAT_FORMAT", "llama-2"),

396

verbose=os.getenv("LLAMA_VERBOSE", "false").lower() == "true",

397

)

398

399

print("Environment-based configuration:")

400

print(f"- Model: {env_settings.model}")

401

print(f"- Server: {env_settings.host}:{env_settings.port}")

402

print(f"- GPU layers: {env_settings.n_gpu_layers}")

403

print(f"- Chat format: {env_settings.chat_format}")

404

```

405

406

### Health Check Configuration

407

408

```python

409

# Server configuration with health monitoring

410

monitoring_settings = Settings(

411

model="./models/model.gguf",

412

host="0.0.0.0",

413

port=8000,

414

415

# Enable request interruption for health checks

416

interrupt_requests=True,

417

418

# Optimized for responsiveness

419

n_ctx=1024,

420

n_batch=128,

421

422

# Minimal logging for production

423

verbose=False,

424

)

425

426

# Example health check endpoint configuration

427

health_check_config = {

428

"endpoint": "/health",

429

"timeout": 5.0,

430

"check_model_loaded": True,

431

"check_memory_usage": True,

432

"max_memory_percent": 90,

433

}

434

435

print("Health monitoring configuration:")

436

print(f"- Health endpoint: {health_check_config['endpoint']}")

437

print(f"- Timeout: {health_check_config['timeout']}s")

438

print(f"- Memory limit: {health_check_config['max_memory_percent']}%")

439

```

440

441

### Load Balancer Configuration

442

443

```python

444

# Multiple server instances for load balancing

445

servers = []

446

447

base_port = 8000

448

for i in range(3): # 3 server instances

449

server_settings = Settings(

450

model=f"./models/model-replica-{i}.gguf",

451

host="127.0.0.1",

452

port=base_port + i,

453

454

# Distributed GPU usage

455

main_gpu=i % 2, # Alternate between GPUs

456

n_gpu_layers=30,

457

458

# Instance-specific settings

459

n_ctx=2048,

460

n_threads=8,

461

462

# Consistent behavior

463

seed=42, # Fixed seed for reproducibility

464

temperature=0.7,

465

)

466

467

servers.append(server_settings)

468

print(f"Server {i+1}: port {server_settings.port}, GPU {server_settings.main_gpu}")

469

470

# Load balancer would distribute requests across these instances

471

load_balancer_config = {

472

"strategy": "round_robin",

473

"health_check_interval": 30,

474

"retry_attempts": 3,

475

"timeout": 30.0,

476

}

477

478

print(f"Load balancer: {load_balancer_config['strategy']} across {len(servers)} instances")

479

```

480

481

### Docker Deployment Configuration

482

483

```python

484

# Configuration optimized for Docker deployment

485

docker_settings = Settings(

486

model="/app/models/model.gguf", # Container path

487

host="0.0.0.0", # Bind to all interfaces

488

port=8000,

489

490

# Container resource limits

491

n_ctx=2048,

492

n_threads=None, # Auto-detect container CPU limits

493

n_gpu_layers=40, # Assume GPU availability

494

495

# Container-friendly settings

496

use_mmap=True, # Efficient memory usage

497

verbose=False, # Reduce log volume

498

499

# Caching in container

500

cache=True,

501

cache_type="ram", # Avoid persistent storage issues

502

cache_size=1 << 30, # 1GB RAM cache

503

)

504

505

# Environment variables for Docker

506

docker_env = {

507

"LLAMA_MODEL_PATH": docker_settings.model,

508

"LLAMA_HOST": docker_settings.host,

509

"LLAMA_PORT": str(docker_settings.port),

510

"LLAMA_N_CTX": str(docker_settings.n_ctx),

511

"LLAMA_N_GPU_LAYERS": str(docker_settings.n_gpu_layers),

512

"LLAMA_CACHE_SIZE": str(docker_settings.cache_size),

513

}

514

515

print("Docker deployment configuration:")

516

for key, value in docker_env.items():

517

print(f"- {key}={value}")

518

```