or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

caching.mdchat-completion.mdgrammar.mdindex.mdllama-model.mdlow-level.mdserver.mdtokenization.mdvision.md

low-level.mddocs/

0

# Low-Level API

1

2

Direct access to llama.cpp C functions through ctypes bindings, providing maximum control over model loading, context management, backend operations, and hardware-specific optimizations.

3

4

## Capabilities

5

6

### Backend Management

7

8

Initialize and manage the llama.cpp backend system.

9

10

```python { .api }

11

def llama_backend_init() -> None:

12

"""

13

Initialize llama.cpp backend.

14

Must be called before using any other functions.

15

"""

16

17

def llama_backend_free() -> None:

18

"""

19

Free backend resources.

20

Should be called when shutting down.

21

"""

22

23

def llama_numa_init(numa_strategy: int) -> None:

24

"""

25

Initialize NUMA support.

26

27

Args:

28

numa_strategy: NUMA initialization strategy

29

"""

30

```

31

32

### Model Management

33

34

Low-level model loading, saving, and memory management.

35

36

```python { .api }

37

def llama_model_load_from_file(

38

path_model: bytes,

39

params

40

) -> llama_model_p:

41

"""

42

Load model from file.

43

44

Args:

45

path_model: Path to model file (bytes)

46

params: Model parameters structure

47

48

Returns:

49

Model pointer or null on failure

50

"""

51

52

def llama_model_free(model: llama_model_p) -> None:

53

"""

54

Free model memory.

55

56

Args:

57

model: Model pointer to free

58

"""

59

60

def llama_model_save_to_file(

61

model: llama_model_p,

62

fname: bytes,

63

**kwargs

64

) -> bool:

65

"""

66

Save model to file.

67

68

Args:

69

model: Model pointer

70

fname: Output filename (bytes)

71

**kwargs: Additional save parameters

72

73

Returns:

74

True if successful

75

"""

76

77

def llama_model_default_params():

78

"""

79

Get default model loading parameters.

80

81

Returns:

82

Default parameter structure

83

"""

84

85

def llama_model_quantize_default_params():

86

"""

87

Get default quantization parameters.

88

89

Returns:

90

Default quantization structure

91

"""

92

```

93

94

### Context Management

95

96

Create and manage model contexts for inference.

97

98

```python { .api }

99

def llama_new_context_with_model(

100

model: llama_model_p,

101

params

102

) -> llama_context_p:

103

"""

104

Create new context with model.

105

106

Args:

107

model: Model pointer

108

params: Context parameters

109

110

Returns:

111

Context pointer or null on failure

112

"""

113

114

def llama_free(ctx: llama_context_p) -> None:

115

"""

116

Free context memory.

117

118

Args:

119

ctx: Context pointer to free

120

"""

121

122

def llama_context_default_params():

123

"""

124

Get default context parameters.

125

126

Returns:

127

Default parameter structure

128

"""

129

```

130

131

### System Information

132

133

Query system capabilities and model properties.

134

135

```python { .api }

136

def llama_supports_mmap() -> bool:

137

"""Check if memory mapping is supported."""

138

139

def llama_supports_mlock() -> bool:

140

"""Check if memory locking is supported."""

141

142

def llama_supports_gpu_offload() -> bool:

143

"""Check if GPU offload is supported."""

144

145

def llama_max_devices() -> int:

146

"""Get maximum number of devices."""

147

148

def llama_time_us() -> int:

149

"""Get current time in microseconds."""

150

151

def llama_n_ctx(ctx: llama_context_p) -> int:

152

"""

153

Get context size.

154

155

Args:

156

ctx: Context pointer

157

158

Returns:

159

Context size in tokens

160

"""

161

162

def llama_n_embd(model: llama_model_p) -> int:

163

"""

164

Get embedding dimensions.

165

166

Args:

167

model: Model pointer

168

169

Returns:

170

Embedding dimension count

171

"""

172

```

173

174

## Core Constants

175

176

### Default Values

177

178

```python { .api }

179

LLAMA_DEFAULT_SEED: int = 0xFFFFFFFF # Default random seed

180

LLAMA_TOKEN_NULL: int = -1 # Null token value

181

LLAMA_MAX_DEVICES: int # Maximum device count

182

```

183

184

### File Format Magic Numbers

185

186

```python { .api }

187

LLAMA_FILE_MAGIC_GGLA: int # GGLA file format identifier

188

LLAMA_FILE_MAGIC_GGSN: int # GGSN file format identifier

189

LLAMA_FILE_MAGIC_GGSQ: int # GGSQ file format identifier

190

LLAMA_SESSION_MAGIC: int # Session file magic number

191

LLAMA_SESSION_VERSION: int # Session file version

192

LLAMA_STATE_SEQ_MAGIC: int # State sequence magic number

193

LLAMA_STATE_SEQ_VERSION: int # State sequence version

194

```

195

196

### Vocabulary Types

197

198

```python { .api }

199

LLAMA_VOCAB_TYPE_NONE: int = 0 # No vocabulary

200

LLAMA_VOCAB_TYPE_SPM: int = 1 # SentencePiece model

201

LLAMA_VOCAB_TYPE_BPE: int = 2 # Byte pair encoding

202

LLAMA_VOCAB_TYPE_WPM: int = 3 # WordPiece model

203

LLAMA_VOCAB_TYPE_UGM: int = 4 # Unigram model

204

LLAMA_VOCAB_TYPE_RWKV: int = 5 # RWKV tokenizer

205

```

206

207

### GGML Quantization Types

208

209

```python { .api }

210

# Float types

211

GGML_TYPE_F32: int # 32-bit float

212

GGML_TYPE_F16: int # 16-bit float

213

214

# Quantized types

215

GGML_TYPE_Q4_0: int # 4-bit quantization, type 0

216

GGML_TYPE_Q4_1: int # 4-bit quantization, type 1

217

GGML_TYPE_Q5_0: int # 5-bit quantization, type 0

218

GGML_TYPE_Q5_1: int # 5-bit quantization, type 1

219

GGML_TYPE_Q8_0: int # 8-bit quantization, type 0

220

GGML_TYPE_Q8_1: int # 8-bit quantization, type 1

221

222

# K-quantization types

223

GGML_TYPE_Q2_K: int # 2-bit K-quantization

224

GGML_TYPE_Q3_K: int # 3-bit K-quantization

225

GGML_TYPE_Q4_K: int # 4-bit K-quantization

226

GGML_TYPE_Q5_K: int # 5-bit K-quantization

227

GGML_TYPE_Q6_K: int # 6-bit K-quantization

228

GGML_TYPE_Q8_K: int # 8-bit K-quantization

229

230

# Integer quantization types

231

GGML_TYPE_IQ2_XXS: int # Integer quantization 2-bit, XXS

232

GGML_TYPE_IQ2_XS: int # Integer quantization 2-bit, XS

233

GGML_TYPE_IQ3_XXS: int # Integer quantization 3-bit, XXS

234

GGML_TYPE_IQ1_S: int # Integer quantization 1-bit, S

235

GGML_TYPE_IQ4_NL: int # Integer quantization 4-bit, NL

236

GGML_TYPE_IQ3_S: int # Integer quantization 3-bit, S

237

GGML_TYPE_IQ2_S: int # Integer quantization 2-bit, S

238

GGML_TYPE_IQ4_XS: int # Integer quantization 4-bit, XS

239

GGML_TYPE_IQ1_M: int # Integer quantization 1-bit, M

240

241

# Standard integer types

242

GGML_TYPE_I8: int # 8-bit signed integer

243

GGML_TYPE_I16: int # 16-bit signed integer

244

GGML_TYPE_I32: int # 32-bit signed integer

245

GGML_TYPE_I64: int # 64-bit signed integer

246

```

247

248

## Pointer Types

249

250

```python { .api }

251

# Core pointer types

252

llama_model_p = ctypes.POINTER(ctypes.c_void_p) # Model pointer

253

llama_context_p = ctypes.POINTER(ctypes.c_void_p) # Context pointer

254

llama_token = ctypes.c_int32 # Token type

255

```

256

257

## Usage Examples

258

259

### Basic Low-Level Setup

260

261

```python

262

import llama_cpp.llama_cpp as llama_cpp

263

import ctypes

264

265

# Initialize backend

266

llama_cpp.llama_backend_init()

267

print("Backend initialized")

268

269

try:

270

# Get default parameters

271

model_params = llama_cpp.llama_model_default_params()

272

context_params = llama_cpp.llama_context_default_params()

273

274

# Load model

275

model_path = b"./models/llama-2-7b.gguf"

276

model = llama_cpp.llama_model_load_from_file(model_path, model_params)

277

278

if not model:

279

raise Exception("Failed to load model")

280

print("Model loaded successfully")

281

282

# Create context

283

context = llama_cpp.llama_new_context_with_model(model, context_params)

284

285

if not context:

286

raise Exception("Failed to create context")

287

print("Context created successfully")

288

289

# Get model information

290

n_ctx = llama_cpp.llama_n_ctx(context)

291

n_embd = llama_cpp.llama_n_embd(model)

292

293

print(f"Context size: {n_ctx}")

294

print(f"Embedding dimensions: {n_embd}")

295

296

finally:

297

# Cleanup

298

if 'context' in locals():

299

llama_cpp.llama_free(context)

300

if 'model' in locals():

301

llama_cpp.llama_model_free(model)

302

303

llama_cpp.llama_backend_free()

304

print("Cleanup completed")

305

```

306

307

### System Capability Detection

308

309

```python

310

import llama_cpp.llama_cpp as llama_cpp

311

312

# Check system capabilities

313

capabilities = {

314

"mmap_support": llama_cpp.llama_supports_mmap(),

315

"mlock_support": llama_cpp.llama_supports_mlock(),

316

"gpu_offload": llama_cpp.llama_supports_gpu_offload(),

317

"max_devices": llama_cpp.llama_max_devices(),

318

}

319

320

print("System capabilities:")

321

for capability, supported in capabilities.items():

322

status = "✓" if supported else "✗"

323

print(f" {status} {capability}: {supported}")

324

325

# Timing utilities

326

start_time = llama_cpp.llama_time_us()

327

# ... some operation ...

328

end_time = llama_cpp.llama_time_us()

329

duration_ms = (end_time - start_time) / 1000

330

print(f"Operation took {duration_ms:.2f}ms")

331

```

332

333

### Custom Parameter Configuration

334

335

```python

336

import llama_cpp.llama_cpp as llama_cpp

337

import ctypes

338

339

# Initialize backend

340

llama_cpp.llama_backend_init()

341

342

# Get and modify default parameters

343

model_params = llama_cpp.llama_model_default_params()

344

context_params = llama_cpp.llama_context_default_params()

345

346

# Modify model parameters (example - actual field names depend on structure)

347

# model_params.n_gpu_layers = 35

348

# model_params.use_mmap = True

349

# model_params.use_mlock = False

350

351

# Modify context parameters

352

# context_params.n_ctx = 4096

353

# context_params.n_batch = 512

354

# context_params.n_threads = 8

355

356

print("Custom parameters configured")

357

358

try:

359

# Load with custom parameters

360

model = llama_cpp.llama_model_load_from_file(

361

b"./models/model.gguf",

362

model_params

363

)

364

365

if model:

366

context = llama_cpp.llama_new_context_with_model(model, context_params)

367

if context:

368

print("Model and context created with custom parameters")

369

370

# Get actual values

371

actual_ctx = llama_cpp.llama_n_ctx(context)

372

actual_embd = llama_cpp.llama_n_embd(model)

373

print(f"Actual context size: {actual_ctx}")

374

print(f"Actual embedding dimensions: {actual_embd}")

375

376

llama_cpp.llama_free(context)

377

llama_cpp.llama_model_free(model)

378

379

finally:

380

llama_cpp.llama_backend_free()

381

```

382

383

### Memory Management Patterns

384

385

```python

386

import llama_cpp.llama_cpp as llama_cpp

387

import gc

388

import psutil

389

import os

390

391

def get_memory_usage():

392

"""Get current memory usage in MB."""

393

process = psutil.Process(os.getpid())

394

return process.memory_info().rss / 1024 / 1024

395

396

class LowLevelLlama:

397

def __init__(self):

398

self.model = None

399

self.context = None

400

self.backend_initialized = False

401

402

def initialize_backend(self):

403

"""Initialize backend if not already done."""

404

if not self.backend_initialized:

405

llama_cpp.llama_backend_init()

406

self.backend_initialized = True

407

408

def load_model(self, model_path: str):

409

"""Load model with automatic cleanup."""

410

self.initialize_backend()

411

412

# Clean up existing model

413

if self.model:

414

self.free_model()

415

416

initial_memory = get_memory_usage()

417

418

model_params = llama_cpp.llama_model_default_params()

419

self.model = llama_cpp.llama_model_load_from_file(

420

model_path.encode('utf-8'),

421

model_params

422

)

423

424

if not self.model:

425

raise RuntimeError(f"Failed to load model: {model_path}")

426

427

final_memory = get_memory_usage()

428

memory_increase = final_memory - initial_memory

429

430

print(f"Model loaded: {memory_increase:.1f}MB memory increase")

431

return True

432

433

def create_context(self):

434

"""Create context with automatic cleanup."""

435

if not self.model:

436

raise RuntimeError("Model must be loaded first")

437

438

# Clean up existing context

439

if self.context:

440

self.free_context()

441

442

context_params = llama_cpp.llama_context_default_params()

443

self.context = llama_cpp.llama_new_context_with_model(

444

self.model,

445

context_params

446

)

447

448

if not self.context:

449

raise RuntimeError("Failed to create context")

450

451

print("Context created successfully")

452

return True

453

454

def free_context(self):

455

"""Free context memory."""

456

if self.context:

457

llama_cpp.llama_free(self.context)

458

self.context = None

459

gc.collect() # Force garbage collection

460

461

def free_model(self):

462

"""Free model memory."""

463

if self.model:

464

llama_cpp.llama_model_free(self.model)

465

self.model = None

466

gc.collect()

467

468

def cleanup(self):

469

"""Full cleanup."""

470

self.free_context()

471

self.free_model()

472

473

if self.backend_initialized:

474

llama_cpp.llama_backend_free()

475

self.backend_initialized = False

476

477

def __del__(self):

478

"""Destructor for automatic cleanup."""

479

self.cleanup()

480

481

# Usage example

482

llama = LowLevelLlama()

483

484

try:

485

print(f"Initial memory: {get_memory_usage():.1f}MB")

486

487

llama.load_model("./models/model.gguf")

488

print(f"After model load: {get_memory_usage():.1f}MB")

489

490

llama.create_context()

491

print(f"After context creation: {get_memory_usage():.1f}MB")

492

493

# Use model...

494

495

finally:

496

llama.cleanup()

497

print(f"After cleanup: {get_memory_usage():.1f}MB")

498

```

499

500

### Error Handling and Validation

501

502

```python

503

import llama_cpp.llama_cpp as llama_cpp

504

import ctypes

505

506

def validate_model_file(file_path: str) -> bool:

507

"""Validate model file before loading."""

508

import os

509

510

if not os.path.exists(file_path):

511

print(f"Model file not found: {file_path}")

512

return False

513

514

file_size = os.path.getsize(file_path)

515

if file_size < 1024: # Less than 1KB is suspicious

516

print(f"Model file too small: {file_size} bytes")

517

return False

518

519

# Check file extension

520

if not file_path.lower().endswith(('.gguf', '.ggml', '.bin')):

521

print(f"Unexpected file extension: {file_path}")

522

return False

523

524

return True

525

526

def safe_model_loading(model_path: str):

527

"""Demonstrate safe model loading with error handling."""

528

529

if not validate_model_file(model_path):

530

return None

531

532

llama_cpp.llama_backend_init()

533

534

try:

535

# Check system capabilities first

536

if not llama_cpp.llama_supports_mmap():

537

print("Warning: Memory mapping not supported")

538

539

# Get default parameters

540

model_params = llama_cpp.llama_model_default_params()

541

542

# Attempt to load model

543

print(f"Loading model: {model_path}")

544

model = llama_cpp.llama_model_load_from_file(

545

model_path.encode('utf-8'),

546

model_params

547

)

548

549

if not model:

550

print("Model loading failed - check file format and permissions")

551

return None

552

553

# Validate model properties

554

try:

555

context_params = llama_cpp.llama_context_default_params()

556

context = llama_cpp.llama_new_context_with_model(model, context_params)

557

558

if context:

559

n_ctx = llama_cpp.llama_n_ctx(context)

560

n_embd = llama_cpp.llama_n_embd(model)

561

562

print(f"Model validation successful:")

563

print(f" Context size: {n_ctx}")

564

print(f" Embeddings: {n_embd}")

565

566

llama_cpp.llama_free(context)

567

return model

568

else:

569

print("Context creation failed - insufficient memory?")

570

llama_cpp.llama_model_free(model)

571

return None

572

573

except Exception as e:

574

print(f"Model validation error: {e}")

575

llama_cpp.llama_model_free(model)

576

return None

577

578

except Exception as e:

579

print(f"Unexpected error during model loading: {e}")

580

return None

581

582

finally:

583

# Backend cleanup handled by caller

584

pass

585

586

# Usage

587

model = safe_model_loading("./models/test-model.gguf")

588

if model:

589

print("Model ready for use")

590

# Use model...

591

llama_cpp.llama_model_free(model)

592

593

llama_cpp.llama_backend_free()

594

```

595

596

### Performance Monitoring

597

598

```python

599

import llama_cpp.llama_cpp as llama_cpp

600

import time

601

import contextlib

602

603

@contextlib.contextmanager

604

def performance_monitor(operation_name: str):

605

"""Context manager for performance monitoring."""

606

start_time = llama_cpp.llama_time_us()

607

start_memory = get_memory_usage()

608

609

try:

610

yield

611

finally:

612

end_time = llama_cpp.llama_time_us()

613

end_memory = get_memory_usage()

614

615

duration_ms = (end_time - start_time) / 1000

616

memory_change = end_memory - start_memory

617

618

print(f"{operation_name}:")

619

print(f" Duration: {duration_ms:.2f}ms")

620

print(f" Memory change: {memory_change:+.1f}MB")

621

622

# Usage example

623

llama_cpp.llama_backend_init()

624

625

try:

626

with performance_monitor("Model Loading"):

627

model_params = llama_cpp.llama_model_default_params()

628

model = llama_cpp.llama_model_load_from_file(

629

b"./models/model.gguf",

630

model_params

631

)

632

633

if model:

634

with performance_monitor("Context Creation"):

635

context_params = llama_cpp.llama_context_default_params()

636

context = llama_cpp.llama_new_context_with_model(model, context_params)

637

638

if context:

639

with performance_monitor("Model Info Retrieval"):

640

n_ctx = llama_cpp.llama_n_ctx(context)

641

n_embd = llama_cpp.llama_n_embd(model)

642

print(f"Context: {n_ctx}, Embeddings: {n_embd}")

643

644

llama_cpp.llama_free(context)

645

646

llama_cpp.llama_model_free(model)

647

648

finally:

649

llama_cpp.llama_backend_free()

650

```