0
# Low-Level API
1
2
Direct access to llama.cpp C functions through ctypes bindings, providing maximum control over model loading, context management, backend operations, and hardware-specific optimizations.
3
4
## Capabilities
5
6
### Backend Management
7
8
Initialize and manage the llama.cpp backend system.
9
10
```python { .api }
11
def llama_backend_init() -> None:
12
"""
13
Initialize llama.cpp backend.
14
Must be called before using any other functions.
15
"""
16
17
def llama_backend_free() -> None:
18
"""
19
Free backend resources.
20
Should be called when shutting down.
21
"""
22
23
def llama_numa_init(numa_strategy: int) -> None:
24
"""
25
Initialize NUMA support.
26
27
Args:
28
numa_strategy: NUMA initialization strategy
29
"""
30
```
31
32
### Model Management
33
34
Low-level model loading, saving, and memory management.
35
36
```python { .api }
37
def llama_model_load_from_file(
38
path_model: bytes,
39
params
40
) -> llama_model_p:
41
"""
42
Load model from file.
43
44
Args:
45
path_model: Path to model file (bytes)
46
params: Model parameters structure
47
48
Returns:
49
Model pointer or null on failure
50
"""
51
52
def llama_model_free(model: llama_model_p) -> None:
53
"""
54
Free model memory.
55
56
Args:
57
model: Model pointer to free
58
"""
59
60
def llama_model_save_to_file(
61
model: llama_model_p,
62
fname: bytes,
63
**kwargs
64
) -> bool:
65
"""
66
Save model to file.
67
68
Args:
69
model: Model pointer
70
fname: Output filename (bytes)
71
**kwargs: Additional save parameters
72
73
Returns:
74
True if successful
75
"""
76
77
def llama_model_default_params():
78
"""
79
Get default model loading parameters.
80
81
Returns:
82
Default parameter structure
83
"""
84
85
def llama_model_quantize_default_params():
86
"""
87
Get default quantization parameters.
88
89
Returns:
90
Default quantization structure
91
"""
92
```
93
94
### Context Management
95
96
Create and manage model contexts for inference.
97
98
```python { .api }
99
def llama_new_context_with_model(
100
model: llama_model_p,
101
params
102
) -> llama_context_p:
103
"""
104
Create new context with model.
105
106
Args:
107
model: Model pointer
108
params: Context parameters
109
110
Returns:
111
Context pointer or null on failure
112
"""
113
114
def llama_free(ctx: llama_context_p) -> None:
115
"""
116
Free context memory.
117
118
Args:
119
ctx: Context pointer to free
120
"""
121
122
def llama_context_default_params():
123
"""
124
Get default context parameters.
125
126
Returns:
127
Default parameter structure
128
"""
129
```
130
131
### System Information
132
133
Query system capabilities and model properties.
134
135
```python { .api }
136
def llama_supports_mmap() -> bool:
137
"""Check if memory mapping is supported."""
138
139
def llama_supports_mlock() -> bool:
140
"""Check if memory locking is supported."""
141
142
def llama_supports_gpu_offload() -> bool:
143
"""Check if GPU offload is supported."""
144
145
def llama_max_devices() -> int:
146
"""Get maximum number of devices."""
147
148
def llama_time_us() -> int:
149
"""Get current time in microseconds."""
150
151
def llama_n_ctx(ctx: llama_context_p) -> int:
152
"""
153
Get context size.
154
155
Args:
156
ctx: Context pointer
157
158
Returns:
159
Context size in tokens
160
"""
161
162
def llama_n_embd(model: llama_model_p) -> int:
163
"""
164
Get embedding dimensions.
165
166
Args:
167
model: Model pointer
168
169
Returns:
170
Embedding dimension count
171
"""
172
```
173
174
## Core Constants
175
176
### Default Values
177
178
```python { .api }
179
LLAMA_DEFAULT_SEED: int = 0xFFFFFFFF # Default random seed
180
LLAMA_TOKEN_NULL: int = -1 # Null token value
181
LLAMA_MAX_DEVICES: int # Maximum device count
182
```
183
184
### File Format Magic Numbers
185
186
```python { .api }
187
LLAMA_FILE_MAGIC_GGLA: int # GGLA file format identifier
188
LLAMA_FILE_MAGIC_GGSN: int # GGSN file format identifier
189
LLAMA_FILE_MAGIC_GGSQ: int # GGSQ file format identifier
190
LLAMA_SESSION_MAGIC: int # Session file magic number
191
LLAMA_SESSION_VERSION: int # Session file version
192
LLAMA_STATE_SEQ_MAGIC: int # State sequence magic number
193
LLAMA_STATE_SEQ_VERSION: int # State sequence version
194
```
195
196
### Vocabulary Types
197
198
```python { .api }
199
LLAMA_VOCAB_TYPE_NONE: int = 0 # No vocabulary
200
LLAMA_VOCAB_TYPE_SPM: int = 1 # SentencePiece model
201
LLAMA_VOCAB_TYPE_BPE: int = 2 # Byte pair encoding
202
LLAMA_VOCAB_TYPE_WPM: int = 3 # WordPiece model
203
LLAMA_VOCAB_TYPE_UGM: int = 4 # Unigram model
204
LLAMA_VOCAB_TYPE_RWKV: int = 5 # RWKV tokenizer
205
```
206
207
### GGML Quantization Types
208
209
```python { .api }
210
# Float types
211
GGML_TYPE_F32: int # 32-bit float
212
GGML_TYPE_F16: int # 16-bit float
213
214
# Quantized types
215
GGML_TYPE_Q4_0: int # 4-bit quantization, type 0
216
GGML_TYPE_Q4_1: int # 4-bit quantization, type 1
217
GGML_TYPE_Q5_0: int # 5-bit quantization, type 0
218
GGML_TYPE_Q5_1: int # 5-bit quantization, type 1
219
GGML_TYPE_Q8_0: int # 8-bit quantization, type 0
220
GGML_TYPE_Q8_1: int # 8-bit quantization, type 1
221
222
# K-quantization types
223
GGML_TYPE_Q2_K: int # 2-bit K-quantization
224
GGML_TYPE_Q3_K: int # 3-bit K-quantization
225
GGML_TYPE_Q4_K: int # 4-bit K-quantization
226
GGML_TYPE_Q5_K: int # 5-bit K-quantization
227
GGML_TYPE_Q6_K: int # 6-bit K-quantization
228
GGML_TYPE_Q8_K: int # 8-bit K-quantization
229
230
# Integer quantization types
231
GGML_TYPE_IQ2_XXS: int # Integer quantization 2-bit, XXS
232
GGML_TYPE_IQ2_XS: int # Integer quantization 2-bit, XS
233
GGML_TYPE_IQ3_XXS: int # Integer quantization 3-bit, XXS
234
GGML_TYPE_IQ1_S: int # Integer quantization 1-bit, S
235
GGML_TYPE_IQ4_NL: int # Integer quantization 4-bit, NL
236
GGML_TYPE_IQ3_S: int # Integer quantization 3-bit, S
237
GGML_TYPE_IQ2_S: int # Integer quantization 2-bit, S
238
GGML_TYPE_IQ4_XS: int # Integer quantization 4-bit, XS
239
GGML_TYPE_IQ1_M: int # Integer quantization 1-bit, M
240
241
# Standard integer types
242
GGML_TYPE_I8: int # 8-bit signed integer
243
GGML_TYPE_I16: int # 16-bit signed integer
244
GGML_TYPE_I32: int # 32-bit signed integer
245
GGML_TYPE_I64: int # 64-bit signed integer
246
```
247
248
## Pointer Types
249
250
```python { .api }
251
# Core pointer types
252
llama_model_p = ctypes.POINTER(ctypes.c_void_p) # Model pointer
253
llama_context_p = ctypes.POINTER(ctypes.c_void_p) # Context pointer
254
llama_token = ctypes.c_int32 # Token type
255
```
256
257
## Usage Examples
258
259
### Basic Low-Level Setup
260
261
```python
262
import llama_cpp.llama_cpp as llama_cpp
263
import ctypes
264
265
# Initialize backend
266
llama_cpp.llama_backend_init()
267
print("Backend initialized")
268
269
try:
270
# Get default parameters
271
model_params = llama_cpp.llama_model_default_params()
272
context_params = llama_cpp.llama_context_default_params()
273
274
# Load model
275
model_path = b"./models/llama-2-7b.gguf"
276
model = llama_cpp.llama_model_load_from_file(model_path, model_params)
277
278
if not model:
279
raise Exception("Failed to load model")
280
print("Model loaded successfully")
281
282
# Create context
283
context = llama_cpp.llama_new_context_with_model(model, context_params)
284
285
if not context:
286
raise Exception("Failed to create context")
287
print("Context created successfully")
288
289
# Get model information
290
n_ctx = llama_cpp.llama_n_ctx(context)
291
n_embd = llama_cpp.llama_n_embd(model)
292
293
print(f"Context size: {n_ctx}")
294
print(f"Embedding dimensions: {n_embd}")
295
296
finally:
297
# Cleanup
298
if 'context' in locals():
299
llama_cpp.llama_free(context)
300
if 'model' in locals():
301
llama_cpp.llama_model_free(model)
302
303
llama_cpp.llama_backend_free()
304
print("Cleanup completed")
305
```
306
307
### System Capability Detection
308
309
```python
310
import llama_cpp.llama_cpp as llama_cpp
311
312
# Check system capabilities
313
capabilities = {
314
"mmap_support": llama_cpp.llama_supports_mmap(),
315
"mlock_support": llama_cpp.llama_supports_mlock(),
316
"gpu_offload": llama_cpp.llama_supports_gpu_offload(),
317
"max_devices": llama_cpp.llama_max_devices(),
318
}
319
320
print("System capabilities:")
321
for capability, supported in capabilities.items():
322
status = "✓" if supported else "✗"
323
print(f" {status} {capability}: {supported}")
324
325
# Timing utilities
326
start_time = llama_cpp.llama_time_us()
327
# ... some operation ...
328
end_time = llama_cpp.llama_time_us()
329
duration_ms = (end_time - start_time) / 1000
330
print(f"Operation took {duration_ms:.2f}ms")
331
```
332
333
### Custom Parameter Configuration
334
335
```python
336
import llama_cpp.llama_cpp as llama_cpp
337
import ctypes
338
339
# Initialize backend
340
llama_cpp.llama_backend_init()
341
342
# Get and modify default parameters
343
model_params = llama_cpp.llama_model_default_params()
344
context_params = llama_cpp.llama_context_default_params()
345
346
# Modify model parameters (example - actual field names depend on structure)
347
# model_params.n_gpu_layers = 35
348
# model_params.use_mmap = True
349
# model_params.use_mlock = False
350
351
# Modify context parameters
352
# context_params.n_ctx = 4096
353
# context_params.n_batch = 512
354
# context_params.n_threads = 8
355
356
print("Custom parameters configured")
357
358
try:
359
# Load with custom parameters
360
model = llama_cpp.llama_model_load_from_file(
361
b"./models/model.gguf",
362
model_params
363
)
364
365
if model:
366
context = llama_cpp.llama_new_context_with_model(model, context_params)
367
if context:
368
print("Model and context created with custom parameters")
369
370
# Get actual values
371
actual_ctx = llama_cpp.llama_n_ctx(context)
372
actual_embd = llama_cpp.llama_n_embd(model)
373
print(f"Actual context size: {actual_ctx}")
374
print(f"Actual embedding dimensions: {actual_embd}")
375
376
llama_cpp.llama_free(context)
377
llama_cpp.llama_model_free(model)
378
379
finally:
380
llama_cpp.llama_backend_free()
381
```
382
383
### Memory Management Patterns
384
385
```python
386
import llama_cpp.llama_cpp as llama_cpp
387
import gc
388
import psutil
389
import os
390
391
def get_memory_usage():
392
"""Get current memory usage in MB."""
393
process = psutil.Process(os.getpid())
394
return process.memory_info().rss / 1024 / 1024
395
396
class LowLevelLlama:
397
def __init__(self):
398
self.model = None
399
self.context = None
400
self.backend_initialized = False
401
402
def initialize_backend(self):
403
"""Initialize backend if not already done."""
404
if not self.backend_initialized:
405
llama_cpp.llama_backend_init()
406
self.backend_initialized = True
407
408
def load_model(self, model_path: str):
409
"""Load model with automatic cleanup."""
410
self.initialize_backend()
411
412
# Clean up existing model
413
if self.model:
414
self.free_model()
415
416
initial_memory = get_memory_usage()
417
418
model_params = llama_cpp.llama_model_default_params()
419
self.model = llama_cpp.llama_model_load_from_file(
420
model_path.encode('utf-8'),
421
model_params
422
)
423
424
if not self.model:
425
raise RuntimeError(f"Failed to load model: {model_path}")
426
427
final_memory = get_memory_usage()
428
memory_increase = final_memory - initial_memory
429
430
print(f"Model loaded: {memory_increase:.1f}MB memory increase")
431
return True
432
433
def create_context(self):
434
"""Create context with automatic cleanup."""
435
if not self.model:
436
raise RuntimeError("Model must be loaded first")
437
438
# Clean up existing context
439
if self.context:
440
self.free_context()
441
442
context_params = llama_cpp.llama_context_default_params()
443
self.context = llama_cpp.llama_new_context_with_model(
444
self.model,
445
context_params
446
)
447
448
if not self.context:
449
raise RuntimeError("Failed to create context")
450
451
print("Context created successfully")
452
return True
453
454
def free_context(self):
455
"""Free context memory."""
456
if self.context:
457
llama_cpp.llama_free(self.context)
458
self.context = None
459
gc.collect() # Force garbage collection
460
461
def free_model(self):
462
"""Free model memory."""
463
if self.model:
464
llama_cpp.llama_model_free(self.model)
465
self.model = None
466
gc.collect()
467
468
def cleanup(self):
469
"""Full cleanup."""
470
self.free_context()
471
self.free_model()
472
473
if self.backend_initialized:
474
llama_cpp.llama_backend_free()
475
self.backend_initialized = False
476
477
def __del__(self):
478
"""Destructor for automatic cleanup."""
479
self.cleanup()
480
481
# Usage example
482
llama = LowLevelLlama()
483
484
try:
485
print(f"Initial memory: {get_memory_usage():.1f}MB")
486
487
llama.load_model("./models/model.gguf")
488
print(f"After model load: {get_memory_usage():.1f}MB")
489
490
llama.create_context()
491
print(f"After context creation: {get_memory_usage():.1f}MB")
492
493
# Use model...
494
495
finally:
496
llama.cleanup()
497
print(f"After cleanup: {get_memory_usage():.1f}MB")
498
```
499
500
### Error Handling and Validation
501
502
```python
503
import llama_cpp.llama_cpp as llama_cpp
504
import ctypes
505
506
def validate_model_file(file_path: str) -> bool:
507
"""Validate model file before loading."""
508
import os
509
510
if not os.path.exists(file_path):
511
print(f"Model file not found: {file_path}")
512
return False
513
514
file_size = os.path.getsize(file_path)
515
if file_size < 1024: # Less than 1KB is suspicious
516
print(f"Model file too small: {file_size} bytes")
517
return False
518
519
# Check file extension
520
if not file_path.lower().endswith(('.gguf', '.ggml', '.bin')):
521
print(f"Unexpected file extension: {file_path}")
522
return False
523
524
return True
525
526
def safe_model_loading(model_path: str):
527
"""Demonstrate safe model loading with error handling."""
528
529
if not validate_model_file(model_path):
530
return None
531
532
llama_cpp.llama_backend_init()
533
534
try:
535
# Check system capabilities first
536
if not llama_cpp.llama_supports_mmap():
537
print("Warning: Memory mapping not supported")
538
539
# Get default parameters
540
model_params = llama_cpp.llama_model_default_params()
541
542
# Attempt to load model
543
print(f"Loading model: {model_path}")
544
model = llama_cpp.llama_model_load_from_file(
545
model_path.encode('utf-8'),
546
model_params
547
)
548
549
if not model:
550
print("Model loading failed - check file format and permissions")
551
return None
552
553
# Validate model properties
554
try:
555
context_params = llama_cpp.llama_context_default_params()
556
context = llama_cpp.llama_new_context_with_model(model, context_params)
557
558
if context:
559
n_ctx = llama_cpp.llama_n_ctx(context)
560
n_embd = llama_cpp.llama_n_embd(model)
561
562
print(f"Model validation successful:")
563
print(f" Context size: {n_ctx}")
564
print(f" Embeddings: {n_embd}")
565
566
llama_cpp.llama_free(context)
567
return model
568
else:
569
print("Context creation failed - insufficient memory?")
570
llama_cpp.llama_model_free(model)
571
return None
572
573
except Exception as e:
574
print(f"Model validation error: {e}")
575
llama_cpp.llama_model_free(model)
576
return None
577
578
except Exception as e:
579
print(f"Unexpected error during model loading: {e}")
580
return None
581
582
finally:
583
# Backend cleanup handled by caller
584
pass
585
586
# Usage
587
model = safe_model_loading("./models/test-model.gguf")
588
if model:
589
print("Model ready for use")
590
# Use model...
591
llama_cpp.llama_model_free(model)
592
593
llama_cpp.llama_backend_free()
594
```
595
596
### Performance Monitoring
597
598
```python
599
import llama_cpp.llama_cpp as llama_cpp
600
import time
601
import contextlib
602
603
@contextlib.contextmanager
604
def performance_monitor(operation_name: str):
605
"""Context manager for performance monitoring."""
606
start_time = llama_cpp.llama_time_us()
607
start_memory = get_memory_usage()
608
609
try:
610
yield
611
finally:
612
end_time = llama_cpp.llama_time_us()
613
end_memory = get_memory_usage()
614
615
duration_ms = (end_time - start_time) / 1000
616
memory_change = end_memory - start_memory
617
618
print(f"{operation_name}:")
619
print(f" Duration: {duration_ms:.2f}ms")
620
print(f" Memory change: {memory_change:+.1f}MB")
621
622
# Usage example
623
llama_cpp.llama_backend_init()
624
625
try:
626
with performance_monitor("Model Loading"):
627
model_params = llama_cpp.llama_model_default_params()
628
model = llama_cpp.llama_model_load_from_file(
629
b"./models/model.gguf",
630
model_params
631
)
632
633
if model:
634
with performance_monitor("Context Creation"):
635
context_params = llama_cpp.llama_context_default_params()
636
context = llama_cpp.llama_new_context_with_model(model, context_params)
637
638
if context:
639
with performance_monitor("Model Info Retrieval"):
640
n_ctx = llama_cpp.llama_n_ctx(context)
641
n_embd = llama_cpp.llama_n_embd(model)
642
print(f"Context: {n_ctx}, Embeddings: {n_embd}")
643
644
llama_cpp.llama_free(context)
645
646
llama_cpp.llama_model_free(model)
647
648
finally:
649
llama_cpp.llama_backend_free()
650
```