Tessl Tile for pypi/llama-cpp-python@0.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

caching.md chat-completion.md grammar.md index.md llama-model.md low-level.md server.md tokenization.md vision.md

low-level.mddocs/

0
# Low-Level API
1

2
Direct access to llama.cpp C functions through ctypes bindings, providing maximum control over model loading, context management, backend operations, and hardware-specific optimizations.
3

4
## Capabilities
5

6
### Backend Management
7

8
Initialize and manage the llama.cpp backend system.
9

10
```python { .api }
11
def llama_backend_init() -> None:
12
    """
13
    Initialize llama.cpp backend.
14
    Must be called before using any other functions.
15
    """
16

17
def llama_backend_free() -> None:
18
    """
19
    Free backend resources.
20
    Should be called when shutting down.
21
    """
22

23
def llama_numa_init(numa_strategy: int) -> None:
24
    """
25
    Initialize NUMA support.
26
    
27
    Args:
28
        numa_strategy: NUMA initialization strategy
29
    """
30
```
31

32
### Model Management
33

34
Low-level model loading, saving, and memory management.
35

36
```python { .api }
37
def llama_model_load_from_file(
38
    path_model: bytes, 
39
    params
40
) -> llama_model_p:
41
    """
42
    Load model from file.
43
    
44
    Args:
45
        path_model: Path to model file (bytes)
46
        params: Model parameters structure
47
        
48
    Returns:
49
        Model pointer or null on failure
50
    """
51

52
def llama_model_free(model: llama_model_p) -> None:
53
    """
54
    Free model memory.
55
    
56
    Args:
57
        model: Model pointer to free
58
    """
59

60
def llama_model_save_to_file(
61
    model: llama_model_p, 
62
    fname: bytes, 
63
    **kwargs
64
) -> bool:
65
    """
66
    Save model to file.
67
    
68
    Args:
69
        model: Model pointer
70
        fname: Output filename (bytes)
71
        **kwargs: Additional save parameters
72
        
73
    Returns:
74
        True if successful
75
    """
76

77
def llama_model_default_params():
78
    """
79
    Get default model loading parameters.
80
    
81
    Returns:
82
        Default parameter structure
83
    """
84

85
def llama_model_quantize_default_params():
86
    """
87
    Get default quantization parameters.
88
    
89
    Returns:
90
        Default quantization structure
91
    """
92
```
93

94
### Context Management
95

96
Create and manage model contexts for inference.
97

98
```python { .api }
99
def llama_new_context_with_model(
100
    model: llama_model_p, 
101
    params
102
) -> llama_context_p:
103
    """
104
    Create new context with model.
105
    
106
    Args:
107
        model: Model pointer
108
        params: Context parameters
109
        
110
    Returns:
111
        Context pointer or null on failure
112
    """
113

114
def llama_free(ctx: llama_context_p) -> None:
115
    """
116
    Free context memory.
117
    
118
    Args:
119
        ctx: Context pointer to free
120
    """
121

122
def llama_context_default_params():
123
    """
124
    Get default context parameters.
125
    
126
    Returns:
127
        Default parameter structure
128
    """
129
```
130

131
### System Information
132

133
Query system capabilities and model properties.
134

135
```python { .api }
136
def llama_supports_mmap() -> bool:
137
    """Check if memory mapping is supported."""
138

139
def llama_supports_mlock() -> bool:
140
    """Check if memory locking is supported."""
141

142
def llama_supports_gpu_offload() -> bool:
143
    """Check if GPU offload is supported."""
144

145
def llama_max_devices() -> int:
146
    """Get maximum number of devices."""
147

148
def llama_time_us() -> int:
149
    """Get current time in microseconds."""
150

151
def llama_n_ctx(ctx: llama_context_p) -> int:
152
    """
153
    Get context size.
154
    
155
    Args:
156
        ctx: Context pointer
157
        
158
    Returns:
159
        Context size in tokens
160
    """
161

162
def llama_n_embd(model: llama_model_p) -> int:
163
    """
164
    Get embedding dimensions.
165
    
166
    Args:
167
        model: Model pointer
168
        
169
    Returns:
170
        Embedding dimension count
171
    """
172
```
173

174
## Core Constants
175

176
### Default Values
177

178
```python { .api }
179
LLAMA_DEFAULT_SEED: int = 0xFFFFFFFF  # Default random seed
180
LLAMA_TOKEN_NULL: int = -1            # Null token value
181
LLAMA_MAX_DEVICES: int                # Maximum device count
182
```
183

184
### File Format Magic Numbers
185

186
```python { .api }
187
LLAMA_FILE_MAGIC_GGLA: int    # GGLA file format identifier
188
LLAMA_FILE_MAGIC_GGSN: int    # GGSN file format identifier
189
LLAMA_FILE_MAGIC_GGSQ: int    # GGSQ file format identifier
190
LLAMA_SESSION_MAGIC: int      # Session file magic number
191
LLAMA_SESSION_VERSION: int    # Session file version
192
LLAMA_STATE_SEQ_MAGIC: int    # State sequence magic number
193
LLAMA_STATE_SEQ_VERSION: int  # State sequence version
194
```
195

196
### Vocabulary Types
197

198
```python { .api }
199
LLAMA_VOCAB_TYPE_NONE: int = 0  # No vocabulary
200
LLAMA_VOCAB_TYPE_SPM: int = 1   # SentencePiece model
201
LLAMA_VOCAB_TYPE_BPE: int = 2   # Byte pair encoding
202
LLAMA_VOCAB_TYPE_WPM: int = 3   # WordPiece model
203
LLAMA_VOCAB_TYPE_UGM: int = 4   # Unigram model
204
LLAMA_VOCAB_TYPE_RWKV: int = 5  # RWKV tokenizer
205
```
206

207
### GGML Quantization Types
208

209
```python { .api }
210
# Float types
211
GGML_TYPE_F32: int  # 32-bit float
212
GGML_TYPE_F16: int  # 16-bit float
213

214
# Quantized types
215
GGML_TYPE_Q4_0: int  # 4-bit quantization, type 0
216
GGML_TYPE_Q4_1: int  # 4-bit quantization, type 1
217
GGML_TYPE_Q5_0: int  # 5-bit quantization, type 0
218
GGML_TYPE_Q5_1: int  # 5-bit quantization, type 1
219
GGML_TYPE_Q8_0: int  # 8-bit quantization, type 0
220
GGML_TYPE_Q8_1: int  # 8-bit quantization, type 1
221

222
# K-quantization types
223
GGML_TYPE_Q2_K: int  # 2-bit K-quantization
224
GGML_TYPE_Q3_K: int  # 3-bit K-quantization
225
GGML_TYPE_Q4_K: int  # 4-bit K-quantization
226
GGML_TYPE_Q5_K: int  # 5-bit K-quantization
227
GGML_TYPE_Q6_K: int  # 6-bit K-quantization
228
GGML_TYPE_Q8_K: int  # 8-bit K-quantization
229

230
# Integer quantization types
231
GGML_TYPE_IQ2_XXS: int  # Integer quantization 2-bit, XXS
232
GGML_TYPE_IQ2_XS: int   # Integer quantization 2-bit, XS
233
GGML_TYPE_IQ3_XXS: int  # Integer quantization 3-bit, XXS
234
GGML_TYPE_IQ1_S: int    # Integer quantization 1-bit, S
235
GGML_TYPE_IQ4_NL: int   # Integer quantization 4-bit, NL
236
GGML_TYPE_IQ3_S: int    # Integer quantization 3-bit, S
237
GGML_TYPE_IQ2_S: int    # Integer quantization 2-bit, S
238
GGML_TYPE_IQ4_XS: int   # Integer quantization 4-bit, XS
239
GGML_TYPE_IQ1_M: int    # Integer quantization 1-bit, M
240

241
# Standard integer types
242
GGML_TYPE_I8: int   # 8-bit signed integer
243
GGML_TYPE_I16: int  # 16-bit signed integer
244
GGML_TYPE_I32: int  # 32-bit signed integer
245
GGML_TYPE_I64: int  # 64-bit signed integer
246
```
247

248
## Pointer Types
249

250
```python { .api }
251
# Core pointer types
252
llama_model_p = ctypes.POINTER(ctypes.c_void_p)    # Model pointer
253
llama_context_p = ctypes.POINTER(ctypes.c_void_p)  # Context pointer
254
llama_token = ctypes.c_int32                       # Token type
255
```
256

257
## Usage Examples
258

259
### Basic Low-Level Setup
260

261
```python
262
import llama_cpp.llama_cpp as llama_cpp
263
import ctypes
264

265
# Initialize backend
266
llama_cpp.llama_backend_init()
267
print("Backend initialized")
268

269
try:
270
    # Get default parameters
271
    model_params = llama_cpp.llama_model_default_params()
272
    context_params = llama_cpp.llama_context_default_params()
273
    
274
    # Load model
275
    model_path = b"./models/llama-2-7b.gguf"
276
    model = llama_cpp.llama_model_load_from_file(model_path, model_params)
277
    
278
    if not model:
279
        raise Exception("Failed to load model")
280
    print("Model loaded successfully")
281
    
282
    # Create context
283
    context = llama_cpp.llama_new_context_with_model(model, context_params)
284
    
285
    if not context:
286
        raise Exception("Failed to create context")
287
    print("Context created successfully")
288
    
289
    # Get model information
290
    n_ctx = llama_cpp.llama_n_ctx(context)
291
    n_embd = llama_cpp.llama_n_embd(model)
292
    
293
    print(f"Context size: {n_ctx}")
294
    print(f"Embedding dimensions: {n_embd}")
295
    
296
finally:
297
    # Cleanup
298
    if 'context' in locals():
299
        llama_cpp.llama_free(context)
300
    if 'model' in locals():
301
        llama_cpp.llama_model_free(model)
302
    
303
    llama_cpp.llama_backend_free()
304
    print("Cleanup completed")
305
```
306

307
### System Capability Detection
308

309
```python
310
import llama_cpp.llama_cpp as llama_cpp
311

312
# Check system capabilities
313
capabilities = {
314
    "mmap_support": llama_cpp.llama_supports_mmap(),
315
    "mlock_support": llama_cpp.llama_supports_mlock(),
316
    "gpu_offload": llama_cpp.llama_supports_gpu_offload(),
317
    "max_devices": llama_cpp.llama_max_devices(),
318
}
319

320
print("System capabilities:")
321
for capability, supported in capabilities.items():
322
    status = "✓" if supported else "✗"
323
    print(f"  {status} {capability}: {supported}")
324

325
# Timing utilities
326
start_time = llama_cpp.llama_time_us()
327
# ... some operation ...
328
end_time = llama_cpp.llama_time_us()
329
duration_ms = (end_time - start_time) / 1000
330
print(f"Operation took {duration_ms:.2f}ms")
331
```
332

333
### Custom Parameter Configuration
334

335
```python
336
import llama_cpp.llama_cpp as llama_cpp
337
import ctypes
338

339
# Initialize backend
340
llama_cpp.llama_backend_init()
341

342
# Get and modify default parameters
343
model_params = llama_cpp.llama_model_default_params()
344
context_params = llama_cpp.llama_context_default_params()
345

346
# Modify model parameters (example - actual field names depend on structure)
347
# model_params.n_gpu_layers = 35
348
# model_params.use_mmap = True
349
# model_params.use_mlock = False
350

351
# Modify context parameters
352
# context_params.n_ctx = 4096
353
# context_params.n_batch = 512
354
# context_params.n_threads = 8
355

356
print("Custom parameters configured")
357

358
try:
359
    # Load with custom parameters
360
    model = llama_cpp.llama_model_load_from_file(
361
        b"./models/model.gguf", 
362
        model_params
363
    )
364
    
365
    if model:
366
        context = llama_cpp.llama_new_context_with_model(model, context_params)
367
        if context:
368
            print("Model and context created with custom parameters")
369
            
370
            # Get actual values
371
            actual_ctx = llama_cpp.llama_n_ctx(context)
372
            actual_embd = llama_cpp.llama_n_embd(model)
373
            print(f"Actual context size: {actual_ctx}")
374
            print(f"Actual embedding dimensions: {actual_embd}")
375
            
376
            llama_cpp.llama_free(context)
377
        llama_cpp.llama_model_free(model)
378
        
379
finally:
380
    llama_cpp.llama_backend_free()
381
```
382

383
### Memory Management Patterns
384

385
```python
386
import llama_cpp.llama_cpp as llama_cpp
387
import gc
388
import psutil
389
import os
390

391
def get_memory_usage():
392
    """Get current memory usage in MB."""
393
    process = psutil.Process(os.getpid())
394
    return process.memory_info().rss / 1024 / 1024
395

396
class LowLevelLlama:
397
    def __init__(self):
398
        self.model = None
399
        self.context = None
400
        self.backend_initialized = False
401
    
402
    def initialize_backend(self):
403
        """Initialize backend if not already done."""
404
        if not self.backend_initialized:
405
            llama_cpp.llama_backend_init()
406
            self.backend_initialized = True
407
    
408
    def load_model(self, model_path: str):
409
        """Load model with automatic cleanup."""
410
        self.initialize_backend()
411
        
412
        # Clean up existing model
413
        if self.model:
414
            self.free_model()
415
        
416
        initial_memory = get_memory_usage()
417
        
418
        model_params = llama_cpp.llama_model_default_params()
419
        self.model = llama_cpp.llama_model_load_from_file(
420
            model_path.encode('utf-8'), 
421
            model_params
422
        )
423
        
424
        if not self.model:
425
            raise RuntimeError(f"Failed to load model: {model_path}")
426
        
427
        final_memory = get_memory_usage()
428
        memory_increase = final_memory - initial_memory
429
        
430
        print(f"Model loaded: {memory_increase:.1f}MB memory increase")
431
        return True
432
    
433
    def create_context(self):
434
        """Create context with automatic cleanup."""
435
        if not self.model:
436
            raise RuntimeError("Model must be loaded first")
437
        
438
        # Clean up existing context
439
        if self.context:
440
            self.free_context()
441
        
442
        context_params = llama_cpp.llama_context_default_params()
443
        self.context = llama_cpp.llama_new_context_with_model(
444
            self.model, 
445
            context_params
446
        )
447
        
448
        if not self.context:
449
            raise RuntimeError("Failed to create context")
450
        
451
        print("Context created successfully")
452
        return True
453
    
454
    def free_context(self):
455
        """Free context memory."""
456
        if self.context:
457
            llama_cpp.llama_free(self.context)
458
            self.context = None
459
            gc.collect()  # Force garbage collection
460
    
461
    def free_model(self):
462
        """Free model memory."""
463
        if self.model:
464
            llama_cpp.llama_model_free(self.model)
465
            self.model = None
466
            gc.collect()
467
    
468
    def cleanup(self):
469
        """Full cleanup."""
470
        self.free_context()
471
        self.free_model()
472
        
473
        if self.backend_initialized:
474
            llama_cpp.llama_backend_free()
475
            self.backend_initialized = False
476
    
477
    def __del__(self):
478
        """Destructor for automatic cleanup."""
479
        self.cleanup()
480

481
# Usage example
482
llama = LowLevelLlama()
483

484
try:
485
    print(f"Initial memory: {get_memory_usage():.1f}MB")
486
    
487
    llama.load_model("./models/model.gguf")
488
    print(f"After model load: {get_memory_usage():.1f}MB")
489
    
490
    llama.create_context()
491
    print(f"After context creation: {get_memory_usage():.1f}MB")
492
    
493
    # Use model...
494
    
495
finally:
496
    llama.cleanup()
497
    print(f"After cleanup: {get_memory_usage():.1f}MB")
498
```
499

500
### Error Handling and Validation
501

502
```python
503
import llama_cpp.llama_cpp as llama_cpp
504
import ctypes
505

506
def validate_model_file(file_path: str) -> bool:
507
    """Validate model file before loading."""
508
    import os
509
    
510
    if not os.path.exists(file_path):
511
        print(f"Model file not found: {file_path}")
512
        return False
513
    
514
    file_size = os.path.getsize(file_path)
515
    if file_size < 1024:  # Less than 1KB is suspicious
516
        print(f"Model file too small: {file_size} bytes")
517
        return False
518
    
519
    # Check file extension
520
    if not file_path.lower().endswith(('.gguf', '.ggml', '.bin')):
521
        print(f"Unexpected file extension: {file_path}")
522
        return False
523
    
524
    return True
525

526
def safe_model_loading(model_path: str):
527
    """Demonstrate safe model loading with error handling."""
528
    
529
    if not validate_model_file(model_path):
530
        return None
531
    
532
    llama_cpp.llama_backend_init()
533
    
534
    try:
535
        # Check system capabilities first
536
        if not llama_cpp.llama_supports_mmap():
537
            print("Warning: Memory mapping not supported")
538
        
539
        # Get default parameters
540
        model_params = llama_cpp.llama_model_default_params()
541
        
542
        # Attempt to load model
543
        print(f"Loading model: {model_path}")
544
        model = llama_cpp.llama_model_load_from_file(
545
            model_path.encode('utf-8'), 
546
            model_params
547
        )
548
        
549
        if not model:
550
            print("Model loading failed - check file format and permissions")
551
            return None
552
        
553
        # Validate model properties
554
        try:
555
            context_params = llama_cpp.llama_context_default_params()
556
            context = llama_cpp.llama_new_context_with_model(model, context_params)
557
            
558
            if context:
559
                n_ctx = llama_cpp.llama_n_ctx(context)
560
                n_embd = llama_cpp.llama_n_embd(model)
561
                
562
                print(f"Model validation successful:")
563
                print(f"  Context size: {n_ctx}")
564
                print(f"  Embeddings: {n_embd}")
565
                
566
                llama_cpp.llama_free(context)
567
                return model
568
            else:
569
                print("Context creation failed - insufficient memory?")
570
                llama_cpp.llama_model_free(model)
571
                return None
572
                
573
        except Exception as e:
574
            print(f"Model validation error: {e}")
575
            llama_cpp.llama_model_free(model)
576
            return None
577
    
578
    except Exception as e:
579
        print(f"Unexpected error during model loading: {e}")
580
        return None
581
    
582
    finally:
583
        # Backend cleanup handled by caller
584
        pass
585

586
# Usage
587
model = safe_model_loading("./models/test-model.gguf")
588
if model:
589
    print("Model ready for use")
590
    # Use model...
591
    llama_cpp.llama_model_free(model)
592

593
llama_cpp.llama_backend_free()
594
```
595

596
### Performance Monitoring
597

598
```python
599
import llama_cpp.llama_cpp as llama_cpp
600
import time
601
import contextlib
602

603
@contextlib.contextmanager
604
def performance_monitor(operation_name: str):
605
    """Context manager for performance monitoring."""
606
    start_time = llama_cpp.llama_time_us()
607
    start_memory = get_memory_usage()
608
    
609
    try:
610
        yield
611
    finally:
612
        end_time = llama_cpp.llama_time_us()
613
        end_memory = get_memory_usage()
614
        
615
        duration_ms = (end_time - start_time) / 1000
616
        memory_change = end_memory - start_memory
617
        
618
        print(f"{operation_name}:")
619
        print(f"  Duration: {duration_ms:.2f}ms")
620
        print(f"  Memory change: {memory_change:+.1f}MB")
621

622
# Usage example
623
llama_cpp.llama_backend_init()
624

625
try:
626
    with performance_monitor("Model Loading"):
627
        model_params = llama_cpp.llama_model_default_params()
628
        model = llama_cpp.llama_model_load_from_file(
629
            b"./models/model.gguf", 
630
            model_params
631
        )
632
    
633
    if model:
634
        with performance_monitor("Context Creation"):
635
            context_params = llama_cpp.llama_context_default_params()
636
            context = llama_cpp.llama_new_context_with_model(model, context_params)
637
        
638
        if context:
639
            with performance_monitor("Model Info Retrieval"):
640
                n_ctx = llama_cpp.llama_n_ctx(context)
641
                n_embd = llama_cpp.llama_n_embd(model)
642
                print(f"Context: {n_ctx}, Embeddings: {n_embd}")
643
            
644
            llama_cpp.llama_free(context)
645
        
646
        llama_cpp.llama_model_free(model)
647

648
finally:
649
    llama_cpp.llama_backend_free()
650
```

Version

Tile

Files

low-level.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

low-level.mddocs/