0
# Core Model and Inference
1
2
High-level model loading, text generation, and inference operations providing the primary interface for llama.cpp functionality through the `Llama` class.
3
4
## Capabilities
5
6
### Model Initialization
7
8
Load and configure language models with comprehensive parameter control for performance optimization and hardware acceleration.
9
10
```python { .api }
11
class Llama:
12
def __init__(
13
self,
14
model_path: str,
15
*,
16
n_gpu_layers: int = 0,
17
split_mode: int = 1,
18
main_gpu: int = 0,
19
tensor_split: Optional[List[float]] = None,
20
vocab_only: bool = False,
21
use_mmap: bool = True,
22
use_mlock: bool = False,
23
kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
24
seed: int = 0xFFFFFFFF,
25
n_ctx: int = 512,
26
n_batch: int = 512,
27
n_ubatch: int = 512,
28
n_threads: Optional[int] = None,
29
n_threads_batch: Optional[int] = None,
30
rope_scaling_type: Optional[int] = -1,
31
pooling_type: int = -1,
32
rope_freq_base: float = 0.0,
33
rope_freq_scale: float = 0.0,
34
yarn_ext_factor: float = -1.0,
35
yarn_attn_factor: float = 1.0,
36
yarn_beta_fast: float = 32.0,
37
yarn_beta_slow: float = 1.0,
38
yarn_orig_ctx: int = 0,
39
logits_all: bool = False,
40
embedding: bool = False,
41
offload_kqv: bool = True,
42
flash_attn: bool = False,
43
op_offload: Optional[bool] = None,
44
swa_full: Optional[bool] = None,
45
no_perf: bool = False,
46
last_n_tokens_size: int = 64,
47
lora_base: Optional[str] = None,
48
lora_scale: float = 1.0,
49
lora_path: Optional[str] = None,
50
numa: Union[bool, int] = False,
51
chat_format: Optional[str] = None,
52
chat_handler: Optional[object] = None,
53
draft_model: Optional[object] = None,
54
tokenizer: Optional[object] = None,
55
type_k: Optional[int] = None,
56
type_v: Optional[int] = None,
57
spm_infill: bool = False,
58
verbose: bool = True,
59
**kwargs
60
):
61
"""
62
Initialize a Llama model instance.
63
64
Args:
65
model_path: Path to the GGUF model file
66
n_gpu_layers: Number of layers to offload to GPU (0 = CPU only)
67
split_mode: GPU split mode (1 = layer-wise split)
68
main_gpu: Main GPU device ID for multi-GPU setups
69
tensor_split: List of GPU memory allocations for each device
70
vocab_only: Load vocabulary only, skip weights
71
use_mmap: Use memory mapping for model loading
72
use_mlock: Lock model in memory to prevent swapping
73
kv_overrides: Key-value metadata overrides for the model
74
seed: Random seed for sampling (-1 for random)
75
n_ctx: Context window size in tokens
76
n_batch: Batch size for processing
77
n_ubatch: Physical batch size (must be <= n_batch)
78
n_threads: Number of CPU threads for computation
79
n_threads_batch: Number of CPU threads for batch processing
80
rope_scaling_type: RoPE scaling method (-1 = auto)
81
pooling_type: Pooling method for embeddings (-1 = unspecified)
82
rope_freq_base: Base frequency for RoPE
83
rope_freq_scale: Frequency scaling factor for RoPE
84
yarn_ext_factor: YaRN extension factor
85
yarn_attn_factor: YaRN attention factor
86
yarn_beta_fast: YaRN beta fast parameter
87
yarn_beta_slow: YaRN beta slow parameter
88
yarn_orig_ctx: YaRN original context size
89
logits_all: Return logits for all tokens
90
embedding: Enable embedding mode
91
offload_kqv: Offload key/value cache to GPU
92
flash_attn: Use Flash Attention optimization
93
op_offload: Offload operations to GPU (auto-detect if None)
94
swa_full: Use full sliding window attention (auto-detect if None)
95
no_perf: Disable performance optimizations
96
last_n_tokens_size: Size of last-n-tokens buffer for repetition penalty
97
lora_base: Path to LoRA base model
98
lora_scale: LoRA scaling factor
99
lora_path: Path to LoRA adapter
100
numa: NUMA optimization (False/True/strategy)
101
chat_format: Chat format template name
102
chat_handler: Custom chat completion handler
103
draft_model: Draft model for speculative decoding
104
tokenizer: Custom tokenizer instance
105
type_k: Key cache quantization type (None = auto)
106
type_v: Value cache quantization type (None = auto)
107
spm_infill: Enable SentencePiece infill mode
108
verbose: Enable verbose logging
109
"""
110
111
@classmethod
112
def from_pretrained(
113
cls,
114
repo_id: str,
115
filename: Optional[str] = None,
116
*,
117
additional_files: Optional[List[str]] = None,
118
local_dir: Optional[str] = None,
119
local_dir_use_symlinks: bool = True,
120
cache_dir: Optional[str] = None,
121
**kwargs
122
) -> "Llama":
123
"""
124
Create a Llama model instance from a Hugging Face Hub repository.
125
126
Args:
127
repo_id: Repository identifier on Hugging Face Hub
128
filename: Specific model file to download (auto-detected if None)
129
additional_files: Additional files to download (e.g., tokenizer files)
130
local_dir: Local directory to save files (uses cache if None)
131
local_dir_use_symlinks: Use symlinks in local directory
132
cache_dir: Cache directory for downloaded files
133
**kwargs: Additional arguments passed to Llama.__init__()
134
135
Returns:
136
Initialized Llama model instance
137
138
Raises:
139
ImportError: If huggingface-hub package is not installed
140
FileNotFoundError: If specified file is not found in repository
141
"""
142
```
143
144
### Text Completion
145
146
Generate text completions with fine-grained control over sampling parameters and output format, compatible with OpenAI completion API.
147
148
```python { .api }
149
def create_completion(
150
self,
151
prompt: str,
152
suffix: Optional[str] = None,
153
max_tokens: Optional[int] = 16,
154
temperature: float = 0.8,
155
top_p: float = 0.95,
156
min_p: float = 0.05,
157
typical_p: float = 1.0,
158
logprobs: Optional[int] = None,
159
echo: bool = False,
160
stop: Optional[Union[str, List[str]]] = [],
161
frequency_penalty: float = 0.0,
162
presence_penalty: float = 0.0,
163
repeat_penalty: float = 1.0,
164
top_k: int = 40,
165
stream: bool = False,
166
seed: Optional[int] = None,
167
tfs_z: float = 1.0,
168
mirostat_mode: int = 0,
169
mirostat_tau: float = 5.0,
170
mirostat_eta: float = 0.1,
171
model: Optional[str] = None,
172
stopping_criteria: Optional[object] = None,
173
logits_processor: Optional[object] = None,
174
grammar: Optional[object] = None,
175
logit_bias: Optional[Dict[str, float]] = None,
176
**kwargs
177
) -> CreateCompletionResponse:
178
"""
179
Create a text completion.
180
181
Args:
182
prompt: Input text prompt
183
suffix: Text to append after completion
184
max_tokens: Maximum tokens to generate
185
temperature: Sampling temperature (0.0-2.0)
186
top_p: Nucleus sampling probability threshold
187
min_p: Minimum probability threshold
188
typical_p: Typical sampling parameter
189
logprobs: Number of log probabilities to return
190
echo: Include prompt in response
191
stop: Stop sequences (string or list)
192
frequency_penalty: Frequency penalty (-2.0 to 2.0)
193
presence_penalty: Presence penalty (-2.0 to 2.0)
194
repeat_penalty: Repetition penalty multiplier
195
top_k: Top-k sampling parameter
196
stream: Enable streaming response
197
seed: Random seed for sampling
198
tfs_z: Tail-free sampling parameter
199
mirostat_mode: Mirostat sampling mode (0/1/2)
200
mirostat_tau: Mirostat target entropy
201
mirostat_eta: Mirostat learning rate
202
model: Model name for response metadata
203
stopping_criteria: Custom stopping criteria
204
logits_processor: Custom logits processor
205
grammar: Grammar constraints
206
logit_bias: Token bias adjustments
207
208
Returns:
209
Completion response with generated text and metadata
210
"""
211
```
212
213
### Embeddings
214
215
Generate dense vector representations of text for semantic similarity, clustering, and retrieval applications.
216
217
```python { .api }
218
def create_embedding(
219
self,
220
input: Union[str, List[str]],
221
model: Optional[str] = None,
222
encoding_format: str = "float",
223
**kwargs
224
) -> CreateEmbeddingResponse:
225
"""
226
Create text embeddings.
227
228
Args:
229
input: Text string or list of strings to embed
230
model: Model name for response metadata
231
encoding_format: Output format ("float" or "base64")
232
233
Returns:
234
Embedding response with vector representations
235
"""
236
237
def embed(
238
self,
239
input: str,
240
normalize: bool = True
241
) -> List[float]:
242
"""
243
Generate embeddings for a single text input.
244
245
Args:
246
input: Text to embed
247
normalize: Normalize embedding vector to unit length
248
249
Returns:
250
List of embedding values
251
"""
252
```
253
254
### Tokenization
255
256
Convert between text and token representations using the model's native tokenizer.
257
258
```python { .api }
259
def tokenize(
260
self,
261
text: str,
262
add_bos: bool = True,
263
special: bool = False
264
) -> List[int]:
265
"""
266
Convert text to token IDs.
267
268
Args:
269
text: Input text to tokenize
270
add_bos: Add beginning-of-sequence token
271
special: Allow special tokens in output
272
273
Returns:
274
List of token IDs
275
"""
276
277
def detokenize(
278
self,
279
tokens: List[int],
280
decode: bool = True
281
) -> str:
282
"""
283
Convert token IDs to text.
284
285
Args:
286
tokens: List of token IDs
287
decode: Decode bytes to string
288
289
Returns:
290
Decoded text string
291
"""
292
```
293
294
### State Management
295
296
Save and restore model context states for efficient caching and continuation of conversations.
297
298
```python { .api }
299
def save_state(self) -> LlamaState:
300
"""
301
Save current model state.
302
303
Returns:
304
Serializable state object
305
"""
306
307
def load_state(self, state: LlamaState) -> None:
308
"""
309
Load previously saved model state.
310
311
Args:
312
state: State object from save_state()
313
"""
314
315
def reset(self) -> None:
316
"""
317
Reset model context to initial state.
318
"""
319
```
320
321
### Configuration and Properties
322
323
Access model metadata and configuration settings.
324
325
```python { .api }
326
@property
327
def n_ctx(self) -> int:
328
"""Context window size in tokens."""
329
330
@property
331
def n_embd(self) -> int:
332
"""Model embedding dimensions."""
333
334
@property
335
def n_vocab(self) -> int:
336
"""Vocabulary size."""
337
338
@property
339
def tokenizer(self) -> object:
340
"""Tokenizer instance."""
341
342
@property
343
def token_eos(self) -> int:
344
"""End-of-sequence token ID."""
345
346
@property
347
def token_bos(self) -> int:
348
"""Beginning-of-sequence token ID."""
349
350
@property
351
def token_nl(self) -> int:
352
"""Newline token ID."""
353
354
def set_seed(self, seed: int) -> None:
355
"""
356
Set random seed for sampling.
357
358
Args:
359
seed: Random seed value
360
"""
361
362
def set_cache(self, cache: object) -> None:
363
"""
364
Set caching implementation.
365
366
Args:
367
cache: Cache instance (LlamaRAMCache or LlamaDiskCache)
368
"""
369
```
370
371
### Low-Level Generation
372
373
Direct token-level generation and sampling for advanced use cases.
374
375
```python { .api }
376
def eval(self, tokens: List[int]) -> None:
377
"""
378
Evaluate tokens and update model context.
379
380
Args:
381
tokens: Token sequence to evaluate
382
"""
383
384
def sample(
385
self,
386
top_k: int = 40,
387
top_p: float = 0.95,
388
min_p: float = 0.05,
389
typical_p: float = 1.0,
390
temp: float = 0.80,
391
repeat_penalty: float = 1.0,
392
frequency_penalty: float = 0.0,
393
presence_penalty: float = 0.0,
394
tfs_z: float = 1.0,
395
mirostat_mode: int = 0,
396
mirostat_tau: float = 5.0,
397
mirostat_eta: float = 0.1,
398
penalize_nl: bool = True,
399
logits_processor: Optional[object] = None,
400
grammar: Optional[object] = None
401
) -> int:
402
"""
403
Sample next token from current context.
404
405
Args:
406
top_k: Top-k sampling parameter
407
top_p: Top-p (nucleus) sampling parameter
408
min_p: Minimum probability threshold
409
typical_p: Typical sampling parameter
410
temp: Sampling temperature
411
repeat_penalty: Repetition penalty multiplier
412
frequency_penalty: Frequency penalty
413
presence_penalty: Presence penalty
414
tfs_z: Tail-free sampling parameter
415
mirostat_mode: Mirostat sampling mode
416
mirostat_tau: Mirostat target entropy
417
mirostat_eta: Mirostat learning rate
418
penalize_nl: Apply penalty to newline tokens
419
logits_processor: Custom logits processor
420
grammar: Grammar constraints
421
422
Returns:
423
Sampled token ID
424
"""
425
426
def generate(
427
self,
428
tokens: List[int],
429
top_k: int = 40,
430
top_p: float = 0.95,
431
min_p: float = 0.05,
432
typical_p: float = 1.0,
433
temp: float = 0.80,
434
repeat_penalty: float = 1.0,
435
reset: bool = True,
436
frequency_penalty: float = 0.0,
437
presence_penalty: float = 0.0,
438
tfs_z: float = 1.0,
439
mirostat_mode: int = 0,
440
mirostat_tau: float = 5.0,
441
mirostat_eta: float = 0.1,
442
stopping_criteria: Optional[object] = None,
443
logits_processor: Optional[object] = None,
444
grammar: Optional[object] = None
445
) -> Generator[int, None, None]:
446
"""
447
Generate token sequence.
448
449
Args:
450
tokens: Initial token sequence
451
top_k: Top-k sampling parameter
452
top_p: Top-p sampling parameter
453
min_p: Minimum probability threshold
454
typical_p: Typical sampling parameter
455
temp: Temperature
456
repeat_penalty: Repetition penalty
457
reset: Reset context before generation
458
frequency_penalty: Frequency penalty
459
presence_penalty: Presence penalty
460
tfs_z: Tail-free sampling parameter
461
mirostat_mode: Mirostat mode
462
mirostat_tau: Mirostat tau
463
mirostat_eta: Mirostat eta
464
stopping_criteria: Custom stopping criteria
465
logits_processor: Custom logits processor
466
grammar: Grammar constraints
467
468
Yields:
469
Generated token IDs
470
"""
471
```
472
473
## Types
474
475
```python { .api }
476
class LlamaState:
477
"""Serializable model state for persistence."""
478
479
def __init__(self, llama_state): ...
480
481
# Logits processing
482
class LogitsProcessor:
483
"""Base class for logits processing."""
484
485
def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: ...
486
487
class LogitsProcessorList:
488
"""List of logits processors."""
489
490
def __init__(self, processors: List[LogitsProcessor]): ...
491
def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: ...
492
493
class MinTokensLogitsProcessor(LogitsProcessor):
494
"""Ensures minimum number of tokens are generated."""
495
496
def __init__(self, min_tokens: int, eos_token_id: int): ...
497
498
# Stopping criteria
499
class StoppingCriteria:
500
"""Base class for stopping criteria."""
501
502
def __call__(self, input_ids: List[int], scores: List[float]) -> bool: ...
503
504
class StoppingCriteriaList:
505
"""List of stopping criteria."""
506
507
def __init__(self, criteria: List[StoppingCriteria]): ...
508
def __call__(self, input_ids: List[int], scores: List[float]) -> bool: ...
509
```
510
511
## Usage Examples
512
513
### Basic Model Loading and Generation
514
515
```python
516
from llama_cpp import Llama
517
518
# Load model with basic configuration
519
llm = Llama(
520
model_path="./models/llama-2-7b-chat.gguf",
521
n_ctx=2048,
522
n_threads=8,
523
)
524
525
# Simple text completion
526
response = llm.create_completion(
527
prompt="The future of artificial intelligence is",
528
max_tokens=50,
529
temperature=0.7,
530
)
531
print(response['choices'][0]['text'])
532
```
533
534
### GPU Acceleration
535
536
```python
537
# Offload layers to GPU for faster inference
538
llm = Llama(
539
model_path="./models/llama-2-13b-chat.gguf",
540
n_gpu_layers=35, # Offload most layers to GPU
541
n_ctx=4096,
542
f16_kv=True, # Use 16-bit precision for cache
543
)
544
```
545
546
### State Management
547
548
```python
549
# Save and restore conversation state
550
llm = Llama(model_path="./model.gguf")
551
552
# Generate some text
553
llm.create_completion(prompt="Hello, my name is")
554
555
# Save current state
556
state = llm.save_state()
557
558
# Continue conversation
559
llm.create_completion(prompt=" and I like")
560
561
# Restore to previous state
562
llm.load_state(state)
563
```
564
565
### Custom Sampling Parameters
566
567
```python
568
# Fine-tune generation with advanced sampling
569
response = llm.create_completion(
570
prompt="Write a creative story:",
571
max_tokens=200,
572
temperature=0.9, # High creativity
573
top_p=0.9, # Nucleus sampling
574
top_k=50, # Top-k sampling
575
repeat_penalty=1.15, # Reduce repetition
576
frequency_penalty=0.1,
577
presence_penalty=0.1,
578
)
579
```