Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
—
High-level model loading, text generation, and inference operations providing the primary interface for llama.cpp functionality through the Llama class.
Load and configure language models with comprehensive parameter control for performance optimization and hardware acceleration.
class Llama:
def __init__(
self,
model_path: str,
*,
n_gpu_layers: int = 0,
split_mode: int = 1,
main_gpu: int = 0,
tensor_split: Optional[List[float]] = None,
vocab_only: bool = False,
use_mmap: bool = True,
use_mlock: bool = False,
kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
seed: int = 0xFFFFFFFF,
n_ctx: int = 512,
n_batch: int = 512,
n_ubatch: int = 512,
n_threads: Optional[int] = None,
n_threads_batch: Optional[int] = None,
rope_scaling_type: Optional[int] = -1,
pooling_type: int = -1,
rope_freq_base: float = 0.0,
rope_freq_scale: float = 0.0,
yarn_ext_factor: float = -1.0,
yarn_attn_factor: float = 1.0,
yarn_beta_fast: float = 32.0,
yarn_beta_slow: float = 1.0,
yarn_orig_ctx: int = 0,
logits_all: bool = False,
embedding: bool = False,
offload_kqv: bool = True,
flash_attn: bool = False,
op_offload: Optional[bool] = None,
swa_full: Optional[bool] = None,
no_perf: bool = False,
last_n_tokens_size: int = 64,
lora_base: Optional[str] = None,
lora_scale: float = 1.0,
lora_path: Optional[str] = None,
numa: Union[bool, int] = False,
chat_format: Optional[str] = None,
chat_handler: Optional[object] = None,
draft_model: Optional[object] = None,
tokenizer: Optional[object] = None,
type_k: Optional[int] = None,
type_v: Optional[int] = None,
spm_infill: bool = False,
verbose: bool = True,
**kwargs
):
"""
Initialize a Llama model instance.
Args:
model_path: Path to the GGUF model file
n_gpu_layers: Number of layers to offload to GPU (0 = CPU only)
split_mode: GPU split mode (1 = layer-wise split)
main_gpu: Main GPU device ID for multi-GPU setups
tensor_split: List of GPU memory allocations for each device
vocab_only: Load vocabulary only, skip weights
use_mmap: Use memory mapping for model loading
use_mlock: Lock model in memory to prevent swapping
kv_overrides: Key-value metadata overrides for the model
seed: Random seed for sampling (-1 for random)
n_ctx: Context window size in tokens
n_batch: Batch size for processing
n_ubatch: Physical batch size (must be <= n_batch)
n_threads: Number of CPU threads for computation
n_threads_batch: Number of CPU threads for batch processing
rope_scaling_type: RoPE scaling method (-1 = auto)
pooling_type: Pooling method for embeddings (-1 = unspecified)
rope_freq_base: Base frequency for RoPE
rope_freq_scale: Frequency scaling factor for RoPE
yarn_ext_factor: YaRN extension factor
yarn_attn_factor: YaRN attention factor
yarn_beta_fast: YaRN beta fast parameter
yarn_beta_slow: YaRN beta slow parameter
yarn_orig_ctx: YaRN original context size
logits_all: Return logits for all tokens
embedding: Enable embedding mode
offload_kqv: Offload key/value cache to GPU
flash_attn: Use Flash Attention optimization
op_offload: Offload operations to GPU (auto-detect if None)
swa_full: Use full sliding window attention (auto-detect if None)
no_perf: Disable performance optimizations
last_n_tokens_size: Size of last-n-tokens buffer for repetition penalty
lora_base: Path to LoRA base model
lora_scale: LoRA scaling factor
lora_path: Path to LoRA adapter
numa: NUMA optimization (False/True/strategy)
chat_format: Chat format template name
chat_handler: Custom chat completion handler
draft_model: Draft model for speculative decoding
tokenizer: Custom tokenizer instance
type_k: Key cache quantization type (None = auto)
type_v: Value cache quantization type (None = auto)
spm_infill: Enable SentencePiece infill mode
verbose: Enable verbose logging
"""
@classmethod
def from_pretrained(
cls,
repo_id: str,
filename: Optional[str] = None,
*,
additional_files: Optional[List[str]] = None,
local_dir: Optional[str] = None,
local_dir_use_symlinks: bool = True,
cache_dir: Optional[str] = None,
**kwargs
) -> "Llama":
"""
Create a Llama model instance from a Hugging Face Hub repository.
Args:
repo_id: Repository identifier on Hugging Face Hub
filename: Specific model file to download (auto-detected if None)
additional_files: Additional files to download (e.g., tokenizer files)
local_dir: Local directory to save files (uses cache if None)
local_dir_use_symlinks: Use symlinks in local directory
cache_dir: Cache directory for downloaded files
**kwargs: Additional arguments passed to Llama.__init__()
Returns:
Initialized Llama model instance
Raises:
ImportError: If huggingface-hub package is not installed
FileNotFoundError: If specified file is not found in repository
"""Generate text completions with fine-grained control over sampling parameters and output format, compatible with OpenAI completion API.
def create_completion(
self,
prompt: str,
suffix: Optional[str] = None,
max_tokens: Optional[int] = 16,
temperature: float = 0.8,
top_p: float = 0.95,
min_p: float = 0.05,
typical_p: float = 1.0,
logprobs: Optional[int] = None,
echo: bool = False,
stop: Optional[Union[str, List[str]]] = [],
frequency_penalty: float = 0.0,
presence_penalty: float = 0.0,
repeat_penalty: float = 1.0,
top_k: int = 40,
stream: bool = False,
seed: Optional[int] = None,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
model: Optional[str] = None,
stopping_criteria: Optional[object] = None,
logits_processor: Optional[object] = None,
grammar: Optional[object] = None,
logit_bias: Optional[Dict[str, float]] = None,
**kwargs
) -> CreateCompletionResponse:
"""
Create a text completion.
Args:
prompt: Input text prompt
suffix: Text to append after completion
max_tokens: Maximum tokens to generate
temperature: Sampling temperature (0.0-2.0)
top_p: Nucleus sampling probability threshold
min_p: Minimum probability threshold
typical_p: Typical sampling parameter
logprobs: Number of log probabilities to return
echo: Include prompt in response
stop: Stop sequences (string or list)
frequency_penalty: Frequency penalty (-2.0 to 2.0)
presence_penalty: Presence penalty (-2.0 to 2.0)
repeat_penalty: Repetition penalty multiplier
top_k: Top-k sampling parameter
stream: Enable streaming response
seed: Random seed for sampling
tfs_z: Tail-free sampling parameter
mirostat_mode: Mirostat sampling mode (0/1/2)
mirostat_tau: Mirostat target entropy
mirostat_eta: Mirostat learning rate
model: Model name for response metadata
stopping_criteria: Custom stopping criteria
logits_processor: Custom logits processor
grammar: Grammar constraints
logit_bias: Token bias adjustments
Returns:
Completion response with generated text and metadata
"""Generate dense vector representations of text for semantic similarity, clustering, and retrieval applications.
def create_embedding(
self,
input: Union[str, List[str]],
model: Optional[str] = None,
encoding_format: str = "float",
**kwargs
) -> CreateEmbeddingResponse:
"""
Create text embeddings.
Args:
input: Text string or list of strings to embed
model: Model name for response metadata
encoding_format: Output format ("float" or "base64")
Returns:
Embedding response with vector representations
"""
def embed(
self,
input: str,
normalize: bool = True
) -> List[float]:
"""
Generate embeddings for a single text input.
Args:
input: Text to embed
normalize: Normalize embedding vector to unit length
Returns:
List of embedding values
"""Convert between text and token representations using the model's native tokenizer.
def tokenize(
self,
text: str,
add_bos: bool = True,
special: bool = False
) -> List[int]:
"""
Convert text to token IDs.
Args:
text: Input text to tokenize
add_bos: Add beginning-of-sequence token
special: Allow special tokens in output
Returns:
List of token IDs
"""
def detokenize(
self,
tokens: List[int],
decode: bool = True
) -> str:
"""
Convert token IDs to text.
Args:
tokens: List of token IDs
decode: Decode bytes to string
Returns:
Decoded text string
"""Save and restore model context states for efficient caching and continuation of conversations.
def save_state(self) -> LlamaState:
"""
Save current model state.
Returns:
Serializable state object
"""
def load_state(self, state: LlamaState) -> None:
"""
Load previously saved model state.
Args:
state: State object from save_state()
"""
def reset(self) -> None:
"""
Reset model context to initial state.
"""Access model metadata and configuration settings.
@property
def n_ctx(self) -> int:
"""Context window size in tokens."""
@property
def n_embd(self) -> int:
"""Model embedding dimensions."""
@property
def n_vocab(self) -> int:
"""Vocabulary size."""
@property
def tokenizer(self) -> object:
"""Tokenizer instance."""
@property
def token_eos(self) -> int:
"""End-of-sequence token ID."""
@property
def token_bos(self) -> int:
"""Beginning-of-sequence token ID."""
@property
def token_nl(self) -> int:
"""Newline token ID."""
def set_seed(self, seed: int) -> None:
"""
Set random seed for sampling.
Args:
seed: Random seed value
"""
def set_cache(self, cache: object) -> None:
"""
Set caching implementation.
Args:
cache: Cache instance (LlamaRAMCache or LlamaDiskCache)
"""Direct token-level generation and sampling for advanced use cases.
def eval(self, tokens: List[int]) -> None:
"""
Evaluate tokens and update model context.
Args:
tokens: Token sequence to evaluate
"""
def sample(
self,
top_k: int = 40,
top_p: float = 0.95,
min_p: float = 0.05,
typical_p: float = 1.0,
temp: float = 0.80,
repeat_penalty: float = 1.0,
frequency_penalty: float = 0.0,
presence_penalty: float = 0.0,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
penalize_nl: bool = True,
logits_processor: Optional[object] = None,
grammar: Optional[object] = None
) -> int:
"""
Sample next token from current context.
Args:
top_k: Top-k sampling parameter
top_p: Top-p (nucleus) sampling parameter
min_p: Minimum probability threshold
typical_p: Typical sampling parameter
temp: Sampling temperature
repeat_penalty: Repetition penalty multiplier
frequency_penalty: Frequency penalty
presence_penalty: Presence penalty
tfs_z: Tail-free sampling parameter
mirostat_mode: Mirostat sampling mode
mirostat_tau: Mirostat target entropy
mirostat_eta: Mirostat learning rate
penalize_nl: Apply penalty to newline tokens
logits_processor: Custom logits processor
grammar: Grammar constraints
Returns:
Sampled token ID
"""
def generate(
self,
tokens: List[int],
top_k: int = 40,
top_p: float = 0.95,
min_p: float = 0.05,
typical_p: float = 1.0,
temp: float = 0.80,
repeat_penalty: float = 1.0,
reset: bool = True,
frequency_penalty: float = 0.0,
presence_penalty: float = 0.0,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
stopping_criteria: Optional[object] = None,
logits_processor: Optional[object] = None,
grammar: Optional[object] = None
) -> Generator[int, None, None]:
"""
Generate token sequence.
Args:
tokens: Initial token sequence
top_k: Top-k sampling parameter
top_p: Top-p sampling parameter
min_p: Minimum probability threshold
typical_p: Typical sampling parameter
temp: Temperature
repeat_penalty: Repetition penalty
reset: Reset context before generation
frequency_penalty: Frequency penalty
presence_penalty: Presence penalty
tfs_z: Tail-free sampling parameter
mirostat_mode: Mirostat mode
mirostat_tau: Mirostat tau
mirostat_eta: Mirostat eta
stopping_criteria: Custom stopping criteria
logits_processor: Custom logits processor
grammar: Grammar constraints
Yields:
Generated token IDs
"""class LlamaState:
"""Serializable model state for persistence."""
def __init__(self, llama_state): ...
# Logits processing
class LogitsProcessor:
"""Base class for logits processing."""
def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: ...
class LogitsProcessorList:
"""List of logits processors."""
def __init__(self, processors: List[LogitsProcessor]): ...
def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: ...
class MinTokensLogitsProcessor(LogitsProcessor):
"""Ensures minimum number of tokens are generated."""
def __init__(self, min_tokens: int, eos_token_id: int): ...
# Stopping criteria
class StoppingCriteria:
"""Base class for stopping criteria."""
def __call__(self, input_ids: List[int], scores: List[float]) -> bool: ...
class StoppingCriteriaList:
"""List of stopping criteria."""
def __init__(self, criteria: List[StoppingCriteria]): ...
def __call__(self, input_ids: List[int], scores: List[float]) -> bool: ...from llama_cpp import Llama
# Load model with basic configuration
llm = Llama(
model_path="./models/llama-2-7b-chat.gguf",
n_ctx=2048,
n_threads=8,
)
# Simple text completion
response = llm.create_completion(
prompt="The future of artificial intelligence is",
max_tokens=50,
temperature=0.7,
)
print(response['choices'][0]['text'])# Offload layers to GPU for faster inference
llm = Llama(
model_path="./models/llama-2-13b-chat.gguf",
n_gpu_layers=35, # Offload most layers to GPU
n_ctx=4096,
f16_kv=True, # Use 16-bit precision for cache
)# Save and restore conversation state
llm = Llama(model_path="./model.gguf")
# Generate some text
llm.create_completion(prompt="Hello, my name is")
# Save current state
state = llm.save_state()
# Continue conversation
llm.create_completion(prompt=" and I like")
# Restore to previous state
llm.load_state(state)# Fine-tune generation with advanced sampling
response = llm.create_completion(
prompt="Write a creative story:",
max_tokens=200,
temperature=0.9, # High creativity
top_p=0.9, # Nucleus sampling
top_k=50, # Top-k sampling
repeat_penalty=1.15, # Reduce repetition
frequency_penalty=0.1,
presence_penalty=0.1,
)Install with Tessl CLI
npx tessl i tessl/pypi-llama-cpp-python