CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-llama-cpp-python

Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.

Pending
Overview
Eval results
Files

llama-model.mddocs/

Core Model and Inference

High-level model loading, text generation, and inference operations providing the primary interface for llama.cpp functionality through the Llama class.

Capabilities

Model Initialization

Load and configure language models with comprehensive parameter control for performance optimization and hardware acceleration.

class Llama:
    def __init__(
        self,
        model_path: str,
        *,
        n_gpu_layers: int = 0,
        split_mode: int = 1,
        main_gpu: int = 0,
        tensor_split: Optional[List[float]] = None,
        vocab_only: bool = False,
        use_mmap: bool = True,
        use_mlock: bool = False,
        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
        seed: int = 0xFFFFFFFF,
        n_ctx: int = 512,
        n_batch: int = 512,
        n_ubatch: int = 512,
        n_threads: Optional[int] = None,
        n_threads_batch: Optional[int] = None,
        rope_scaling_type: Optional[int] = -1,
        pooling_type: int = -1,
        rope_freq_base: float = 0.0,
        rope_freq_scale: float = 0.0,
        yarn_ext_factor: float = -1.0,
        yarn_attn_factor: float = 1.0,
        yarn_beta_fast: float = 32.0,
        yarn_beta_slow: float = 1.0,
        yarn_orig_ctx: int = 0,
        logits_all: bool = False,
        embedding: bool = False,
        offload_kqv: bool = True,
        flash_attn: bool = False,
        op_offload: Optional[bool] = None,
        swa_full: Optional[bool] = None,
        no_perf: bool = False,
        last_n_tokens_size: int = 64,
        lora_base: Optional[str] = None,
        lora_scale: float = 1.0,
        lora_path: Optional[str] = None,
        numa: Union[bool, int] = False,
        chat_format: Optional[str] = None,
        chat_handler: Optional[object] = None,
        draft_model: Optional[object] = None,
        tokenizer: Optional[object] = None,
        type_k: Optional[int] = None,
        type_v: Optional[int] = None,
        spm_infill: bool = False,
        verbose: bool = True,
        **kwargs
    ):
        """
        Initialize a Llama model instance.

        Args:
            model_path: Path to the GGUF model file
            n_gpu_layers: Number of layers to offload to GPU (0 = CPU only)
            split_mode: GPU split mode (1 = layer-wise split)
            main_gpu: Main GPU device ID for multi-GPU setups
            tensor_split: List of GPU memory allocations for each device
            vocab_only: Load vocabulary only, skip weights
            use_mmap: Use memory mapping for model loading
            use_mlock: Lock model in memory to prevent swapping
            kv_overrides: Key-value metadata overrides for the model
            seed: Random seed for sampling (-1 for random)
            n_ctx: Context window size in tokens
            n_batch: Batch size for processing
            n_ubatch: Physical batch size (must be <= n_batch)
            n_threads: Number of CPU threads for computation
            n_threads_batch: Number of CPU threads for batch processing
            rope_scaling_type: RoPE scaling method (-1 = auto)
            pooling_type: Pooling method for embeddings (-1 = unspecified)
            rope_freq_base: Base frequency for RoPE
            rope_freq_scale: Frequency scaling factor for RoPE
            yarn_ext_factor: YaRN extension factor
            yarn_attn_factor: YaRN attention factor
            yarn_beta_fast: YaRN beta fast parameter
            yarn_beta_slow: YaRN beta slow parameter
            yarn_orig_ctx: YaRN original context size
            logits_all: Return logits for all tokens
            embedding: Enable embedding mode
            offload_kqv: Offload key/value cache to GPU
            flash_attn: Use Flash Attention optimization
            op_offload: Offload operations to GPU (auto-detect if None)
            swa_full: Use full sliding window attention (auto-detect if None)
            no_perf: Disable performance optimizations
            last_n_tokens_size: Size of last-n-tokens buffer for repetition penalty
            lora_base: Path to LoRA base model
            lora_scale: LoRA scaling factor
            lora_path: Path to LoRA adapter
            numa: NUMA optimization (False/True/strategy)
            chat_format: Chat format template name
            chat_handler: Custom chat completion handler
            draft_model: Draft model for speculative decoding
            tokenizer: Custom tokenizer instance
            type_k: Key cache quantization type (None = auto)
            type_v: Value cache quantization type (None = auto)
            spm_infill: Enable SentencePiece infill mode
            verbose: Enable verbose logging
        """

    @classmethod
    def from_pretrained(
        cls,
        repo_id: str,
        filename: Optional[str] = None,
        *,
        additional_files: Optional[List[str]] = None,
        local_dir: Optional[str] = None,
        local_dir_use_symlinks: bool = True,
        cache_dir: Optional[str] = None,
        **kwargs
    ) -> "Llama":
        """
        Create a Llama model instance from a Hugging Face Hub repository.

        Args:
            repo_id: Repository identifier on Hugging Face Hub
            filename: Specific model file to download (auto-detected if None)
            additional_files: Additional files to download (e.g., tokenizer files)
            local_dir: Local directory to save files (uses cache if None)
            local_dir_use_symlinks: Use symlinks in local directory
            cache_dir: Cache directory for downloaded files
            **kwargs: Additional arguments passed to Llama.__init__()

        Returns:
            Initialized Llama model instance

        Raises:
            ImportError: If huggingface-hub package is not installed
            FileNotFoundError: If specified file is not found in repository
        """

Text Completion

Generate text completions with fine-grained control over sampling parameters and output format, compatible with OpenAI completion API.

def create_completion(
    self,
    prompt: str,
    suffix: Optional[str] = None,
    max_tokens: Optional[int] = 16,
    temperature: float = 0.8,
    top_p: float = 0.95,
    min_p: float = 0.05,
    typical_p: float = 1.0,
    logprobs: Optional[int] = None,
    echo: bool = False,
    stop: Optional[Union[str, List[str]]] = [],
    frequency_penalty: float = 0.0,
    presence_penalty: float = 0.0,
    repeat_penalty: float = 1.0,
    top_k: int = 40,
    stream: bool = False,
    seed: Optional[int] = None,
    tfs_z: float = 1.0,
    mirostat_mode: int = 0,
    mirostat_tau: float = 5.0,
    mirostat_eta: float = 0.1,
    model: Optional[str] = None,
    stopping_criteria: Optional[object] = None,
    logits_processor: Optional[object] = None,
    grammar: Optional[object] = None,
    logit_bias: Optional[Dict[str, float]] = None,
    **kwargs
) -> CreateCompletionResponse:
    """
    Create a text completion.

    Args:
        prompt: Input text prompt
        suffix: Text to append after completion
        max_tokens: Maximum tokens to generate
        temperature: Sampling temperature (0.0-2.0)
        top_p: Nucleus sampling probability threshold
        min_p: Minimum probability threshold
        typical_p: Typical sampling parameter
        logprobs: Number of log probabilities to return
        echo: Include prompt in response
        stop: Stop sequences (string or list)
        frequency_penalty: Frequency penalty (-2.0 to 2.0)
        presence_penalty: Presence penalty (-2.0 to 2.0)
        repeat_penalty: Repetition penalty multiplier
        top_k: Top-k sampling parameter
        stream: Enable streaming response
        seed: Random seed for sampling
        tfs_z: Tail-free sampling parameter
        mirostat_mode: Mirostat sampling mode (0/1/2)
        mirostat_tau: Mirostat target entropy
        mirostat_eta: Mirostat learning rate
        model: Model name for response metadata
        stopping_criteria: Custom stopping criteria
        logits_processor: Custom logits processor
        grammar: Grammar constraints
        logit_bias: Token bias adjustments

    Returns:
        Completion response with generated text and metadata
    """

Embeddings

Generate dense vector representations of text for semantic similarity, clustering, and retrieval applications.

def create_embedding(
    self,
    input: Union[str, List[str]],
    model: Optional[str] = None,
    encoding_format: str = "float",
    **kwargs
) -> CreateEmbeddingResponse:
    """
    Create text embeddings.

    Args:
        input: Text string or list of strings to embed
        model: Model name for response metadata
        encoding_format: Output format ("float" or "base64")

    Returns:
        Embedding response with vector representations
    """

def embed(
    self,
    input: str,
    normalize: bool = True
) -> List[float]:
    """
    Generate embeddings for a single text input.

    Args:
        input: Text to embed
        normalize: Normalize embedding vector to unit length

    Returns:
        List of embedding values
    """

Tokenization

Convert between text and token representations using the model's native tokenizer.

def tokenize(
    self,
    text: str,
    add_bos: bool = True,
    special: bool = False
) -> List[int]:
    """
    Convert text to token IDs.

    Args:
        text: Input text to tokenize
        add_bos: Add beginning-of-sequence token
        special: Allow special tokens in output

    Returns:
        List of token IDs
    """

def detokenize(
    self,
    tokens: List[int],
    decode: bool = True
) -> str:
    """
    Convert token IDs to text.

    Args:
        tokens: List of token IDs
        decode: Decode bytes to string

    Returns:
        Decoded text string
    """

State Management

Save and restore model context states for efficient caching and continuation of conversations.

def save_state(self) -> LlamaState:
    """
    Save current model state.

    Returns:
        Serializable state object
    """

def load_state(self, state: LlamaState) -> None:
    """
    Load previously saved model state.

    Args:
        state: State object from save_state()
    """

def reset(self) -> None:
    """
    Reset model context to initial state.
    """

Configuration and Properties

Access model metadata and configuration settings.

@property
def n_ctx(self) -> int:
    """Context window size in tokens."""

@property
def n_embd(self) -> int:
    """Model embedding dimensions."""

@property
def n_vocab(self) -> int:
    """Vocabulary size."""

@property
def tokenizer(self) -> object:
    """Tokenizer instance."""

@property
def token_eos(self) -> int:
    """End-of-sequence token ID."""

@property
def token_bos(self) -> int:
    """Beginning-of-sequence token ID."""

@property
def token_nl(self) -> int:
    """Newline token ID."""

def set_seed(self, seed: int) -> None:
    """
    Set random seed for sampling.

    Args:
        seed: Random seed value
    """

def set_cache(self, cache: object) -> None:
    """
    Set caching implementation.

    Args:
        cache: Cache instance (LlamaRAMCache or LlamaDiskCache)
    """

Low-Level Generation

Direct token-level generation and sampling for advanced use cases.

def eval(self, tokens: List[int]) -> None:
    """
    Evaluate tokens and update model context.

    Args:
        tokens: Token sequence to evaluate
    """

def sample(
    self,
    top_k: int = 40,
    top_p: float = 0.95,
    min_p: float = 0.05,
    typical_p: float = 1.0,
    temp: float = 0.80,
    repeat_penalty: float = 1.0,
    frequency_penalty: float = 0.0,
    presence_penalty: float = 0.0,
    tfs_z: float = 1.0,
    mirostat_mode: int = 0,
    mirostat_tau: float = 5.0,
    mirostat_eta: float = 0.1,
    penalize_nl: bool = True,
    logits_processor: Optional[object] = None,
    grammar: Optional[object] = None
) -> int:
    """
    Sample next token from current context.

    Args:
        top_k: Top-k sampling parameter
        top_p: Top-p (nucleus) sampling parameter
        min_p: Minimum probability threshold
        typical_p: Typical sampling parameter
        temp: Sampling temperature
        repeat_penalty: Repetition penalty multiplier
        frequency_penalty: Frequency penalty
        presence_penalty: Presence penalty
        tfs_z: Tail-free sampling parameter
        mirostat_mode: Mirostat sampling mode
        mirostat_tau: Mirostat target entropy
        mirostat_eta: Mirostat learning rate
        penalize_nl: Apply penalty to newline tokens
        logits_processor: Custom logits processor
        grammar: Grammar constraints

    Returns:
        Sampled token ID
    """

def generate(
    self,
    tokens: List[int],
    top_k: int = 40,
    top_p: float = 0.95,
    min_p: float = 0.05,
    typical_p: float = 1.0,
    temp: float = 0.80,
    repeat_penalty: float = 1.0,
    reset: bool = True,
    frequency_penalty: float = 0.0,
    presence_penalty: float = 0.0,
    tfs_z: float = 1.0,
    mirostat_mode: int = 0,
    mirostat_tau: float = 5.0,
    mirostat_eta: float = 0.1,
    stopping_criteria: Optional[object] = None,
    logits_processor: Optional[object] = None,
    grammar: Optional[object] = None
) -> Generator[int, None, None]:
    """
    Generate token sequence.

    Args:
        tokens: Initial token sequence
        top_k: Top-k sampling parameter
        top_p: Top-p sampling parameter
        min_p: Minimum probability threshold
        typical_p: Typical sampling parameter
        temp: Temperature
        repeat_penalty: Repetition penalty
        reset: Reset context before generation
        frequency_penalty: Frequency penalty
        presence_penalty: Presence penalty
        tfs_z: Tail-free sampling parameter
        mirostat_mode: Mirostat mode
        mirostat_tau: Mirostat tau
        mirostat_eta: Mirostat eta
        stopping_criteria: Custom stopping criteria
        logits_processor: Custom logits processor
        grammar: Grammar constraints

    Yields:
        Generated token IDs
    """

Types

class LlamaState:
    """Serializable model state for persistence."""
    
    def __init__(self, llama_state): ...

# Logits processing
class LogitsProcessor:
    """Base class for logits processing."""
    
    def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: ...

class LogitsProcessorList:
    """List of logits processors."""
    
    def __init__(self, processors: List[LogitsProcessor]): ...
    def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: ...

class MinTokensLogitsProcessor(LogitsProcessor):
    """Ensures minimum number of tokens are generated."""
    
    def __init__(self, min_tokens: int, eos_token_id: int): ...

# Stopping criteria
class StoppingCriteria:
    """Base class for stopping criteria."""
    
    def __call__(self, input_ids: List[int], scores: List[float]) -> bool: ...

class StoppingCriteriaList:
    """List of stopping criteria."""
    
    def __init__(self, criteria: List[StoppingCriteria]): ...
    def __call__(self, input_ids: List[int], scores: List[float]) -> bool: ...

Usage Examples

Basic Model Loading and Generation

from llama_cpp import Llama

# Load model with basic configuration
llm = Llama(
    model_path="./models/llama-2-7b-chat.gguf",
    n_ctx=2048,
    n_threads=8,
)

# Simple text completion
response = llm.create_completion(
    prompt="The future of artificial intelligence is",
    max_tokens=50,
    temperature=0.7,
)
print(response['choices'][0]['text'])

GPU Acceleration

# Offload layers to GPU for faster inference
llm = Llama(
    model_path="./models/llama-2-13b-chat.gguf",
    n_gpu_layers=35,  # Offload most layers to GPU
    n_ctx=4096,
    f16_kv=True,  # Use 16-bit precision for cache
)

State Management

# Save and restore conversation state
llm = Llama(model_path="./model.gguf")

# Generate some text
llm.create_completion(prompt="Hello, my name is")

# Save current state
state = llm.save_state()

# Continue conversation
llm.create_completion(prompt=" and I like")

# Restore to previous state
llm.load_state(state)

Custom Sampling Parameters

# Fine-tune generation with advanced sampling
response = llm.create_completion(
    prompt="Write a creative story:",
    max_tokens=200,
    temperature=0.9,      # High creativity
    top_p=0.9,           # Nucleus sampling
    top_k=50,            # Top-k sampling
    repeat_penalty=1.15,  # Reduce repetition
    frequency_penalty=0.1,
    presence_penalty=0.1,
)

Install with Tessl CLI

npx tessl i tessl/pypi-llama-cpp-python

docs

caching.md

chat-completion.md

grammar.md

index.md

llama-model.md

low-level.md

server.md

tokenization.md

vision.md

tile.json