Python bindings for llama.cpp enabling efficient local language model inference without external API dependencies
—
Core functionality for loading models, generating text, and managing model state. The Model class provides the primary interface for interacting with GGML language models through both streaming and batch generation methods.
Initialize and configure a language model instance with extensive customization options for context size, GPU utilization, and model behavior.
class Model:
def __init__(
self,
model_path: str,
prompt_context: str = '',
prompt_prefix: str = '',
prompt_suffix: str = '',
log_level: int = logging.ERROR,
n_ctx: int = 512,
seed: int = 0,
n_gpu_layers: int = 0,
f16_kv: bool = False,
logits_all: bool = False,
vocab_only: bool = False,
use_mlock: bool = False,
embedding: bool = False
):
"""
Initialize a Model instance.
Parameters:
- model_path: str, path to the GGML model file
- prompt_context: str, global context for all interactions
- prompt_prefix: str, prefix added to each prompt
- prompt_suffix: str, suffix added to each prompt
- log_level: int, logging level (default: logging.ERROR)
- n_ctx: int, context window size in tokens (default: 512)
- seed: int, random seed for generation (default: 0)
- n_gpu_layers: int, number of layers to offload to GPU (default: 0)
- f16_kv: bool, use fp16 for key/value cache (default: False)
- logits_all: bool, compute all logits, not just last token (default: False)
- vocab_only: bool, only load vocabulary, no weights (default: False)
- use_mlock: bool, force system to keep model in RAM (default: False)
- embedding: bool, enable embedding mode (default: False)
"""Example usage:
from pyllamacpp.model import Model
# Basic model loading
model = Model(model_path='./models/llama-7b.ggml')
# Advanced configuration
model = Model(
model_path='./models/llama-13b.ggml',
n_ctx=2048,
n_gpu_layers=32,
f16_kv=True,
prompt_context="You are a helpful AI assistant.",
prompt_prefix="\n\nHuman: ",
prompt_suffix="\n\nAssistant: "
)Generate text tokens iteratively using a generator pattern, allowing real-time display of generated text with extensive parameter control for sampling strategies.
def generate(
self,
prompt: str,
n_predict: Union[None, int] = None,
n_threads: int = 4,
seed: Union[None, int] = None,
antiprompt: str = None,
n_batch: int = 512,
n_keep: int = 0,
top_k: int = 40,
top_p: float = 0.95,
tfs_z: float = 1.00,
typical_p: float = 1.00,
temp: float = 0.8,
repeat_penalty: float = 1.10,
repeat_last_n: int = 64,
frequency_penalty: float = 0.00,
presence_penalty: float = 0.00,
mirostat: int = 0,
mirostat_tau: int = 5.00,
mirostat_eta: int = 0.1,
infinite_generation: bool = False
) -> Generator:
"""
Generate text tokens iteratively.
Parameters:
- prompt: str, input prompt for generation
- n_predict: int or None, max tokens to generate (None for until EOS)
- n_threads: int, CPU threads to use (default: 4)
- seed: int or None, random seed (None for time-based seed)
- antiprompt: str, stop word to halt generation
- n_batch: int, batch size for prompt processing (default: 512)
- n_keep: int, tokens to keep from initial prompt (default: 0)
- top_k: int, top-k sampling parameter (default: 40)
- top_p: float, top-p sampling parameter (default: 0.95)
- tfs_z: float, tail free sampling parameter (default: 1.00)
- typical_p: float, typical sampling parameter (default: 1.00)
- temp: float, temperature for sampling (default: 0.8)
- repeat_penalty: float, repetition penalty (default: 1.10)
- repeat_last_n: int, last n tokens to penalize (default: 64)
- frequency_penalty: float, frequency penalty (default: 0.00)
- presence_penalty: float, presence penalty (default: 0.00)
- mirostat: int, mirostat algorithm (0=disabled, 1=v1, 2=v2)
- mirostat_tau: int, mirostat target entropy (default: 5.00)
- mirostat_eta: int, mirostat learning rate (default: 0.1)
- infinite_generation: bool, generate infinitely (default: False)
Yields:
str: Individual tokens as they are generated
"""Example usage:
# Basic streaming generation
for token in model.generate("What is machine learning?"):
print(token, end='', flush=True)
# Advanced parameter control
for token in model.generate(
"Explain quantum computing",
n_predict=200,
temp=0.7,
top_p=0.9,
repeat_penalty=1.15,
antiprompt="Human:"
):
print(token, end='', flush=True)Generate complete text responses using llama.cpp's native generation function with callback support for monitoring generation progress.
def cpp_generate(
self,
prompt: str,
n_predict: int = 128,
new_text_callback: Callable[[bytes], None] = None,
n_threads: int = 4,
top_k: int = 40,
top_p: float = 0.95,
tfs_z: float = 1.00,
typical_p: float = 1.00,
temp: float = 0.8,
repeat_penalty: float = 1.10,
repeat_last_n: int = 64,
frequency_penalty: float = 0.00,
presence_penalty: float = 0.00,
mirostat: int = 0,
mirostat_tau: int = 5.00,
mirostat_eta: int = 0.1,
n_batch: int = 8,
n_keep: int = 0,
interactive: bool = False,
antiprompt: List = [],
instruct: bool = False,
verbose_prompt: bool = False
) -> str:
"""
Generate text using llama.cpp's native generation function.
Parameters:
- prompt: str, input prompt
- n_predict: int, number of tokens to generate (default: 128)
- new_text_callback: callable, callback for new text generation
- n_threads: int, CPU threads (default: 4)
- top_k: int, top-k sampling (default: 40)
- top_p: float, top-p sampling (default: 0.95)
- tfs_z: float, tail free sampling (default: 1.00)
- typical_p: float, typical sampling (default: 1.00)
- temp: float, temperature (default: 0.8)
- repeat_penalty: float, repetition penalty (default: 1.10)
- repeat_last_n: int, penalty window (default: 64)
- frequency_penalty: float, frequency penalty (default: 0.00)
- presence_penalty: float, presence penalty (default: 0.00)
- mirostat: int, mirostat mode (default: 0)
- mirostat_tau: int, mirostat tau (default: 5.00)
- mirostat_eta: int, mirostat eta (default: 0.1)
- n_batch: int, batch size (default: 8)
- n_keep: int, tokens to keep (default: 0)
- interactive: bool, interactive mode (default: False)
- antiprompt: list, stop phrases (default: [])
- instruct: bool, instruction mode (default: False)
- verbose_prompt: bool, verbose prompting (default: False)
Returns:
str: Complete generated text
"""Example usage:
# Basic batch generation
response = model.cpp_generate("Describe the solar system", n_predict=200)
print(response)
# With callback for progress monitoring
def progress_callback(text):
print("Generated:", text.decode('utf-8'), end='')
response = model.cpp_generate(
"Write a short poem",
n_predict=100,
new_text_callback=progress_callback,
temp=0.9
)Convert between text and token representations, essential for understanding model input processing and implementing custom text handling.
def tokenize(self, text: str):
"""
Convert text to list of tokens.
Parameters:
- text: str, text to tokenize
Returns:
list: List of token integers
"""
def detokenize(self, tokens: list):
"""
Convert tokens back to text.
Parameters:
- tokens: list or array, token integers
Returns:
str: Decoded text string
"""Example usage:
# Tokenize text
tokens = model.tokenize("Hello, world!")
print(f"Tokens: {tokens}")
# Convert back to text
text = model.detokenize(tokens)
print(f"Text: {text}")
# Analyze token count
prompt = "This is a test prompt for token counting"
token_count = len(model.tokenize(prompt))
print(f"Token count: {token_count}")Reset and manage the model's conversational context, essential for multi-turn conversations and context window management.
def reset(self) -> None:
"""
Reset the model context and token history.
Clears conversation history and resets internal state
to initial conditions, useful for starting fresh conversations
or managing context window limitations.
"""Example usage:
# Use model for one conversation
model.generate("Hello, how are you?")
# Reset for fresh conversation
model.reset()
# Start new conversation with clean context
model.generate("What's the weather like?")Access performance metrics and system information for optimization and debugging purposes.
def llama_print_timings(self):
"""Print detailed performance timing information."""
@staticmethod
def llama_print_system_info():
"""Print system information relevant to model execution."""
@staticmethod
def get_params(params) -> dict:
"""
Convert parameter object to dictionary representation.
Parameters:
- params: parameter object
Returns:
dict: Dictionary representation of parameters
"""Example usage:
# Print system information
Model.llama_print_system_info()
# Generate text and check performance
model.generate("Test prompt")
model.llama_print_timings()
# Inspect model parameters
params_dict = Model.get_params(model.llama_params)
print(params_dict)Install with Tessl CLI
npx tessl i tessl/pypi-pyllamacpp