Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
npx @tessl/cli install tessl/pypi-llama-cpp-python@0.3.0Python bindings for the llama.cpp library providing high-performance large language model inference with comprehensive APIs for text completion, chat, embeddings, and multimodal processing. Offers both high-level Python interfaces and low-level C bindings with OpenAI-compatible endpoints.
pip install llama-cpp-pythonimport llama_cppCommon high-level imports:
from llama_cpp import Llama, LlamaGrammar, LlamaCacheOpenAI-compatible types:
from llama_cpp.llama_types import (
CreateCompletionResponse,
CreateChatCompletionResponse,
CreateEmbeddingResponse
)from llama_cpp import Llama
# Initialize model
llm = Llama(
model_path="./models/llama-model.gguf",
n_ctx=2048, # Context window
n_threads=8, # CPU threads
)
# Generate text completion
output = llm.create_completion(
prompt="The capital of France is",
max_tokens=32,
temperature=0.7,
top_p=0.9,
)
print(output['choices'][0]['text'])
# Create chat completion
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello! How are you?"}
]
response = llm.create_chat_completion(
messages=messages,
max_tokens=100,
temperature=0.7,
)
print(response['choices'][0]['message']['content'])
# Generate embeddings
embeddings = llm.create_embedding(
input=["Hello world", "Python is great"],
)
print(embeddings['data'][0]['embedding'][:5]) # First 5 dimensionsThe llama-cpp-python package provides multiple layers of abstraction:
Llama class offers convenient methods for common operations with sensible defaultsKey design patterns:
High-level model loading, text generation, and inference operations including completion, sampling, state management, and performance optimization.
class Llama:
def __init__(self, model_path: str, **kwargs): ...
def create_completion(self, prompt: str, **kwargs) -> CreateCompletionResponse: ...
def create_chat_completion(self, messages: List[dict], **kwargs) -> CreateChatCompletionResponse: ...
def create_embedding(self, input: Union[str, List[str]], **kwargs) -> CreateEmbeddingResponse: ...
def tokenize(self, text: str, add_bos: bool = True, special: bool = False) -> List[int]: ...
def detokenize(self, tokens: List[int], decode: bool = True) -> str: ...OpenAI-compatible chat completions with extensive formatting options, role-based conversations, function calling, and custom message templates for different model types.
def get_chat_completion_handler(chat_format: str) -> LlamaChatCompletionHandler: ...
def register_chat_completion_handler(chat_format: str, chat_handler: LlamaChatCompletionHandler): ...
class Jinja2ChatFormatter:
def __init__(self, template: str, **kwargs): ...
def format_messages(self, messages: List[dict]) -> ChatFormatterResponse: ...Chat Completions and Formatting
Native llama.cpp tokenization and HuggingFace tokenizer integration with support for different vocabulary types, encoding/decoding, and model-specific preprocessing.
class LlamaTokenizer:
def tokenize(self, text: str, add_bos: bool = True, special: bool = False) -> List[int]: ...
def detokenize(self, tokens: List[int], decode: bool = True) -> str: ...
@classmethod
def from_ggml_file(cls, path: str) -> "LlamaTokenizer": ...
class LlamaHFTokenizer:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": ...Memory and disk-based caching systems for model states, context, and computed results to improve inference performance and enable state persistence.
class LlamaRAMCache:
def __init__(self, capacity_bytes: int = 2 << 30): ...
class LlamaDiskCache:
def __init__(self, cache_dir: str = ".cache/llama_cpp"): ...Constrained text generation using formal grammars (GBNF), JSON Schema validation, and built-in templates for structured outputs like JSON, code, and domain-specific formats.
class LlamaGrammar:
@classmethod
def from_string(cls, grammar_str: str, verbose: bool = True) -> "LlamaGrammar": ...
@classmethod
def from_json_schema(cls, schema: dict, verbose: bool = True) -> "LlamaGrammar": ...
def json_schema_to_gbnf(schema: dict, **kwargs) -> str: ...Grammar and Structured Generation
LLaVA vision model integration for processing images alongside text, supporting various image formats and multimodal conversation flows.
def llava_image_embed_make_with_filename(ctx_clip, image_path: str): ...
def llava_image_embed_make_with_bytes(ctx_clip, image_bytes: bytes, image_bytes_length: int): ...
def llava_validate_embed_size(n_embd: int, n_image_embd: int) -> bool: ...FastAPI-based web server with OpenAI-compatible endpoints, settings management, and multi-model configuration support for production deployments.
class ModelSettings:
model: str
n_ctx: int = 2048
temperature: float = 0.7
class ServerSettings:
host: str = "127.0.0.1"
port: int = 8000
interrupt_requests: bool = TrueDirect access to llama.cpp C functions through ctypes bindings, providing maximum control over model loading, context management, and backend operations.
def llama_model_load_from_file(path_model: bytes, params) -> llama_model_p: ...
def llama_new_context_with_model(model: llama_model_p, params) -> llama_context_p: ...
def llama_backend_init() -> None: ...
def llama_backend_free() -> None: ...# Core response types
CreateCompletionResponse = TypedDict('CreateCompletionResponse', {
'id': str,
'object': str,
'created': int,
'model': str,
'choices': List[CompletionChoice],
'usage': CompletionUsage,
})
CreateChatCompletionResponse = TypedDict('CreateChatCompletionResponse', {
'id': str,
'object': str,
'created': int,
'model': str,
'choices': List[ChatCompletionResponseChoice],
'usage': CompletionUsage,
})
CreateEmbeddingResponse = TypedDict('CreateEmbeddingResponse', {
'object': str,
'data': List[Embedding],
'model': str,
'usage': EmbeddingUsage,
})
# Message types for chat
ChatCompletionRequestMessage = TypedDict('ChatCompletionRequestMessage', {
'role': str,
'content': Optional[str],
})
ChatCompletionRequestSystemMessage = TypedDict('ChatCompletionRequestSystemMessage', {
'role': Literal['system'],
'content': str,
})
ChatCompletionRequestUserMessage = TypedDict('ChatCompletionRequestUserMessage', {
'role': Literal['user'],
'content': str,
})
ChatCompletionRequestAssistantMessage = TypedDict('ChatCompletionRequestAssistantMessage', {
'role': Literal['assistant'],
'content': Optional[str],
})
# JSON serializable type
JsonType = Union[None, int, float, str, bool, List['JsonType'], Dict[str, 'JsonType']]