Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
—
OpenAI-compatible chat completions with extensive formatting options, role-based conversations, function calling, and custom message templates for different model architectures.
Generate contextual responses in multi-turn conversations with full OpenAI API compatibility.
def create_chat_completion(
self,
messages: List[dict],
functions: Optional[List[dict]] = None,
function_call: Optional[Union[str, dict]] = None,
tools: Optional[List[dict]] = None,
tool_choice: Optional[Union[str, dict]] = None,
temperature: float = 0.2,
top_p: float = 0.95,
top_k: int = 40,
min_p: float = 0.05,
typical_p: float = 1.0,
stream: bool = False,
stop: Optional[Union[str, List[str]]] = None,
seed: Optional[int] = None,
response_format: Optional[dict] = None,
max_tokens: Optional[int] = None,
presence_penalty: float = 0.0,
frequency_penalty: float = 0.0,
repeat_penalty: float = 1.1,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
model: Optional[str] = None,
logits_processor: Optional[object] = None,
grammar: Optional[object] = None,
logit_bias: Optional[Dict[str, float]] = None,
**kwargs
) -> CreateChatCompletionResponse:
"""
Create a chat completion response.
Args:
messages: List of message objects with 'role' and 'content'
functions: Available functions for function calling (deprecated, use tools)
function_call: Function call preference (deprecated, use tool_choice)
tools: Available tools for the model to call
tool_choice: Tool usage preference ("none", "auto", or specific tool)
temperature: Sampling temperature (0.0-2.0)
top_p: Nucleus sampling threshold
top_k: Top-k sampling parameter
min_p: Minimum probability threshold
typical_p: Typical sampling parameter
stream: Enable streaming response
stop: Stop sequences
seed: Random seed
response_format: Output format specification
max_tokens: Maximum tokens to generate
presence_penalty: Presence penalty (-2.0 to 2.0)
frequency_penalty: Frequency penalty (-2.0 to 2.0)
repeat_penalty: Repetition penalty multiplier
tfs_z: Tail-free sampling parameter
mirostat_mode: Mirostat sampling mode
mirostat_tau: Mirostat target entropy
mirostat_eta: Mirostat learning rate
model: Model name for metadata
logits_processor: Custom logits processor
grammar: Grammar constraints
logit_bias: Token probability adjustments
Returns:
Chat completion response with generated message
"""Format conversations according to model-specific templates and requirements.
class Jinja2ChatFormatter:
def __init__(
self,
template: str,
eos_token: str = "</s>",
bos_token: str = "<s>",
stop_token_ids: Optional[List[int]] = None,
**kwargs
):
"""
Initialize Jinja2-based chat formatter.
Args:
template: Jinja2 template string for message formatting
eos_token: End-of-sequence token
bos_token: Beginning-of-sequence token
stop_token_ids: List of token IDs that should stop generation
"""
def format_messages(self, messages: List[dict]) -> "ChatFormatterResponse":
"""
Format messages according to template.
Args:
messages: List of message dictionaries
Returns:
Formatted response with prompt and stop sequences
"""
class ChatFormatterResponse:
def __init__(
self,
prompt: str,
stop: Optional[List[str]] = None
):
"""
Response container for formatted chat messages.
Args:
prompt: Formatted prompt text
stop: Stop sequences for generation
"""
self.prompt = prompt
self.stop = stopRegister and retrieve chat formatting handlers for different model types.
def get_chat_completion_handler(
chat_format: str
) -> "LlamaChatCompletionHandler":
"""
Get registered chat completion handler by format name.
Args:
chat_format: Format identifier (e.g., "chatml", "llama-2", "mistral-instruct")
Returns:
Chat completion handler instance
"""
def register_chat_completion_handler(
chat_format: str,
chat_handler: "LlamaChatCompletionHandler"
) -> None:
"""
Register new chat completion handler.
Args:
chat_format: Format identifier
chat_handler: Handler implementation
"""
class LlamaChatCompletionHandlerRegistry:
def register_chat_completion_handler(
self,
chat_format: str,
handler: "LlamaChatCompletionHandler"
) -> None: ...
def get_chat_completion_handler(
self,
chat_format: str
) -> "LlamaChatCompletionHandler": ...Handle different message types and roles in conversations.
# Protocol definitions for chat completion handlers
class LlamaChatCompletionHandler:
"""Protocol for chat completion handlers."""
def __call__(
self,
llama: "Llama",
messages: List[dict],
**kwargs
) -> Union[dict, Iterator[dict]]: ...
class ChatFormatter:
"""Protocol for chat message formatters."""
def __call__(
self,
messages: List[dict],
**kwargs
) -> ChatFormatterResponse: ...# Template constants for different model formats
CHATML_CHAT_TEMPLATE: str
MISTRAL_INSTRUCT_CHAT_TEMPLATE: str
MIXTRAL_INSTRUCT_CHAT_TEMPLATE: str
LLAMA3_INSTRUCT_CHAT_TEMPLATE: str
# Associated token constants
CHATML_EOS_TOKEN: str
MISTRAL_INSTRUCT_EOS_TOKEN: str
MIXTRAL_INSTRUCT_EOS_TOKEN: str
LLAMA3_INSTRUCT_EOS_TOKEN: str
CHATML_BOS_TOKEN: str
MISTRAL_INSTRUCT_BOS_TOKEN: str
MIXTRAL_INSTRUCT_BOS_TOKEN: str
LLAMA3_INSTRUCT_BOS_TOKEN: str# Message types for different roles
ChatCompletionRequestMessage = TypedDict('ChatCompletionRequestMessage', {
'role': str,
'content': Optional[str],
})
ChatCompletionRequestSystemMessage = TypedDict('ChatCompletionRequestSystemMessage', {
'role': Literal['system'],
'content': str,
'name': NotRequired[str],
})
ChatCompletionRequestUserMessage = TypedDict('ChatCompletionRequestUserMessage', {
'role': Literal['user'],
'content': str,
'name': NotRequired[str],
})
ChatCompletionRequestAssistantMessage = TypedDict('ChatCompletionRequestAssistantMessage', {
'role': Literal['assistant'],
'content': Optional[str],
'name': NotRequired[str],
'tool_calls': NotRequired[List[dict]],
'function_call': NotRequired[dict], # Deprecated
})
ChatCompletionRequestToolMessage = TypedDict('ChatCompletionRequestToolMessage', {
'role': Literal['tool'],
'content': str,
'tool_call_id': str,
})
ChatCompletionRequestFunctionMessage = TypedDict('ChatCompletionRequestFunctionMessage', {
'role': Literal['function'],
'content': str,
'name': str,
})
# Response types
CreateChatCompletionResponse = TypedDict('CreateChatCompletionResponse', {
'id': str,
'object': Literal['chat.completion'],
'created': int,
'model': str,
'choices': List["ChatCompletionResponseChoice"],
'usage': "CompletionUsage",
})
ChatCompletionResponseChoice = TypedDict('ChatCompletionResponseChoice', {
'index': int,
'message': "ChatCompletionResponseMessage",
'finish_reason': Optional[str],
'logprobs': Optional[dict],
})
ChatCompletionResponseMessage = TypedDict('ChatCompletionResponseMessage', {
'role': Literal['assistant'],
'content': Optional[str],
'function_call': NotRequired[dict],
'tool_calls': NotRequired[List[dict]],
})
# Streaming response types
CreateChatCompletionStreamResponse = TypedDict('CreateChatCompletionStreamResponse', {
'id': str,
'object': Literal['chat.completion.chunk'],
'created': int,
'model': str,
'choices': List["ChatCompletionStreamResponseChoice"],
})
ChatCompletionStreamResponseChoice = TypedDict('ChatCompletionStreamResponseChoice', {
'index': int,
'delta': "ChatCompletionResponseMessage",
'finish_reason': Optional[str],
'logprobs': Optional[dict],
})
# Tool and function types
ChatCompletionMessageToolCall = TypedDict('ChatCompletionMessageToolCall', {
'id': str,
'type': Literal['function'],
'function': dict,
})
ChatCompletionTool = TypedDict('ChatCompletionTool', {
'type': Literal['function'],
'function': "ChatCompletionFunction",
})
ChatCompletionFunction = TypedDict('ChatCompletionFunction', {
'name': str,
'description': Optional[str],
'parameters': dict,
})
# Response format specification
ChatCompletionRequestResponseFormat = TypedDict('ChatCompletionRequestResponseFormat', {
'type': Literal['text', 'json_object'],
})from llama_cpp import Llama
llm = Llama(
model_path="./models/llama-2-7b-chat.gguf",
chat_format="llama-2"
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello! Can you help me with Python?"},
]
response = llm.create_chat_completion(
messages=messages,
max_tokens=150,
temperature=0.7,
)
print(response['choices'][0]['message']['content'])messages = [
{"role": "system", "content": "You are a coding tutor."},
{"role": "user", "content": "How do I create a list in Python?"},
{"role": "assistant", "content": "You can create a list using square brackets: my_list = [1, 2, 3]"},
{"role": "user", "content": "How do I add items to it?"},
]
response = llm.create_chat_completion(
messages=messages,
max_tokens=100,
)
# Add assistant response to conversation
messages.append({
"role": "assistant",
"content": response['choices'][0]['message']['content']
})tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather information",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City name"
}
},
"required": ["location"]
}
}
}
]
messages = [
{"role": "user", "content": "What's the weather like in New York?"}
]
response = llm.create_chat_completion(
messages=messages,
tools=tools,
tool_choice="auto",
)
# Check if model wants to call a function
if response['choices'][0]['message'].get('tool_calls'):
tool_call = response['choices'][0]['message']['tool_calls'][0]
print(f"Function: {tool_call['function']['name']}")
print(f"Arguments: {tool_call['function']['arguments']}")from llama_cpp.llama_chat_format import Jinja2ChatFormatter
# Create custom formatter
custom_template = """
{%- for message in messages %}
{%- if message['role'] == 'user' %}
User: {{ message['content'] }}
{%- elif message['role'] == 'assistant' %}
Assistant: {{ message['content'] }}
{%- elif message['role'] == 'system' %}
System: {{ message['content'] }}
{%- endif %}
{%- endfor %}
Assistant: """
formatter = Jinja2ChatFormatter(
template=custom_template,
eos_token="</s>",
bos_token="<s>",
)
# Format messages manually
messages = [{"role": "user", "content": "Hello!"}]
formatted = formatter.format_messages(messages)
print(formatted.prompt)messages = [
{"role": "user", "content": "Write a short story about robots."}
]
stream = llm.create_chat_completion(
messages=messages,
max_tokens=200,
stream=True, # Enable streaming
)
# Process streaming response
for chunk in stream:
if chunk['choices'][0]['delta'].get('content'):
print(chunk['choices'][0]['delta']['content'], end='', flush=True)# Request JSON response format
response = llm.create_chat_completion(
messages=[
{"role": "user", "content": "List 3 programming languages in JSON format"}
],
response_format={"type": "json_object"},
max_tokens=100,
)
print(response['choices'][0]['message']['content'])Install with Tessl CLI
npx tessl i tessl/pypi-llama-cpp-python