CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-llama-cpp-python

Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.

Pending
Overview
Eval results
Files

chat-completion.mddocs/

Chat Completions and Formatting

OpenAI-compatible chat completions with extensive formatting options, role-based conversations, function calling, and custom message templates for different model architectures.

Capabilities

Chat Completion

Generate contextual responses in multi-turn conversations with full OpenAI API compatibility.

def create_chat_completion(
    self,
    messages: List[dict],
    functions: Optional[List[dict]] = None,
    function_call: Optional[Union[str, dict]] = None,
    tools: Optional[List[dict]] = None,
    tool_choice: Optional[Union[str, dict]] = None,
    temperature: float = 0.2,
    top_p: float = 0.95,
    top_k: int = 40,
    min_p: float = 0.05,
    typical_p: float = 1.0,
    stream: bool = False,
    stop: Optional[Union[str, List[str]]] = None,
    seed: Optional[int] = None,
    response_format: Optional[dict] = None,
    max_tokens: Optional[int] = None,
    presence_penalty: float = 0.0,
    frequency_penalty: float = 0.0,
    repeat_penalty: float = 1.1,
    tfs_z: float = 1.0,
    mirostat_mode: int = 0,
    mirostat_tau: float = 5.0,
    mirostat_eta: float = 0.1,
    model: Optional[str] = None,
    logits_processor: Optional[object] = None,
    grammar: Optional[object] = None,
    logit_bias: Optional[Dict[str, float]] = None,
    **kwargs
) -> CreateChatCompletionResponse:
    """
    Create a chat completion response.

    Args:
        messages: List of message objects with 'role' and 'content'
        functions: Available functions for function calling (deprecated, use tools)
        function_call: Function call preference (deprecated, use tool_choice)
        tools: Available tools for the model to call
        tool_choice: Tool usage preference ("none", "auto", or specific tool)
        temperature: Sampling temperature (0.0-2.0)
        top_p: Nucleus sampling threshold
        top_k: Top-k sampling parameter
        min_p: Minimum probability threshold
        typical_p: Typical sampling parameter
        stream: Enable streaming response
        stop: Stop sequences
        seed: Random seed
        response_format: Output format specification
        max_tokens: Maximum tokens to generate
        presence_penalty: Presence penalty (-2.0 to 2.0)
        frequency_penalty: Frequency penalty (-2.0 to 2.0)
        repeat_penalty: Repetition penalty multiplier
        tfs_z: Tail-free sampling parameter
        mirostat_mode: Mirostat sampling mode
        mirostat_tau: Mirostat target entropy
        mirostat_eta: Mirostat learning rate
        model: Model name for metadata
        logits_processor: Custom logits processor
        grammar: Grammar constraints
        logit_bias: Token probability adjustments

    Returns:
        Chat completion response with generated message
    """

Chat Formatting

Format conversations according to model-specific templates and requirements.

class Jinja2ChatFormatter:
    def __init__(
        self,
        template: str,
        eos_token: str = "</s>",
        bos_token: str = "<s>",
        stop_token_ids: Optional[List[int]] = None,
        **kwargs
    ):
        """
        Initialize Jinja2-based chat formatter.

        Args:
            template: Jinja2 template string for message formatting
            eos_token: End-of-sequence token
            bos_token: Beginning-of-sequence token
            stop_token_ids: List of token IDs that should stop generation
        """

    def format_messages(self, messages: List[dict]) -> "ChatFormatterResponse":
        """
        Format messages according to template.

        Args:
            messages: List of message dictionaries

        Returns:
            Formatted response with prompt and stop sequences
        """

class ChatFormatterResponse:
    def __init__(
        self,
        prompt: str,
        stop: Optional[List[str]] = None
    ):
        """
        Response container for formatted chat messages.

        Args:
            prompt: Formatted prompt text
            stop: Stop sequences for generation
        """
        self.prompt = prompt
        self.stop = stop

Chat Format Management

Register and retrieve chat formatting handlers for different model types.

def get_chat_completion_handler(
    chat_format: str
) -> "LlamaChatCompletionHandler":
    """
    Get registered chat completion handler by format name.

    Args:
        chat_format: Format identifier (e.g., "chatml", "llama-2", "mistral-instruct")

    Returns:
        Chat completion handler instance
    """

def register_chat_completion_handler(
    chat_format: str,
    chat_handler: "LlamaChatCompletionHandler"
) -> None:
    """
    Register new chat completion handler.

    Args:
        chat_format: Format identifier
        chat_handler: Handler implementation
    """

class LlamaChatCompletionHandlerRegistry:
    def register_chat_completion_handler(
        self, 
        chat_format: str, 
        handler: "LlamaChatCompletionHandler"
    ) -> None: ...
    
    def get_chat_completion_handler(
        self, 
        chat_format: str
    ) -> "LlamaChatCompletionHandler": ...

Message Processing

Handle different message types and roles in conversations.

# Protocol definitions for chat completion handlers
class LlamaChatCompletionHandler:
    """Protocol for chat completion handlers."""
    
    def __call__(
        self,
        llama: "Llama",
        messages: List[dict],
        **kwargs
    ) -> Union[dict, Iterator[dict]]: ...

class ChatFormatter:
    """Protocol for chat message formatters."""
    
    def __call__(
        self,
        messages: List[dict],
        **kwargs
    ) -> ChatFormatterResponse: ...

Pre-defined Chat Templates

# Template constants for different model formats
CHATML_CHAT_TEMPLATE: str
MISTRAL_INSTRUCT_CHAT_TEMPLATE: str
MIXTRAL_INSTRUCT_CHAT_TEMPLATE: str
LLAMA3_INSTRUCT_CHAT_TEMPLATE: str

# Associated token constants
CHATML_EOS_TOKEN: str
MISTRAL_INSTRUCT_EOS_TOKEN: str
MIXTRAL_INSTRUCT_EOS_TOKEN: str
LLAMA3_INSTRUCT_EOS_TOKEN: str

CHATML_BOS_TOKEN: str
MISTRAL_INSTRUCT_BOS_TOKEN: str
MIXTRAL_INSTRUCT_BOS_TOKEN: str
LLAMA3_INSTRUCT_BOS_TOKEN: str

Types

# Message types for different roles
ChatCompletionRequestMessage = TypedDict('ChatCompletionRequestMessage', {
    'role': str,
    'content': Optional[str],
})

ChatCompletionRequestSystemMessage = TypedDict('ChatCompletionRequestSystemMessage', {
    'role': Literal['system'],
    'content': str,
    'name': NotRequired[str],
})

ChatCompletionRequestUserMessage = TypedDict('ChatCompletionRequestUserMessage', {
    'role': Literal['user'],
    'content': str,
    'name': NotRequired[str],
})

ChatCompletionRequestAssistantMessage = TypedDict('ChatCompletionRequestAssistantMessage', {
    'role': Literal['assistant'],
    'content': Optional[str],
    'name': NotRequired[str],
    'tool_calls': NotRequired[List[dict]],
    'function_call': NotRequired[dict],  # Deprecated
})

ChatCompletionRequestToolMessage = TypedDict('ChatCompletionRequestToolMessage', {
    'role': Literal['tool'],
    'content': str,
    'tool_call_id': str,
})

ChatCompletionRequestFunctionMessage = TypedDict('ChatCompletionRequestFunctionMessage', {
    'role': Literal['function'],
    'content': str,
    'name': str,
})

# Response types
CreateChatCompletionResponse = TypedDict('CreateChatCompletionResponse', {
    'id': str,
    'object': Literal['chat.completion'],
    'created': int,
    'model': str,
    'choices': List["ChatCompletionResponseChoice"],
    'usage': "CompletionUsage",
})

ChatCompletionResponseChoice = TypedDict('ChatCompletionResponseChoice', {
    'index': int,
    'message': "ChatCompletionResponseMessage",
    'finish_reason': Optional[str],
    'logprobs': Optional[dict],
})

ChatCompletionResponseMessage = TypedDict('ChatCompletionResponseMessage', {
    'role': Literal['assistant'],
    'content': Optional[str],
    'function_call': NotRequired[dict],
    'tool_calls': NotRequired[List[dict]],
})

# Streaming response types
CreateChatCompletionStreamResponse = TypedDict('CreateChatCompletionStreamResponse', {
    'id': str,
    'object': Literal['chat.completion.chunk'],
    'created': int,
    'model': str,
    'choices': List["ChatCompletionStreamResponseChoice"],
})

ChatCompletionStreamResponseChoice = TypedDict('ChatCompletionStreamResponseChoice', {
    'index': int,
    'delta': "ChatCompletionResponseMessage",
    'finish_reason': Optional[str],
    'logprobs': Optional[dict],
})

# Tool and function types
ChatCompletionMessageToolCall = TypedDict('ChatCompletionMessageToolCall', {
    'id': str,
    'type': Literal['function'],
    'function': dict,
})

ChatCompletionTool = TypedDict('ChatCompletionTool', {
    'type': Literal['function'],
    'function': "ChatCompletionFunction",
})

ChatCompletionFunction = TypedDict('ChatCompletionFunction', {
    'name': str,
    'description': Optional[str],
    'parameters': dict,
})

# Response format specification
ChatCompletionRequestResponseFormat = TypedDict('ChatCompletionRequestResponseFormat', {
    'type': Literal['text', 'json_object'],
})

Usage Examples

Basic Chat Conversation

from llama_cpp import Llama

llm = Llama(
    model_path="./models/llama-2-7b-chat.gguf",
    chat_format="llama-2"
)

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello! Can you help me with Python?"},
]

response = llm.create_chat_completion(
    messages=messages,
    max_tokens=150,
    temperature=0.7,
)

print(response['choices'][0]['message']['content'])

Multi-turn Conversation

messages = [
    {"role": "system", "content": "You are a coding tutor."},
    {"role": "user", "content": "How do I create a list in Python?"},
    {"role": "assistant", "content": "You can create a list using square brackets: my_list = [1, 2, 3]"},
    {"role": "user", "content": "How do I add items to it?"},
]

response = llm.create_chat_completion(
    messages=messages,
    max_tokens=100,
)

# Add assistant response to conversation
messages.append({
    "role": "assistant", 
    "content": response['choices'][0]['message']['content']
})

Function Calling

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get current weather information",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "City name"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

messages = [
    {"role": "user", "content": "What's the weather like in New York?"}
]

response = llm.create_chat_completion(
    messages=messages,
    tools=tools,
    tool_choice="auto",
)

# Check if model wants to call a function
if response['choices'][0]['message'].get('tool_calls'):
    tool_call = response['choices'][0]['message']['tool_calls'][0]
    print(f"Function: {tool_call['function']['name']}")
    print(f"Arguments: {tool_call['function']['arguments']}")

Custom Chat Format

from llama_cpp.llama_chat_format import Jinja2ChatFormatter

# Create custom formatter
custom_template = """
{%- for message in messages %}
    {%- if message['role'] == 'user' %}
User: {{ message['content'] }}
    {%- elif message['role'] == 'assistant' %}
Assistant: {{ message['content'] }}
    {%- elif message['role'] == 'system' %}
System: {{ message['content'] }}
    {%- endif %}
{%- endfor %}
Assistant: """

formatter = Jinja2ChatFormatter(
    template=custom_template,
    eos_token="</s>",
    bos_token="<s>",
)

# Format messages manually
messages = [{"role": "user", "content": "Hello!"}]
formatted = formatter.format_messages(messages)
print(formatted.prompt)

Streaming Chat

messages = [
    {"role": "user", "content": "Write a short story about robots."}
]

stream = llm.create_chat_completion(
    messages=messages,
    max_tokens=200,
    stream=True,  # Enable streaming
)

# Process streaming response
for chunk in stream:
    if chunk['choices'][0]['delta'].get('content'):
        print(chunk['choices'][0]['delta']['content'], end='', flush=True)

Response Format Control

# Request JSON response format
response = llm.create_chat_completion(
    messages=[
        {"role": "user", "content": "List 3 programming languages in JSON format"}
    ],
    response_format={"type": "json_object"},
    max_tokens=100,
)

print(response['choices'][0]['message']['content'])

Install with Tessl CLI

npx tessl i tessl/pypi-llama-cpp-python

docs

caching.md

chat-completion.md

grammar.md

index.md

llama-model.md

low-level.md

server.md

tokenization.md

vision.md

tile.json