Tessl Tile for pypi/llama-cpp-python@0.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

caching.md chat-completion.md grammar.md index.md llama-model.md low-level.md server.md tokenization.md vision.md

chat-completion.mddocs/

0
# Chat Completions and Formatting
1

2
OpenAI-compatible chat completions with extensive formatting options, role-based conversations, function calling, and custom message templates for different model architectures.
3

4
## Capabilities
5

6
### Chat Completion
7

8
Generate contextual responses in multi-turn conversations with full OpenAI API compatibility.
9

10
```python { .api }
11
def create_chat_completion(
12
    self,
13
    messages: List[dict],
14
    functions: Optional[List[dict]] = None,
15
    function_call: Optional[Union[str, dict]] = None,
16
    tools: Optional[List[dict]] = None,
17
    tool_choice: Optional[Union[str, dict]] = None,
18
    temperature: float = 0.2,
19
    top_p: float = 0.95,
20
    top_k: int = 40,
21
    min_p: float = 0.05,
22
    typical_p: float = 1.0,
23
    stream: bool = False,
24
    stop: Optional[Union[str, List[str]]] = None,
25
    seed: Optional[int] = None,
26
    response_format: Optional[dict] = None,
27
    max_tokens: Optional[int] = None,
28
    presence_penalty: float = 0.0,
29
    frequency_penalty: float = 0.0,
30
    repeat_penalty: float = 1.1,
31
    tfs_z: float = 1.0,
32
    mirostat_mode: int = 0,
33
    mirostat_tau: float = 5.0,
34
    mirostat_eta: float = 0.1,
35
    model: Optional[str] = None,
36
    logits_processor: Optional[object] = None,
37
    grammar: Optional[object] = None,
38
    logit_bias: Optional[Dict[str, float]] = None,
39
    **kwargs
40
) -> CreateChatCompletionResponse:
41
    """
42
    Create a chat completion response.
43

44
    Args:
45
        messages: List of message objects with 'role' and 'content'
46
        functions: Available functions for function calling (deprecated, use tools)
47
        function_call: Function call preference (deprecated, use tool_choice)
48
        tools: Available tools for the model to call
49
        tool_choice: Tool usage preference ("none", "auto", or specific tool)
50
        temperature: Sampling temperature (0.0-2.0)
51
        top_p: Nucleus sampling threshold
52
        top_k: Top-k sampling parameter
53
        min_p: Minimum probability threshold
54
        typical_p: Typical sampling parameter
55
        stream: Enable streaming response
56
        stop: Stop sequences
57
        seed: Random seed
58
        response_format: Output format specification
59
        max_tokens: Maximum tokens to generate
60
        presence_penalty: Presence penalty (-2.0 to 2.0)
61
        frequency_penalty: Frequency penalty (-2.0 to 2.0)
62
        repeat_penalty: Repetition penalty multiplier
63
        tfs_z: Tail-free sampling parameter
64
        mirostat_mode: Mirostat sampling mode
65
        mirostat_tau: Mirostat target entropy
66
        mirostat_eta: Mirostat learning rate
67
        model: Model name for metadata
68
        logits_processor: Custom logits processor
69
        grammar: Grammar constraints
70
        logit_bias: Token probability adjustments
71

72
    Returns:
73
        Chat completion response with generated message
74
    """
75
```
76

77
### Chat Formatting
78

79
Format conversations according to model-specific templates and requirements.
80

81
```python { .api }
82
class Jinja2ChatFormatter:
83
    def __init__(
84
        self,
85
        template: str,
86
        eos_token: str = "</s>",
87
        bos_token: str = "<s>",
88
        stop_token_ids: Optional[List[int]] = None,
89
        **kwargs
90
    ):
91
        """
92
        Initialize Jinja2-based chat formatter.
93

94
        Args:
95
            template: Jinja2 template string for message formatting
96
            eos_token: End-of-sequence token
97
            bos_token: Beginning-of-sequence token
98
            stop_token_ids: List of token IDs that should stop generation
99
        """
100

101
    def format_messages(self, messages: List[dict]) -> "ChatFormatterResponse":
102
        """
103
        Format messages according to template.
104

105
        Args:
106
            messages: List of message dictionaries
107

108
        Returns:
109
            Formatted response with prompt and stop sequences
110
        """
111

112
class ChatFormatterResponse:
113
    def __init__(
114
        self,
115
        prompt: str,
116
        stop: Optional[List[str]] = None
117
    ):
118
        """
119
        Response container for formatted chat messages.
120

121
        Args:
122
            prompt: Formatted prompt text
123
            stop: Stop sequences for generation
124
        """
125
        self.prompt = prompt
126
        self.stop = stop
127
```
128

129
### Chat Format Management
130

131
Register and retrieve chat formatting handlers for different model types.
132

133
```python { .api }
134
def get_chat_completion_handler(
135
    chat_format: str
136
) -> "LlamaChatCompletionHandler":
137
    """
138
    Get registered chat completion handler by format name.
139

140
    Args:
141
        chat_format: Format identifier (e.g., "chatml", "llama-2", "mistral-instruct")
142

143
    Returns:
144
        Chat completion handler instance
145
    """
146

147
def register_chat_completion_handler(
148
    chat_format: str,
149
    chat_handler: "LlamaChatCompletionHandler"
150
) -> None:
151
    """
152
    Register new chat completion handler.
153

154
    Args:
155
        chat_format: Format identifier
156
        chat_handler: Handler implementation
157
    """
158

159
class LlamaChatCompletionHandlerRegistry:
160
    def register_chat_completion_handler(
161
        self, 
162
        chat_format: str, 
163
        handler: "LlamaChatCompletionHandler"
164
    ) -> None: ...
165
    
166
    def get_chat_completion_handler(
167
        self, 
168
        chat_format: str
169
    ) -> "LlamaChatCompletionHandler": ...
170
```
171

172
### Message Processing
173

174
Handle different message types and roles in conversations.
175

176
```python { .api }
177
# Protocol definitions for chat completion handlers
178
class LlamaChatCompletionHandler:
179
    """Protocol for chat completion handlers."""
180
    
181
    def __call__(
182
        self,
183
        llama: "Llama",
184
        messages: List[dict],
185
        **kwargs
186
    ) -> Union[dict, Iterator[dict]]: ...
187

188
class ChatFormatter:
189
    """Protocol for chat message formatters."""
190
    
191
    def __call__(
192
        self,
193
        messages: List[dict],
194
        **kwargs
195
    ) -> ChatFormatterResponse: ...
196
```
197

198
## Pre-defined Chat Templates
199

200
```python { .api }
201
# Template constants for different model formats
202
CHATML_CHAT_TEMPLATE: str
203
MISTRAL_INSTRUCT_CHAT_TEMPLATE: str
204
MIXTRAL_INSTRUCT_CHAT_TEMPLATE: str
205
LLAMA3_INSTRUCT_CHAT_TEMPLATE: str
206

207
# Associated token constants
208
CHATML_EOS_TOKEN: str
209
MISTRAL_INSTRUCT_EOS_TOKEN: str
210
MIXTRAL_INSTRUCT_EOS_TOKEN: str
211
LLAMA3_INSTRUCT_EOS_TOKEN: str
212

213
CHATML_BOS_TOKEN: str
214
MISTRAL_INSTRUCT_BOS_TOKEN: str
215
MIXTRAL_INSTRUCT_BOS_TOKEN: str
216
LLAMA3_INSTRUCT_BOS_TOKEN: str
217
```
218

219
## Types
220

221
```python { .api }
222
# Message types for different roles
223
ChatCompletionRequestMessage = TypedDict('ChatCompletionRequestMessage', {
224
    'role': str,
225
    'content': Optional[str],
226
})
227

228
ChatCompletionRequestSystemMessage = TypedDict('ChatCompletionRequestSystemMessage', {
229
    'role': Literal['system'],
230
    'content': str,
231
    'name': NotRequired[str],
232
})
233

234
ChatCompletionRequestUserMessage = TypedDict('ChatCompletionRequestUserMessage', {
235
    'role': Literal['user'],
236
    'content': str,
237
    'name': NotRequired[str],
238
})
239

240
ChatCompletionRequestAssistantMessage = TypedDict('ChatCompletionRequestAssistantMessage', {
241
    'role': Literal['assistant'],
242
    'content': Optional[str],
243
    'name': NotRequired[str],
244
    'tool_calls': NotRequired[List[dict]],
245
    'function_call': NotRequired[dict],  # Deprecated
246
})
247

248
ChatCompletionRequestToolMessage = TypedDict('ChatCompletionRequestToolMessage', {
249
    'role': Literal['tool'],
250
    'content': str,
251
    'tool_call_id': str,
252
})
253

254
ChatCompletionRequestFunctionMessage = TypedDict('ChatCompletionRequestFunctionMessage', {
255
    'role': Literal['function'],
256
    'content': str,
257
    'name': str,
258
})
259

260
# Response types
261
CreateChatCompletionResponse = TypedDict('CreateChatCompletionResponse', {
262
    'id': str,
263
    'object': Literal['chat.completion'],
264
    'created': int,
265
    'model': str,
266
    'choices': List["ChatCompletionResponseChoice"],
267
    'usage': "CompletionUsage",
268
})
269

270
ChatCompletionResponseChoice = TypedDict('ChatCompletionResponseChoice', {
271
    'index': int,
272
    'message': "ChatCompletionResponseMessage",
273
    'finish_reason': Optional[str],
274
    'logprobs': Optional[dict],
275
})
276

277
ChatCompletionResponseMessage = TypedDict('ChatCompletionResponseMessage', {
278
    'role': Literal['assistant'],
279
    'content': Optional[str],
280
    'function_call': NotRequired[dict],
281
    'tool_calls': NotRequired[List[dict]],
282
})
283

284
# Streaming response types
285
CreateChatCompletionStreamResponse = TypedDict('CreateChatCompletionStreamResponse', {
286
    'id': str,
287
    'object': Literal['chat.completion.chunk'],
288
    'created': int,
289
    'model': str,
290
    'choices': List["ChatCompletionStreamResponseChoice"],
291
})
292

293
ChatCompletionStreamResponseChoice = TypedDict('ChatCompletionStreamResponseChoice', {
294
    'index': int,
295
    'delta': "ChatCompletionResponseMessage",
296
    'finish_reason': Optional[str],
297
    'logprobs': Optional[dict],
298
})
299

300
# Tool and function types
301
ChatCompletionMessageToolCall = TypedDict('ChatCompletionMessageToolCall', {
302
    'id': str,
303
    'type': Literal['function'],
304
    'function': dict,
305
})
306

307
ChatCompletionTool = TypedDict('ChatCompletionTool', {
308
    'type': Literal['function'],
309
    'function': "ChatCompletionFunction",
310
})
311

312
ChatCompletionFunction = TypedDict('ChatCompletionFunction', {
313
    'name': str,
314
    'description': Optional[str],
315
    'parameters': dict,
316
})
317

318
# Response format specification
319
ChatCompletionRequestResponseFormat = TypedDict('ChatCompletionRequestResponseFormat', {
320
    'type': Literal['text', 'json_object'],
321
})
322
```
323

324
## Usage Examples
325

326
### Basic Chat Conversation
327

328
```python
329
from llama_cpp import Llama
330

331
llm = Llama(
332
    model_path="./models/llama-2-7b-chat.gguf",
333
    chat_format="llama-2"
334
)
335

336
messages = [
337
    {"role": "system", "content": "You are a helpful assistant."},
338
    {"role": "user", "content": "Hello! Can you help me with Python?"},
339
]
340

341
response = llm.create_chat_completion(
342
    messages=messages,
343
    max_tokens=150,
344
    temperature=0.7,
345
)
346

347
print(response['choices'][0]['message']['content'])
348
```
349

350
### Multi-turn Conversation
351

352
```python
353
messages = [
354
    {"role": "system", "content": "You are a coding tutor."},
355
    {"role": "user", "content": "How do I create a list in Python?"},
356
    {"role": "assistant", "content": "You can create a list using square brackets: my_list = [1, 2, 3]"},
357
    {"role": "user", "content": "How do I add items to it?"},
358
]
359

360
response = llm.create_chat_completion(
361
    messages=messages,
362
    max_tokens=100,
363
)
364

365
# Add assistant response to conversation
366
messages.append({
367
    "role": "assistant", 
368
    "content": response['choices'][0]['message']['content']
369
})
370
```
371

372
### Function Calling
373

374
```python
375
tools = [
376
    {
377
        "type": "function",
378
        "function": {
379
            "name": "get_weather",
380
            "description": "Get current weather information",
381
            "parameters": {
382
                "type": "object",
383
                "properties": {
384
                    "location": {
385
                        "type": "string",
386
                        "description": "City name"
387
                    }
388
                },
389
                "required": ["location"]
390
            }
391
        }
392
    }
393
]
394

395
messages = [
396
    {"role": "user", "content": "What's the weather like in New York?"}
397
]
398

399
response = llm.create_chat_completion(
400
    messages=messages,
401
    tools=tools,
402
    tool_choice="auto",
403
)
404

405
# Check if model wants to call a function
406
if response['choices'][0]['message'].get('tool_calls'):
407
    tool_call = response['choices'][0]['message']['tool_calls'][0]
408
    print(f"Function: {tool_call['function']['name']}")
409
    print(f"Arguments: {tool_call['function']['arguments']}")
410
```
411

412
### Custom Chat Format
413

414
```python
415
from llama_cpp.llama_chat_format import Jinja2ChatFormatter
416

417
# Create custom formatter
418
custom_template = """
419
{%- for message in messages %}
420
    {%- if message['role'] == 'user' %}
421
User: {{ message['content'] }}
422
    {%- elif message['role'] == 'assistant' %}
423
Assistant: {{ message['content'] }}
424
    {%- elif message['role'] == 'system' %}
425
System: {{ message['content'] }}
426
    {%- endif %}
427
{%- endfor %}
428
Assistant: """
429

430
formatter = Jinja2ChatFormatter(
431
    template=custom_template,
432
    eos_token="</s>",
433
    bos_token="<s>",
434
)
435

436
# Format messages manually
437
messages = [{"role": "user", "content": "Hello!"}]
438
formatted = formatter.format_messages(messages)
439
print(formatted.prompt)
440
```
441

442
### Streaming Chat
443

444
```python
445
messages = [
446
    {"role": "user", "content": "Write a short story about robots."}
447
]
448

449
stream = llm.create_chat_completion(
450
    messages=messages,
451
    max_tokens=200,
452
    stream=True,  # Enable streaming
453
)
454

455
# Process streaming response
456
for chunk in stream:
457
    if chunk['choices'][0]['delta'].get('content'):
458
        print(chunk['choices'][0]['delta']['content'], end='', flush=True)
459
```
460

461
### Response Format Control
462

463
```python
464
# Request JSON response format
465
response = llm.create_chat_completion(
466
    messages=[
467
        {"role": "user", "content": "List 3 programming languages in JSON format"}
468
    ],
469
    response_format={"type": "json_object"},
470
    max_tokens=100,
471
)
472

473
print(response['choices'][0]['message']['content'])
474
```

Version

Tile

Files

chat-completion.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

chat-completion.mddocs/