0
# Chat Completions and Formatting
1
2
OpenAI-compatible chat completions with extensive formatting options, role-based conversations, function calling, and custom message templates for different model architectures.
3
4
## Capabilities
5
6
### Chat Completion
7
8
Generate contextual responses in multi-turn conversations with full OpenAI API compatibility.
9
10
```python { .api }
11
def create_chat_completion(
12
self,
13
messages: List[dict],
14
functions: Optional[List[dict]] = None,
15
function_call: Optional[Union[str, dict]] = None,
16
tools: Optional[List[dict]] = None,
17
tool_choice: Optional[Union[str, dict]] = None,
18
temperature: float = 0.2,
19
top_p: float = 0.95,
20
top_k: int = 40,
21
min_p: float = 0.05,
22
typical_p: float = 1.0,
23
stream: bool = False,
24
stop: Optional[Union[str, List[str]]] = None,
25
seed: Optional[int] = None,
26
response_format: Optional[dict] = None,
27
max_tokens: Optional[int] = None,
28
presence_penalty: float = 0.0,
29
frequency_penalty: float = 0.0,
30
repeat_penalty: float = 1.1,
31
tfs_z: float = 1.0,
32
mirostat_mode: int = 0,
33
mirostat_tau: float = 5.0,
34
mirostat_eta: float = 0.1,
35
model: Optional[str] = None,
36
logits_processor: Optional[object] = None,
37
grammar: Optional[object] = None,
38
logit_bias: Optional[Dict[str, float]] = None,
39
**kwargs
40
) -> CreateChatCompletionResponse:
41
"""
42
Create a chat completion response.
43
44
Args:
45
messages: List of message objects with 'role' and 'content'
46
functions: Available functions for function calling (deprecated, use tools)
47
function_call: Function call preference (deprecated, use tool_choice)
48
tools: Available tools for the model to call
49
tool_choice: Tool usage preference ("none", "auto", or specific tool)
50
temperature: Sampling temperature (0.0-2.0)
51
top_p: Nucleus sampling threshold
52
top_k: Top-k sampling parameter
53
min_p: Minimum probability threshold
54
typical_p: Typical sampling parameter
55
stream: Enable streaming response
56
stop: Stop sequences
57
seed: Random seed
58
response_format: Output format specification
59
max_tokens: Maximum tokens to generate
60
presence_penalty: Presence penalty (-2.0 to 2.0)
61
frequency_penalty: Frequency penalty (-2.0 to 2.0)
62
repeat_penalty: Repetition penalty multiplier
63
tfs_z: Tail-free sampling parameter
64
mirostat_mode: Mirostat sampling mode
65
mirostat_tau: Mirostat target entropy
66
mirostat_eta: Mirostat learning rate
67
model: Model name for metadata
68
logits_processor: Custom logits processor
69
grammar: Grammar constraints
70
logit_bias: Token probability adjustments
71
72
Returns:
73
Chat completion response with generated message
74
"""
75
```
76
77
### Chat Formatting
78
79
Format conversations according to model-specific templates and requirements.
80
81
```python { .api }
82
class Jinja2ChatFormatter:
83
def __init__(
84
self,
85
template: str,
86
eos_token: str = "</s>",
87
bos_token: str = "<s>",
88
stop_token_ids: Optional[List[int]] = None,
89
**kwargs
90
):
91
"""
92
Initialize Jinja2-based chat formatter.
93
94
Args:
95
template: Jinja2 template string for message formatting
96
eos_token: End-of-sequence token
97
bos_token: Beginning-of-sequence token
98
stop_token_ids: List of token IDs that should stop generation
99
"""
100
101
def format_messages(self, messages: List[dict]) -> "ChatFormatterResponse":
102
"""
103
Format messages according to template.
104
105
Args:
106
messages: List of message dictionaries
107
108
Returns:
109
Formatted response with prompt and stop sequences
110
"""
111
112
class ChatFormatterResponse:
113
def __init__(
114
self,
115
prompt: str,
116
stop: Optional[List[str]] = None
117
):
118
"""
119
Response container for formatted chat messages.
120
121
Args:
122
prompt: Formatted prompt text
123
stop: Stop sequences for generation
124
"""
125
self.prompt = prompt
126
self.stop = stop
127
```
128
129
### Chat Format Management
130
131
Register and retrieve chat formatting handlers for different model types.
132
133
```python { .api }
134
def get_chat_completion_handler(
135
chat_format: str
136
) -> "LlamaChatCompletionHandler":
137
"""
138
Get registered chat completion handler by format name.
139
140
Args:
141
chat_format: Format identifier (e.g., "chatml", "llama-2", "mistral-instruct")
142
143
Returns:
144
Chat completion handler instance
145
"""
146
147
def register_chat_completion_handler(
148
chat_format: str,
149
chat_handler: "LlamaChatCompletionHandler"
150
) -> None:
151
"""
152
Register new chat completion handler.
153
154
Args:
155
chat_format: Format identifier
156
chat_handler: Handler implementation
157
"""
158
159
class LlamaChatCompletionHandlerRegistry:
160
def register_chat_completion_handler(
161
self,
162
chat_format: str,
163
handler: "LlamaChatCompletionHandler"
164
) -> None: ...
165
166
def get_chat_completion_handler(
167
self,
168
chat_format: str
169
) -> "LlamaChatCompletionHandler": ...
170
```
171
172
### Message Processing
173
174
Handle different message types and roles in conversations.
175
176
```python { .api }
177
# Protocol definitions for chat completion handlers
178
class LlamaChatCompletionHandler:
179
"""Protocol for chat completion handlers."""
180
181
def __call__(
182
self,
183
llama: "Llama",
184
messages: List[dict],
185
**kwargs
186
) -> Union[dict, Iterator[dict]]: ...
187
188
class ChatFormatter:
189
"""Protocol for chat message formatters."""
190
191
def __call__(
192
self,
193
messages: List[dict],
194
**kwargs
195
) -> ChatFormatterResponse: ...
196
```
197
198
## Pre-defined Chat Templates
199
200
```python { .api }
201
# Template constants for different model formats
202
CHATML_CHAT_TEMPLATE: str
203
MISTRAL_INSTRUCT_CHAT_TEMPLATE: str
204
MIXTRAL_INSTRUCT_CHAT_TEMPLATE: str
205
LLAMA3_INSTRUCT_CHAT_TEMPLATE: str
206
207
# Associated token constants
208
CHATML_EOS_TOKEN: str
209
MISTRAL_INSTRUCT_EOS_TOKEN: str
210
MIXTRAL_INSTRUCT_EOS_TOKEN: str
211
LLAMA3_INSTRUCT_EOS_TOKEN: str
212
213
CHATML_BOS_TOKEN: str
214
MISTRAL_INSTRUCT_BOS_TOKEN: str
215
MIXTRAL_INSTRUCT_BOS_TOKEN: str
216
LLAMA3_INSTRUCT_BOS_TOKEN: str
217
```
218
219
## Types
220
221
```python { .api }
222
# Message types for different roles
223
ChatCompletionRequestMessage = TypedDict('ChatCompletionRequestMessage', {
224
'role': str,
225
'content': Optional[str],
226
})
227
228
ChatCompletionRequestSystemMessage = TypedDict('ChatCompletionRequestSystemMessage', {
229
'role': Literal['system'],
230
'content': str,
231
'name': NotRequired[str],
232
})
233
234
ChatCompletionRequestUserMessage = TypedDict('ChatCompletionRequestUserMessage', {
235
'role': Literal['user'],
236
'content': str,
237
'name': NotRequired[str],
238
})
239
240
ChatCompletionRequestAssistantMessage = TypedDict('ChatCompletionRequestAssistantMessage', {
241
'role': Literal['assistant'],
242
'content': Optional[str],
243
'name': NotRequired[str],
244
'tool_calls': NotRequired[List[dict]],
245
'function_call': NotRequired[dict], # Deprecated
246
})
247
248
ChatCompletionRequestToolMessage = TypedDict('ChatCompletionRequestToolMessage', {
249
'role': Literal['tool'],
250
'content': str,
251
'tool_call_id': str,
252
})
253
254
ChatCompletionRequestFunctionMessage = TypedDict('ChatCompletionRequestFunctionMessage', {
255
'role': Literal['function'],
256
'content': str,
257
'name': str,
258
})
259
260
# Response types
261
CreateChatCompletionResponse = TypedDict('CreateChatCompletionResponse', {
262
'id': str,
263
'object': Literal['chat.completion'],
264
'created': int,
265
'model': str,
266
'choices': List["ChatCompletionResponseChoice"],
267
'usage': "CompletionUsage",
268
})
269
270
ChatCompletionResponseChoice = TypedDict('ChatCompletionResponseChoice', {
271
'index': int,
272
'message': "ChatCompletionResponseMessage",
273
'finish_reason': Optional[str],
274
'logprobs': Optional[dict],
275
})
276
277
ChatCompletionResponseMessage = TypedDict('ChatCompletionResponseMessage', {
278
'role': Literal['assistant'],
279
'content': Optional[str],
280
'function_call': NotRequired[dict],
281
'tool_calls': NotRequired[List[dict]],
282
})
283
284
# Streaming response types
285
CreateChatCompletionStreamResponse = TypedDict('CreateChatCompletionStreamResponse', {
286
'id': str,
287
'object': Literal['chat.completion.chunk'],
288
'created': int,
289
'model': str,
290
'choices': List["ChatCompletionStreamResponseChoice"],
291
})
292
293
ChatCompletionStreamResponseChoice = TypedDict('ChatCompletionStreamResponseChoice', {
294
'index': int,
295
'delta': "ChatCompletionResponseMessage",
296
'finish_reason': Optional[str],
297
'logprobs': Optional[dict],
298
})
299
300
# Tool and function types
301
ChatCompletionMessageToolCall = TypedDict('ChatCompletionMessageToolCall', {
302
'id': str,
303
'type': Literal['function'],
304
'function': dict,
305
})
306
307
ChatCompletionTool = TypedDict('ChatCompletionTool', {
308
'type': Literal['function'],
309
'function': "ChatCompletionFunction",
310
})
311
312
ChatCompletionFunction = TypedDict('ChatCompletionFunction', {
313
'name': str,
314
'description': Optional[str],
315
'parameters': dict,
316
})
317
318
# Response format specification
319
ChatCompletionRequestResponseFormat = TypedDict('ChatCompletionRequestResponseFormat', {
320
'type': Literal['text', 'json_object'],
321
})
322
```
323
324
## Usage Examples
325
326
### Basic Chat Conversation
327
328
```python
329
from llama_cpp import Llama
330
331
llm = Llama(
332
model_path="./models/llama-2-7b-chat.gguf",
333
chat_format="llama-2"
334
)
335
336
messages = [
337
{"role": "system", "content": "You are a helpful assistant."},
338
{"role": "user", "content": "Hello! Can you help me with Python?"},
339
]
340
341
response = llm.create_chat_completion(
342
messages=messages,
343
max_tokens=150,
344
temperature=0.7,
345
)
346
347
print(response['choices'][0]['message']['content'])
348
```
349
350
### Multi-turn Conversation
351
352
```python
353
messages = [
354
{"role": "system", "content": "You are a coding tutor."},
355
{"role": "user", "content": "How do I create a list in Python?"},
356
{"role": "assistant", "content": "You can create a list using square brackets: my_list = [1, 2, 3]"},
357
{"role": "user", "content": "How do I add items to it?"},
358
]
359
360
response = llm.create_chat_completion(
361
messages=messages,
362
max_tokens=100,
363
)
364
365
# Add assistant response to conversation
366
messages.append({
367
"role": "assistant",
368
"content": response['choices'][0]['message']['content']
369
})
370
```
371
372
### Function Calling
373
374
```python
375
tools = [
376
{
377
"type": "function",
378
"function": {
379
"name": "get_weather",
380
"description": "Get current weather information",
381
"parameters": {
382
"type": "object",
383
"properties": {
384
"location": {
385
"type": "string",
386
"description": "City name"
387
}
388
},
389
"required": ["location"]
390
}
391
}
392
}
393
]
394
395
messages = [
396
{"role": "user", "content": "What's the weather like in New York?"}
397
]
398
399
response = llm.create_chat_completion(
400
messages=messages,
401
tools=tools,
402
tool_choice="auto",
403
)
404
405
# Check if model wants to call a function
406
if response['choices'][0]['message'].get('tool_calls'):
407
tool_call = response['choices'][0]['message']['tool_calls'][0]
408
print(f"Function: {tool_call['function']['name']}")
409
print(f"Arguments: {tool_call['function']['arguments']}")
410
```
411
412
### Custom Chat Format
413
414
```python
415
from llama_cpp.llama_chat_format import Jinja2ChatFormatter
416
417
# Create custom formatter
418
custom_template = """
419
{%- for message in messages %}
420
{%- if message['role'] == 'user' %}
421
User: {{ message['content'] }}
422
{%- elif message['role'] == 'assistant' %}
423
Assistant: {{ message['content'] }}
424
{%- elif message['role'] == 'system' %}
425
System: {{ message['content'] }}
426
{%- endif %}
427
{%- endfor %}
428
Assistant: """
429
430
formatter = Jinja2ChatFormatter(
431
template=custom_template,
432
eos_token="</s>",
433
bos_token="<s>",
434
)
435
436
# Format messages manually
437
messages = [{"role": "user", "content": "Hello!"}]
438
formatted = formatter.format_messages(messages)
439
print(formatted.prompt)
440
```
441
442
### Streaming Chat
443
444
```python
445
messages = [
446
{"role": "user", "content": "Write a short story about robots."}
447
]
448
449
stream = llm.create_chat_completion(
450
messages=messages,
451
max_tokens=200,
452
stream=True, # Enable streaming
453
)
454
455
# Process streaming response
456
for chunk in stream:
457
if chunk['choices'][0]['delta'].get('content'):
458
print(chunk['choices'][0]['delta']['content'], end='', flush=True)
459
```
460
461
### Response Format Control
462
463
```python
464
# Request JSON response format
465
response = llm.create_chat_completion(
466
messages=[
467
{"role": "user", "content": "List 3 programming languages in JSON format"}
468
],
469
response_format={"type": "json_object"},
470
max_tokens=100,
471
)
472
473
print(response['choices'][0]['message']['content'])
474
```