or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

caching.mdchat-completion.mdgrammar.mdindex.mdllama-model.mdlow-level.mdserver.mdtokenization.mdvision.md

chat-completion.mddocs/

0

# Chat Completions and Formatting

1

2

OpenAI-compatible chat completions with extensive formatting options, role-based conversations, function calling, and custom message templates for different model architectures.

3

4

## Capabilities

5

6

### Chat Completion

7

8

Generate contextual responses in multi-turn conversations with full OpenAI API compatibility.

9

10

```python { .api }

11

def create_chat_completion(

12

self,

13

messages: List[dict],

14

functions: Optional[List[dict]] = None,

15

function_call: Optional[Union[str, dict]] = None,

16

tools: Optional[List[dict]] = None,

17

tool_choice: Optional[Union[str, dict]] = None,

18

temperature: float = 0.2,

19

top_p: float = 0.95,

20

top_k: int = 40,

21

min_p: float = 0.05,

22

typical_p: float = 1.0,

23

stream: bool = False,

24

stop: Optional[Union[str, List[str]]] = None,

25

seed: Optional[int] = None,

26

response_format: Optional[dict] = None,

27

max_tokens: Optional[int] = None,

28

presence_penalty: float = 0.0,

29

frequency_penalty: float = 0.0,

30

repeat_penalty: float = 1.1,

31

tfs_z: float = 1.0,

32

mirostat_mode: int = 0,

33

mirostat_tau: float = 5.0,

34

mirostat_eta: float = 0.1,

35

model: Optional[str] = None,

36

logits_processor: Optional[object] = None,

37

grammar: Optional[object] = None,

38

logit_bias: Optional[Dict[str, float]] = None,

39

**kwargs

40

) -> CreateChatCompletionResponse:

41

"""

42

Create a chat completion response.

43

44

Args:

45

messages: List of message objects with 'role' and 'content'

46

functions: Available functions for function calling (deprecated, use tools)

47

function_call: Function call preference (deprecated, use tool_choice)

48

tools: Available tools for the model to call

49

tool_choice: Tool usage preference ("none", "auto", or specific tool)

50

temperature: Sampling temperature (0.0-2.0)

51

top_p: Nucleus sampling threshold

52

top_k: Top-k sampling parameter

53

min_p: Minimum probability threshold

54

typical_p: Typical sampling parameter

55

stream: Enable streaming response

56

stop: Stop sequences

57

seed: Random seed

58

response_format: Output format specification

59

max_tokens: Maximum tokens to generate

60

presence_penalty: Presence penalty (-2.0 to 2.0)

61

frequency_penalty: Frequency penalty (-2.0 to 2.0)

62

repeat_penalty: Repetition penalty multiplier

63

tfs_z: Tail-free sampling parameter

64

mirostat_mode: Mirostat sampling mode

65

mirostat_tau: Mirostat target entropy

66

mirostat_eta: Mirostat learning rate

67

model: Model name for metadata

68

logits_processor: Custom logits processor

69

grammar: Grammar constraints

70

logit_bias: Token probability adjustments

71

72

Returns:

73

Chat completion response with generated message

74

"""

75

```

76

77

### Chat Formatting

78

79

Format conversations according to model-specific templates and requirements.

80

81

```python { .api }

82

class Jinja2ChatFormatter:

83

def __init__(

84

self,

85

template: str,

86

eos_token: str = "</s>",

87

bos_token: str = "<s>",

88

stop_token_ids: Optional[List[int]] = None,

89

**kwargs

90

):

91

"""

92

Initialize Jinja2-based chat formatter.

93

94

Args:

95

template: Jinja2 template string for message formatting

96

eos_token: End-of-sequence token

97

bos_token: Beginning-of-sequence token

98

stop_token_ids: List of token IDs that should stop generation

99

"""

100

101

def format_messages(self, messages: List[dict]) -> "ChatFormatterResponse":

102

"""

103

Format messages according to template.

104

105

Args:

106

messages: List of message dictionaries

107

108

Returns:

109

Formatted response with prompt and stop sequences

110

"""

111

112

class ChatFormatterResponse:

113

def __init__(

114

self,

115

prompt: str,

116

stop: Optional[List[str]] = None

117

):

118

"""

119

Response container for formatted chat messages.

120

121

Args:

122

prompt: Formatted prompt text

123

stop: Stop sequences for generation

124

"""

125

self.prompt = prompt

126

self.stop = stop

127

```

128

129

### Chat Format Management

130

131

Register and retrieve chat formatting handlers for different model types.

132

133

```python { .api }

134

def get_chat_completion_handler(

135

chat_format: str

136

) -> "LlamaChatCompletionHandler":

137

"""

138

Get registered chat completion handler by format name.

139

140

Args:

141

chat_format: Format identifier (e.g., "chatml", "llama-2", "mistral-instruct")

142

143

Returns:

144

Chat completion handler instance

145

"""

146

147

def register_chat_completion_handler(

148

chat_format: str,

149

chat_handler: "LlamaChatCompletionHandler"

150

) -> None:

151

"""

152

Register new chat completion handler.

153

154

Args:

155

chat_format: Format identifier

156

chat_handler: Handler implementation

157

"""

158

159

class LlamaChatCompletionHandlerRegistry:

160

def register_chat_completion_handler(

161

self,

162

chat_format: str,

163

handler: "LlamaChatCompletionHandler"

164

) -> None: ...

165

166

def get_chat_completion_handler(

167

self,

168

chat_format: str

169

) -> "LlamaChatCompletionHandler": ...

170

```

171

172

### Message Processing

173

174

Handle different message types and roles in conversations.

175

176

```python { .api }

177

# Protocol definitions for chat completion handlers

178

class LlamaChatCompletionHandler:

179

"""Protocol for chat completion handlers."""

180

181

def __call__(

182

self,

183

llama: "Llama",

184

messages: List[dict],

185

**kwargs

186

) -> Union[dict, Iterator[dict]]: ...

187

188

class ChatFormatter:

189

"""Protocol for chat message formatters."""

190

191

def __call__(

192

self,

193

messages: List[dict],

194

**kwargs

195

) -> ChatFormatterResponse: ...

196

```

197

198

## Pre-defined Chat Templates

199

200

```python { .api }

201

# Template constants for different model formats

202

CHATML_CHAT_TEMPLATE: str

203

MISTRAL_INSTRUCT_CHAT_TEMPLATE: str

204

MIXTRAL_INSTRUCT_CHAT_TEMPLATE: str

205

LLAMA3_INSTRUCT_CHAT_TEMPLATE: str

206

207

# Associated token constants

208

CHATML_EOS_TOKEN: str

209

MISTRAL_INSTRUCT_EOS_TOKEN: str

210

MIXTRAL_INSTRUCT_EOS_TOKEN: str

211

LLAMA3_INSTRUCT_EOS_TOKEN: str

212

213

CHATML_BOS_TOKEN: str

214

MISTRAL_INSTRUCT_BOS_TOKEN: str

215

MIXTRAL_INSTRUCT_BOS_TOKEN: str

216

LLAMA3_INSTRUCT_BOS_TOKEN: str

217

```

218

219

## Types

220

221

```python { .api }

222

# Message types for different roles

223

ChatCompletionRequestMessage = TypedDict('ChatCompletionRequestMessage', {

224

'role': str,

225

'content': Optional[str],

226

})

227

228

ChatCompletionRequestSystemMessage = TypedDict('ChatCompletionRequestSystemMessage', {

229

'role': Literal['system'],

230

'content': str,

231

'name': NotRequired[str],

232

})

233

234

ChatCompletionRequestUserMessage = TypedDict('ChatCompletionRequestUserMessage', {

235

'role': Literal['user'],

236

'content': str,

237

'name': NotRequired[str],

238

})

239

240

ChatCompletionRequestAssistantMessage = TypedDict('ChatCompletionRequestAssistantMessage', {

241

'role': Literal['assistant'],

242

'content': Optional[str],

243

'name': NotRequired[str],

244

'tool_calls': NotRequired[List[dict]],

245

'function_call': NotRequired[dict], # Deprecated

246

})

247

248

ChatCompletionRequestToolMessage = TypedDict('ChatCompletionRequestToolMessage', {

249

'role': Literal['tool'],

250

'content': str,

251

'tool_call_id': str,

252

})

253

254

ChatCompletionRequestFunctionMessage = TypedDict('ChatCompletionRequestFunctionMessage', {

255

'role': Literal['function'],

256

'content': str,

257

'name': str,

258

})

259

260

# Response types

261

CreateChatCompletionResponse = TypedDict('CreateChatCompletionResponse', {

262

'id': str,

263

'object': Literal['chat.completion'],

264

'created': int,

265

'model': str,

266

'choices': List["ChatCompletionResponseChoice"],

267

'usage': "CompletionUsage",

268

})

269

270

ChatCompletionResponseChoice = TypedDict('ChatCompletionResponseChoice', {

271

'index': int,

272

'message': "ChatCompletionResponseMessage",

273

'finish_reason': Optional[str],

274

'logprobs': Optional[dict],

275

})

276

277

ChatCompletionResponseMessage = TypedDict('ChatCompletionResponseMessage', {

278

'role': Literal['assistant'],

279

'content': Optional[str],

280

'function_call': NotRequired[dict],

281

'tool_calls': NotRequired[List[dict]],

282

})

283

284

# Streaming response types

285

CreateChatCompletionStreamResponse = TypedDict('CreateChatCompletionStreamResponse', {

286

'id': str,

287

'object': Literal['chat.completion.chunk'],

288

'created': int,

289

'model': str,

290

'choices': List["ChatCompletionStreamResponseChoice"],

291

})

292

293

ChatCompletionStreamResponseChoice = TypedDict('ChatCompletionStreamResponseChoice', {

294

'index': int,

295

'delta': "ChatCompletionResponseMessage",

296

'finish_reason': Optional[str],

297

'logprobs': Optional[dict],

298

})

299

300

# Tool and function types

301

ChatCompletionMessageToolCall = TypedDict('ChatCompletionMessageToolCall', {

302

'id': str,

303

'type': Literal['function'],

304

'function': dict,

305

})

306

307

ChatCompletionTool = TypedDict('ChatCompletionTool', {

308

'type': Literal['function'],

309

'function': "ChatCompletionFunction",

310

})

311

312

ChatCompletionFunction = TypedDict('ChatCompletionFunction', {

313

'name': str,

314

'description': Optional[str],

315

'parameters': dict,

316

})

317

318

# Response format specification

319

ChatCompletionRequestResponseFormat = TypedDict('ChatCompletionRequestResponseFormat', {

320

'type': Literal['text', 'json_object'],

321

})

322

```

323

324

## Usage Examples

325

326

### Basic Chat Conversation

327

328

```python

329

from llama_cpp import Llama

330

331

llm = Llama(

332

model_path="./models/llama-2-7b-chat.gguf",

333

chat_format="llama-2"

334

)

335

336

messages = [

337

{"role": "system", "content": "You are a helpful assistant."},

338

{"role": "user", "content": "Hello! Can you help me with Python?"},

339

]

340

341

response = llm.create_chat_completion(

342

messages=messages,

343

max_tokens=150,

344

temperature=0.7,

345

)

346

347

print(response['choices'][0]['message']['content'])

348

```

349

350

### Multi-turn Conversation

351

352

```python

353

messages = [

354

{"role": "system", "content": "You are a coding tutor."},

355

{"role": "user", "content": "How do I create a list in Python?"},

356

{"role": "assistant", "content": "You can create a list using square brackets: my_list = [1, 2, 3]"},

357

{"role": "user", "content": "How do I add items to it?"},

358

]

359

360

response = llm.create_chat_completion(

361

messages=messages,

362

max_tokens=100,

363

)

364

365

# Add assistant response to conversation

366

messages.append({

367

"role": "assistant",

368

"content": response['choices'][0]['message']['content']

369

})

370

```

371

372

### Function Calling

373

374

```python

375

tools = [

376

{

377

"type": "function",

378

"function": {

379

"name": "get_weather",

380

"description": "Get current weather information",

381

"parameters": {

382

"type": "object",

383

"properties": {

384

"location": {

385

"type": "string",

386

"description": "City name"

387

}

388

},

389

"required": ["location"]

390

}

391

}

392

}

393

]

394

395

messages = [

396

{"role": "user", "content": "What's the weather like in New York?"}

397

]

398

399

response = llm.create_chat_completion(

400

messages=messages,

401

tools=tools,

402

tool_choice="auto",

403

)

404

405

# Check if model wants to call a function

406

if response['choices'][0]['message'].get('tool_calls'):

407

tool_call = response['choices'][0]['message']['tool_calls'][0]

408

print(f"Function: {tool_call['function']['name']}")

409

print(f"Arguments: {tool_call['function']['arguments']}")

410

```

411

412

### Custom Chat Format

413

414

```python

415

from llama_cpp.llama_chat_format import Jinja2ChatFormatter

416

417

# Create custom formatter

418

custom_template = """

419

{%- for message in messages %}

420

{%- if message['role'] == 'user' %}

421

User: {{ message['content'] }}

422

{%- elif message['role'] == 'assistant' %}

423

Assistant: {{ message['content'] }}

424

{%- elif message['role'] == 'system' %}

425

System: {{ message['content'] }}

426

{%- endif %}

427

{%- endfor %}

428

Assistant: """

429

430

formatter = Jinja2ChatFormatter(

431

template=custom_template,

432

eos_token="</s>",

433

bos_token="<s>",

434

)

435

436

# Format messages manually

437

messages = [{"role": "user", "content": "Hello!"}]

438

formatted = formatter.format_messages(messages)

439

print(formatted.prompt)

440

```

441

442

### Streaming Chat

443

444

```python

445

messages = [

446

{"role": "user", "content": "Write a short story about robots."}

447

]

448

449

stream = llm.create_chat_completion(

450

messages=messages,

451

max_tokens=200,

452

stream=True, # Enable streaming

453

)

454

455

# Process streaming response

456

for chunk in stream:

457

if chunk['choices'][0]['delta'].get('content'):

458

print(chunk['choices'][0]['delta']['content'], end='', flush=True)

459

```

460

461

### Response Format Control

462

463

```python

464

# Request JSON response format

465

response = llm.create_chat_completion(

466

messages=[

467

{"role": "user", "content": "List 3 programming languages in JSON format"}

468

],

469

response_format={"type": "json_object"},

470

max_tokens=100,

471

)

472

473

print(response['choices'][0]['message']['content'])

474

```