or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

caching.mdchat-completion.mdgrammar.mdindex.mdllama-model.mdlow-level.mdserver.mdtokenization.mdvision.md

llama-model.mddocs/

0

# Core Model and Inference

1

2

High-level model loading, text generation, and inference operations providing the primary interface for llama.cpp functionality through the `Llama` class.

3

4

## Capabilities

5

6

### Model Initialization

7

8

Load and configure language models with comprehensive parameter control for performance optimization and hardware acceleration.

9

10

```python { .api }

11

class Llama:

12

def __init__(

13

self,

14

model_path: str,

15

*,

16

n_gpu_layers: int = 0,

17

split_mode: int = 1,

18

main_gpu: int = 0,

19

tensor_split: Optional[List[float]] = None,

20

vocab_only: bool = False,

21

use_mmap: bool = True,

22

use_mlock: bool = False,

23

kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,

24

seed: int = 0xFFFFFFFF,

25

n_ctx: int = 512,

26

n_batch: int = 512,

27

n_ubatch: int = 512,

28

n_threads: Optional[int] = None,

29

n_threads_batch: Optional[int] = None,

30

rope_scaling_type: Optional[int] = -1,

31

pooling_type: int = -1,

32

rope_freq_base: float = 0.0,

33

rope_freq_scale: float = 0.0,

34

yarn_ext_factor: float = -1.0,

35

yarn_attn_factor: float = 1.0,

36

yarn_beta_fast: float = 32.0,

37

yarn_beta_slow: float = 1.0,

38

yarn_orig_ctx: int = 0,

39

logits_all: bool = False,

40

embedding: bool = False,

41

offload_kqv: bool = True,

42

flash_attn: bool = False,

43

op_offload: Optional[bool] = None,

44

swa_full: Optional[bool] = None,

45

no_perf: bool = False,

46

last_n_tokens_size: int = 64,

47

lora_base: Optional[str] = None,

48

lora_scale: float = 1.0,

49

lora_path: Optional[str] = None,

50

numa: Union[bool, int] = False,

51

chat_format: Optional[str] = None,

52

chat_handler: Optional[object] = None,

53

draft_model: Optional[object] = None,

54

tokenizer: Optional[object] = None,

55

type_k: Optional[int] = None,

56

type_v: Optional[int] = None,

57

spm_infill: bool = False,

58

verbose: bool = True,

59

**kwargs

60

):

61

"""

62

Initialize a Llama model instance.

63

64

Args:

65

model_path: Path to the GGUF model file

66

n_gpu_layers: Number of layers to offload to GPU (0 = CPU only)

67

split_mode: GPU split mode (1 = layer-wise split)

68

main_gpu: Main GPU device ID for multi-GPU setups

69

tensor_split: List of GPU memory allocations for each device

70

vocab_only: Load vocabulary only, skip weights

71

use_mmap: Use memory mapping for model loading

72

use_mlock: Lock model in memory to prevent swapping

73

kv_overrides: Key-value metadata overrides for the model

74

seed: Random seed for sampling (-1 for random)

75

n_ctx: Context window size in tokens

76

n_batch: Batch size for processing

77

n_ubatch: Physical batch size (must be <= n_batch)

78

n_threads: Number of CPU threads for computation

79

n_threads_batch: Number of CPU threads for batch processing

80

rope_scaling_type: RoPE scaling method (-1 = auto)

81

pooling_type: Pooling method for embeddings (-1 = unspecified)

82

rope_freq_base: Base frequency for RoPE

83

rope_freq_scale: Frequency scaling factor for RoPE

84

yarn_ext_factor: YaRN extension factor

85

yarn_attn_factor: YaRN attention factor

86

yarn_beta_fast: YaRN beta fast parameter

87

yarn_beta_slow: YaRN beta slow parameter

88

yarn_orig_ctx: YaRN original context size

89

logits_all: Return logits for all tokens

90

embedding: Enable embedding mode

91

offload_kqv: Offload key/value cache to GPU

92

flash_attn: Use Flash Attention optimization

93

op_offload: Offload operations to GPU (auto-detect if None)

94

swa_full: Use full sliding window attention (auto-detect if None)

95

no_perf: Disable performance optimizations

96

last_n_tokens_size: Size of last-n-tokens buffer for repetition penalty

97

lora_base: Path to LoRA base model

98

lora_scale: LoRA scaling factor

99

lora_path: Path to LoRA adapter

100

numa: NUMA optimization (False/True/strategy)

101

chat_format: Chat format template name

102

chat_handler: Custom chat completion handler

103

draft_model: Draft model for speculative decoding

104

tokenizer: Custom tokenizer instance

105

type_k: Key cache quantization type (None = auto)

106

type_v: Value cache quantization type (None = auto)

107

spm_infill: Enable SentencePiece infill mode

108

verbose: Enable verbose logging

109

"""

110

111

@classmethod

112

def from_pretrained(

113

cls,

114

repo_id: str,

115

filename: Optional[str] = None,

116

*,

117

additional_files: Optional[List[str]] = None,

118

local_dir: Optional[str] = None,

119

local_dir_use_symlinks: bool = True,

120

cache_dir: Optional[str] = None,

121

**kwargs

122

) -> "Llama":

123

"""

124

Create a Llama model instance from a Hugging Face Hub repository.

125

126

Args:

127

repo_id: Repository identifier on Hugging Face Hub

128

filename: Specific model file to download (auto-detected if None)

129

additional_files: Additional files to download (e.g., tokenizer files)

130

local_dir: Local directory to save files (uses cache if None)

131

local_dir_use_symlinks: Use symlinks in local directory

132

cache_dir: Cache directory for downloaded files

133

**kwargs: Additional arguments passed to Llama.__init__()

134

135

Returns:

136

Initialized Llama model instance

137

138

Raises:

139

ImportError: If huggingface-hub package is not installed

140

FileNotFoundError: If specified file is not found in repository

141

"""

142

```

143

144

### Text Completion

145

146

Generate text completions with fine-grained control over sampling parameters and output format, compatible with OpenAI completion API.

147

148

```python { .api }

149

def create_completion(

150

self,

151

prompt: str,

152

suffix: Optional[str] = None,

153

max_tokens: Optional[int] = 16,

154

temperature: float = 0.8,

155

top_p: float = 0.95,

156

min_p: float = 0.05,

157

typical_p: float = 1.0,

158

logprobs: Optional[int] = None,

159

echo: bool = False,

160

stop: Optional[Union[str, List[str]]] = [],

161

frequency_penalty: float = 0.0,

162

presence_penalty: float = 0.0,

163

repeat_penalty: float = 1.0,

164

top_k: int = 40,

165

stream: bool = False,

166

seed: Optional[int] = None,

167

tfs_z: float = 1.0,

168

mirostat_mode: int = 0,

169

mirostat_tau: float = 5.0,

170

mirostat_eta: float = 0.1,

171

model: Optional[str] = None,

172

stopping_criteria: Optional[object] = None,

173

logits_processor: Optional[object] = None,

174

grammar: Optional[object] = None,

175

logit_bias: Optional[Dict[str, float]] = None,

176

**kwargs

177

) -> CreateCompletionResponse:

178

"""

179

Create a text completion.

180

181

Args:

182

prompt: Input text prompt

183

suffix: Text to append after completion

184

max_tokens: Maximum tokens to generate

185

temperature: Sampling temperature (0.0-2.0)

186

top_p: Nucleus sampling probability threshold

187

min_p: Minimum probability threshold

188

typical_p: Typical sampling parameter

189

logprobs: Number of log probabilities to return

190

echo: Include prompt in response

191

stop: Stop sequences (string or list)

192

frequency_penalty: Frequency penalty (-2.0 to 2.0)

193

presence_penalty: Presence penalty (-2.0 to 2.0)

194

repeat_penalty: Repetition penalty multiplier

195

top_k: Top-k sampling parameter

196

stream: Enable streaming response

197

seed: Random seed for sampling

198

tfs_z: Tail-free sampling parameter

199

mirostat_mode: Mirostat sampling mode (0/1/2)

200

mirostat_tau: Mirostat target entropy

201

mirostat_eta: Mirostat learning rate

202

model: Model name for response metadata

203

stopping_criteria: Custom stopping criteria

204

logits_processor: Custom logits processor

205

grammar: Grammar constraints

206

logit_bias: Token bias adjustments

207

208

Returns:

209

Completion response with generated text and metadata

210

"""

211

```

212

213

### Embeddings

214

215

Generate dense vector representations of text for semantic similarity, clustering, and retrieval applications.

216

217

```python { .api }

218

def create_embedding(

219

self,

220

input: Union[str, List[str]],

221

model: Optional[str] = None,

222

encoding_format: str = "float",

223

**kwargs

224

) -> CreateEmbeddingResponse:

225

"""

226

Create text embeddings.

227

228

Args:

229

input: Text string or list of strings to embed

230

model: Model name for response metadata

231

encoding_format: Output format ("float" or "base64")

232

233

Returns:

234

Embedding response with vector representations

235

"""

236

237

def embed(

238

self,

239

input: str,

240

normalize: bool = True

241

) -> List[float]:

242

"""

243

Generate embeddings for a single text input.

244

245

Args:

246

input: Text to embed

247

normalize: Normalize embedding vector to unit length

248

249

Returns:

250

List of embedding values

251

"""

252

```

253

254

### Tokenization

255

256

Convert between text and token representations using the model's native tokenizer.

257

258

```python { .api }

259

def tokenize(

260

self,

261

text: str,

262

add_bos: bool = True,

263

special: bool = False

264

) -> List[int]:

265

"""

266

Convert text to token IDs.

267

268

Args:

269

text: Input text to tokenize

270

add_bos: Add beginning-of-sequence token

271

special: Allow special tokens in output

272

273

Returns:

274

List of token IDs

275

"""

276

277

def detokenize(

278

self,

279

tokens: List[int],

280

decode: bool = True

281

) -> str:

282

"""

283

Convert token IDs to text.

284

285

Args:

286

tokens: List of token IDs

287

decode: Decode bytes to string

288

289

Returns:

290

Decoded text string

291

"""

292

```

293

294

### State Management

295

296

Save and restore model context states for efficient caching and continuation of conversations.

297

298

```python { .api }

299

def save_state(self) -> LlamaState:

300

"""

301

Save current model state.

302

303

Returns:

304

Serializable state object

305

"""

306

307

def load_state(self, state: LlamaState) -> None:

308

"""

309

Load previously saved model state.

310

311

Args:

312

state: State object from save_state()

313

"""

314

315

def reset(self) -> None:

316

"""

317

Reset model context to initial state.

318

"""

319

```

320

321

### Configuration and Properties

322

323

Access model metadata and configuration settings.

324

325

```python { .api }

326

@property

327

def n_ctx(self) -> int:

328

"""Context window size in tokens."""

329

330

@property

331

def n_embd(self) -> int:

332

"""Model embedding dimensions."""

333

334

@property

335

def n_vocab(self) -> int:

336

"""Vocabulary size."""

337

338

@property

339

def tokenizer(self) -> object:

340

"""Tokenizer instance."""

341

342

@property

343

def token_eos(self) -> int:

344

"""End-of-sequence token ID."""

345

346

@property

347

def token_bos(self) -> int:

348

"""Beginning-of-sequence token ID."""

349

350

@property

351

def token_nl(self) -> int:

352

"""Newline token ID."""

353

354

def set_seed(self, seed: int) -> None:

355

"""

356

Set random seed for sampling.

357

358

Args:

359

seed: Random seed value

360

"""

361

362

def set_cache(self, cache: object) -> None:

363

"""

364

Set caching implementation.

365

366

Args:

367

cache: Cache instance (LlamaRAMCache or LlamaDiskCache)

368

"""

369

```

370

371

### Low-Level Generation

372

373

Direct token-level generation and sampling for advanced use cases.

374

375

```python { .api }

376

def eval(self, tokens: List[int]) -> None:

377

"""

378

Evaluate tokens and update model context.

379

380

Args:

381

tokens: Token sequence to evaluate

382

"""

383

384

def sample(

385

self,

386

top_k: int = 40,

387

top_p: float = 0.95,

388

min_p: float = 0.05,

389

typical_p: float = 1.0,

390

temp: float = 0.80,

391

repeat_penalty: float = 1.0,

392

frequency_penalty: float = 0.0,

393

presence_penalty: float = 0.0,

394

tfs_z: float = 1.0,

395

mirostat_mode: int = 0,

396

mirostat_tau: float = 5.0,

397

mirostat_eta: float = 0.1,

398

penalize_nl: bool = True,

399

logits_processor: Optional[object] = None,

400

grammar: Optional[object] = None

401

) -> int:

402

"""

403

Sample next token from current context.

404

405

Args:

406

top_k: Top-k sampling parameter

407

top_p: Top-p (nucleus) sampling parameter

408

min_p: Minimum probability threshold

409

typical_p: Typical sampling parameter

410

temp: Sampling temperature

411

repeat_penalty: Repetition penalty multiplier

412

frequency_penalty: Frequency penalty

413

presence_penalty: Presence penalty

414

tfs_z: Tail-free sampling parameter

415

mirostat_mode: Mirostat sampling mode

416

mirostat_tau: Mirostat target entropy

417

mirostat_eta: Mirostat learning rate

418

penalize_nl: Apply penalty to newline tokens

419

logits_processor: Custom logits processor

420

grammar: Grammar constraints

421

422

Returns:

423

Sampled token ID

424

"""

425

426

def generate(

427

self,

428

tokens: List[int],

429

top_k: int = 40,

430

top_p: float = 0.95,

431

min_p: float = 0.05,

432

typical_p: float = 1.0,

433

temp: float = 0.80,

434

repeat_penalty: float = 1.0,

435

reset: bool = True,

436

frequency_penalty: float = 0.0,

437

presence_penalty: float = 0.0,

438

tfs_z: float = 1.0,

439

mirostat_mode: int = 0,

440

mirostat_tau: float = 5.0,

441

mirostat_eta: float = 0.1,

442

stopping_criteria: Optional[object] = None,

443

logits_processor: Optional[object] = None,

444

grammar: Optional[object] = None

445

) -> Generator[int, None, None]:

446

"""

447

Generate token sequence.

448

449

Args:

450

tokens: Initial token sequence

451

top_k: Top-k sampling parameter

452

top_p: Top-p sampling parameter

453

min_p: Minimum probability threshold

454

typical_p: Typical sampling parameter

455

temp: Temperature

456

repeat_penalty: Repetition penalty

457

reset: Reset context before generation

458

frequency_penalty: Frequency penalty

459

presence_penalty: Presence penalty

460

tfs_z: Tail-free sampling parameter

461

mirostat_mode: Mirostat mode

462

mirostat_tau: Mirostat tau

463

mirostat_eta: Mirostat eta

464

stopping_criteria: Custom stopping criteria

465

logits_processor: Custom logits processor

466

grammar: Grammar constraints

467

468

Yields:

469

Generated token IDs

470

"""

471

```

472

473

## Types

474

475

```python { .api }

476

class LlamaState:

477

"""Serializable model state for persistence."""

478

479

def __init__(self, llama_state): ...

480

481

# Logits processing

482

class LogitsProcessor:

483

"""Base class for logits processing."""

484

485

def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: ...

486

487

class LogitsProcessorList:

488

"""List of logits processors."""

489

490

def __init__(self, processors: List[LogitsProcessor]): ...

491

def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: ...

492

493

class MinTokensLogitsProcessor(LogitsProcessor):

494

"""Ensures minimum number of tokens are generated."""

495

496

def __init__(self, min_tokens: int, eos_token_id: int): ...

497

498

# Stopping criteria

499

class StoppingCriteria:

500

"""Base class for stopping criteria."""

501

502

def __call__(self, input_ids: List[int], scores: List[float]) -> bool: ...

503

504

class StoppingCriteriaList:

505

"""List of stopping criteria."""

506

507

def __init__(self, criteria: List[StoppingCriteria]): ...

508

def __call__(self, input_ids: List[int], scores: List[float]) -> bool: ...

509

```

510

511

## Usage Examples

512

513

### Basic Model Loading and Generation

514

515

```python

516

from llama_cpp import Llama

517

518

# Load model with basic configuration

519

llm = Llama(

520

model_path="./models/llama-2-7b-chat.gguf",

521

n_ctx=2048,

522

n_threads=8,

523

)

524

525

# Simple text completion

526

response = llm.create_completion(

527

prompt="The future of artificial intelligence is",

528

max_tokens=50,

529

temperature=0.7,

530

)

531

print(response['choices'][0]['text'])

532

```

533

534

### GPU Acceleration

535

536

```python

537

# Offload layers to GPU for faster inference

538

llm = Llama(

539

model_path="./models/llama-2-13b-chat.gguf",

540

n_gpu_layers=35, # Offload most layers to GPU

541

n_ctx=4096,

542

f16_kv=True, # Use 16-bit precision for cache

543

)

544

```

545

546

### State Management

547

548

```python

549

# Save and restore conversation state

550

llm = Llama(model_path="./model.gguf")

551

552

# Generate some text

553

llm.create_completion(prompt="Hello, my name is")

554

555

# Save current state

556

state = llm.save_state()

557

558

# Continue conversation

559

llm.create_completion(prompt=" and I like")

560

561

# Restore to previous state

562

llm.load_state(state)

563

```

564

565

### Custom Sampling Parameters

566

567

```python

568

# Fine-tune generation with advanced sampling

569

response = llm.create_completion(

570

prompt="Write a creative story:",

571

max_tokens=200,

572

temperature=0.9, # High creativity

573

top_p=0.9, # Nucleus sampling

574

top_k=50, # Top-k sampling

575

repeat_penalty=1.15, # Reduce repetition

576

frequency_penalty=0.1,

577

presence_penalty=0.1,

578

)

579

```