or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

caching.mdchat-completion.mdgrammar.mdindex.mdllama-model.mdlow-level.mdserver.mdtokenization.mdvision.md

tokenization.mddocs/

0

# Tokenization

1

2

Native llama.cpp tokenization and HuggingFace tokenizer integration supporting different vocabulary types, encoding/decoding operations, and model-specific preprocessing.

3

4

## Capabilities

5

6

### Native Tokenization

7

8

Use the model's built-in tokenizer for consistent text processing.

9

10

```python { .api }

11

class LlamaTokenizer:

12

def __init__(self, llama: "Llama"):

13

"""

14

Initialize tokenizer with Llama model instance.

15

16

Args:

17

llama: Llama model instance

18

"""

19

20

def tokenize(

21

self,

22

text: str,

23

add_bos: bool = True,

24

special: bool = False

25

) -> List[int]:

26

"""

27

Convert text to token IDs.

28

29

Args:

30

text: Input text to tokenize

31

add_bos: Add beginning-of-sequence token

32

special: Allow special tokens in output

33

34

Returns:

35

List of token IDs

36

"""

37

38

def detokenize(

39

self,

40

tokens: List[int],

41

decode: bool = True

42

) -> str:

43

"""

44

Convert token IDs to text.

45

46

Args:

47

tokens: List of token IDs to convert

48

decode: Decode bytes to string

49

50

Returns:

51

Decoded text string

52

"""

53

54

def encode(

55

self,

56

text: str,

57

add_bos: bool = True,

58

special: bool = False

59

) -> List[int]:

60

"""

61

Encode text to tokens (alias for tokenize).

62

63

Args:

64

text: Text to encode

65

add_bos: Add beginning-of-sequence token

66

special: Allow special tokens

67

68

Returns:

69

List of token IDs

70

"""

71

72

def decode(

73

self,

74

tokens: List[int],

75

**kwargs

76

) -> str:

77

"""

78

Decode tokens to text (alias for detokenize).

79

80

Args:

81

tokens: Token IDs to decode

82

**kwargs: Additional decoding parameters

83

84

Returns:

85

Decoded text

86

"""

87

88

@classmethod

89

def from_ggml_file(cls, path: str) -> "LlamaTokenizer":

90

"""

91

Create tokenizer from GGML tokenizer file.

92

93

Args:

94

path: Path to GGML tokenizer file

95

96

Returns:

97

LlamaTokenizer instance

98

"""

99

```

100

101

### HuggingFace Tokenizer Integration

102

103

Use HuggingFace tokenizers for compatibility with Transformers ecosystem.

104

105

```python { .api }

106

class LlamaHFTokenizer:

107

def __init__(self, hf_tokenizer):

108

"""

109

Initialize with HuggingFace tokenizer.

110

111

Args:

112

hf_tokenizer: HuggingFace tokenizer instance

113

"""

114

115

def tokenize(

116

self,

117

text: str,

118

add_bos: bool = True,

119

special: bool = False

120

) -> List[int]:

121

"""

122

Tokenize text using HuggingFace tokenizer.

123

124

Args:

125

text: Input text

126

add_bos: Add beginning-of-sequence token

127

special: Allow special tokens

128

129

Returns:

130

List of token IDs

131

"""

132

133

def detokenize(

134

self,

135

tokens: List[int],

136

decode: bool = True

137

) -> str:

138

"""

139

Detokenize using HuggingFace tokenizer.

140

141

Args:

142

tokens: Token IDs to decode

143

decode: Decode to string

144

145

Returns:

146

Decoded text

147

"""

148

149

@classmethod

150

def from_pretrained(

151

cls,

152

pretrained_model_name_or_path: str,

153

**kwargs

154

) -> "LlamaHFTokenizer":

155

"""

156

Load tokenizer from HuggingFace model.

157

158

Args:

159

pretrained_model_name_or_path: Model name or path

160

**kwargs: Additional tokenizer arguments

161

162

Returns:

163

LlamaHFTokenizer instance

164

"""

165

```

166

167

### Base Tokenizer Interface

168

169

Abstract base class for tokenizer implementations.

170

171

```python { .api }

172

class BaseLlamaTokenizer:

173

"""Abstract base class for tokenizer implementations."""

174

175

def tokenize(

176

self,

177

text: str,

178

add_bos: bool = True,

179

special: bool = False

180

) -> List[int]:

181

"""Convert text to tokens."""

182

183

def detokenize(

184

self,

185

tokens: List[int],

186

decode: bool = True

187

) -> str:

188

"""Convert tokens to text."""

189

190

def encode(

191

self,

192

text: str,

193

add_bos: bool = True,

194

special: bool = False

195

) -> List[int]:

196

"""Encode text (alias for tokenize)."""

197

198

def decode(self, tokens: List[int], **kwargs) -> str:

199

"""Decode tokens (alias for detokenize)."""

200

```

201

202

## Vocabulary Type Constants

203

204

```python { .api }

205

# Vocabulary types supported by llama.cpp

206

LLAMA_VOCAB_TYPE_NONE: int # No vocabulary

207

LLAMA_VOCAB_TYPE_SPM: int # SentencePiece model

208

LLAMA_VOCAB_TYPE_BPE: int # Byte pair encoding

209

LLAMA_VOCAB_TYPE_WPM: int # WordPiece model

210

LLAMA_VOCAB_TYPE_UGM: int # Unigram model

211

LLAMA_VOCAB_TYPE_RWKV: int # RWKV tokenizer

212

```

213

214

## Preprocessing Type Constants

215

216

```python { .api }

217

# Text preprocessing types for different models

218

LLAMA_VOCAB_PRE_TYPE_DEFAULT: int # Default preprocessing

219

LLAMA_VOCAB_PRE_TYPE_LLAMA3: int # Llama 3 preprocessing

220

LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: int # DeepSeek preprocessing

221

LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: int # DeepSeek Coder preprocessing

222

LLAMA_VOCAB_PRE_TYPE_FALCON: int # Falcon preprocessing

223

LLAMA_VOCAB_PRE_TYPE_MPT: int # MPT preprocessing

224

LLAMA_VOCAB_PRE_TYPE_STARCODER: int # StarCoder preprocessing

225

LLAMA_VOCAB_PRE_TYPE_GPT2: int # GPT-2 preprocessing

226

LLAMA_VOCAB_PRE_TYPE_REFACT: int # Refact preprocessing

227

LLAMA_VOCAB_PRE_TYPE_COMMAND_R: int # Command-R preprocessing

228

LLAMA_VOCAB_PRE_TYPE_QWEN2: int # Qwen2 preprocessing

229

LLAMA_VOCAB_PRE_TYPE_OLMO: int # OLMo preprocessing

230

LLAMA_VOCAB_PRE_TYPE_DBRX: int # DBRX preprocessing

231

LLAMA_VOCAB_PRE_TYPE_SMAUG: int # Smaug preprocessing

232

LLAMA_VOCAB_PRE_TYPE_PORO: int # Poro preprocessing

233

LLAMA_VOCAB_PRE_TYPE_CHATGLM3: int # ChatGLM3 preprocessing

234

LLAMA_VOCAB_PRE_TYPE_CHATGLM4: int # ChatGLM4 preprocessing

235

LLAMA_VOCAB_PRE_TYPE_VIKING: int # Viking preprocessing

236

LLAMA_VOCAB_PRE_TYPE_JAIS: int # Jais preprocessing

237

LLAMA_VOCAB_PRE_TYPE_TEKKEN: int # Tekken preprocessing

238

LLAMA_VOCAB_PRE_TYPE_SMOLLM: int # SmolLM preprocessing

239

LLAMA_VOCAB_PRE_TYPE_CODESHELL: int # CodeShell preprocessing

240

LLAMA_VOCAB_PRE_TYPE_BLOOM: int # BLOOM preprocessing

241

LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: int # GPT-3 Finnish preprocessing

242

LLAMA_VOCAB_PRE_TYPE_EXAONE: int # EXAONE preprocessing

243

```

244

245

## Usage Examples

246

247

### Basic Tokenization

248

249

```python

250

from llama_cpp import Llama

251

252

# Initialize model with tokenizer access

253

llm = Llama(model_path="./models/llama-2-7b.gguf")

254

255

# Tokenize text

256

text = "Hello, world! How are you today?"

257

tokens = llm.tokenize(text, add_bos=True)

258

print(f"Tokens: {tokens}")

259

print(f"Token count: {len(tokens)}")

260

261

# Detokenize back to text

262

decoded = llm.detokenize(tokens)

263

print(f"Decoded: {decoded}")

264

```

265

266

### Native Tokenizer Usage

267

268

```python

269

from llama_cpp.llama_tokenizer import LlamaTokenizer

270

271

# Create standalone tokenizer

272

tokenizer = LlamaTokenizer.from_ggml_file("./tokenizer.ggml")

273

274

# Tokenize without BOS token

275

tokens = tokenizer.tokenize("Python is awesome", add_bos=False)

276

print(f"Without BOS: {tokens}")

277

278

# Tokenize with BOS token

279

tokens_bos = tokenizer.tokenize("Python is awesome", add_bos=True)

280

print(f"With BOS: {tokens_bos}")

281

282

# Handle special tokens

283

tokens_special = tokenizer.tokenize(

284

"<|im_start|>user\nHello<|im_end|>",

285

special=True

286

)

287

print(f"Special tokens: {tokens_special}")

288

```

289

290

### HuggingFace Integration

291

292

```python

293

from llama_cpp.llama_tokenizer import LlamaHFTokenizer

294

295

# Load HuggingFace tokenizer

296

hf_tokenizer = LlamaHFTokenizer.from_pretrained(

297

"microsoft/DialoGPT-medium",

298

use_fast=True

299

)

300

301

# Use with consistent interface

302

text = "Tell me a joke about programming"

303

tokens = hf_tokenizer.tokenize(text)

304

decoded = hf_tokenizer.detokenize(tokens)

305

306

print(f"Original: {text}")

307

print(f"Tokens: {tokens}")

308

print(f"Decoded: {decoded}")

309

```

310

311

### Token Analysis

312

313

```python

314

# Analyze tokenization behavior

315

texts = [

316

"Hello world",

317

"Hello, world!",

318

"Hello world.",

319

"HelloWorld",

320

"HELLO WORLD",

321

]

322

323

for text in texts:

324

tokens = llm.tokenize(text, add_bos=False)

325

print(f"'{text}' -> {len(tokens)} tokens: {tokens}")

326

```

327

328

### Batch Processing

329

330

```python

331

# Process multiple texts efficiently

332

texts = [

333

"First example text",

334

"Second example with more words",

335

"Third text for processing",

336

]

337

338

# Tokenize all texts

339

all_tokens = []

340

for text in texts:

341

tokens = llm.tokenize(text, add_bos=True)

342

all_tokens.append(tokens)

343

print(f"'{text}' -> {len(tokens)} tokens")

344

345

# Find maximum length for padding

346

max_length = max(len(tokens) for tokens in all_tokens)

347

print(f"Maximum token length: {max_length}")

348

```

349

350

### Special Token Handling

351

352

```python

353

# Check special token IDs

354

print(f"BOS token: {llm.token_bos}")

355

print(f"EOS token: {llm.token_eos}")

356

print(f"Newline token: {llm.token_nl}")

357

358

# Create text with explicit special tokens

359

text_with_special = f"<|begin_of_text|>Hello<|end_of_text|>"

360

tokens = llm.tokenize(text_with_special, special=True)

361

print(f"With special tokens: {tokens}")

362

363

# Compare with normal tokenization

364

tokens_normal = llm.tokenize(text_with_special, special=False)

365

print(f"Normal tokenization: {tokens_normal}")

366

```

367

368

### Vocabulary Analysis

369

370

```python

371

# Get vocabulary information

372

print(f"Vocabulary size: {llm.n_vocab}")

373

print(f"Context size: {llm.n_ctx}")

374

375

# Sample some token IDs and their text representations

376

import random

377

378

sample_ids = random.sample(range(min(1000, llm.n_vocab)), 10)

379

for token_id in sample_ids:

380

try:

381

text = llm.detokenize([token_id])

382

print(f"Token {token_id}: '{text}'")

383

except:

384

print(f"Token {token_id}: <unable to decode>")

385

```

386

387

### Custom Tokenizer Integration

388

389

```python

390

from llama_cpp.llama_tokenizer import BaseLlamaTokenizer

391

392

class CustomTokenizer(BaseLlamaTokenizer):

393

def __init__(self, base_tokenizer):

394

self.base_tokenizer = base_tokenizer

395

396

def tokenize(self, text, add_bos=True, special=False):

397

# Add custom preprocessing

398

processed_text = text.lower().strip()

399

return self.base_tokenizer.tokenize(processed_text, add_bos, special)

400

401

def detokenize(self, tokens, decode=True):

402

return self.base_tokenizer.detokenize(tokens, decode)

403

404

# Use custom tokenizer

405

custom_tokenizer = CustomTokenizer(llm)

406

tokens = custom_tokenizer.tokenize("HELLO WORLD!")

407

print(f"Custom tokenized: {tokens}")

408

```