0
# Tokenization
1
2
Native llama.cpp tokenization and HuggingFace tokenizer integration supporting different vocabulary types, encoding/decoding operations, and model-specific preprocessing.
3
4
## Capabilities
5
6
### Native Tokenization
7
8
Use the model's built-in tokenizer for consistent text processing.
9
10
```python { .api }
11
class LlamaTokenizer:
12
def __init__(self, llama: "Llama"):
13
"""
14
Initialize tokenizer with Llama model instance.
15
16
Args:
17
llama: Llama model instance
18
"""
19
20
def tokenize(
21
self,
22
text: str,
23
add_bos: bool = True,
24
special: bool = False
25
) -> List[int]:
26
"""
27
Convert text to token IDs.
28
29
Args:
30
text: Input text to tokenize
31
add_bos: Add beginning-of-sequence token
32
special: Allow special tokens in output
33
34
Returns:
35
List of token IDs
36
"""
37
38
def detokenize(
39
self,
40
tokens: List[int],
41
decode: bool = True
42
) -> str:
43
"""
44
Convert token IDs to text.
45
46
Args:
47
tokens: List of token IDs to convert
48
decode: Decode bytes to string
49
50
Returns:
51
Decoded text string
52
"""
53
54
def encode(
55
self,
56
text: str,
57
add_bos: bool = True,
58
special: bool = False
59
) -> List[int]:
60
"""
61
Encode text to tokens (alias for tokenize).
62
63
Args:
64
text: Text to encode
65
add_bos: Add beginning-of-sequence token
66
special: Allow special tokens
67
68
Returns:
69
List of token IDs
70
"""
71
72
def decode(
73
self,
74
tokens: List[int],
75
**kwargs
76
) -> str:
77
"""
78
Decode tokens to text (alias for detokenize).
79
80
Args:
81
tokens: Token IDs to decode
82
**kwargs: Additional decoding parameters
83
84
Returns:
85
Decoded text
86
"""
87
88
@classmethod
89
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
90
"""
91
Create tokenizer from GGML tokenizer file.
92
93
Args:
94
path: Path to GGML tokenizer file
95
96
Returns:
97
LlamaTokenizer instance
98
"""
99
```
100
101
### HuggingFace Tokenizer Integration
102
103
Use HuggingFace tokenizers for compatibility with Transformers ecosystem.
104
105
```python { .api }
106
class LlamaHFTokenizer:
107
def __init__(self, hf_tokenizer):
108
"""
109
Initialize with HuggingFace tokenizer.
110
111
Args:
112
hf_tokenizer: HuggingFace tokenizer instance
113
"""
114
115
def tokenize(
116
self,
117
text: str,
118
add_bos: bool = True,
119
special: bool = False
120
) -> List[int]:
121
"""
122
Tokenize text using HuggingFace tokenizer.
123
124
Args:
125
text: Input text
126
add_bos: Add beginning-of-sequence token
127
special: Allow special tokens
128
129
Returns:
130
List of token IDs
131
"""
132
133
def detokenize(
134
self,
135
tokens: List[int],
136
decode: bool = True
137
) -> str:
138
"""
139
Detokenize using HuggingFace tokenizer.
140
141
Args:
142
tokens: Token IDs to decode
143
decode: Decode to string
144
145
Returns:
146
Decoded text
147
"""
148
149
@classmethod
150
def from_pretrained(
151
cls,
152
pretrained_model_name_or_path: str,
153
**kwargs
154
) -> "LlamaHFTokenizer":
155
"""
156
Load tokenizer from HuggingFace model.
157
158
Args:
159
pretrained_model_name_or_path: Model name or path
160
**kwargs: Additional tokenizer arguments
161
162
Returns:
163
LlamaHFTokenizer instance
164
"""
165
```
166
167
### Base Tokenizer Interface
168
169
Abstract base class for tokenizer implementations.
170
171
```python { .api }
172
class BaseLlamaTokenizer:
173
"""Abstract base class for tokenizer implementations."""
174
175
def tokenize(
176
self,
177
text: str,
178
add_bos: bool = True,
179
special: bool = False
180
) -> List[int]:
181
"""Convert text to tokens."""
182
183
def detokenize(
184
self,
185
tokens: List[int],
186
decode: bool = True
187
) -> str:
188
"""Convert tokens to text."""
189
190
def encode(
191
self,
192
text: str,
193
add_bos: bool = True,
194
special: bool = False
195
) -> List[int]:
196
"""Encode text (alias for tokenize)."""
197
198
def decode(self, tokens: List[int], **kwargs) -> str:
199
"""Decode tokens (alias for detokenize)."""
200
```
201
202
## Vocabulary Type Constants
203
204
```python { .api }
205
# Vocabulary types supported by llama.cpp
206
LLAMA_VOCAB_TYPE_NONE: int # No vocabulary
207
LLAMA_VOCAB_TYPE_SPM: int # SentencePiece model
208
LLAMA_VOCAB_TYPE_BPE: int # Byte pair encoding
209
LLAMA_VOCAB_TYPE_WPM: int # WordPiece model
210
LLAMA_VOCAB_TYPE_UGM: int # Unigram model
211
LLAMA_VOCAB_TYPE_RWKV: int # RWKV tokenizer
212
```
213
214
## Preprocessing Type Constants
215
216
```python { .api }
217
# Text preprocessing types for different models
218
LLAMA_VOCAB_PRE_TYPE_DEFAULT: int # Default preprocessing
219
LLAMA_VOCAB_PRE_TYPE_LLAMA3: int # Llama 3 preprocessing
220
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: int # DeepSeek preprocessing
221
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: int # DeepSeek Coder preprocessing
222
LLAMA_VOCAB_PRE_TYPE_FALCON: int # Falcon preprocessing
223
LLAMA_VOCAB_PRE_TYPE_MPT: int # MPT preprocessing
224
LLAMA_VOCAB_PRE_TYPE_STARCODER: int # StarCoder preprocessing
225
LLAMA_VOCAB_PRE_TYPE_GPT2: int # GPT-2 preprocessing
226
LLAMA_VOCAB_PRE_TYPE_REFACT: int # Refact preprocessing
227
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: int # Command-R preprocessing
228
LLAMA_VOCAB_PRE_TYPE_QWEN2: int # Qwen2 preprocessing
229
LLAMA_VOCAB_PRE_TYPE_OLMO: int # OLMo preprocessing
230
LLAMA_VOCAB_PRE_TYPE_DBRX: int # DBRX preprocessing
231
LLAMA_VOCAB_PRE_TYPE_SMAUG: int # Smaug preprocessing
232
LLAMA_VOCAB_PRE_TYPE_PORO: int # Poro preprocessing
233
LLAMA_VOCAB_PRE_TYPE_CHATGLM3: int # ChatGLM3 preprocessing
234
LLAMA_VOCAB_PRE_TYPE_CHATGLM4: int # ChatGLM4 preprocessing
235
LLAMA_VOCAB_PRE_TYPE_VIKING: int # Viking preprocessing
236
LLAMA_VOCAB_PRE_TYPE_JAIS: int # Jais preprocessing
237
LLAMA_VOCAB_PRE_TYPE_TEKKEN: int # Tekken preprocessing
238
LLAMA_VOCAB_PRE_TYPE_SMOLLM: int # SmolLM preprocessing
239
LLAMA_VOCAB_PRE_TYPE_CODESHELL: int # CodeShell preprocessing
240
LLAMA_VOCAB_PRE_TYPE_BLOOM: int # BLOOM preprocessing
241
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: int # GPT-3 Finnish preprocessing
242
LLAMA_VOCAB_PRE_TYPE_EXAONE: int # EXAONE preprocessing
243
```
244
245
## Usage Examples
246
247
### Basic Tokenization
248
249
```python
250
from llama_cpp import Llama
251
252
# Initialize model with tokenizer access
253
llm = Llama(model_path="./models/llama-2-7b.gguf")
254
255
# Tokenize text
256
text = "Hello, world! How are you today?"
257
tokens = llm.tokenize(text, add_bos=True)
258
print(f"Tokens: {tokens}")
259
print(f"Token count: {len(tokens)}")
260
261
# Detokenize back to text
262
decoded = llm.detokenize(tokens)
263
print(f"Decoded: {decoded}")
264
```
265
266
### Native Tokenizer Usage
267
268
```python
269
from llama_cpp.llama_tokenizer import LlamaTokenizer
270
271
# Create standalone tokenizer
272
tokenizer = LlamaTokenizer.from_ggml_file("./tokenizer.ggml")
273
274
# Tokenize without BOS token
275
tokens = tokenizer.tokenize("Python is awesome", add_bos=False)
276
print(f"Without BOS: {tokens}")
277
278
# Tokenize with BOS token
279
tokens_bos = tokenizer.tokenize("Python is awesome", add_bos=True)
280
print(f"With BOS: {tokens_bos}")
281
282
# Handle special tokens
283
tokens_special = tokenizer.tokenize(
284
"<|im_start|>user\nHello<|im_end|>",
285
special=True
286
)
287
print(f"Special tokens: {tokens_special}")
288
```
289
290
### HuggingFace Integration
291
292
```python
293
from llama_cpp.llama_tokenizer import LlamaHFTokenizer
294
295
# Load HuggingFace tokenizer
296
hf_tokenizer = LlamaHFTokenizer.from_pretrained(
297
"microsoft/DialoGPT-medium",
298
use_fast=True
299
)
300
301
# Use with consistent interface
302
text = "Tell me a joke about programming"
303
tokens = hf_tokenizer.tokenize(text)
304
decoded = hf_tokenizer.detokenize(tokens)
305
306
print(f"Original: {text}")
307
print(f"Tokens: {tokens}")
308
print(f"Decoded: {decoded}")
309
```
310
311
### Token Analysis
312
313
```python
314
# Analyze tokenization behavior
315
texts = [
316
"Hello world",
317
"Hello, world!",
318
"Hello world.",
319
"HelloWorld",
320
"HELLO WORLD",
321
]
322
323
for text in texts:
324
tokens = llm.tokenize(text, add_bos=False)
325
print(f"'{text}' -> {len(tokens)} tokens: {tokens}")
326
```
327
328
### Batch Processing
329
330
```python
331
# Process multiple texts efficiently
332
texts = [
333
"First example text",
334
"Second example with more words",
335
"Third text for processing",
336
]
337
338
# Tokenize all texts
339
all_tokens = []
340
for text in texts:
341
tokens = llm.tokenize(text, add_bos=True)
342
all_tokens.append(tokens)
343
print(f"'{text}' -> {len(tokens)} tokens")
344
345
# Find maximum length for padding
346
max_length = max(len(tokens) for tokens in all_tokens)
347
print(f"Maximum token length: {max_length}")
348
```
349
350
### Special Token Handling
351
352
```python
353
# Check special token IDs
354
print(f"BOS token: {llm.token_bos}")
355
print(f"EOS token: {llm.token_eos}")
356
print(f"Newline token: {llm.token_nl}")
357
358
# Create text with explicit special tokens
359
text_with_special = f"<|begin_of_text|>Hello<|end_of_text|>"
360
tokens = llm.tokenize(text_with_special, special=True)
361
print(f"With special tokens: {tokens}")
362
363
# Compare with normal tokenization
364
tokens_normal = llm.tokenize(text_with_special, special=False)
365
print(f"Normal tokenization: {tokens_normal}")
366
```
367
368
### Vocabulary Analysis
369
370
```python
371
# Get vocabulary information
372
print(f"Vocabulary size: {llm.n_vocab}")
373
print(f"Context size: {llm.n_ctx}")
374
375
# Sample some token IDs and their text representations
376
import random
377
378
sample_ids = random.sample(range(min(1000, llm.n_vocab)), 10)
379
for token_id in sample_ids:
380
try:
381
text = llm.detokenize([token_id])
382
print(f"Token {token_id}: '{text}'")
383
except:
384
print(f"Token {token_id}: <unable to decode>")
385
```
386
387
### Custom Tokenizer Integration
388
389
```python
390
from llama_cpp.llama_tokenizer import BaseLlamaTokenizer
391
392
class CustomTokenizer(BaseLlamaTokenizer):
393
def __init__(self, base_tokenizer):
394
self.base_tokenizer = base_tokenizer
395
396
def tokenize(self, text, add_bos=True, special=False):
397
# Add custom preprocessing
398
processed_text = text.lower().strip()
399
return self.base_tokenizer.tokenize(processed_text, add_bos, special)
400
401
def detokenize(self, tokens, decode=True):
402
return self.base_tokenizer.detokenize(tokens, decode)
403
404
# Use custom tokenizer
405
custom_tokenizer = CustomTokenizer(llm)
406
tokens = custom_tokenizer.tokenize("HELLO WORLD!")
407
print(f"Custom tokenized: {tokens}")
408
```