Tessl Tile for pypi/llama-cpp-python@0.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

caching.md chat-completion.md grammar.md index.md llama-model.md low-level.md server.md tokenization.md vision.md

tokenization.mddocs/

0
# Tokenization
1

2
Native llama.cpp tokenization and HuggingFace tokenizer integration supporting different vocabulary types, encoding/decoding operations, and model-specific preprocessing.
3

4
## Capabilities
5

6
### Native Tokenization
7

8
Use the model's built-in tokenizer for consistent text processing.
9

10
```python { .api }
11
class LlamaTokenizer:
12
    def __init__(self, llama: "Llama"):
13
        """
14
        Initialize tokenizer with Llama model instance.
15
        
16
        Args:
17
            llama: Llama model instance
18
        """
19

20
    def tokenize(
21
        self, 
22
        text: str, 
23
        add_bos: bool = True, 
24
        special: bool = False
25
    ) -> List[int]:
26
        """
27
        Convert text to token IDs.
28
        
29
        Args:
30
            text: Input text to tokenize
31
            add_bos: Add beginning-of-sequence token
32
            special: Allow special tokens in output
33
            
34
        Returns:
35
            List of token IDs
36
        """
37

38
    def detokenize(
39
        self, 
40
        tokens: List[int], 
41
        decode: bool = True
42
    ) -> str:
43
        """
44
        Convert token IDs to text.
45
        
46
        Args:
47
            tokens: List of token IDs to convert
48
            decode: Decode bytes to string
49
            
50
        Returns:
51
            Decoded text string
52
        """
53

54
    def encode(
55
        self, 
56
        text: str, 
57
        add_bos: bool = True, 
58
        special: bool = False
59
    ) -> List[int]:
60
        """
61
        Encode text to tokens (alias for tokenize).
62
        
63
        Args:
64
            text: Text to encode
65
            add_bos: Add beginning-of-sequence token
66
            special: Allow special tokens
67
            
68
        Returns:
69
            List of token IDs
70
        """
71

72
    def decode(
73
        self, 
74
        tokens: List[int], 
75
        **kwargs
76
    ) -> str:
77
        """
78
        Decode tokens to text (alias for detokenize).
79
        
80
        Args:
81
            tokens: Token IDs to decode
82
            **kwargs: Additional decoding parameters
83
            
84
        Returns:
85
            Decoded text
86
        """
87

88
    @classmethod
89
    def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
90
        """
91
        Create tokenizer from GGML tokenizer file.
92
        
93
        Args:
94
            path: Path to GGML tokenizer file
95
            
96
        Returns:
97
            LlamaTokenizer instance
98
        """
99
```
100

101
### HuggingFace Tokenizer Integration
102

103
Use HuggingFace tokenizers for compatibility with Transformers ecosystem.
104

105
```python { .api }
106
class LlamaHFTokenizer:
107
    def __init__(self, hf_tokenizer):
108
        """
109
        Initialize with HuggingFace tokenizer.
110
        
111
        Args:
112
            hf_tokenizer: HuggingFace tokenizer instance
113
        """
114

115
    def tokenize(
116
        self, 
117
        text: str, 
118
        add_bos: bool = True, 
119
        special: bool = False
120
    ) -> List[int]:
121
        """
122
        Tokenize text using HuggingFace tokenizer.
123
        
124
        Args:
125
            text: Input text
126
            add_bos: Add beginning-of-sequence token
127
            special: Allow special tokens
128
            
129
        Returns:
130
            List of token IDs
131
        """
132

133
    def detokenize(
134
        self, 
135
        tokens: List[int], 
136
        decode: bool = True
137
    ) -> str:
138
        """
139
        Detokenize using HuggingFace tokenizer.
140
        
141
        Args:
142
            tokens: Token IDs to decode
143
            decode: Decode to string
144
            
145
        Returns:
146
            Decoded text
147
        """
148

149
    @classmethod
150
    def from_pretrained(
151
        cls, 
152
        pretrained_model_name_or_path: str, 
153
        **kwargs
154
    ) -> "LlamaHFTokenizer":
155
        """
156
        Load tokenizer from HuggingFace model.
157
        
158
        Args:
159
            pretrained_model_name_or_path: Model name or path
160
            **kwargs: Additional tokenizer arguments
161
            
162
        Returns:
163
            LlamaHFTokenizer instance
164
        """
165
```
166

167
### Base Tokenizer Interface
168

169
Abstract base class for tokenizer implementations.
170

171
```python { .api }
172
class BaseLlamaTokenizer:
173
    """Abstract base class for tokenizer implementations."""
174
    
175
    def tokenize(
176
        self, 
177
        text: str, 
178
        add_bos: bool = True, 
179
        special: bool = False
180
    ) -> List[int]:
181
        """Convert text to tokens."""
182
        
183
    def detokenize(
184
        self, 
185
        tokens: List[int], 
186
        decode: bool = True
187
    ) -> str:
188
        """Convert tokens to text."""
189
        
190
    def encode(
191
        self, 
192
        text: str, 
193
        add_bos: bool = True, 
194
        special: bool = False
195
    ) -> List[int]:
196
        """Encode text (alias for tokenize)."""
197
        
198
    def decode(self, tokens: List[int], **kwargs) -> str:
199
        """Decode tokens (alias for detokenize)."""
200
```
201

202
## Vocabulary Type Constants
203

204
```python { .api }
205
# Vocabulary types supported by llama.cpp
206
LLAMA_VOCAB_TYPE_NONE: int  # No vocabulary
207
LLAMA_VOCAB_TYPE_SPM: int   # SentencePiece model
208
LLAMA_VOCAB_TYPE_BPE: int   # Byte pair encoding  
209
LLAMA_VOCAB_TYPE_WPM: int   # WordPiece model
210
LLAMA_VOCAB_TYPE_UGM: int   # Unigram model
211
LLAMA_VOCAB_TYPE_RWKV: int  # RWKV tokenizer
212
```
213

214
## Preprocessing Type Constants
215

216
```python { .api }
217
# Text preprocessing types for different models
218
LLAMA_VOCAB_PRE_TYPE_DEFAULT: int        # Default preprocessing
219
LLAMA_VOCAB_PRE_TYPE_LLAMA3: int         # Llama 3 preprocessing
220
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: int   # DeepSeek preprocessing
221
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: int # DeepSeek Coder preprocessing
222
LLAMA_VOCAB_PRE_TYPE_FALCON: int         # Falcon preprocessing
223
LLAMA_VOCAB_PRE_TYPE_MPT: int            # MPT preprocessing
224
LLAMA_VOCAB_PRE_TYPE_STARCODER: int      # StarCoder preprocessing
225
LLAMA_VOCAB_PRE_TYPE_GPT2: int           # GPT-2 preprocessing
226
LLAMA_VOCAB_PRE_TYPE_REFACT: int         # Refact preprocessing
227
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: int      # Command-R preprocessing
228
LLAMA_VOCAB_PRE_TYPE_QWEN2: int          # Qwen2 preprocessing
229
LLAMA_VOCAB_PRE_TYPE_OLMO: int           # OLMo preprocessing
230
LLAMA_VOCAB_PRE_TYPE_DBRX: int           # DBRX preprocessing
231
LLAMA_VOCAB_PRE_TYPE_SMAUG: int          # Smaug preprocessing
232
LLAMA_VOCAB_PRE_TYPE_PORO: int           # Poro preprocessing
233
LLAMA_VOCAB_PRE_TYPE_CHATGLM3: int       # ChatGLM3 preprocessing
234
LLAMA_VOCAB_PRE_TYPE_CHATGLM4: int       # ChatGLM4 preprocessing
235
LLAMA_VOCAB_PRE_TYPE_VIKING: int         # Viking preprocessing
236
LLAMA_VOCAB_PRE_TYPE_JAIS: int           # Jais preprocessing
237
LLAMA_VOCAB_PRE_TYPE_TEKKEN: int         # Tekken preprocessing
238
LLAMA_VOCAB_PRE_TYPE_SMOLLM: int         # SmolLM preprocessing
239
LLAMA_VOCAB_PRE_TYPE_CODESHELL: int      # CodeShell preprocessing
240
LLAMA_VOCAB_PRE_TYPE_BLOOM: int          # BLOOM preprocessing
241
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: int   # GPT-3 Finnish preprocessing
242
LLAMA_VOCAB_PRE_TYPE_EXAONE: int         # EXAONE preprocessing
243
```
244

245
## Usage Examples
246

247
### Basic Tokenization
248

249
```python
250
from llama_cpp import Llama
251

252
# Initialize model with tokenizer access
253
llm = Llama(model_path="./models/llama-2-7b.gguf")
254

255
# Tokenize text
256
text = "Hello, world! How are you today?"
257
tokens = llm.tokenize(text, add_bos=True)
258
print(f"Tokens: {tokens}")
259
print(f"Token count: {len(tokens)}")
260

261
# Detokenize back to text
262
decoded = llm.detokenize(tokens)
263
print(f"Decoded: {decoded}")
264
```
265

266
### Native Tokenizer Usage
267

268
```python
269
from llama_cpp.llama_tokenizer import LlamaTokenizer
270

271
# Create standalone tokenizer
272
tokenizer = LlamaTokenizer.from_ggml_file("./tokenizer.ggml")
273

274
# Tokenize without BOS token
275
tokens = tokenizer.tokenize("Python is awesome", add_bos=False)
276
print(f"Without BOS: {tokens}")
277

278
# Tokenize with BOS token  
279
tokens_bos = tokenizer.tokenize("Python is awesome", add_bos=True)
280
print(f"With BOS: {tokens_bos}")
281

282
# Handle special tokens
283
tokens_special = tokenizer.tokenize(
284
    "<|im_start|>user\nHello<|im_end|>", 
285
    special=True
286
)
287
print(f"Special tokens: {tokens_special}")
288
```
289

290
### HuggingFace Integration
291

292
```python
293
from llama_cpp.llama_tokenizer import LlamaHFTokenizer
294

295
# Load HuggingFace tokenizer
296
hf_tokenizer = LlamaHFTokenizer.from_pretrained(
297
    "microsoft/DialoGPT-medium",
298
    use_fast=True
299
)
300

301
# Use with consistent interface
302
text = "Tell me a joke about programming"
303
tokens = hf_tokenizer.tokenize(text)
304
decoded = hf_tokenizer.detokenize(tokens)
305

306
print(f"Original: {text}")
307
print(f"Tokens: {tokens}")
308
print(f"Decoded: {decoded}")
309
```
310

311
### Token Analysis
312

313
```python
314
# Analyze tokenization behavior
315
texts = [
316
    "Hello world",
317
    "Hello, world!",
318
    "Hello world.",
319
    "HelloWorld",
320
    "HELLO WORLD",
321
]
322

323
for text in texts:
324
    tokens = llm.tokenize(text, add_bos=False)
325
    print(f"'{text}' -> {len(tokens)} tokens: {tokens}")
326
```
327

328
### Batch Processing
329

330
```python
331
# Process multiple texts efficiently
332
texts = [
333
    "First example text",
334
    "Second example with more words",
335
    "Third text for processing",
336
]
337

338
# Tokenize all texts
339
all_tokens = []
340
for text in texts:
341
    tokens = llm.tokenize(text, add_bos=True)
342
    all_tokens.append(tokens)
343
    print(f"'{text}' -> {len(tokens)} tokens")
344

345
# Find maximum length for padding
346
max_length = max(len(tokens) for tokens in all_tokens)
347
print(f"Maximum token length: {max_length}")
348
```
349

350
### Special Token Handling
351

352
```python
353
# Check special token IDs
354
print(f"BOS token: {llm.token_bos}")
355
print(f"EOS token: {llm.token_eos}")
356
print(f"Newline token: {llm.token_nl}")
357

358
# Create text with explicit special tokens
359
text_with_special = f"<|begin_of_text|>Hello<|end_of_text|>"
360
tokens = llm.tokenize(text_with_special, special=True)
361
print(f"With special tokens: {tokens}")
362

363
# Compare with normal tokenization
364
tokens_normal = llm.tokenize(text_with_special, special=False)
365
print(f"Normal tokenization: {tokens_normal}")
366
```
367

368
### Vocabulary Analysis
369

370
```python
371
# Get vocabulary information
372
print(f"Vocabulary size: {llm.n_vocab}")
373
print(f"Context size: {llm.n_ctx}")
374

375
# Sample some token IDs and their text representations
376
import random
377

378
sample_ids = random.sample(range(min(1000, llm.n_vocab)), 10)
379
for token_id in sample_ids:
380
    try:
381
        text = llm.detokenize([token_id])
382
        print(f"Token {token_id}: '{text}'")
383
    except:
384
        print(f"Token {token_id}: <unable to decode>")
385
```
386

387
### Custom Tokenizer Integration
388

389
```python
390
from llama_cpp.llama_tokenizer import BaseLlamaTokenizer
391

392
class CustomTokenizer(BaseLlamaTokenizer):
393
    def __init__(self, base_tokenizer):
394
        self.base_tokenizer = base_tokenizer
395
    
396
    def tokenize(self, text, add_bos=True, special=False):
397
        # Add custom preprocessing
398
        processed_text = text.lower().strip()
399
        return self.base_tokenizer.tokenize(processed_text, add_bos, special)
400
    
401
    def detokenize(self, tokens, decode=True):
402
        return self.base_tokenizer.detokenize(tokens, decode)
403

404
# Use custom tokenizer
405
custom_tokenizer = CustomTokenizer(llm)
406
tokens = custom_tokenizer.tokenize("HELLO WORLD!")
407
print(f"Custom tokenized: {tokens}")
408
```

Version

Tile

Files

tokenization.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

tokenization.mddocs/