0
# Tokenizers
1
2
Text tokenization utilities supporting various algorithms including byte-pair encoding, WordPiece, and SentencePiece. Keras Hub provides both general-purpose tokenizers and model-specific implementations.
3
4
## Capabilities
5
6
### Base Classes
7
8
Foundation classes for text tokenization.
9
10
```python { .api }
11
class Tokenizer:
12
"""Base class for all tokenizers."""
13
def __init__(self, **kwargs): ...
14
15
def __call__(self, inputs): ...
16
def tokenize(self, inputs): ...
17
def detokenize(self, inputs): ...
18
19
@classmethod
20
def from_preset(cls, preset: str, **kwargs): ...
21
22
@property
23
def vocabulary_size(self) -> int: ...
24
25
@property
26
def vocabulary(self) -> dict: ...
27
```
28
29
### General-Purpose Tokenizers
30
31
Tokenizers that can be used with various models and trained on custom datasets.
32
33
```python { .api }
34
class BytePairTokenizer(Tokenizer):
35
"""Byte Pair Encoding (BPE) tokenizer."""
36
def __init__(
37
self,
38
vocabulary: dict = None,
39
merges: list = None,
40
unseen_token: str = "<unk>",
41
**kwargs
42
): ...
43
44
class WordPieceTokenizer(Tokenizer):
45
"""WordPiece tokenizer as used in BERT."""
46
def __init__(
47
self,
48
vocabulary: dict = None,
49
unseen_token: str = "[UNK]",
50
max_input_chars_per_word: int = 100,
51
**kwargs
52
): ...
53
54
class SentencePieceTokenizer(Tokenizer):
55
"""SentencePiece tokenizer."""
56
def __init__(
57
self,
58
proto: bytes = None,
59
**kwargs
60
): ...
61
62
class ByteTokenizer(Tokenizer):
63
"""Byte-level tokenizer."""
64
def __init__(
65
self,
66
vocabulary_size: int = 256,
67
**kwargs
68
): ...
69
70
class UnicodeCodepointTokenizer(Tokenizer):
71
"""Unicode codepoint tokenizer."""
72
def __init__(
73
self,
74
vocabulary_size: int = 1000000,
75
lowercase: bool = False,
76
**kwargs
77
): ...
78
```
79
80
### Tokenizer Training Utilities
81
82
Utilities for training custom tokenizers on your data.
83
84
```python { .api }
85
def compute_word_piece_vocabulary(
86
data: list,
87
vocabulary_size: int,
88
reserved_tokens: list = None,
89
**kwargs
90
) -> dict:
91
"""
92
Compute WordPiece vocabulary from training data.
93
94
Args:
95
data: List of text strings for training
96
vocabulary_size: Target vocabulary size
97
reserved_tokens: Special tokens to include in vocabulary
98
99
Returns:
100
Dictionary mapping tokens to IDs
101
"""
102
...
103
104
def compute_sentence_piece_proto(
105
data: list,
106
vocabulary_size: int,
107
model_type: str = "unigram",
108
**kwargs
109
) -> bytes:
110
"""
111
Compute SentencePiece model proto from training data.
112
113
Args:
114
data: List of text strings for training
115
vocabulary_size: Target vocabulary size
116
model_type: SentencePiece model type ("unigram", "bpe", "word", "char")
117
118
Returns:
119
Serialized SentencePiece model proto
120
"""
121
...
122
```
123
124
### Model-Specific Tokenizers
125
126
Tokenizers specifically designed for particular model architectures.
127
128
```python { .api }
129
# BERT Family
130
class BertTokenizer(Tokenizer):
131
"""BERT tokenizer using WordPiece."""
132
def __init__(
133
self,
134
vocabulary: dict = None,
135
lowercase: bool = True,
136
**kwargs
137
): ...
138
139
class AlbertTokenizer(Tokenizer):
140
"""ALBERT tokenizer."""
141
def __init__(
142
self,
143
vocabulary: dict = None,
144
**kwargs
145
): ...
146
147
class DistilBertTokenizer(Tokenizer):
148
"""DistilBERT tokenizer."""
149
def __init__(
150
self,
151
vocabulary: dict = None,
152
lowercase: bool = True,
153
**kwargs
154
): ...
155
156
class ElectraTokenizer(Tokenizer):
157
"""ELECTRA tokenizer."""
158
def __init__(
159
self,
160
vocabulary: dict = None,
161
**kwargs
162
): ...
163
164
class RobertaTokenizer(Tokenizer):
165
"""RoBERTa tokenizer using BPE."""
166
def __init__(
167
self,
168
vocabulary: dict = None,
169
merges: list = None,
170
**kwargs
171
): ...
172
173
class DebertaV3Tokenizer(Tokenizer):
174
"""DeBERTa V3 tokenizer."""
175
def __init__(
176
self,
177
vocabulary: dict = None,
178
**kwargs
179
): ...
180
181
class XLMRobertaTokenizer(Tokenizer):
182
"""XLM-RoBERTa tokenizer."""
183
def __init__(
184
self,
185
vocabulary: dict = None,
186
**kwargs
187
): ...
188
189
# GPT Family
190
class GPT2Tokenizer(Tokenizer):
191
"""GPT-2 tokenizer using BPE."""
192
def __init__(
193
self,
194
vocabulary: dict = None,
195
merges: list = None,
196
**kwargs
197
): ...
198
199
class GPTNeoXTokenizer(Tokenizer):
200
"""GPT-NeoX tokenizer."""
201
def __init__(
202
self,
203
vocabulary: dict = None,
204
**kwargs
205
): ...
206
207
# Large Language Models
208
class LlamaTokenizer(Tokenizer):
209
"""Llama tokenizer using SentencePiece."""
210
def __init__(
211
self,
212
proto: bytes = None,
213
**kwargs
214
): ...
215
216
class Llama3Tokenizer(Tokenizer):
217
"""Llama 3 tokenizer."""
218
def __init__(
219
self,
220
vocabulary: dict = None,
221
**kwargs
222
): ...
223
224
class MistralTokenizer(Tokenizer):
225
"""Mistral tokenizer."""
226
def __init__(
227
self,
228
vocabulary: dict = None,
229
**kwargs
230
): ...
231
232
class MixtralTokenizer(Tokenizer):
233
"""Mixtral tokenizer."""
234
def __init__(
235
self,
236
vocabulary: dict = None,
237
**kwargs
238
): ...
239
240
class GemmaTokenizer(Tokenizer):
241
"""Gemma tokenizer."""
242
def __init__(
243
self,
244
vocabulary: dict = None,
245
**kwargs
246
): ...
247
248
class Gemma3Tokenizer(Tokenizer):
249
"""Gemma 3 tokenizer."""
250
def __init__(
251
self,
252
vocabulary: dict = None,
253
**kwargs
254
): ...
255
256
class BloomTokenizer(Tokenizer):
257
"""BLOOM tokenizer."""
258
def __init__(
259
self,
260
vocabulary: dict = None,
261
**kwargs
262
): ...
263
264
class OPTTokenizer(Tokenizer):
265
"""OPT tokenizer."""
266
def __init__(
267
self,
268
vocabulary: dict = None,
269
**kwargs
270
): ...
271
272
class FalconTokenizer(Tokenizer):
273
"""Falcon tokenizer."""
274
def __init__(
275
self,
276
vocabulary: dict = None,
277
**kwargs
278
): ...
279
280
class Phi3Tokenizer(Tokenizer):
281
"""Phi-3 tokenizer."""
282
def __init__(
283
self,
284
vocabulary: dict = None,
285
**kwargs
286
): ...
287
288
class QwenTokenizer(Tokenizer):
289
"""Qwen tokenizer."""
290
def __init__(
291
self,
292
vocabulary: dict = None,
293
**kwargs
294
): ...
295
296
class QwenMoeTokenizer(Tokenizer):
297
"""Qwen MoE tokenizer."""
298
def __init__(
299
self,
300
vocabulary: dict = None,
301
**kwargs
302
): ...
303
304
class Qwen3Tokenizer(Tokenizer):
305
"""Qwen 3 tokenizer."""
306
def __init__(
307
self,
308
vocabulary: dict = None,
309
**kwargs
310
): ...
311
312
# Aliases
313
Qwen2Tokenizer = QwenTokenizer
314
315
# Sequence-to-Sequence Models
316
class BartTokenizer(Tokenizer):
317
"""BART tokenizer."""
318
def __init__(
319
self,
320
vocabulary: dict = None,
321
**kwargs
322
): ...
323
324
class T5Tokenizer(Tokenizer):
325
"""T5 tokenizer using SentencePiece."""
326
def __init__(
327
self,
328
proto: bytes = None,
329
**kwargs
330
): ...
331
332
# Specialized Models
333
class FNetTokenizer(Tokenizer):
334
"""F-Net tokenizer."""
335
def __init__(
336
self,
337
vocabulary: dict = None,
338
**kwargs
339
): ...
340
341
class RoformerV2Tokenizer(Tokenizer):
342
"""RoFormer V2 tokenizer."""
343
def __init__(
344
self,
345
vocabulary: dict = None,
346
**kwargs
347
): ...
348
349
class ESMTokenizer(Tokenizer):
350
"""ESM (protein) tokenizer."""
351
def __init__(
352
self,
353
vocabulary: dict = None,
354
**kwargs
355
): ...
356
357
# Multimodal Models
358
class CLIPTokenizer(Tokenizer):
359
"""CLIP tokenizer."""
360
def __init__(
361
self,
362
vocabulary: dict = None,
363
**kwargs
364
): ...
365
366
class SigLIPTokenizer(Tokenizer):
367
"""SigLIP tokenizer."""
368
def __init__(
369
self,
370
vocabulary: dict = None,
371
**kwargs
372
): ...
373
374
class PaliGemmaTokenizer(Tokenizer):
375
"""PaliGemma tokenizer."""
376
def __init__(
377
self,
378
vocabulary: dict = None,
379
**kwargs
380
): ...
381
382
# Audio Models
383
class WhisperTokenizer(Tokenizer):
384
"""Whisper tokenizer."""
385
def __init__(
386
self,
387
vocabulary: dict = None,
388
**kwargs
389
): ...
390
391
class MoonshineTokenizer(Tokenizer):
392
"""Moonshine tokenizer."""
393
def __init__(
394
self,
395
vocabulary: dict = None,
396
**kwargs
397
): ...
398
```
399
400
## Usage Examples
401
402
### Using Pretrained Tokenizers
403
404
```python
405
import keras_hub
406
407
# Load a pretrained tokenizer
408
tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")
409
410
# Tokenize text
411
text = ["Hello world!", "How are you today?"]
412
tokens = tokenizer(text)
413
print("Tokens:", tokens)
414
415
# Get vocabulary information
416
print("Vocabulary size:", tokenizer.vocabulary_size)
417
print("Sample vocabulary:", list(tokenizer.vocabulary.items())[:10])
418
```
419
420
### Creating Custom Tokenizers
421
422
```python
423
import keras_hub
424
425
# Create a custom WordPiece tokenizer
426
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
427
vocabulary={"[UNK]": 0, "[PAD]": 1, "hello": 2, "world": 3, "##ing": 4},
428
unseen_token="[UNK]"
429
)
430
431
# Use the tokenizer
432
tokens = tokenizer(["hello world", "testing"])
433
print("Custom tokens:", tokens)
434
```
435
436
### Training Custom Vocabularies
437
438
```python
439
import keras_hub
440
441
# Training data
442
training_texts = [
443
"This is a sample text for training tokenizer.",
444
"Another example sentence for vocabulary building.",
445
"More text data for better tokenization results."
446
]
447
448
# Train WordPiece vocabulary
449
vocabulary = keras_hub.tokenizers.compute_word_piece_vocabulary(
450
data=training_texts,
451
vocabulary_size=1000,
452
reserved_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]
453
)
454
455
# Create tokenizer with trained vocabulary
456
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(vocabulary=vocabulary)
457
458
# Use the trained tokenizer
459
tokens = tokenizer(["New text to tokenize"])
460
print("Trained tokenizer output:", tokens)
461
```
462
463
### SentencePiece Training
464
465
```python
466
import keras_hub
467
468
# Train SentencePiece model
469
training_data = ["Large corpus of text for training", "More text data..."]
470
471
proto = keras_hub.tokenizers.compute_sentence_piece_proto(
472
data=training_data,
473
vocabulary_size=8000,
474
model_type="unigram"
475
)
476
477
# Create SentencePiece tokenizer
478
tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(proto=proto)
479
480
# Use the tokenizer
481
tokens = tokenizer(["Text to tokenize with SentencePiece"])
482
print("SentencePiece tokens:", tokens)
483
```
484
485
### Working with Different Tokenization Algorithms
486
487
```python
488
import keras_hub
489
490
# BPE tokenizer
491
bpe_tokenizer = keras_hub.tokenizers.BytePairTokenizer.from_preset("gpt2_base_en")
492
bpe_tokens = bpe_tokenizer(["Example text"])
493
494
# WordPiece tokenizer
495
wordpiece_tokenizer = keras_hub.tokenizers.WordPieceTokenizer.from_preset("bert_base_en")
496
wordpiece_tokens = wordpiece_tokenizer(["Example text"])
497
498
# SentencePiece tokenizer
499
sentencepiece_tokenizer = keras_hub.tokenizers.SentencePieceTokenizer.from_preset("t5_base_en")
500
sp_tokens = sentencepiece_tokenizer(["Example text"])
501
502
print("BPE tokens:", bpe_tokens)
503
print("WordPiece tokens:", wordpiece_tokens)
504
print("SentencePiece tokens:", sp_tokens)
505
```
506
507
### Tokenization and Detokenization
508
509
```python
510
import keras_hub
511
512
# Load tokenizer
513
tokenizer = keras_hub.tokenizers.GPT2Tokenizer.from_preset("gpt2_base_en")
514
515
# Original text
516
text = "Hello, how are you doing today?"
517
518
# Tokenize
519
tokens = tokenizer.tokenize(text)
520
print("Tokens:", tokens)
521
522
# Detokenize back to text
523
reconstructed = tokenizer.detokenize(tokens)
524
print("Reconstructed:", reconstructed)
525
```
526
527
### Batch Processing
528
529
```python
530
import keras_hub
531
532
# Load tokenizer
533
tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")
534
535
# Batch of texts
536
texts = [
537
"First document to tokenize",
538
"Second document with different content",
539
"Third document for batch processing"
540
]
541
542
# Batch tokenization
543
batch_tokens = tokenizer(texts)
544
print("Batch tokens shape:", batch_tokens.shape)
545
print("Batch tokens:", batch_tokens)
546
```