Tessl Tile for pypi/keras-hub@0.22.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-models.md evaluation-metrics.md generative-models.md image-models.md index.md layers-components.md multimodal-models.md text-generation-sampling.md text-models.md tokenizers.md utilities-helpers.md

tokenizers.mddocs/

0
# Tokenizers
1

2
Text tokenization utilities supporting various algorithms including byte-pair encoding, WordPiece, and SentencePiece. Keras Hub provides both general-purpose tokenizers and model-specific implementations.
3

4
## Capabilities
5

6
### Base Classes
7

8
Foundation classes for text tokenization.
9

10
```python { .api }
11
class Tokenizer:
12
    """Base class for all tokenizers."""
13
    def __init__(self, **kwargs): ...
14
    
15
    def __call__(self, inputs): ...
16
    def tokenize(self, inputs): ...
17
    def detokenize(self, inputs): ...
18
    
19
    @classmethod
20
    def from_preset(cls, preset: str, **kwargs): ...
21
    
22
    @property
23
    def vocabulary_size(self) -> int: ...
24
    
25
    @property
26
    def vocabulary(self) -> dict: ...
27
```
28

29
### General-Purpose Tokenizers
30

31
Tokenizers that can be used with various models and trained on custom datasets.
32

33
```python { .api }
34
class BytePairTokenizer(Tokenizer):
35
    """Byte Pair Encoding (BPE) tokenizer."""
36
    def __init__(
37
        self,
38
        vocabulary: dict = None,
39
        merges: list = None,
40
        unseen_token: str = "<unk>",
41
        **kwargs
42
    ): ...
43

44
class WordPieceTokenizer(Tokenizer):
45
    """WordPiece tokenizer as used in BERT."""
46
    def __init__(
47
        self,
48
        vocabulary: dict = None,
49
        unseen_token: str = "[UNK]",
50
        max_input_chars_per_word: int = 100,
51
        **kwargs
52
    ): ...
53

54
class SentencePieceTokenizer(Tokenizer):
55
    """SentencePiece tokenizer."""
56
    def __init__(
57
        self,
58
        proto: bytes = None,
59
        **kwargs
60
    ): ...
61

62
class ByteTokenizer(Tokenizer):
63
    """Byte-level tokenizer."""
64
    def __init__(
65
        self,
66
        vocabulary_size: int = 256,
67
        **kwargs
68
    ): ...
69

70
class UnicodeCodepointTokenizer(Tokenizer):
71
    """Unicode codepoint tokenizer."""
72
    def __init__(
73
        self,
74
        vocabulary_size: int = 1000000,
75
        lowercase: bool = False,
76
        **kwargs
77
    ): ...
78
```
79

80
### Tokenizer Training Utilities
81

82
Utilities for training custom tokenizers on your data.
83

84
```python { .api }
85
def compute_word_piece_vocabulary(
86
    data: list,
87
    vocabulary_size: int,
88
    reserved_tokens: list = None,
89
    **kwargs
90
) -> dict:
91
    """
92
    Compute WordPiece vocabulary from training data.
93
    
94
    Args:
95
        data: List of text strings for training
96
        vocabulary_size: Target vocabulary size
97
        reserved_tokens: Special tokens to include in vocabulary
98
        
99
    Returns:
100
        Dictionary mapping tokens to IDs
101
    """
102
    ...
103

104
def compute_sentence_piece_proto(
105
    data: list,
106
    vocabulary_size: int,
107
    model_type: str = "unigram",
108
    **kwargs
109
) -> bytes:
110
    """
111
    Compute SentencePiece model proto from training data.
112
    
113
    Args:
114
        data: List of text strings for training
115
        vocabulary_size: Target vocabulary size
116
        model_type: SentencePiece model type ("unigram", "bpe", "word", "char")
117
        
118
    Returns:
119
        Serialized SentencePiece model proto
120
    """
121
    ...
122
```
123

124
### Model-Specific Tokenizers
125

126
Tokenizers specifically designed for particular model architectures.
127

128
```python { .api }
129
# BERT Family
130
class BertTokenizer(Tokenizer):
131
    """BERT tokenizer using WordPiece."""
132
    def __init__(
133
        self,
134
        vocabulary: dict = None,
135
        lowercase: bool = True,
136
        **kwargs
137
    ): ...
138

139
class AlbertTokenizer(Tokenizer):
140
    """ALBERT tokenizer."""
141
    def __init__(
142
        self,
143
        vocabulary: dict = None,
144
        **kwargs
145
    ): ...
146

147
class DistilBertTokenizer(Tokenizer):
148
    """DistilBERT tokenizer."""
149
    def __init__(
150
        self,
151
        vocabulary: dict = None,
152
        lowercase: bool = True,
153
        **kwargs
154
    ): ...
155

156
class ElectraTokenizer(Tokenizer):
157
    """ELECTRA tokenizer."""
158
    def __init__(
159
        self,
160
        vocabulary: dict = None,
161
        **kwargs
162
    ): ...
163

164
class RobertaTokenizer(Tokenizer):
165
    """RoBERTa tokenizer using BPE."""
166
    def __init__(
167
        self,
168
        vocabulary: dict = None,
169
        merges: list = None,
170
        **kwargs
171
    ): ...
172

173
class DebertaV3Tokenizer(Tokenizer):
174
    """DeBERTa V3 tokenizer."""
175
    def __init__(
176
        self,
177
        vocabulary: dict = None,
178
        **kwargs
179
    ): ...
180

181
class XLMRobertaTokenizer(Tokenizer):
182
    """XLM-RoBERTa tokenizer."""
183
    def __init__(
184
        self,
185
        vocabulary: dict = None,
186
        **kwargs
187
    ): ...
188

189
# GPT Family
190
class GPT2Tokenizer(Tokenizer):
191
    """GPT-2 tokenizer using BPE."""
192
    def __init__(
193
        self,
194
        vocabulary: dict = None,
195
        merges: list = None,
196
        **kwargs
197
    ): ...
198

199
class GPTNeoXTokenizer(Tokenizer):
200
    """GPT-NeoX tokenizer."""
201
    def __init__(
202
        self,
203
        vocabulary: dict = None,
204
        **kwargs
205
    ): ...
206

207
# Large Language Models
208
class LlamaTokenizer(Tokenizer):
209
    """Llama tokenizer using SentencePiece."""
210
    def __init__(
211
        self,
212
        proto: bytes = None,
213
        **kwargs
214
    ): ...
215

216
class Llama3Tokenizer(Tokenizer):
217
    """Llama 3 tokenizer."""
218
    def __init__(
219
        self,
220
        vocabulary: dict = None,
221
        **kwargs
222
    ): ...
223

224
class MistralTokenizer(Tokenizer):
225
    """Mistral tokenizer."""
226
    def __init__(
227
        self,
228
        vocabulary: dict = None,
229
        **kwargs
230
    ): ...
231

232
class MixtralTokenizer(Tokenizer):
233
    """Mixtral tokenizer."""
234
    def __init__(
235
        self,
236
        vocabulary: dict = None,
237
        **kwargs
238
    ): ...
239

240
class GemmaTokenizer(Tokenizer):
241
    """Gemma tokenizer."""
242
    def __init__(
243
        self,
244
        vocabulary: dict = None,
245
        **kwargs
246
    ): ...
247

248
class Gemma3Tokenizer(Tokenizer):
249
    """Gemma 3 tokenizer."""
250
    def __init__(
251
        self,
252
        vocabulary: dict = None,
253
        **kwargs
254
    ): ...
255

256
class BloomTokenizer(Tokenizer):
257
    """BLOOM tokenizer."""
258
    def __init__(
259
        self,
260
        vocabulary: dict = None,
261
        **kwargs
262
    ): ...
263

264
class OPTTokenizer(Tokenizer):
265
    """OPT tokenizer."""
266
    def __init__(
267
        self,
268
        vocabulary: dict = None,
269
        **kwargs
270
    ): ...
271

272
class FalconTokenizer(Tokenizer):
273
    """Falcon tokenizer."""
274
    def __init__(
275
        self,
276
        vocabulary: dict = None,
277
        **kwargs
278
    ): ...
279

280
class Phi3Tokenizer(Tokenizer):
281
    """Phi-3 tokenizer."""
282
    def __init__(
283
        self,
284
        vocabulary: dict = None,
285
        **kwargs
286
    ): ...
287

288
class QwenTokenizer(Tokenizer):
289
    """Qwen tokenizer."""
290
    def __init__(
291
        self,
292
        vocabulary: dict = None,
293
        **kwargs
294
    ): ...
295

296
class QwenMoeTokenizer(Tokenizer):
297
    """Qwen MoE tokenizer."""
298
    def __init__(
299
        self,
300
        vocabulary: dict = None,
301
        **kwargs
302
    ): ...
303

304
class Qwen3Tokenizer(Tokenizer):
305
    """Qwen 3 tokenizer."""
306
    def __init__(
307
        self,
308
        vocabulary: dict = None,
309
        **kwargs
310
    ): ...
311

312
# Aliases
313
Qwen2Tokenizer = QwenTokenizer
314

315
# Sequence-to-Sequence Models
316
class BartTokenizer(Tokenizer):
317
    """BART tokenizer."""
318
    def __init__(
319
        self,
320
        vocabulary: dict = None,
321
        **kwargs
322
    ): ...
323

324
class T5Tokenizer(Tokenizer):
325
    """T5 tokenizer using SentencePiece."""
326
    def __init__(
327
        self,
328
        proto: bytes = None,
329
        **kwargs
330
    ): ...
331

332
# Specialized Models
333
class FNetTokenizer(Tokenizer):
334
    """F-Net tokenizer."""
335
    def __init__(
336
        self,
337
        vocabulary: dict = None,
338
        **kwargs
339
    ): ...
340

341
class RoformerV2Tokenizer(Tokenizer):
342
    """RoFormer V2 tokenizer."""
343
    def __init__(
344
        self,
345
        vocabulary: dict = None,
346
        **kwargs
347
    ): ...
348

349
class ESMTokenizer(Tokenizer):
350
    """ESM (protein) tokenizer."""
351
    def __init__(
352
        self,
353
        vocabulary: dict = None,
354
        **kwargs
355
    ): ...
356

357
# Multimodal Models
358
class CLIPTokenizer(Tokenizer):
359
    """CLIP tokenizer."""
360
    def __init__(
361
        self,
362
        vocabulary: dict = None,
363
        **kwargs
364
    ): ...
365

366
class SigLIPTokenizer(Tokenizer):
367
    """SigLIP tokenizer."""
368
    def __init__(
369
        self,
370
        vocabulary: dict = None,
371
        **kwargs
372
    ): ...
373

374
class PaliGemmaTokenizer(Tokenizer):
375
    """PaliGemma tokenizer."""
376
    def __init__(
377
        self,
378
        vocabulary: dict = None,
379
        **kwargs
380
    ): ...
381

382
# Audio Models
383
class WhisperTokenizer(Tokenizer):
384
    """Whisper tokenizer."""
385
    def __init__(
386
        self,
387
        vocabulary: dict = None,
388
        **kwargs
389
    ): ...
390

391
class MoonshineTokenizer(Tokenizer):
392
    """Moonshine tokenizer."""
393
    def __init__(
394
        self,
395
        vocabulary: dict = None,
396
        **kwargs
397
    ): ...
398
```
399

400
## Usage Examples
401

402
### Using Pretrained Tokenizers
403

404
```python
405
import keras_hub
406

407
# Load a pretrained tokenizer
408
tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")
409

410
# Tokenize text
411
text = ["Hello world!", "How are you today?"]
412
tokens = tokenizer(text)
413
print("Tokens:", tokens)
414

415
# Get vocabulary information
416
print("Vocabulary size:", tokenizer.vocabulary_size)
417
print("Sample vocabulary:", list(tokenizer.vocabulary.items())[:10])
418
```
419

420
### Creating Custom Tokenizers
421

422
```python
423
import keras_hub
424

425
# Create a custom WordPiece tokenizer
426
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
427
    vocabulary={"[UNK]": 0, "[PAD]": 1, "hello": 2, "world": 3, "##ing": 4},
428
    unseen_token="[UNK]"
429
)
430

431
# Use the tokenizer
432
tokens = tokenizer(["hello world", "testing"])
433
print("Custom tokens:", tokens)
434
```
435

436
### Training Custom Vocabularies
437

438
```python
439
import keras_hub
440

441
# Training data
442
training_texts = [
443
    "This is a sample text for training tokenizer.",
444
    "Another example sentence for vocabulary building.",
445
    "More text data for better tokenization results."
446
]
447

448
# Train WordPiece vocabulary
449
vocabulary = keras_hub.tokenizers.compute_word_piece_vocabulary(
450
    data=training_texts,
451
    vocabulary_size=1000,
452
    reserved_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]
453
)
454

455
# Create tokenizer with trained vocabulary
456
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(vocabulary=vocabulary)
457

458
# Use the trained tokenizer
459
tokens = tokenizer(["New text to tokenize"])
460
print("Trained tokenizer output:", tokens)
461
```
462

463
### SentencePiece Training
464

465
```python
466
import keras_hub
467

468
# Train SentencePiece model
469
training_data = ["Large corpus of text for training", "More text data..."]
470

471
proto = keras_hub.tokenizers.compute_sentence_piece_proto(
472
    data=training_data,
473
    vocabulary_size=8000,
474
    model_type="unigram"
475
)
476

477
# Create SentencePiece tokenizer
478
tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(proto=proto)
479

480
# Use the tokenizer
481
tokens = tokenizer(["Text to tokenize with SentencePiece"])
482
print("SentencePiece tokens:", tokens)
483
```
484

485
### Working with Different Tokenization Algorithms
486

487
```python
488
import keras_hub
489

490
# BPE tokenizer
491
bpe_tokenizer = keras_hub.tokenizers.BytePairTokenizer.from_preset("gpt2_base_en")
492
bpe_tokens = bpe_tokenizer(["Example text"])
493

494
# WordPiece tokenizer
495
wordpiece_tokenizer = keras_hub.tokenizers.WordPieceTokenizer.from_preset("bert_base_en")
496
wordpiece_tokens = wordpiece_tokenizer(["Example text"])
497

498
# SentencePiece tokenizer
499
sentencepiece_tokenizer = keras_hub.tokenizers.SentencePieceTokenizer.from_preset("t5_base_en")
500
sp_tokens = sentencepiece_tokenizer(["Example text"])
501

502
print("BPE tokens:", bpe_tokens)
503
print("WordPiece tokens:", wordpiece_tokens)
504
print("SentencePiece tokens:", sp_tokens)
505
```
506

507
### Tokenization and Detokenization
508

509
```python
510
import keras_hub
511

512
# Load tokenizer
513
tokenizer = keras_hub.tokenizers.GPT2Tokenizer.from_preset("gpt2_base_en")
514

515
# Original text
516
text = "Hello, how are you doing today?"
517

518
# Tokenize
519
tokens = tokenizer.tokenize(text)
520
print("Tokens:", tokens)
521

522
# Detokenize back to text
523
reconstructed = tokenizer.detokenize(tokens)
524
print("Reconstructed:", reconstructed)
525
```
526

527
### Batch Processing
528

529
```python
530
import keras_hub
531

532
# Load tokenizer
533
tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")
534

535
# Batch of texts
536
texts = [
537
    "First document to tokenize",
538
    "Second document with different content",
539
    "Third document for batch processing"
540
]
541

542
# Batch tokenization
543
batch_tokens = tokenizer(texts)
544
print("Batch tokens shape:", batch_tokens.shape)
545
print("Batch tokens:", batch_tokens)
546
```

Version

Tile

Files

tokenizers.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

tokenizers.mddocs/