or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-models.mdevaluation-metrics.mdgenerative-models.mdimage-models.mdindex.mdlayers-components.mdmultimodal-models.mdtext-generation-sampling.mdtext-models.mdtokenizers.mdutilities-helpers.md

tokenizers.mddocs/

0

# Tokenizers

1

2

Text tokenization utilities supporting various algorithms including byte-pair encoding, WordPiece, and SentencePiece. Keras Hub provides both general-purpose tokenizers and model-specific implementations.

3

4

## Capabilities

5

6

### Base Classes

7

8

Foundation classes for text tokenization.

9

10

```python { .api }

11

class Tokenizer:

12

"""Base class for all tokenizers."""

13

def __init__(self, **kwargs): ...

14

15

def __call__(self, inputs): ...

16

def tokenize(self, inputs): ...

17

def detokenize(self, inputs): ...

18

19

@classmethod

20

def from_preset(cls, preset: str, **kwargs): ...

21

22

@property

23

def vocabulary_size(self) -> int: ...

24

25

@property

26

def vocabulary(self) -> dict: ...

27

```

28

29

### General-Purpose Tokenizers

30

31

Tokenizers that can be used with various models and trained on custom datasets.

32

33

```python { .api }

34

class BytePairTokenizer(Tokenizer):

35

"""Byte Pair Encoding (BPE) tokenizer."""

36

def __init__(

37

self,

38

vocabulary: dict = None,

39

merges: list = None,

40

unseen_token: str = "<unk>",

41

**kwargs

42

): ...

43

44

class WordPieceTokenizer(Tokenizer):

45

"""WordPiece tokenizer as used in BERT."""

46

def __init__(

47

self,

48

vocabulary: dict = None,

49

unseen_token: str = "[UNK]",

50

max_input_chars_per_word: int = 100,

51

**kwargs

52

): ...

53

54

class SentencePieceTokenizer(Tokenizer):

55

"""SentencePiece tokenizer."""

56

def __init__(

57

self,

58

proto: bytes = None,

59

**kwargs

60

): ...

61

62

class ByteTokenizer(Tokenizer):

63

"""Byte-level tokenizer."""

64

def __init__(

65

self,

66

vocabulary_size: int = 256,

67

**kwargs

68

): ...

69

70

class UnicodeCodepointTokenizer(Tokenizer):

71

"""Unicode codepoint tokenizer."""

72

def __init__(

73

self,

74

vocabulary_size: int = 1000000,

75

lowercase: bool = False,

76

**kwargs

77

): ...

78

```

79

80

### Tokenizer Training Utilities

81

82

Utilities for training custom tokenizers on your data.

83

84

```python { .api }

85

def compute_word_piece_vocabulary(

86

data: list,

87

vocabulary_size: int,

88

reserved_tokens: list = None,

89

**kwargs

90

) -> dict:

91

"""

92

Compute WordPiece vocabulary from training data.

93

94

Args:

95

data: List of text strings for training

96

vocabulary_size: Target vocabulary size

97

reserved_tokens: Special tokens to include in vocabulary

98

99

Returns:

100

Dictionary mapping tokens to IDs

101

"""

102

...

103

104

def compute_sentence_piece_proto(

105

data: list,

106

vocabulary_size: int,

107

model_type: str = "unigram",

108

**kwargs

109

) -> bytes:

110

"""

111

Compute SentencePiece model proto from training data.

112

113

Args:

114

data: List of text strings for training

115

vocabulary_size: Target vocabulary size

116

model_type: SentencePiece model type ("unigram", "bpe", "word", "char")

117

118

Returns:

119

Serialized SentencePiece model proto

120

"""

121

...

122

```

123

124

### Model-Specific Tokenizers

125

126

Tokenizers specifically designed for particular model architectures.

127

128

```python { .api }

129

# BERT Family

130

class BertTokenizer(Tokenizer):

131

"""BERT tokenizer using WordPiece."""

132

def __init__(

133

self,

134

vocabulary: dict = None,

135

lowercase: bool = True,

136

**kwargs

137

): ...

138

139

class AlbertTokenizer(Tokenizer):

140

"""ALBERT tokenizer."""

141

def __init__(

142

self,

143

vocabulary: dict = None,

144

**kwargs

145

): ...

146

147

class DistilBertTokenizer(Tokenizer):

148

"""DistilBERT tokenizer."""

149

def __init__(

150

self,

151

vocabulary: dict = None,

152

lowercase: bool = True,

153

**kwargs

154

): ...

155

156

class ElectraTokenizer(Tokenizer):

157

"""ELECTRA tokenizer."""

158

def __init__(

159

self,

160

vocabulary: dict = None,

161

**kwargs

162

): ...

163

164

class RobertaTokenizer(Tokenizer):

165

"""RoBERTa tokenizer using BPE."""

166

def __init__(

167

self,

168

vocabulary: dict = None,

169

merges: list = None,

170

**kwargs

171

): ...

172

173

class DebertaV3Tokenizer(Tokenizer):

174

"""DeBERTa V3 tokenizer."""

175

def __init__(

176

self,

177

vocabulary: dict = None,

178

**kwargs

179

): ...

180

181

class XLMRobertaTokenizer(Tokenizer):

182

"""XLM-RoBERTa tokenizer."""

183

def __init__(

184

self,

185

vocabulary: dict = None,

186

**kwargs

187

): ...

188

189

# GPT Family

190

class GPT2Tokenizer(Tokenizer):

191

"""GPT-2 tokenizer using BPE."""

192

def __init__(

193

self,

194

vocabulary: dict = None,

195

merges: list = None,

196

**kwargs

197

): ...

198

199

class GPTNeoXTokenizer(Tokenizer):

200

"""GPT-NeoX tokenizer."""

201

def __init__(

202

self,

203

vocabulary: dict = None,

204

**kwargs

205

): ...

206

207

# Large Language Models

208

class LlamaTokenizer(Tokenizer):

209

"""Llama tokenizer using SentencePiece."""

210

def __init__(

211

self,

212

proto: bytes = None,

213

**kwargs

214

): ...

215

216

class Llama3Tokenizer(Tokenizer):

217

"""Llama 3 tokenizer."""

218

def __init__(

219

self,

220

vocabulary: dict = None,

221

**kwargs

222

): ...

223

224

class MistralTokenizer(Tokenizer):

225

"""Mistral tokenizer."""

226

def __init__(

227

self,

228

vocabulary: dict = None,

229

**kwargs

230

): ...

231

232

class MixtralTokenizer(Tokenizer):

233

"""Mixtral tokenizer."""

234

def __init__(

235

self,

236

vocabulary: dict = None,

237

**kwargs

238

): ...

239

240

class GemmaTokenizer(Tokenizer):

241

"""Gemma tokenizer."""

242

def __init__(

243

self,

244

vocabulary: dict = None,

245

**kwargs

246

): ...

247

248

class Gemma3Tokenizer(Tokenizer):

249

"""Gemma 3 tokenizer."""

250

def __init__(

251

self,

252

vocabulary: dict = None,

253

**kwargs

254

): ...

255

256

class BloomTokenizer(Tokenizer):

257

"""BLOOM tokenizer."""

258

def __init__(

259

self,

260

vocabulary: dict = None,

261

**kwargs

262

): ...

263

264

class OPTTokenizer(Tokenizer):

265

"""OPT tokenizer."""

266

def __init__(

267

self,

268

vocabulary: dict = None,

269

**kwargs

270

): ...

271

272

class FalconTokenizer(Tokenizer):

273

"""Falcon tokenizer."""

274

def __init__(

275

self,

276

vocabulary: dict = None,

277

**kwargs

278

): ...

279

280

class Phi3Tokenizer(Tokenizer):

281

"""Phi-3 tokenizer."""

282

def __init__(

283

self,

284

vocabulary: dict = None,

285

**kwargs

286

): ...

287

288

class QwenTokenizer(Tokenizer):

289

"""Qwen tokenizer."""

290

def __init__(

291

self,

292

vocabulary: dict = None,

293

**kwargs

294

): ...

295

296

class QwenMoeTokenizer(Tokenizer):

297

"""Qwen MoE tokenizer."""

298

def __init__(

299

self,

300

vocabulary: dict = None,

301

**kwargs

302

): ...

303

304

class Qwen3Tokenizer(Tokenizer):

305

"""Qwen 3 tokenizer."""

306

def __init__(

307

self,

308

vocabulary: dict = None,

309

**kwargs

310

): ...

311

312

# Aliases

313

Qwen2Tokenizer = QwenTokenizer

314

315

# Sequence-to-Sequence Models

316

class BartTokenizer(Tokenizer):

317

"""BART tokenizer."""

318

def __init__(

319

self,

320

vocabulary: dict = None,

321

**kwargs

322

): ...

323

324

class T5Tokenizer(Tokenizer):

325

"""T5 tokenizer using SentencePiece."""

326

def __init__(

327

self,

328

proto: bytes = None,

329

**kwargs

330

): ...

331

332

# Specialized Models

333

class FNetTokenizer(Tokenizer):

334

"""F-Net tokenizer."""

335

def __init__(

336

self,

337

vocabulary: dict = None,

338

**kwargs

339

): ...

340

341

class RoformerV2Tokenizer(Tokenizer):

342

"""RoFormer V2 tokenizer."""

343

def __init__(

344

self,

345

vocabulary: dict = None,

346

**kwargs

347

): ...

348

349

class ESMTokenizer(Tokenizer):

350

"""ESM (protein) tokenizer."""

351

def __init__(

352

self,

353

vocabulary: dict = None,

354

**kwargs

355

): ...

356

357

# Multimodal Models

358

class CLIPTokenizer(Tokenizer):

359

"""CLIP tokenizer."""

360

def __init__(

361

self,

362

vocabulary: dict = None,

363

**kwargs

364

): ...

365

366

class SigLIPTokenizer(Tokenizer):

367

"""SigLIP tokenizer."""

368

def __init__(

369

self,

370

vocabulary: dict = None,

371

**kwargs

372

): ...

373

374

class PaliGemmaTokenizer(Tokenizer):

375

"""PaliGemma tokenizer."""

376

def __init__(

377

self,

378

vocabulary: dict = None,

379

**kwargs

380

): ...

381

382

# Audio Models

383

class WhisperTokenizer(Tokenizer):

384

"""Whisper tokenizer."""

385

def __init__(

386

self,

387

vocabulary: dict = None,

388

**kwargs

389

): ...

390

391

class MoonshineTokenizer(Tokenizer):

392

"""Moonshine tokenizer."""

393

def __init__(

394

self,

395

vocabulary: dict = None,

396

**kwargs

397

): ...

398

```

399

400

## Usage Examples

401

402

### Using Pretrained Tokenizers

403

404

```python

405

import keras_hub

406

407

# Load a pretrained tokenizer

408

tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")

409

410

# Tokenize text

411

text = ["Hello world!", "How are you today?"]

412

tokens = tokenizer(text)

413

print("Tokens:", tokens)

414

415

# Get vocabulary information

416

print("Vocabulary size:", tokenizer.vocabulary_size)

417

print("Sample vocabulary:", list(tokenizer.vocabulary.items())[:10])

418

```

419

420

### Creating Custom Tokenizers

421

422

```python

423

import keras_hub

424

425

# Create a custom WordPiece tokenizer

426

tokenizer = keras_hub.tokenizers.WordPieceTokenizer(

427

vocabulary={"[UNK]": 0, "[PAD]": 1, "hello": 2, "world": 3, "##ing": 4},

428

unseen_token="[UNK]"

429

)

430

431

# Use the tokenizer

432

tokens = tokenizer(["hello world", "testing"])

433

print("Custom tokens:", tokens)

434

```

435

436

### Training Custom Vocabularies

437

438

```python

439

import keras_hub

440

441

# Training data

442

training_texts = [

443

"This is a sample text for training tokenizer.",

444

"Another example sentence for vocabulary building.",

445

"More text data for better tokenization results."

446

]

447

448

# Train WordPiece vocabulary

449

vocabulary = keras_hub.tokenizers.compute_word_piece_vocabulary(

450

data=training_texts,

451

vocabulary_size=1000,

452

reserved_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]

453

)

454

455

# Create tokenizer with trained vocabulary

456

tokenizer = keras_hub.tokenizers.WordPieceTokenizer(vocabulary=vocabulary)

457

458

# Use the trained tokenizer

459

tokens = tokenizer(["New text to tokenize"])

460

print("Trained tokenizer output:", tokens)

461

```

462

463

### SentencePiece Training

464

465

```python

466

import keras_hub

467

468

# Train SentencePiece model

469

training_data = ["Large corpus of text for training", "More text data..."]

470

471

proto = keras_hub.tokenizers.compute_sentence_piece_proto(

472

data=training_data,

473

vocabulary_size=8000,

474

model_type="unigram"

475

)

476

477

# Create SentencePiece tokenizer

478

tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(proto=proto)

479

480

# Use the tokenizer

481

tokens = tokenizer(["Text to tokenize with SentencePiece"])

482

print("SentencePiece tokens:", tokens)

483

```

484

485

### Working with Different Tokenization Algorithms

486

487

```python

488

import keras_hub

489

490

# BPE tokenizer

491

bpe_tokenizer = keras_hub.tokenizers.BytePairTokenizer.from_preset("gpt2_base_en")

492

bpe_tokens = bpe_tokenizer(["Example text"])

493

494

# WordPiece tokenizer

495

wordpiece_tokenizer = keras_hub.tokenizers.WordPieceTokenizer.from_preset("bert_base_en")

496

wordpiece_tokens = wordpiece_tokenizer(["Example text"])

497

498

# SentencePiece tokenizer

499

sentencepiece_tokenizer = keras_hub.tokenizers.SentencePieceTokenizer.from_preset("t5_base_en")

500

sp_tokens = sentencepiece_tokenizer(["Example text"])

501

502

print("BPE tokens:", bpe_tokens)

503

print("WordPiece tokens:", wordpiece_tokens)

504

print("SentencePiece tokens:", sp_tokens)

505

```

506

507

### Tokenization and Detokenization

508

509

```python

510

import keras_hub

511

512

# Load tokenizer

513

tokenizer = keras_hub.tokenizers.GPT2Tokenizer.from_preset("gpt2_base_en")

514

515

# Original text

516

text = "Hello, how are you doing today?"

517

518

# Tokenize

519

tokens = tokenizer.tokenize(text)

520

print("Tokens:", tokens)

521

522

# Detokenize back to text

523

reconstructed = tokenizer.detokenize(tokens)

524

print("Reconstructed:", reconstructed)

525

```

526

527

### Batch Processing

528

529

```python

530

import keras_hub

531

532

# Load tokenizer

533

tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")

534

535

# Batch of texts

536

texts = [

537

"First document to tokenize",

538

"Second document with different content",

539

"Third document for batch processing"

540

]

541

542

# Batch tokenization

543

batch_tokens = tokenizer(texts)

544

print("Batch tokens shape:", batch_tokens.shape)

545

print("Batch tokens:", batch_tokens)

546

```