Pretrained models for Keras with multi-framework compatibility.
—
Text tokenization utilities supporting various algorithms including byte-pair encoding, WordPiece, and SentencePiece. Keras Hub provides both general-purpose tokenizers and model-specific implementations.
Foundation classes for text tokenization.
class Tokenizer:
"""Base class for all tokenizers."""
def __init__(self, **kwargs): ...
def __call__(self, inputs): ...
def tokenize(self, inputs): ...
def detokenize(self, inputs): ...
@classmethod
def from_preset(cls, preset: str, **kwargs): ...
@property
def vocabulary_size(self) -> int: ...
@property
def vocabulary(self) -> dict: ...Tokenizers that can be used with various models and trained on custom datasets.
class BytePairTokenizer(Tokenizer):
"""Byte Pair Encoding (BPE) tokenizer."""
def __init__(
self,
vocabulary: dict = None,
merges: list = None,
unseen_token: str = "<unk>",
**kwargs
): ...
class WordPieceTokenizer(Tokenizer):
"""WordPiece tokenizer as used in BERT."""
def __init__(
self,
vocabulary: dict = None,
unseen_token: str = "[UNK]",
max_input_chars_per_word: int = 100,
**kwargs
): ...
class SentencePieceTokenizer(Tokenizer):
"""SentencePiece tokenizer."""
def __init__(
self,
proto: bytes = None,
**kwargs
): ...
class ByteTokenizer(Tokenizer):
"""Byte-level tokenizer."""
def __init__(
self,
vocabulary_size: int = 256,
**kwargs
): ...
class UnicodeCodepointTokenizer(Tokenizer):
"""Unicode codepoint tokenizer."""
def __init__(
self,
vocabulary_size: int = 1000000,
lowercase: bool = False,
**kwargs
): ...Utilities for training custom tokenizers on your data.
def compute_word_piece_vocabulary(
data: list,
vocabulary_size: int,
reserved_tokens: list = None,
**kwargs
) -> dict:
"""
Compute WordPiece vocabulary from training data.
Args:
data: List of text strings for training
vocabulary_size: Target vocabulary size
reserved_tokens: Special tokens to include in vocabulary
Returns:
Dictionary mapping tokens to IDs
"""
...
def compute_sentence_piece_proto(
data: list,
vocabulary_size: int,
model_type: str = "unigram",
**kwargs
) -> bytes:
"""
Compute SentencePiece model proto from training data.
Args:
data: List of text strings for training
vocabulary_size: Target vocabulary size
model_type: SentencePiece model type ("unigram", "bpe", "word", "char")
Returns:
Serialized SentencePiece model proto
"""
...Tokenizers specifically designed for particular model architectures.
# BERT Family
class BertTokenizer(Tokenizer):
"""BERT tokenizer using WordPiece."""
def __init__(
self,
vocabulary: dict = None,
lowercase: bool = True,
**kwargs
): ...
class AlbertTokenizer(Tokenizer):
"""ALBERT tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class DistilBertTokenizer(Tokenizer):
"""DistilBERT tokenizer."""
def __init__(
self,
vocabulary: dict = None,
lowercase: bool = True,
**kwargs
): ...
class ElectraTokenizer(Tokenizer):
"""ELECTRA tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class RobertaTokenizer(Tokenizer):
"""RoBERTa tokenizer using BPE."""
def __init__(
self,
vocabulary: dict = None,
merges: list = None,
**kwargs
): ...
class DebertaV3Tokenizer(Tokenizer):
"""DeBERTa V3 tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class XLMRobertaTokenizer(Tokenizer):
"""XLM-RoBERTa tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
# GPT Family
class GPT2Tokenizer(Tokenizer):
"""GPT-2 tokenizer using BPE."""
def __init__(
self,
vocabulary: dict = None,
merges: list = None,
**kwargs
): ...
class GPTNeoXTokenizer(Tokenizer):
"""GPT-NeoX tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
# Large Language Models
class LlamaTokenizer(Tokenizer):
"""Llama tokenizer using SentencePiece."""
def __init__(
self,
proto: bytes = None,
**kwargs
): ...
class Llama3Tokenizer(Tokenizer):
"""Llama 3 tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class MistralTokenizer(Tokenizer):
"""Mistral tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class MixtralTokenizer(Tokenizer):
"""Mixtral tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class GemmaTokenizer(Tokenizer):
"""Gemma tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class Gemma3Tokenizer(Tokenizer):
"""Gemma 3 tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class BloomTokenizer(Tokenizer):
"""BLOOM tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class OPTTokenizer(Tokenizer):
"""OPT tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class FalconTokenizer(Tokenizer):
"""Falcon tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class Phi3Tokenizer(Tokenizer):
"""Phi-3 tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class QwenTokenizer(Tokenizer):
"""Qwen tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class QwenMoeTokenizer(Tokenizer):
"""Qwen MoE tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class Qwen3Tokenizer(Tokenizer):
"""Qwen 3 tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
# Aliases
Qwen2Tokenizer = QwenTokenizer
# Sequence-to-Sequence Models
class BartTokenizer(Tokenizer):
"""BART tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class T5Tokenizer(Tokenizer):
"""T5 tokenizer using SentencePiece."""
def __init__(
self,
proto: bytes = None,
**kwargs
): ...
# Specialized Models
class FNetTokenizer(Tokenizer):
"""F-Net tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class RoformerV2Tokenizer(Tokenizer):
"""RoFormer V2 tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class ESMTokenizer(Tokenizer):
"""ESM (protein) tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
# Multimodal Models
class CLIPTokenizer(Tokenizer):
"""CLIP tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class SigLIPTokenizer(Tokenizer):
"""SigLIP tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class PaliGemmaTokenizer(Tokenizer):
"""PaliGemma tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
# Audio Models
class WhisperTokenizer(Tokenizer):
"""Whisper tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...
class MoonshineTokenizer(Tokenizer):
"""Moonshine tokenizer."""
def __init__(
self,
vocabulary: dict = None,
**kwargs
): ...import keras_hub
# Load a pretrained tokenizer
tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")
# Tokenize text
text = ["Hello world!", "How are you today?"]
tokens = tokenizer(text)
print("Tokens:", tokens)
# Get vocabulary information
print("Vocabulary size:", tokenizer.vocabulary_size)
print("Sample vocabulary:", list(tokenizer.vocabulary.items())[:10])import keras_hub
# Create a custom WordPiece tokenizer
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
vocabulary={"[UNK]": 0, "[PAD]": 1, "hello": 2, "world": 3, "##ing": 4},
unseen_token="[UNK]"
)
# Use the tokenizer
tokens = tokenizer(["hello world", "testing"])
print("Custom tokens:", tokens)import keras_hub
# Training data
training_texts = [
"This is a sample text for training tokenizer.",
"Another example sentence for vocabulary building.",
"More text data for better tokenization results."
]
# Train WordPiece vocabulary
vocabulary = keras_hub.tokenizers.compute_word_piece_vocabulary(
data=training_texts,
vocabulary_size=1000,
reserved_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]
)
# Create tokenizer with trained vocabulary
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(vocabulary=vocabulary)
# Use the trained tokenizer
tokens = tokenizer(["New text to tokenize"])
print("Trained tokenizer output:", tokens)import keras_hub
# Train SentencePiece model
training_data = ["Large corpus of text for training", "More text data..."]
proto = keras_hub.tokenizers.compute_sentence_piece_proto(
data=training_data,
vocabulary_size=8000,
model_type="unigram"
)
# Create SentencePiece tokenizer
tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(proto=proto)
# Use the tokenizer
tokens = tokenizer(["Text to tokenize with SentencePiece"])
print("SentencePiece tokens:", tokens)import keras_hub
# BPE tokenizer
bpe_tokenizer = keras_hub.tokenizers.BytePairTokenizer.from_preset("gpt2_base_en")
bpe_tokens = bpe_tokenizer(["Example text"])
# WordPiece tokenizer
wordpiece_tokenizer = keras_hub.tokenizers.WordPieceTokenizer.from_preset("bert_base_en")
wordpiece_tokens = wordpiece_tokenizer(["Example text"])
# SentencePiece tokenizer
sentencepiece_tokenizer = keras_hub.tokenizers.SentencePieceTokenizer.from_preset("t5_base_en")
sp_tokens = sentencepiece_tokenizer(["Example text"])
print("BPE tokens:", bpe_tokens)
print("WordPiece tokens:", wordpiece_tokens)
print("SentencePiece tokens:", sp_tokens)import keras_hub
# Load tokenizer
tokenizer = keras_hub.tokenizers.GPT2Tokenizer.from_preset("gpt2_base_en")
# Original text
text = "Hello, how are you doing today?"
# Tokenize
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)
# Detokenize back to text
reconstructed = tokenizer.detokenize(tokens)
print("Reconstructed:", reconstructed)import keras_hub
# Load tokenizer
tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")
# Batch of texts
texts = [
"First document to tokenize",
"Second document with different content",
"Third document for batch processing"
]
# Batch tokenization
batch_tokens = tokenizer(texts)
print("Batch tokens shape:", batch_tokens.shape)
print("Batch tokens:", batch_tokens)Install with Tessl CLI
npx tessl i tessl/pypi-keras-hub