CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-keras-hub

Pretrained models for Keras with multi-framework compatibility.

Pending
Overview
Eval results
Files

tokenizers.mddocs/

Tokenizers

Text tokenization utilities supporting various algorithms including byte-pair encoding, WordPiece, and SentencePiece. Keras Hub provides both general-purpose tokenizers and model-specific implementations.

Capabilities

Base Classes

Foundation classes for text tokenization.

class Tokenizer:
    """Base class for all tokenizers."""
    def __init__(self, **kwargs): ...
    
    def __call__(self, inputs): ...
    def tokenize(self, inputs): ...
    def detokenize(self, inputs): ...
    
    @classmethod
    def from_preset(cls, preset: str, **kwargs): ...
    
    @property
    def vocabulary_size(self) -> int: ...
    
    @property
    def vocabulary(self) -> dict: ...

General-Purpose Tokenizers

Tokenizers that can be used with various models and trained on custom datasets.

class BytePairTokenizer(Tokenizer):
    """Byte Pair Encoding (BPE) tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        merges: list = None,
        unseen_token: str = "<unk>",
        **kwargs
    ): ...

class WordPieceTokenizer(Tokenizer):
    """WordPiece tokenizer as used in BERT."""
    def __init__(
        self,
        vocabulary: dict = None,
        unseen_token: str = "[UNK]",
        max_input_chars_per_word: int = 100,
        **kwargs
    ): ...

class SentencePieceTokenizer(Tokenizer):
    """SentencePiece tokenizer."""
    def __init__(
        self,
        proto: bytes = None,
        **kwargs
    ): ...

class ByteTokenizer(Tokenizer):
    """Byte-level tokenizer."""
    def __init__(
        self,
        vocabulary_size: int = 256,
        **kwargs
    ): ...

class UnicodeCodepointTokenizer(Tokenizer):
    """Unicode codepoint tokenizer."""
    def __init__(
        self,
        vocabulary_size: int = 1000000,
        lowercase: bool = False,
        **kwargs
    ): ...

Tokenizer Training Utilities

Utilities for training custom tokenizers on your data.

def compute_word_piece_vocabulary(
    data: list,
    vocabulary_size: int,
    reserved_tokens: list = None,
    **kwargs
) -> dict:
    """
    Compute WordPiece vocabulary from training data.
    
    Args:
        data: List of text strings for training
        vocabulary_size: Target vocabulary size
        reserved_tokens: Special tokens to include in vocabulary
        
    Returns:
        Dictionary mapping tokens to IDs
    """
    ...

def compute_sentence_piece_proto(
    data: list,
    vocabulary_size: int,
    model_type: str = "unigram",
    **kwargs
) -> bytes:
    """
    Compute SentencePiece model proto from training data.
    
    Args:
        data: List of text strings for training
        vocabulary_size: Target vocabulary size
        model_type: SentencePiece model type ("unigram", "bpe", "word", "char")
        
    Returns:
        Serialized SentencePiece model proto
    """
    ...

Model-Specific Tokenizers

Tokenizers specifically designed for particular model architectures.

# BERT Family
class BertTokenizer(Tokenizer):
    """BERT tokenizer using WordPiece."""
    def __init__(
        self,
        vocabulary: dict = None,
        lowercase: bool = True,
        **kwargs
    ): ...

class AlbertTokenizer(Tokenizer):
    """ALBERT tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class DistilBertTokenizer(Tokenizer):
    """DistilBERT tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        lowercase: bool = True,
        **kwargs
    ): ...

class ElectraTokenizer(Tokenizer):
    """ELECTRA tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class RobertaTokenizer(Tokenizer):
    """RoBERTa tokenizer using BPE."""
    def __init__(
        self,
        vocabulary: dict = None,
        merges: list = None,
        **kwargs
    ): ...

class DebertaV3Tokenizer(Tokenizer):
    """DeBERTa V3 tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class XLMRobertaTokenizer(Tokenizer):
    """XLM-RoBERTa tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

# GPT Family
class GPT2Tokenizer(Tokenizer):
    """GPT-2 tokenizer using BPE."""
    def __init__(
        self,
        vocabulary: dict = None,
        merges: list = None,
        **kwargs
    ): ...

class GPTNeoXTokenizer(Tokenizer):
    """GPT-NeoX tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

# Large Language Models
class LlamaTokenizer(Tokenizer):
    """Llama tokenizer using SentencePiece."""
    def __init__(
        self,
        proto: bytes = None,
        **kwargs
    ): ...

class Llama3Tokenizer(Tokenizer):
    """Llama 3 tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class MistralTokenizer(Tokenizer):
    """Mistral tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class MixtralTokenizer(Tokenizer):
    """Mixtral tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class GemmaTokenizer(Tokenizer):
    """Gemma tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class Gemma3Tokenizer(Tokenizer):
    """Gemma 3 tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class BloomTokenizer(Tokenizer):
    """BLOOM tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class OPTTokenizer(Tokenizer):
    """OPT tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class FalconTokenizer(Tokenizer):
    """Falcon tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class Phi3Tokenizer(Tokenizer):
    """Phi-3 tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class QwenTokenizer(Tokenizer):
    """Qwen tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class QwenMoeTokenizer(Tokenizer):
    """Qwen MoE tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class Qwen3Tokenizer(Tokenizer):
    """Qwen 3 tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

# Aliases
Qwen2Tokenizer = QwenTokenizer

# Sequence-to-Sequence Models
class BartTokenizer(Tokenizer):
    """BART tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class T5Tokenizer(Tokenizer):
    """T5 tokenizer using SentencePiece."""
    def __init__(
        self,
        proto: bytes = None,
        **kwargs
    ): ...

# Specialized Models
class FNetTokenizer(Tokenizer):
    """F-Net tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class RoformerV2Tokenizer(Tokenizer):
    """RoFormer V2 tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class ESMTokenizer(Tokenizer):
    """ESM (protein) tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

# Multimodal Models
class CLIPTokenizer(Tokenizer):
    """CLIP tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class SigLIPTokenizer(Tokenizer):
    """SigLIP tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class PaliGemmaTokenizer(Tokenizer):
    """PaliGemma tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

# Audio Models
class WhisperTokenizer(Tokenizer):
    """Whisper tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

class MoonshineTokenizer(Tokenizer):
    """Moonshine tokenizer."""
    def __init__(
        self,
        vocabulary: dict = None,
        **kwargs
    ): ...

Usage Examples

Using Pretrained Tokenizers

import keras_hub

# Load a pretrained tokenizer
tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")

# Tokenize text
text = ["Hello world!", "How are you today?"]
tokens = tokenizer(text)
print("Tokens:", tokens)

# Get vocabulary information
print("Vocabulary size:", tokenizer.vocabulary_size)
print("Sample vocabulary:", list(tokenizer.vocabulary.items())[:10])

Creating Custom Tokenizers

import keras_hub

# Create a custom WordPiece tokenizer
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary={"[UNK]": 0, "[PAD]": 1, "hello": 2, "world": 3, "##ing": 4},
    unseen_token="[UNK]"
)

# Use the tokenizer
tokens = tokenizer(["hello world", "testing"])
print("Custom tokens:", tokens)

Training Custom Vocabularies

import keras_hub

# Training data
training_texts = [
    "This is a sample text for training tokenizer.",
    "Another example sentence for vocabulary building.",
    "More text data for better tokenization results."
]

# Train WordPiece vocabulary
vocabulary = keras_hub.tokenizers.compute_word_piece_vocabulary(
    data=training_texts,
    vocabulary_size=1000,
    reserved_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]
)

# Create tokenizer with trained vocabulary
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(vocabulary=vocabulary)

# Use the trained tokenizer
tokens = tokenizer(["New text to tokenize"])
print("Trained tokenizer output:", tokens)

SentencePiece Training

import keras_hub

# Train SentencePiece model
training_data = ["Large corpus of text for training", "More text data..."]

proto = keras_hub.tokenizers.compute_sentence_piece_proto(
    data=training_data,
    vocabulary_size=8000,
    model_type="unigram"
)

# Create SentencePiece tokenizer
tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(proto=proto)

# Use the tokenizer
tokens = tokenizer(["Text to tokenize with SentencePiece"])
print("SentencePiece tokens:", tokens)

Working with Different Tokenization Algorithms

import keras_hub

# BPE tokenizer
bpe_tokenizer = keras_hub.tokenizers.BytePairTokenizer.from_preset("gpt2_base_en")
bpe_tokens = bpe_tokenizer(["Example text"])

# WordPiece tokenizer
wordpiece_tokenizer = keras_hub.tokenizers.WordPieceTokenizer.from_preset("bert_base_en")
wordpiece_tokens = wordpiece_tokenizer(["Example text"])

# SentencePiece tokenizer
sentencepiece_tokenizer = keras_hub.tokenizers.SentencePieceTokenizer.from_preset("t5_base_en")
sp_tokens = sentencepiece_tokenizer(["Example text"])

print("BPE tokens:", bpe_tokens)
print("WordPiece tokens:", wordpiece_tokens)
print("SentencePiece tokens:", sp_tokens)

Tokenization and Detokenization

import keras_hub

# Load tokenizer
tokenizer = keras_hub.tokenizers.GPT2Tokenizer.from_preset("gpt2_base_en")

# Original text
text = "Hello, how are you doing today?"

# Tokenize
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Detokenize back to text
reconstructed = tokenizer.detokenize(tokens)
print("Reconstructed:", reconstructed)

Batch Processing

import keras_hub

# Load tokenizer
tokenizer = keras_hub.tokenizers.BertTokenizer.from_preset("bert_base_en")

# Batch of texts
texts = [
    "First document to tokenize",
    "Second document with different content",
    "Third document for batch processing"
]

# Batch tokenization
batch_tokens = tokenizer(texts)
print("Batch tokens shape:", batch_tokens.shape)
print("Batch tokens:", batch_tokens)

Install with Tessl CLI

npx tessl i tessl/pypi-keras-hub

docs

audio-models.md

evaluation-metrics.md

generative-models.md

image-models.md

index.md

layers-components.md

multimodal-models.md

text-generation-sampling.md

text-models.md

tokenizers.md

utilities-helpers.md

tile.json