State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Comprehensive tokenization with support for 100+ different tokenizers, handling subword tokenization, special tokens, efficient batch processing, and cross-framework compatibility. The tokenization system provides consistent APIs across different architectures while optimizing for speed and memory efficiency.
Automatic tokenizer selection based on model names or configurations.
class AutoTokenizer:
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
*inputs,
cache_dir: Union[str, os.PathLike] = None,
force_download: bool = False,
local_files_only: bool = False,
token: Union[str, bool] = None,
revision: str = "main",
use_fast: bool = True,
tokenizer_type: Optional[str] = None,
trust_remote_code: bool = False,
**kwargs
) -> PreTrainedTokenizer:
"""
Load tokenizer automatically detecting the type.
Args:
pretrained_model_name_or_path: Model name or path
cache_dir: Custom cache directory
force_download: Force fresh download
local_files_only: Only use local files
token: Authentication token
revision: Model revision/branch
use_fast: Use fast (Rust-based) tokenizer when available
tokenizer_type: Override auto-detected tokenizer type
trust_remote_code: Allow custom tokenizer code
Returns:
Loaded tokenizer instance
"""Foundation classes for all tokenizer implementations.
class PreTrainedTokenizer:
"""Base class for all Python tokenizers."""
def __init__(
self,
model_max_length: int = None,
padding_side: str = "right",
truncation_side: str = "right",
chat_template: str = None,
model_input_names: List[str] = None,
bos_token: Union[str, AddedToken] = None,
eos_token: Union[str, AddedToken] = None,
unk_token: Union[str, AddedToken] = None,
sep_token: Union[str, AddedToken] = None,
pad_token: Union[str, AddedToken] = None,
cls_token: Union[str, AddedToken] = None,
mask_token: Union[str, AddedToken] = None,
additional_special_tokens: List[Union[str, AddedToken]] = None,
**kwargs
)
def __call__(
self,
text: Union[str, List[str], List[List[str]]] = None,
text_pair: Union[str, List[str], List[List[str]]] = None,
text_target: Union[str, List[str], List[List[str]]] = None,
text_pair_target: Union[str, List[str], List[List[str]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Main tokenization method with extensive options.
Args:
text: Input text(s) to tokenize
text_pair: Paired text for sequence pair tasks
add_special_tokens: Add model-specific special tokens
padding: Padding strategy ("longest", "max_length", True, False)
truncation: Truncation strategy (True, False, "longest_first", etc.)
max_length: Maximum sequence length
stride: Stride for overlapping windows
is_split_into_words: Whether input is pre-tokenized
pad_to_multiple_of: Pad length to multiple of this value
return_tensors: Format of returned tensors ("pt", "tf", "np")
return_token_type_ids: Include token type IDs
return_attention_mask: Include attention mask
return_overflowing_tokens: Return overflowing tokens
return_special_tokens_mask: Mark special tokens
return_offsets_mapping: Include character-to-token mapping
return_length: Include sequence lengths
Returns:
BatchEncoding with tokenized inputs
"""
def encode(
self,
text: Union[str, List[str], List[int]],
text_pair: Optional[Union[str, List[str]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
stride: int = 0,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> List[int]:
"""
Encode text to token IDs.
Args:
text: Text to encode
text_pair: Paired text for sequence pairs
add_special_tokens: Add special tokens
padding: Padding strategy
truncation: Truncation strategy
max_length: Maximum sequence length
stride: Stride for overlapping windows
return_tensors: Format of returned tensors
Returns:
List of token IDs
"""
def decode(
self,
token_ids: Union[int, List[int], torch.Tensor, tf.Tensor, np.ndarray],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
**kwargs
) -> str:
"""
Decode token IDs back to text.
Args:
token_ids: Token IDs to decode
skip_special_tokens: Skip special tokens in output
clean_up_tokenization_spaces: Clean tokenization artifacts
Returns:
Decoded text string
"""
def tokenize(
self,
text: str,
pair: Optional[str] = None,
add_special_tokens: bool = False,
**kwargs
) -> List[str]:
"""
Tokenize text into tokens (not IDs).
Args:
text: Text to tokenize
pair: Paired text for sequence pairs
add_special_tokens: Add special tokens
Returns:
List of token strings
"""
def convert_tokens_to_ids(
self,
tokens: Union[str, List[str]]
) -> Union[int, List[int]]:
"""Convert tokens to corresponding IDs."""
def convert_ids_to_tokens(
self,
ids: Union[int, List[int]],
skip_special_tokens: bool = False
) -> Union[str, List[str]]:
"""Convert IDs to corresponding tokens."""
def add_special_tokens(
self,
special_tokens_dict: Dict[str, Union[str, AddedToken]]
) -> int:
"""
Add special tokens to vocabulary.
Args:
special_tokens_dict: Dictionary of special tokens
Returns:
Number of tokens added
"""
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
push_to_hub: bool = False,
**kwargs
) -> Tuple[str]:
"""Save tokenizer to directory."""
class PreTrainedTokenizerFast:
"""Base class for fast (Rust-based) tokenizers."""
def __init__(
self,
tokenizer_object: Optional["Tokenizer"] = None,
tokenizer_file: Optional[str] = None,
**kwargs
)
# Inherits most methods from PreTrainedTokenizer with optimized implementations
def train_new_from_iterator(
self,
text_iterator: Iterator[str],
vocab_size: int,
length: Optional[int] = None,
new_special_tokens: Optional[List[str]] = None,
special_tokens_map: Optional[Dict[str, str]] = None,
**kwargs
) -> "PreTrainedTokenizerFast":
"""Train new tokenizer from text iterator."""
def push_to_hub(
self,
repo_id: str,
use_temp_dir: Optional[bool] = None,
commit_message: Optional[str] = None,
private: Optional[bool] = None,
token: Union[bool, str] = None,
**kwargs
) -> str:
"""Upload tokenizer to Hugging Face Hub."""Container for tokenizer outputs with tensor conversion capabilities.
class BatchEncoding:
"""Container for tokenized inputs with convenient methods."""
def __init__(
self,
data: Optional[Dict[str, Any]] = None,
encoding: Optional[List["EncodingFast"]] = None,
tensor_type: Union[None, str, TensorType] = None,
prepend_batch_axis: bool = False,
n_sequences: Optional[int] = None
)
def __getitem__(self, item: Union[str, int]) -> Union[Any, List[Any]]:
"""Access tokenized data by key or index."""
def __setitem__(self, key: str, value: Any) -> None:
"""Set tokenized data value."""
def keys(self) -> List[str]:
"""Get all available keys."""
def values(self) -> List[Any]:
"""Get all values."""
def items(self) -> List[Tuple[str, Any]]:
"""Get key-value pairs."""
def to(
self,
device: Union[str, torch.device, int]
) -> "BatchEncoding":
"""Move tensors to specified device."""
def convert_to_tensors(
self,
tensor_type: Optional[Union[str, TensorType]] = None,
prepend_batch_axis: bool = False
) -> "BatchEncoding":
"""Convert to specified tensor format."""
@property
def input_ids(self) -> Optional[List[List[int]]]:
"""Token IDs for input sequences."""
@property
def attention_mask(self) -> Optional[List[List[int]]]:
"""Attention mask (1 for real tokens, 0 for padding)."""
@property
def token_type_ids(self) -> Optional[List[List[int]]]:
"""Token type IDs for sequence pairs."""
def char_to_token(
self,
batch_or_char_index: int,
char_index: Optional[int] = None,
sequence_index: int = 0
) -> Optional[int]:
"""Convert character index to token index."""
def token_to_chars(
self,
batch_or_token_index: int,
token_index: Optional[int] = None,
sequence_index: int = 0
) -> Optional[Tuple[int, int]]:
"""Convert token index to character span."""
def word_to_tokens(
self,
batch_or_word_index: int,
word_index: Optional[int] = None,
sequence_index: int = 0
) -> Optional[Tuple[int, int]]:
"""Convert word index to token span."""class BertTokenizer(PreTrainedTokenizer):
"""BERT WordPiece tokenizer."""
class BertTokenizerFast(PreTrainedTokenizerFast):
"""Fast BERT tokenizer."""class GPT2Tokenizer(PreTrainedTokenizer):
"""GPT-2 BPE tokenizer."""
class GPT2TokenizerFast(PreTrainedTokenizerFast):
"""Fast GPT-2 tokenizer."""class T5Tokenizer(PreTrainedTokenizer):
"""T5 SentencePiece tokenizer."""
class T5TokenizerFast(PreTrainedTokenizerFast):
"""Fast T5 tokenizer."""class RobertaTokenizer(PreTrainedTokenizer):
"""RoBERTa BPE tokenizer."""
class RobertaTokenizerFast(PreTrainedTokenizerFast):
"""Fast RoBERTa tokenizer."""class AddedToken:
"""Represents a token that was added to the vocabulary."""
def __init__(
self,
content: str,
single_word: bool = False,
lstrip: bool = False,
rstrip: bool = False,
normalized: bool = True,
special: bool = False
):
"""
Create an added token.
Args:
content: Token content
single_word: Whether token represents a single word
lstrip: Remove leading whitespace
rstrip: Remove trailing whitespace
normalized: Whether token is normalized
special: Whether this is a special token
"""Helper functions for common tokenization tasks.
def is_tokenizers_available() -> bool:
"""Check if tokenizers library is available."""
def clean_up_tokenization(text: str) -> str:
"""Clean up tokenization artifacts in text."""
def get_pairs(word: Tuple[str, ...]) -> Set[Tuple[str, str]]:
"""Get all character pairs in a word (for BPE)."""Common tokenization patterns and use cases:
from transformers import AutoTokenizer
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Basic tokenization
text = "Hello, world!"
tokens = tokenizer.tokenize(text)
# Output: ['hello', ',', 'world', '!']
# Encode to IDs
token_ids = tokenizer.encode(text)
# Output: [101, 7592, 1010, 2088, 999, 102] # [CLS] + tokens + [SEP]
# Decode back to text
decoded = tokenizer.decode(token_ids)
# Output: "[CLS] hello, world! [SEP]"
# Skip special tokens
decoded_clean = tokenizer.decode(token_ids, skip_special_tokens=True)
# Output: "hello, world!"
# Batch processing with padding
texts = ["Short text", "This is a much longer text that will be truncated"]
batch = tokenizer(
texts,
padding=True,
truncation=True,
max_length=10,
return_tensors="pt"
)
# Returns BatchEncoding with padded/truncated sequences
# Sequence pairs (for tasks like similarity, NLI)
result = tokenizer(
"What is AI?",
"Artificial Intelligence is machine learning.",
padding=True,
return_tensors="pt"
)
# Add custom special tokens
num_added = tokenizer.add_special_tokens({
"additional_special_tokens": ["[CUSTOM]", "[SPECIAL]"]
})
# Character-to-token mapping
encoding = tokenizer("Hello world", return_offsets_mapping=True)
char_to_token = encoding.char_to_token(6) # Character at position 6 -> token indexThe library provides both Python-based ("slow") and Rust-based ("fast") tokenizers:
Fast Tokenizers (Recommended):
Slow Tokenizers:
Use use_fast=True (default) to automatically select fast tokenizers when available.
Install with Tessl CLI
npx tessl i tessl/pypi-transformers