CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-transformers

State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow

Overview
Eval results
Files

tokenization.mddocs/

Tokenization

Comprehensive tokenization with support for 100+ different tokenizers, handling subword tokenization, special tokens, efficient batch processing, and cross-framework compatibility. The tokenization system provides consistent APIs across different architectures while optimizing for speed and memory efficiency.

Capabilities

Auto Tokenizer

Automatic tokenizer selection based on model names or configurations.

class AutoTokenizer:
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        *inputs,
        cache_dir: Union[str, os.PathLike] = None,
        force_download: bool = False,
        local_files_only: bool = False,
        token: Union[str, bool] = None,
        revision: str = "main",
        use_fast: bool = True,
        tokenizer_type: Optional[str] = None,
        trust_remote_code: bool = False,
        **kwargs
    ) -> PreTrainedTokenizer:
        """
        Load tokenizer automatically detecting the type.
        
        Args:
            pretrained_model_name_or_path: Model name or path
            cache_dir: Custom cache directory
            force_download: Force fresh download
            local_files_only: Only use local files
            token: Authentication token
            revision: Model revision/branch
            use_fast: Use fast (Rust-based) tokenizer when available
            tokenizer_type: Override auto-detected tokenizer type
            trust_remote_code: Allow custom tokenizer code
        
        Returns:
            Loaded tokenizer instance
        """

Base Tokenizer Classes

Foundation classes for all tokenizer implementations.

class PreTrainedTokenizer:
    """Base class for all Python tokenizers."""
    
    def __init__(
        self,
        model_max_length: int = None,
        padding_side: str = "right",
        truncation_side: str = "right",
        chat_template: str = None,
        model_input_names: List[str] = None,
        bos_token: Union[str, AddedToken] = None,
        eos_token: Union[str, AddedToken] = None,
        unk_token: Union[str, AddedToken] = None,
        sep_token: Union[str, AddedToken] = None,
        pad_token: Union[str, AddedToken] = None,
        cls_token: Union[str, AddedToken] = None,
        mask_token: Union[str, AddedToken] = None,
        additional_special_tokens: List[Union[str, AddedToken]] = None,
        **kwargs
    )
    
    def __call__(
        self,
        text: Union[str, List[str], List[List[str]]] = None,
        text_pair: Union[str, List[str], List[List[str]]] = None,
        text_target: Union[str, List[str], List[List[str]]] = None,
        text_pair_target: Union[str, List[str], List[List[str]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str] = False,
        truncation: Union[bool, str] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
        Main tokenization method with extensive options.
        
        Args:
            text: Input text(s) to tokenize
            text_pair: Paired text for sequence pair tasks
            add_special_tokens: Add model-specific special tokens
            padding: Padding strategy ("longest", "max_length", True, False)
            truncation: Truncation strategy (True, False, "longest_first", etc.)
            max_length: Maximum sequence length
            stride: Stride for overlapping windows
            is_split_into_words: Whether input is pre-tokenized
            pad_to_multiple_of: Pad length to multiple of this value
            return_tensors: Format of returned tensors ("pt", "tf", "np")
            return_token_type_ids: Include token type IDs
            return_attention_mask: Include attention mask
            return_overflowing_tokens: Return overflowing tokens
            return_special_tokens_mask: Mark special tokens
            return_offsets_mapping: Include character-to-token mapping
            return_length: Include sequence lengths
        
        Returns:
            BatchEncoding with tokenized inputs
        """
    
    def encode(
        self,
        text: Union[str, List[str], List[int]],
        text_pair: Optional[Union[str, List[str]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str] = False,
        truncation: Union[bool, str] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
    ) -> List[int]:
        """
        Encode text to token IDs.
        
        Args:
            text: Text to encode
            text_pair: Paired text for sequence pairs
            add_special_tokens: Add special tokens
            padding: Padding strategy
            truncation: Truncation strategy
            max_length: Maximum sequence length
            stride: Stride for overlapping windows
            return_tensors: Format of returned tensors
        
        Returns:
            List of token IDs
        """
    
    def decode(
        self,
        token_ids: Union[int, List[int], torch.Tensor, tf.Tensor, np.ndarray],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        **kwargs
    ) -> str:
        """
        Decode token IDs back to text.
        
        Args:
            token_ids: Token IDs to decode
            skip_special_tokens: Skip special tokens in output
            clean_up_tokenization_spaces: Clean tokenization artifacts
        
        Returns:
            Decoded text string
        """
    
    def tokenize(
        self,
        text: str,
        pair: Optional[str] = None,
        add_special_tokens: bool = False,
        **kwargs
    ) -> List[str]:
        """
        Tokenize text into tokens (not IDs).
        
        Args:
            text: Text to tokenize
            pair: Paired text for sequence pairs
            add_special_tokens: Add special tokens
        
        Returns:
            List of token strings
        """
    
    def convert_tokens_to_ids(
        self, 
        tokens: Union[str, List[str]]
    ) -> Union[int, List[int]]:
        """Convert tokens to corresponding IDs."""
    
    def convert_ids_to_tokens(
        self,
        ids: Union[int, List[int]],
        skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
        """Convert IDs to corresponding tokens."""
    
    def add_special_tokens(
        self,
        special_tokens_dict: Dict[str, Union[str, AddedToken]]
    ) -> int:
        """
        Add special tokens to vocabulary.
        
        Args:
            special_tokens_dict: Dictionary of special tokens
        
        Returns:
            Number of tokens added
        """
    
    def save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
        push_to_hub: bool = False,
        **kwargs
    ) -> Tuple[str]:
        """Save tokenizer to directory."""

class PreTrainedTokenizerFast:
    """Base class for fast (Rust-based) tokenizers."""
    
    def __init__(
        self,
        tokenizer_object: Optional["Tokenizer"] = None,
        tokenizer_file: Optional[str] = None,
        **kwargs
    )
    
    # Inherits most methods from PreTrainedTokenizer with optimized implementations
    
    def train_new_from_iterator(
        self,
        text_iterator: Iterator[str],
        vocab_size: int,
        length: Optional[int] = None,
        new_special_tokens: Optional[List[str]] = None,
        special_tokens_map: Optional[Dict[str, str]] = None,
        **kwargs
    ) -> "PreTrainedTokenizerFast":
        """Train new tokenizer from text iterator."""
    
    def push_to_hub(
        self,
        repo_id: str,
        use_temp_dir: Optional[bool] = None,
        commit_message: Optional[str] = None,
        private: Optional[bool] = None,
        token: Union[bool, str] = None,
        **kwargs
    ) -> str:
        """Upload tokenizer to Hugging Face Hub."""

Batch Encoding

Container for tokenizer outputs with tensor conversion capabilities.

class BatchEncoding:
    """Container for tokenized inputs with convenient methods."""
    
    def __init__(
        self,
        data: Optional[Dict[str, Any]] = None,
        encoding: Optional[List["EncodingFast"]] = None,
        tensor_type: Union[None, str, TensorType] = None,
        prepend_batch_axis: bool = False,
        n_sequences: Optional[int] = None
    )
    
    def __getitem__(self, item: Union[str, int]) -> Union[Any, List[Any]]:
        """Access tokenized data by key or index."""
    
    def __setitem__(self, key: str, value: Any) -> None:
        """Set tokenized data value."""
    
    def keys(self) -> List[str]:
        """Get all available keys."""
    
    def values(self) -> List[Any]:
        """Get all values."""
    
    def items(self) -> List[Tuple[str, Any]]:
        """Get key-value pairs."""
    
    def to(
        self,
        device: Union[str, torch.device, int]
    ) -> "BatchEncoding":
        """Move tensors to specified device."""
    
    def convert_to_tensors(
        self,
        tensor_type: Optional[Union[str, TensorType]] = None,
        prepend_batch_axis: bool = False
    ) -> "BatchEncoding":
        """Convert to specified tensor format."""
    
    @property 
    def input_ids(self) -> Optional[List[List[int]]]:
        """Token IDs for input sequences."""
    
    @property
    def attention_mask(self) -> Optional[List[List[int]]]:
        """Attention mask (1 for real tokens, 0 for padding)."""
    
    @property  
    def token_type_ids(self) -> Optional[List[List[int]]]:
        """Token type IDs for sequence pairs."""
    
    def char_to_token(
        self,
        batch_or_char_index: int,
        char_index: Optional[int] = None,
        sequence_index: int = 0
    ) -> Optional[int]:
        """Convert character index to token index."""
    
    def token_to_chars(
        self,
        batch_or_token_index: int,
        token_index: Optional[int] = None,
        sequence_index: int = 0
    ) -> Optional[Tuple[int, int]]:
        """Convert token index to character span."""
    
    def word_to_tokens(
        self,
        batch_or_word_index: int,
        word_index: Optional[int] = None,
        sequence_index: int = 0
    ) -> Optional[Tuple[int, int]]:
        """Convert word index to token span."""

Popular Tokenizer Implementations

BERT Tokenizers

class BertTokenizer(PreTrainedTokenizer):
    """BERT WordPiece tokenizer."""

class BertTokenizerFast(PreTrainedTokenizerFast):
    """Fast BERT tokenizer."""

GPT Tokenizers

class GPT2Tokenizer(PreTrainedTokenizer):
    """GPT-2 BPE tokenizer."""

class GPT2TokenizerFast(PreTrainedTokenizerFast):
    """Fast GPT-2 tokenizer."""

T5 Tokenizers

class T5Tokenizer(PreTrainedTokenizer):
    """T5 SentencePiece tokenizer."""

class T5TokenizerFast(PreTrainedTokenizerFast):
    """Fast T5 tokenizer."""

RoBERTa Tokenizers

class RobertaTokenizer(PreTrainedTokenizer):
    """RoBERTa BPE tokenizer."""

class RobertaTokenizerFast(PreTrainedTokenizerFast):  
    """Fast RoBERTa tokenizer."""

Special Token Handling

class AddedToken:
    """Represents a token that was added to the vocabulary."""
    
    def __init__(
        self,
        content: str,
        single_word: bool = False,
        lstrip: bool = False,
        rstrip: bool = False,
        normalized: bool = True,
        special: bool = False
    ):
        """
        Create an added token.
        
        Args:
            content: Token content
            single_word: Whether token represents a single word
            lstrip: Remove leading whitespace
            rstrip: Remove trailing whitespace
            normalized: Whether token is normalized
            special: Whether this is a special token
        """

Tokenization Utilities

Helper functions for common tokenization tasks.

def is_tokenizers_available() -> bool:
    """Check if tokenizers library is available."""

def clean_up_tokenization(text: str) -> str:
    """Clean up tokenization artifacts in text."""

def get_pairs(word: Tuple[str, ...]) -> Set[Tuple[str, str]]:
    """Get all character pairs in a word (for BPE)."""

Tokenization Examples

Common tokenization patterns and use cases:

from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Basic tokenization
text = "Hello, world!"
tokens = tokenizer.tokenize(text)
# Output: ['hello', ',', 'world', '!']

# Encode to IDs
token_ids = tokenizer.encode(text)
# Output: [101, 7592, 1010, 2088, 999, 102]  # [CLS] + tokens + [SEP]

# Decode back to text
decoded = tokenizer.decode(token_ids)
# Output: "[CLS] hello, world! [SEP]"

# Skip special tokens
decoded_clean = tokenizer.decode(token_ids, skip_special_tokens=True)
# Output: "hello, world!"

# Batch processing with padding
texts = ["Short text", "This is a much longer text that will be truncated"]
batch = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=10,
    return_tensors="pt"
)
# Returns BatchEncoding with padded/truncated sequences

# Sequence pairs (for tasks like similarity, NLI)
result = tokenizer(
    "What is AI?",
    "Artificial Intelligence is machine learning.",
    padding=True,
    return_tensors="pt"
)

# Add custom special tokens
num_added = tokenizer.add_special_tokens({
    "additional_special_tokens": ["[CUSTOM]", "[SPECIAL]"]
})

# Character-to-token mapping
encoding = tokenizer("Hello world", return_offsets_mapping=True)
char_to_token = encoding.char_to_token(6)  # Character at position 6 -> token index

Fast vs Slow Tokenizers

The library provides both Python-based ("slow") and Rust-based ("fast") tokenizers:

Fast Tokenizers (Recommended):

  • Rust-based implementation for superior speed
  • Better memory efficiency
  • Additional features like offset mapping
  • Parallel processing capabilities
  • Available for most popular models

Slow Tokenizers:

  • Pure Python implementation
  • Full compatibility and customization
  • Fallback when fast tokenizer unavailable
  • Better for research and custom modifications

Use use_fast=True (default) to automatically select fast tokenizers when available.

Install with Tessl CLI

npx tessl i tessl/pypi-transformers

docs

feature-extraction.md

generation.md

index.md

models.md

optimization.md

pipelines.md

tokenization.md

training.md

tile.json