tessl/pypi-pytorch-pretrained-bert

PyTorch implementations of transformer-based language models including BERT, OpenAI GPT, GPT-2, and Transformer-XL with pre-trained models, tokenizers, and utilities for NLP tasks

—

Pending

Overview

Eval results

Files

Tokenizers

Name: tessl/pypi-pytorch-pretrained-bert
Author: tessl

Comprehensive tokenization utilities for all supported transformer models, handling text preprocessing, encoding, decoding, and vocabulary management with model-specific tokenization strategies including WordPiece, BPE, and adaptive tokenization.

Capabilities

BERT Tokenizer

End-to-end BERT tokenizer combining punctuation splitting, lowercasing, and WordPiece tokenization for bidirectional language models.

class BertTokenizer:
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        max_len=None,
        do_basic_tokenize=True,
        never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
    ):
        """
        Initialize BERT tokenizer.
        
        Args:
            vocab_file (str): Path to vocabulary file
            do_lower_case (bool): Whether to lowercase input text
            max_len (int, optional): Maximum sequence length
            do_basic_tokenize (bool): Whether to do basic tokenization before WordPiece
            never_split (tuple, optional): Tokens that should never be split
        """
    
    def tokenize(self, text):
        """
        Tokenize text into subword tokens.
        
        Args:
            text (str): Input text to tokenize
            
        Returns:
            list: List of subword tokens
        """
    
    def convert_tokens_to_ids(self, tokens):
        """
        Convert tokens to vocabulary IDs.
        
        Args:
            tokens (list): List of tokens
            
        Returns:
            list: List of token IDs
        """
    
    def convert_ids_to_tokens(self, ids):
        """
        Convert vocabulary IDs back to tokens.
        
        Args:
            ids (list): List of token IDs
            
        Returns:
            list: List of tokens
        """
    
    def save_vocabulary(self, vocab_path):
        """
        Save vocabulary to file.
        
        Args:
            vocab_path (str): Directory path to save vocabulary
            
        Returns:
            str: Path to saved vocabulary file
        """
    
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path,
        cache_dir=None,
        do_lower_case=True,
        **kwargs
    ):
        """
        Load pre-trained BERT tokenizer.
        
        Args:
            pretrained_model_name_or_path (str): Model name or path
            cache_dir (str, optional): Cache directory
            do_lower_case (bool): Whether to lowercase
            
        Returns:
            BertTokenizer: Initialized tokenizer
        """

Basic Tokenizer

Basic text tokenization handling punctuation splitting, accent stripping, and lowercasing.

class BasicTokenizer:
    def __init__(self, do_lower_case=True, never_split=None):
        """
        Initialize basic tokenizer.
        
        Args:
            do_lower_case (bool): Whether to lowercase text
            never_split (list, optional): Tokens never to split
        """
    
    def tokenize(self, text):
        """
        Perform basic tokenization on text.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of basic tokens
        """

WordPiece Tokenizer

WordPiece subword tokenization using greedy longest-match-first algorithm for handling out-of-vocabulary tokens.

class WordpieceTokenizer:
    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
        """
        Initialize WordPiece tokenizer.
        
        Args:
            vocab (dict): Vocabulary mapping tokens to IDs
            unk_token (str): Unknown token symbol
            max_input_chars_per_word (int): Maximum characters per word
        """
    
    def tokenize(self, text):
        """
        Perform WordPiece tokenization.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of WordPiece tokens
        """

OpenAI GPT Tokenizer

Byte-pair encoding (BPE) tokenizer for OpenAI GPT models with special token support and text standardization.

class OpenAIGPTTokenizer:
    def __init__(
        self,
        vocab_file,
        merges_file,
        special_tokens=None,
        max_len=None
    ):
        """
        Initialize OpenAI GPT tokenizer.
        
        Args:
            vocab_file (str): Path to vocabulary JSON file
            merges_file (str): Path to BPE merges file
            special_tokens (list, optional): List of special tokens
            max_len (int, optional): Maximum sequence length
        """
    
    def tokenize(self, text):
        """
        Perform BPE tokenization.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of BPE tokens
        """
    
    def convert_tokens_to_ids(self, tokens):
        """Convert tokens to IDs."""
    
    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """
        Convert IDs to tokens.
        
        Args:
            ids (list): Token IDs
            skip_special_tokens (bool): Whether to skip special tokens
            
        Returns:
            list: List of tokens
        """
    
    def encode(self, text):
        """
        Tokenize and convert to IDs in one step.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of token IDs
        """
    
    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
        """
        Decode token IDs back to text.
        
        Args:
            ids (list): Token IDs
            skip_special_tokens (bool): Whether to skip special tokens
            clean_up_tokenization_spaces (bool): Whether to clean up spaces
            
        Returns:
            str: Decoded text
        """
    
    def set_special_tokens(self, special_tokens):
        """
        Add special tokens to vocabulary.
        
        Args:
            special_tokens (list): List of special tokens to add
        """
    
    def save_vocabulary(self, vocab_path):
        """Save tokenizer vocabulary and merges files."""
    
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path,
        cache_dir=None,
        **kwargs
    ):
        """Load pre-trained OpenAI GPT tokenizer."""

GPT-2 Tokenizer

Byte-level BPE tokenizer for GPT-2 models with improved Unicode handling and robustness.

class GPT2Tokenizer:
    def __init__(
        self,
        vocab_file,
        merges_file,
        errors='replace',
        special_tokens=None,
        max_len=None
    ):
        """
        Initialize GPT-2 tokenizer.
        
        Args:
            vocab_file (str): Path to vocabulary JSON file
            merges_file (str): Path to BPE merges file
            errors (str): Error handling for byte decoding
            special_tokens (list, optional): Special tokens
            max_len (int, optional): Maximum sequence length
        """
    
    def tokenize(self, text):
        """Perform byte-level BPE tokenization."""
    
    def convert_tokens_to_ids(self, tokens):
        """Convert tokens to IDs."""
    
    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """Convert IDs to tokens."""
    
    def encode(self, text):
        """Encode text to token IDs."""
    
    def decode(self, tokens):
        """
        Decode token IDs using byte-level encoding.
        
        Args:
            tokens (list): Token IDs or tokens
            
        Returns:
            str: Decoded text
        """
    
    def save_vocabulary(self, vocab_path):
        """Save vocabulary files."""
    
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path,
        cache_dir=None,
        **kwargs
    ):
        """Load pre-trained GPT-2 tokenizer."""

Transformer-XL Tokenizer

Adaptive tokenizer for Transformer-XL with vocabulary building, corpus management, and flexible tokenization options.

class TransfoXLTokenizer:
    def __init__(
        self,
        special=None,
        min_freq=0,
        max_size=None,
        lower_case=False,
        delimiter=None,
        vocab_file=None,
        never_split=None
    ):
        """
        Initialize Transformer-XL tokenizer.
        
        Args:
            special (list, optional): Special tokens
            min_freq (int): Minimum frequency for vocabulary inclusion
            max_size (int, optional): Maximum vocabulary size
            lower_case (bool): Whether to lowercase text
            delimiter (str, optional): Token delimiter
            vocab_file (str, optional): Pre-built vocabulary file
            never_split (list, optional): Tokens never to split
        """
    
    def build_vocab(self):
        """Build vocabulary from counted tokens."""
    
    def tokenize(self, line, add_eos=False, add_double_eos=False):
        """
        Tokenize text line.
        
        Args:
            line (str): Input text line
            add_eos (bool): Whether to add end-of-sequence token
            add_double_eos (bool): Whether to add double EOS tokens
            
        Returns:
            list: List of tokens
        """
    
    def encode_file(self, path, ordered=False, verbose=False):
        """
        Encode entire file to token IDs.
        
        Args:
            path (str): File path
            ordered (bool): Whether to maintain order
            verbose (bool): Whether to show progress
            
        Returns:
            torch.Tensor: Encoded token IDs
        """
    
    def convert_tokens_to_ids(self, symbols):
        """Convert tokens to vocabulary IDs."""
    
    def convert_ids_to_tokens(self, indices):
        """Convert IDs to tokens."""
    
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path,
        cache_dir=None,
        **kwargs
    ):
        """Load pre-trained Transformer-XL tokenizer."""

Transformer-XL Corpus

Corpus management class for Transformer-XL providing dataset loading, vocabulary building, and data iteration.

class TransfoXLCorpus:
    def __init__(self, path, dataset, *args, **kwargs):
        """
        Initialize corpus manager.
        
        Args:
            path (str): Dataset path
            dataset (str): Dataset name
        """
    
    def build_corpus(self, path, dataset):
        """
        Build corpus from dataset.
        
        Args:
            path (str): Dataset path
            dataset (str): Dataset name
        """
    
    def get_iterator(self, split, *args, **kwargs):
        """
        Get data iterator for specified split.
        
        Args:
            split (str): Dataset split ('train', 'valid', 'test')
            
        Returns:
            Iterator: Data iterator
        """

Utility Functions

def load_vocab(vocab_file):
    """
    Load vocabulary file into ordered dictionary.
    
    Args:
        vocab_file (str): Path to vocabulary file
        
    Returns:
        collections.OrderedDict: Token to ID mapping
    """

def whitespace_tokenize(text):
    """
    Basic whitespace tokenization.
    
    Args:
        text (str): Input text
        
    Returns:
        list: Whitespace-separated tokens
    """

def get_pairs(word):
    """
    Get symbol pairs in word for BPE processing.
    
    Args:
        word (tuple): Word as tuple of symbols
        
    Returns:
        set: Set of symbol pairs
    """

def text_standardize(text):
    """
    Standardize text by fixing punctuation and spacing.
    
    Args:
        text (str): Input text
        
    Returns:
        str: Standardized text
    """

def bytes_to_unicode():
    """
    Create mapping from UTF-8 bytes to unicode strings for GPT-2.
    
    Returns:
        dict: Byte to unicode mapping
    """

def get_lm_corpus(datadir, dataset):
    """
    Get language model corpus for Transformer-XL.
    
    Args:
        datadir (str): Data directory
        dataset (str): Dataset name
        
    Returns:
        TransfoXLCorpus: Corpus instance
    """

Usage Examples

BERT Tokenization

from pytorch_pretrained_bert import BertTokenizer

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text
text = "Hello world! This is BERT tokenization."
tokens = tokenizer.tokenize(text)
print(tokens)  # ['hello', 'world', '!', 'this', 'is', 'bert', 'token', '##ization', '.']

# Convert to IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)  # [7592, 2088, 999, 2023, 2003, 14324, 19204, 6851, 1012]

# Convert back to tokens
recovered_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(recovered_tokens)

GPT-2 Tokenization and Encoding

from pytorch_pretrained_bert import GPT2Tokenizer

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Direct encoding and decoding
text = "The future of AI is bright."
encoded = tokenizer.encode(text)
print(encoded)  # [464, 2003, 286, 9552, 318, 6016, 13]

decoded = tokenizer.decode(encoded)
print(decoded)  # "The future of AI is bright."

Transformer-XL with Custom Vocabulary

from pytorch_pretrained_bert import TransfoXLTokenizer

# Initialize tokenizer with custom settings
tokenizer = TransfoXLTokenizer(
    special=['<eos>', '<unk>'],
    min_freq=3,
    lower_case=True
)

# Tokenize with special tokens
text = "This is a sample sentence."
tokens = tokenizer.tokenize(text, add_eos=True)
print(tokens)  # ['this', 'is', 'a', 'sample', 'sentence', '.', '<eos>']

OpenAI GPT with Special Tokens

from pytorch_pretrained_bert import OpenAIGPTTokenizer

# Load tokenizer
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

# Add special tokens
special_tokens = ['<start>', '<end>']
tokenizer.set_special_tokens(special_tokens)

# Use special tokens
text = "<start> Generate some text <end>"
tokens = tokenizer.tokenize(text)
print(tokens)

Install with Tessl CLI

npx tessl i tessl/pypi-pytorch-pretrained-bert

docs