PyTorch implementations of transformer-based language models including BERT, OpenAI GPT, GPT-2, and Transformer-XL with pre-trained models, tokenizers, and utilities for NLP tasks
—
Comprehensive tokenization utilities for all supported transformer models, handling text preprocessing, encoding, decoding, and vocabulary management with model-specific tokenization strategies including WordPiece, BPE, and adaptive tokenization.
End-to-end BERT tokenizer combining punctuation splitting, lowercasing, and WordPiece tokenization for bidirectional language models.
class BertTokenizer:
def __init__(
self,
vocab_file,
do_lower_case=True,
max_len=None,
do_basic_tokenize=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
):
"""
Initialize BERT tokenizer.
Args:
vocab_file (str): Path to vocabulary file
do_lower_case (bool): Whether to lowercase input text
max_len (int, optional): Maximum sequence length
do_basic_tokenize (bool): Whether to do basic tokenization before WordPiece
never_split (tuple, optional): Tokens that should never be split
"""
def tokenize(self, text):
"""
Tokenize text into subword tokens.
Args:
text (str): Input text to tokenize
Returns:
list: List of subword tokens
"""
def convert_tokens_to_ids(self, tokens):
"""
Convert tokens to vocabulary IDs.
Args:
tokens (list): List of tokens
Returns:
list: List of token IDs
"""
def convert_ids_to_tokens(self, ids):
"""
Convert vocabulary IDs back to tokens.
Args:
ids (list): List of token IDs
Returns:
list: List of tokens
"""
def save_vocabulary(self, vocab_path):
"""
Save vocabulary to file.
Args:
vocab_path (str): Directory path to save vocabulary
Returns:
str: Path to saved vocabulary file
"""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
cache_dir=None,
do_lower_case=True,
**kwargs
):
"""
Load pre-trained BERT tokenizer.
Args:
pretrained_model_name_or_path (str): Model name or path
cache_dir (str, optional): Cache directory
do_lower_case (bool): Whether to lowercase
Returns:
BertTokenizer: Initialized tokenizer
"""Basic text tokenization handling punctuation splitting, accent stripping, and lowercasing.
class BasicTokenizer:
def __init__(self, do_lower_case=True, never_split=None):
"""
Initialize basic tokenizer.
Args:
do_lower_case (bool): Whether to lowercase text
never_split (list, optional): Tokens never to split
"""
def tokenize(self, text):
"""
Perform basic tokenization on text.
Args:
text (str): Input text
Returns:
list: List of basic tokens
"""WordPiece subword tokenization using greedy longest-match-first algorithm for handling out-of-vocabulary tokens.
class WordpieceTokenizer:
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
"""
Initialize WordPiece tokenizer.
Args:
vocab (dict): Vocabulary mapping tokens to IDs
unk_token (str): Unknown token symbol
max_input_chars_per_word (int): Maximum characters per word
"""
def tokenize(self, text):
"""
Perform WordPiece tokenization.
Args:
text (str): Input text
Returns:
list: List of WordPiece tokens
"""Byte-pair encoding (BPE) tokenizer for OpenAI GPT models with special token support and text standardization.
class OpenAIGPTTokenizer:
def __init__(
self,
vocab_file,
merges_file,
special_tokens=None,
max_len=None
):
"""
Initialize OpenAI GPT tokenizer.
Args:
vocab_file (str): Path to vocabulary JSON file
merges_file (str): Path to BPE merges file
special_tokens (list, optional): List of special tokens
max_len (int, optional): Maximum sequence length
"""
def tokenize(self, text):
"""
Perform BPE tokenization.
Args:
text (str): Input text
Returns:
list: List of BPE tokens
"""
def convert_tokens_to_ids(self, tokens):
"""Convert tokens to IDs."""
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
"""
Convert IDs to tokens.
Args:
ids (list): Token IDs
skip_special_tokens (bool): Whether to skip special tokens
Returns:
list: List of tokens
"""
def encode(self, text):
"""
Tokenize and convert to IDs in one step.
Args:
text (str): Input text
Returns:
list: List of token IDs
"""
def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
"""
Decode token IDs back to text.
Args:
ids (list): Token IDs
skip_special_tokens (bool): Whether to skip special tokens
clean_up_tokenization_spaces (bool): Whether to clean up spaces
Returns:
str: Decoded text
"""
def set_special_tokens(self, special_tokens):
"""
Add special tokens to vocabulary.
Args:
special_tokens (list): List of special tokens to add
"""
def save_vocabulary(self, vocab_path):
"""Save tokenizer vocabulary and merges files."""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
cache_dir=None,
**kwargs
):
"""Load pre-trained OpenAI GPT tokenizer."""Byte-level BPE tokenizer for GPT-2 models with improved Unicode handling and robustness.
class GPT2Tokenizer:
def __init__(
self,
vocab_file,
merges_file,
errors='replace',
special_tokens=None,
max_len=None
):
"""
Initialize GPT-2 tokenizer.
Args:
vocab_file (str): Path to vocabulary JSON file
merges_file (str): Path to BPE merges file
errors (str): Error handling for byte decoding
special_tokens (list, optional): Special tokens
max_len (int, optional): Maximum sequence length
"""
def tokenize(self, text):
"""Perform byte-level BPE tokenization."""
def convert_tokens_to_ids(self, tokens):
"""Convert tokens to IDs."""
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
"""Convert IDs to tokens."""
def encode(self, text):
"""Encode text to token IDs."""
def decode(self, tokens):
"""
Decode token IDs using byte-level encoding.
Args:
tokens (list): Token IDs or tokens
Returns:
str: Decoded text
"""
def save_vocabulary(self, vocab_path):
"""Save vocabulary files."""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
cache_dir=None,
**kwargs
):
"""Load pre-trained GPT-2 tokenizer."""Adaptive tokenizer for Transformer-XL with vocabulary building, corpus management, and flexible tokenization options.
class TransfoXLTokenizer:
def __init__(
self,
special=None,
min_freq=0,
max_size=None,
lower_case=False,
delimiter=None,
vocab_file=None,
never_split=None
):
"""
Initialize Transformer-XL tokenizer.
Args:
special (list, optional): Special tokens
min_freq (int): Minimum frequency for vocabulary inclusion
max_size (int, optional): Maximum vocabulary size
lower_case (bool): Whether to lowercase text
delimiter (str, optional): Token delimiter
vocab_file (str, optional): Pre-built vocabulary file
never_split (list, optional): Tokens never to split
"""
def build_vocab(self):
"""Build vocabulary from counted tokens."""
def tokenize(self, line, add_eos=False, add_double_eos=False):
"""
Tokenize text line.
Args:
line (str): Input text line
add_eos (bool): Whether to add end-of-sequence token
add_double_eos (bool): Whether to add double EOS tokens
Returns:
list: List of tokens
"""
def encode_file(self, path, ordered=False, verbose=False):
"""
Encode entire file to token IDs.
Args:
path (str): File path
ordered (bool): Whether to maintain order
verbose (bool): Whether to show progress
Returns:
torch.Tensor: Encoded token IDs
"""
def convert_tokens_to_ids(self, symbols):
"""Convert tokens to vocabulary IDs."""
def convert_ids_to_tokens(self, indices):
"""Convert IDs to tokens."""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
cache_dir=None,
**kwargs
):
"""Load pre-trained Transformer-XL tokenizer."""Corpus management class for Transformer-XL providing dataset loading, vocabulary building, and data iteration.
class TransfoXLCorpus:
def __init__(self, path, dataset, *args, **kwargs):
"""
Initialize corpus manager.
Args:
path (str): Dataset path
dataset (str): Dataset name
"""
def build_corpus(self, path, dataset):
"""
Build corpus from dataset.
Args:
path (str): Dataset path
dataset (str): Dataset name
"""
def get_iterator(self, split, *args, **kwargs):
"""
Get data iterator for specified split.
Args:
split (str): Dataset split ('train', 'valid', 'test')
Returns:
Iterator: Data iterator
"""def load_vocab(vocab_file):
"""
Load vocabulary file into ordered dictionary.
Args:
vocab_file (str): Path to vocabulary file
Returns:
collections.OrderedDict: Token to ID mapping
"""
def whitespace_tokenize(text):
"""
Basic whitespace tokenization.
Args:
text (str): Input text
Returns:
list: Whitespace-separated tokens
"""
def get_pairs(word):
"""
Get symbol pairs in word for BPE processing.
Args:
word (tuple): Word as tuple of symbols
Returns:
set: Set of symbol pairs
"""
def text_standardize(text):
"""
Standardize text by fixing punctuation and spacing.
Args:
text (str): Input text
Returns:
str: Standardized text
"""
def bytes_to_unicode():
"""
Create mapping from UTF-8 bytes to unicode strings for GPT-2.
Returns:
dict: Byte to unicode mapping
"""
def get_lm_corpus(datadir, dataset):
"""
Get language model corpus for Transformer-XL.
Args:
datadir (str): Data directory
dataset (str): Dataset name
Returns:
TransfoXLCorpus: Corpus instance
"""from pytorch_pretrained_bert import BertTokenizer
# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize text
text = "Hello world! This is BERT tokenization."
tokens = tokenizer.tokenize(text)
print(tokens) # ['hello', 'world', '!', 'this', 'is', 'bert', 'token', '##ization', '.']
# Convert to IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids) # [7592, 2088, 999, 2023, 2003, 14324, 19204, 6851, 1012]
# Convert back to tokens
recovered_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(recovered_tokens)from pytorch_pretrained_bert import GPT2Tokenizer
# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Direct encoding and decoding
text = "The future of AI is bright."
encoded = tokenizer.encode(text)
print(encoded) # [464, 2003, 286, 9552, 318, 6016, 13]
decoded = tokenizer.decode(encoded)
print(decoded) # "The future of AI is bright."from pytorch_pretrained_bert import TransfoXLTokenizer
# Initialize tokenizer with custom settings
tokenizer = TransfoXLTokenizer(
special=['<eos>', '<unk>'],
min_freq=3,
lower_case=True
)
# Tokenize with special tokens
text = "This is a sample sentence."
tokens = tokenizer.tokenize(text, add_eos=True)
print(tokens) # ['this', 'is', 'a', 'sample', 'sentence', '.', '<eos>']from pytorch_pretrained_bert import OpenAIGPTTokenizer
# Load tokenizer
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
# Add special tokens
special_tokens = ['<start>', '<end>']
tokenizer.set_special_tokens(special_tokens)
# Use special tokens
text = "<start> Generate some text <end>"
tokens = tokenizer.tokenize(text)
print(tokens)Install with Tessl CLI
npx tessl i tessl/pypi-pytorch-pretrained-bert