PyTorch implementations of transformer-based language models including BERT, OpenAI GPT, GPT-2, and Transformer-XL with pre-trained models, tokenizers, and utilities for NLP tasks
npx @tessl/cli install tessl/pypi-pytorch-pretrained-bert@0.6.0PyTorch implementations of transformer-based language models including Google's BERT, OpenAI's GPT and GPT-2, and Google/CMU's Transformer-XL. This library provides pre-trained models, fine-tuning examples, tokenizers, and model architectures that match the performance of their original TensorFlow implementations, designed for researchers and practitioners working with state-of-the-art language models.
pip install pytorch_pretrained_bertimport pytorch_pretrained_bertCommon imports for specific functionality:
# BERT models and tokenizer
from pytorch_pretrained_bert import (
BertTokenizer, BertModel, BertForSequenceClassification,
BertConfig, BertAdam
)
# OpenAI GPT models
from pytorch_pretrained_bert import (
OpenAIGPTTokenizer, OpenAIGPTLMHeadModel, OpenAIGPTConfig
)
# GPT-2 models
from pytorch_pretrained_bert import (
GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
)
# Transformer-XL models
from pytorch_pretrained_bert import (
TransfoXLTokenizer, TransfoXLLMHeadModel, TransfoXLConfig
)
# Utilities
from pytorch_pretrained_bert import cached_path, WEIGHTS_NAME, CONFIG_NAMEimport torch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertConfig
# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Tokenize input text
text = "Hello, my dog is cute"
tokens = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([input_ids])
# Forward pass
with torch.no_grad():
outputs = model(input_ids)
predictions = torch.nn.functional.softmax(outputs[0], dim=-1)
print(f"Predictions: {predictions}")from pytorch_pretrained_bert import GPT2Tokenizer, GPT2LMHeadModel
# Load pre-trained GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Prepare input
input_text = "The future of artificial intelligence"
input_ids = tokenizer.encode(input_text)
input_ids = torch.tensor([input_ids])
# Forward pass to get next token predictions
with torch.no_grad():
outputs = model(input_ids)
predictions = outputs[0] # Language modeling logits
# Get next token probabilities
next_token_logits = predictions[0, -1, :]
next_token_probs = torch.softmax(next_token_logits, dim=-1)
# Sample next token
next_token_id = torch.multinomial(next_token_probs, 1).item()
next_token = tokenizer.decode([next_token_id])
print(f"Input: {input_text}")
print(f"Next token: {next_token}")The library is organized around four main transformer architectures:
Each model family includes:
All models support the from_pretrained() class method for loading pre-trained weights with automatic download and caching.
Complete BERT model family including base model, task-specific variants, configuration, and tokenization for bidirectional language understanding tasks.
class BertModel: ...
class BertForSequenceClassification: ...
class BertForQuestionAnswering: ...
class BertTokenizer: ...
class BertConfig: ...Tokenization utilities for all supported model types, handling text preprocessing, encoding, decoding, and vocabulary management with model-specific tokenization strategies.
class BertTokenizer: ...
class BasicTokenizer: ...
class WordpieceTokenizer: ...
class OpenAIGPTTokenizer: ...
class GPT2Tokenizer: ...
class TransfoXLTokenizer: ...OpenAI GPT, GPT-2, and Transformer-XL model families with their configurations and tokenizers for autoregressive language modeling and text generation tasks.
class OpenAIGPTLMHeadModel: ...
class GPT2LMHeadModel: ...
class TransfoXLLMHeadModel: ...Specialized optimizers with learning rate scheduling designed for transformer training, including BERT-specific and OpenAI-specific Adam variants.
class BertAdam: ...
class OpenAIAdam: ...File handling, caching, and model loading utilities for automatic download, caching of pre-trained models, and conversion from TensorFlow checkpoints.
def cached_path(url_or_filename, cache_dir=None): ...
def load_tf_weights_in_bert(model, tf_checkpoint_path): ...All model classes support the standard from_pretrained() pattern:
# Load model with default configuration
model = BertModel.from_pretrained('bert-base-uncased')
# Load with custom cache directory
model = BertModel.from_pretrained('bert-base-uncased', cache_dir='./models/')
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam
# Load model for fine-tuning
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# Setup optimizer with learning rate scheduling
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters,
lr=2e-5,
warmup=0.1,
t_total=num_train_steps)from pytorch_pretrained_bert import BertModel, load_tf_weights_in_bert
# Create PyTorch model
model = BertModel(config)
# Load TensorFlow weights
load_tf_weights_in_bert(model, tf_checkpoint_path)