PyTorch implementations of transformer-based language models including BERT, OpenAI GPT, GPT-2, and Transformer-XL with pre-trained models, tokenizers, and utilities for NLP tasks
—
Complete BERT model family including configuration, base model, and task-specific variants for bidirectional language understanding tasks such as sequence classification, question answering, token classification, and masked language modeling.
Stores BERT model configuration parameters including architecture dimensions, layer counts, and training hyperparameters.
class BertConfig:
def __init__(
self,
vocab_size_or_config_json_file,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02
):
"""
Initialize BERT configuration.
Args:
vocab_size_or_config_json_file (int or str): Vocabulary size or path to config JSON (required)
hidden_size (int): Hidden layer dimension
num_hidden_layers (int): Number of transformer layers
num_attention_heads (int): Number of attention heads
intermediate_size (int): Feed-forward layer dimension
hidden_act (str): Activation function ('gelu', 'relu', 'swish')
hidden_dropout_prob (float): Dropout probability for hidden layers
attention_probs_dropout_prob (float): Dropout probability for attention
max_position_embeddings (int): Maximum sequence length
type_vocab_size (int): Token type vocabulary size
initializer_range (float): Weight initialization range
"""
@classmethod
def from_dict(cls, json_object):
"""Create configuration from dictionary."""
@classmethod
def from_json_file(cls, json_file):
"""Create configuration from JSON file."""
def to_dict(self):
"""Convert configuration to dictionary."""
def to_json_string(self):
"""Convert configuration to JSON string."""The core BERT transformer model outputting raw hidden states without task-specific heads.
class BertModel:
def __init__(self, config, output_attentions=False):
"""
Initialize BERT base model.
Args:
config (BertConfig): Model configuration
output_attentions (bool): Whether to output attention weights
"""
def forward(
self,
input_ids,
token_type_ids=None,
attention_mask=None,
output_all_encoded_layers=True
):
"""
Forward pass through BERT model.
Args:
input_ids (torch.Tensor): Token IDs of shape [batch_size, seq_len]
token_type_ids (torch.Tensor, optional): Segment IDs of shape [batch_size, seq_len]
attention_mask (torch.Tensor, optional): Attention mask of shape [batch_size, seq_len]
output_all_encoded_layers (bool): Whether to output all layer states
Returns:
tuple: (encoded_layers, pooled_output) where:
- encoded_layers (list): Hidden states from each layer
- pooled_output (torch.Tensor): Pooled representation for classification
"""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
cache_dir=None,
output_attentions=False,
**kwargs
):
"""Load pre-trained BERT model."""BERT model with both masked language modeling and next sentence prediction heads for pre-training tasks.
class BertForPreTraining:
def __init__(self, config):
"""
Initialize BERT for pre-training.
Args:
config (BertConfig): Model configuration
"""
def forward(
self,
input_ids,
token_type_ids=None,
attention_mask=None,
masked_lm_labels=None,
next_sentence_label=None
):
"""
Forward pass with pre-training heads.
Args:
input_ids (torch.Tensor): Token IDs
token_type_ids (torch.Tensor, optional): Segment IDs
attention_mask (torch.Tensor, optional): Attention mask
masked_lm_labels (torch.Tensor, optional): MLM labels for loss computation
next_sentence_label (torch.Tensor, optional): NSP labels for loss computation
Returns:
tuple: (prediction_scores, seq_relationship_score) or loss if labels provided
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""BERT model with only the masked language modeling head for MLM fine-tuning.
class BertForMaskedLM:
def __init__(self, config):
"""
Initialize BERT for masked language modeling.
Args:
config (BertConfig): Model configuration
"""
def forward(
self,
input_ids,
token_type_ids=None,
attention_mask=None,
masked_lm_labels=None
):
"""
Forward pass with MLM head.
Returns:
torch.Tensor: Prediction scores for vocabulary tokens or loss if labels provided
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""BERT model with only the next sentence prediction head for NSP tasks.
class BertForNextSentencePrediction:
def __init__(self, config):
"""
Initialize BERT for next sentence prediction.
Args:
config (BertConfig): Model configuration
"""
def forward(
self,
input_ids,
token_type_ids=None,
attention_mask=None,
next_sentence_label=None
):
"""
Forward pass with NSP head.
Returns:
torch.Tensor: Next sentence prediction scores or loss if labels provided
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""BERT model with a classification head for sequence-level tasks like sentiment analysis, text classification, and natural language inference.
class BertForSequenceClassification:
def __init__(self, config, num_labels):
"""
Initialize BERT for sequence classification.
Args:
config (BertConfig): Model configuration
num_labels (int): Number of classification labels
"""
def forward(
self,
input_ids,
token_type_ids=None,
attention_mask=None,
labels=None
):
"""
Forward pass with classification head.
Args:
input_ids (torch.Tensor): Token IDs
token_type_ids (torch.Tensor, optional): Segment IDs
attention_mask (torch.Tensor, optional): Attention mask
labels (torch.Tensor, optional): Classification labels for loss computation
Returns:
torch.Tensor: Classification logits or loss if labels provided
"""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
cache_dir=None,
num_labels=2,
**kwargs
):
"""Load pre-trained model."""BERT model for multiple choice tasks where each example consists of multiple candidate choices.
class BertForMultipleChoice:
def __init__(self, config, num_choices):
"""
Initialize BERT for multiple choice.
Args:
config (BertConfig): Model configuration
num_choices (int): Number of choices per example
"""
def forward(
self,
input_ids,
token_type_ids=None,
attention_mask=None,
labels=None
):
"""
Forward pass with multiple choice head.
Args:
input_ids (torch.Tensor): Token IDs of shape [batch_size, num_choices, seq_len]
token_type_ids (torch.Tensor, optional): Segment IDs
attention_mask (torch.Tensor, optional): Attention mask
labels (torch.Tensor, optional): Choice labels for loss computation
Returns:
torch.Tensor: Choice scores or loss if labels provided
"""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
cache_dir=None,
num_choices=2,
**kwargs
):
"""Load pre-trained model."""BERT model with a token-level classification head for tasks like named entity recognition and part-of-speech tagging.
class BertForTokenClassification:
def __init__(self, config, num_labels):
"""
Initialize BERT for token classification.
Args:
config (BertConfig): Model configuration
num_labels (int): Number of token classification labels
"""
def forward(
self,
input_ids,
token_type_ids=None,
attention_mask=None,
labels=None
):
"""
Forward pass with token classification head.
Args:
input_ids (torch.Tensor): Token IDs
token_type_ids (torch.Tensor, optional): Segment IDs
attention_mask (torch.Tensor, optional): Attention mask
labels (torch.Tensor, optional): Token labels for loss computation
Returns:
torch.Tensor: Token classification logits or loss if labels provided
"""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
cache_dir=None,
num_labels=2,
**kwargs
):
"""Load pre-trained model."""BERT model with span-based question answering head for extractive QA tasks like SQuAD.
class BertForQuestionAnswering:
def __init__(self, config):
"""
Initialize BERT for question answering.
Args:
config (BertConfig): Model configuration
"""
def forward(
self,
input_ids,
token_type_ids=None,
attention_mask=None,
start_positions=None,
end_positions=None
):
"""
Forward pass with QA head.
Args:
input_ids (torch.Tensor): Token IDs
token_type_ids (torch.Tensor, optional): Segment IDs
attention_mask (torch.Tensor, optional): Attention mask
start_positions (torch.Tensor, optional): Start positions for loss computation
end_positions (torch.Tensor, optional): End positions for loss computation
Returns:
tuple: (start_scores, end_scores) or loss if positions provided
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
"""Load pre-trained model."""Function to load TensorFlow BERT checkpoint weights into PyTorch model.
def load_tf_weights_in_bert(model, tf_checkpoint_path):
"""
Load TensorFlow BERT checkpoint into PyTorch model.
Args:
model: PyTorch BERT model instance
tf_checkpoint_path (str): Path to TensorFlow checkpoint
Returns:
PyTorch model with loaded weights
"""from pytorch_pretrained_bert import BertModel, BertConfig
# Create model from configuration
config = BertConfig(vocab_size=30522, hidden_size=768)
model = BertModel(config)
# Or load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')from pytorch_pretrained_bert import BertForSequenceClassification
import torch
# Load for 3-class classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# Forward pass
input_ids = torch.tensor([[101, 2023, 2003, 102]]) # [CLS] this is [SEP]
outputs = model(input_ids)
logits = outputs[0] # Classification scoresfrom pytorch_pretrained_bert import BertForQuestionAnswering
import torch
# Load QA model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
# Forward pass with question and passage
input_ids = torch.tensor([[101, 2054, 2003, 102, 2023, 2003, 1996, 3437, 102]])
token_type_ids = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1, 1]]) # 0=question, 1=passage
outputs = model(input_ids, token_type_ids=token_type_ids)
start_scores, end_scores = outputsInstall with Tessl CLI
npx tessl i tessl/pypi-pytorch-pretrained-bert