Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM
—
BERT (Bidirectional Encoder Representations from Transformers) models for various NLP tasks. BERT uses bidirectional attention to understand context from both directions, making it highly effective for understanding-based tasks like classification, question answering, and token-level predictions.
Configuration class for BERT models containing all hyperparameters and architecture specifications.
class BertConfig(PretrainedConfig):
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
**kwargs
):
"""
Configuration for BERT models.
Parameters:
- vocab_size (int): Vocabulary size
- hidden_size (int): Hidden layer dimensionality
- num_hidden_layers (int): Number of transformer layers
- num_attention_heads (int): Number of attention heads per layer
- intermediate_size (int): Feed-forward layer dimensionality
- hidden_act (str): Activation function ("gelu", "relu", "swish")
- hidden_dropout_prob (float): Dropout probability for hidden layers
- attention_probs_dropout_prob (float): Dropout for attention probabilities
- max_position_embeddings (int): Maximum sequence length
- type_vocab_size (int): Number of token type embeddings
- initializer_range (float): Weight initialization range
- layer_norm_eps (float): Layer normalization epsilon
"""Base BERT model for encoding sequences into contextualized representations.
class BertModel(PreTrainedModel):
def __init__(self, config):
"""
Initialize BERT base model.
Parameters:
- config (BertConfig): Model configuration
"""
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None
):
"""
Forward pass through BERT model.
Parameters:
- input_ids (torch.Tensor): Token IDs of shape (batch_size, sequence_length)
- attention_mask (torch.Tensor): Attention mask to avoid padding tokens
- token_type_ids (torch.Tensor): Segment token indices for sentence pairs
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Mask to nullify selected heads
- inputs_embeds (torch.Tensor): Pre-computed embeddings
Returns:
BaseModelOutputWithPooling: Object with last_hidden_state and pooler_output
"""Usage Example:
from pytorch_transformers import BertModel, BertTokenizer
import torch
# Load model and tokenizer
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Prepare input
text = "The quick brown fox jumps over the lazy dog."
inputs = tokenizer(text, return_tensors="pt")
# Get model outputs
with torch.no_grad():
outputs = model(**inputs)
# Access representations
last_hidden_state = outputs.last_hidden_state # Shape: (1, seq_len, 768)
pooled_output = outputs.pooler_output # Shape: (1, 768)
print(f"Sequence representation shape: {last_hidden_state.shape}")
print(f"Pooled representation shape: {pooled_output.shape}")Abstract base class for all BERT models that handles weight initialization and provides a simple interface for downloading and loading pre-trained models.
class BertPreTrainedModel(PreTrainedModel):
config_class = BertConfig
pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = load_tf_weights_in_bert
base_model_prefix = "bert"
def _init_weights(self, module):
"""
Initialize the weights for BERT models.
Parameters:
- module (nn.Module): Module to initialize
"""Usage Example:
from pytorch_transformers import BertPreTrainedModel, BertConfig
# BertPreTrainedModel is typically used as a base class for custom BERT models
class CustomBertModel(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# Custom model implementation
def forward(self, input_ids):
# Custom forward implementation
pass
# Initialize with proper weight initialization
config = BertConfig()
model = CustomBertModel(config)
# Weights are automatically initialized according to BERT standardsBERT model for pre-training with both masked language modeling and next sentence prediction heads.
class BertForPreTraining(BertPreTrainedModel):
def __init__(self, config):
"""
Initialize BERT for pre-training with MLM and NSP heads.
Parameters:
- config (BertConfig): Model configuration
"""
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
masked_lm_labels=None,
next_sentence_label=None
):
"""
Forward pass for pre-training with MLM and NSP tasks.
Parameters:
- input_ids (torch.Tensor): Token IDs
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment token indices
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- masked_lm_labels (torch.Tensor): Labels for MLM loss
- next_sentence_label (torch.Tensor): Labels for NSP loss
Returns:
BertForPreTrainingOutput: Object with prediction_logits, seq_relationship_logits, and losses
"""Usage Example:
from pytorch_transformers import BertForPreTraining, BertTokenizer
import torch
# Load model and tokenizer
model = BertForPreTraining.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Prepare pre-training data
text_a = "The cat sat on the"
text_b = "mat and slept peacefully"
inputs = tokenizer(text_a, text_b, return_tensors="pt")
# Add masked LM labels (replace some tokens with [MASK])
masked_inputs = inputs.copy()
masked_inputs['input_ids'][0, 5] = tokenizer.mask_token_id # Mask "on"
masked_lm_labels = inputs['input_ids'].clone()
masked_lm_labels[masked_inputs['input_ids'] != tokenizer.mask_token_id] = -1
# Add NSP label (0 = sentence B follows A, 1 = random sentence B)
next_sentence_label = torch.tensor([0])
# Forward pass
outputs = model(**masked_inputs,
masked_lm_labels=masked_lm_labels,
next_sentence_label=next_sentence_label)
print(f"MLM loss: {outputs.loss}")
print(f"NSP predictions: {torch.softmax(outputs.seq_relationship_logits, dim=-1)}")BERT model with only a next sentence prediction head for determining if two sentences are consecutive.
class BertForNextSentencePrediction(BertPreTrainedModel):
def __init__(self, config):
"""
Initialize BERT for next sentence prediction task.
Parameters:
- config (BertConfig): Model configuration
"""
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
next_sentence_label=None
):
"""
Forward pass for next sentence prediction.
Parameters:
- input_ids (torch.Tensor): Token IDs for sentence pair
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment token indices (0 for sentence A, 1 for sentence B)
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- next_sentence_label (torch.Tensor): Labels (0=consecutive, 1=random)
Returns:
NextSentencePredictorOutput: Object with seq_relationship_logits and loss
"""Usage Example:
from pytorch_transformers import BertForNextSentencePrediction, BertTokenizer
import torch
# Load model and tokenizer
model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Prepare sentence pairs
sentence_a = "The weather is nice today"
sentence_b = "I think I'll go for a walk" # Consecutive sentence
sentence_c = "Machine learning is fascinating" # Random sentence
# Encode pairs
consecutive_inputs = tokenizer(sentence_a, sentence_b, return_tensors="pt")
random_inputs = tokenizer(sentence_a, sentence_c, return_tensors="pt")
# Predict
with torch.no_grad():
consecutive_outputs = model(**consecutive_inputs)
random_outputs = model(**random_inputs)
# Get predictions (0=consecutive, 1=random)
consecutive_probs = torch.softmax(consecutive_outputs.logits, dim=-1)
random_probs = torch.softmax(random_outputs.logits, dim=-1)
print(f"Consecutive pair - P(consecutive): {consecutive_probs[0, 0]:.3f}")
print(f"Random pair - P(consecutive): {random_probs[0, 0]:.3f}")BERT model with a language modeling head for masked language modeling (MLM) tasks.
class BertForMaskedLM(PreTrainedModel):
def __init__(self, config):
"""
Initialize BERT for masked language modeling.
Parameters:
- config (BertConfig): Model configuration
"""
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None
):
"""
Forward pass for masked language modeling.
Parameters:
- input_ids (torch.Tensor): Token IDs with [MASK] tokens
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment token indices
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- labels (torch.Tensor): True token IDs for masked positions
Returns:
MaskedLMOutput: Object with loss and prediction_scores
"""BERT model with a classification head for sequence-level classification tasks.
class BertForSequenceClassification(PreTrainedModel):
def __init__(self, config):
"""
Initialize BERT for sequence classification.
Parameters:
- config (BertConfig): Model configuration with num_labels
"""
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None
):
"""
Forward pass for sequence classification.
Parameters:
- input_ids (torch.Tensor): Token IDs
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment token indices
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- labels (torch.Tensor): Classification labels
Returns:
SequenceClassifierOutput: Object with loss and logits
"""Usage Example:
from pytorch_transformers import BertForSequenceClassification, BertTokenizer
import torch
# Load model for binary classification
model = BertForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2
)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Prepare input
text = "This movie is fantastic!"
inputs = tokenizer(text, return_tensors="pt")
# Get predictions
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(f"Positive probability: {predictions[0][1].item():.3f}")BERT model with a span classification head for extractive question answering.
class BertForQuestionAnswering(PreTrainedModel):
def __init__(self, config):
"""
Initialize BERT for question answering.
Parameters:
- config (BertConfig): Model configuration
"""
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None
):
"""
Forward pass for question answering.
Parameters:
- input_ids (torch.Tensor): Token IDs for question and context
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment IDs (0 for question, 1 for context)
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- start_positions (torch.Tensor): Start positions of answer spans
- end_positions (torch.Tensor): End positions of answer spans
Returns:
QuestionAnsweringModelOutput: Object with loss, start_logits, end_logits
"""Usage Example:
from pytorch_transformers import BertForQuestionAnswering, BertTokenizer
import torch
# Load model and tokenizer
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Prepare question and context
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."
# Tokenize with proper formatting
inputs = tokenizer.encode_plus(
question,
context,
return_tensors="pt",
max_length=512,
truncation=True
)
# Get answer span predictions
with torch.no_grad():
outputs = model(**inputs)
start_scores = outputs.start_logits
end_scores = outputs.end_logits
# Find best answer span
start_idx = torch.argmax(start_scores)
end_idx = torch.argmax(end_scores)
# Extract answer
answer_tokens = inputs["input_ids"][0][start_idx:end_idx+1]
answer = tokenizer.decode(answer_tokens)
print(f"Answer: {answer}")BERT model with a token classification head for token-level tasks like named entity recognition.
class BertForTokenClassification(PreTrainedModel):
def __init__(self, config):
"""
Initialize BERT for token classification.
Parameters:
- config (BertConfig): Model configuration with num_labels
"""
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None
):
"""
Forward pass for token classification.
Parameters:
- input_ids (torch.Tensor): Token IDs
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment token indices
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- labels (torch.Tensor): Token-level labels
Returns:
TokenClassifierOutput: Object with loss and logits
"""BERT model for multiple choice tasks with a classification head over multiple choice options.
class BertForMultipleChoice(PreTrainedModel):
def __init__(self, config):
"""
Initialize BERT for multiple choice.
Parameters:
- config (BertConfig): Model configuration
"""
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None
):
"""
Forward pass for multiple choice.
Parameters:
- input_ids (torch.Tensor): Token IDs of shape (batch_size, num_choices, seq_len)
- attention_mask (torch.Tensor): Attention mask
- token_type_ids (torch.Tensor): Segment token indices
- position_ids (torch.Tensor): Position indices
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
- labels (torch.Tensor): Correct choice indices
Returns:
MultipleChoiceModelOutput: Object with loss and logits
"""WordPiece tokenizer for BERT models with proper handling of special tokens and subword tokenization.
class BertTokenizer(PreTrainedTokenizer):
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
**kwargs
):
"""
Initialize BERT tokenizer.
Parameters:
- vocab_file (str): Path to vocabulary file
- do_lower_case (bool): Whether to lowercase input
- do_basic_tokenize (bool): Whether to do basic tokenization
- never_split (List[str]): Tokens never to split
- unk_token (str): Unknown token
- sep_token (str): Separator token
- pad_token (str): Padding token
- cls_token (str): Classification token
- mask_token (str): Mask token
- tokenize_chinese_chars (bool): Whether to tokenize Chinese characters
"""def load_tf_weights_in_bert(model, tf_checkpoint_path):
"""
Load TensorFlow BERT checkpoint weights into a PyTorch BERT model.
Parameters:
- model (BertModel): PyTorch BERT model
- tf_checkpoint_path (str): Path to TensorFlow checkpoint
Returns:
BertModel: Model with loaded weights
"""BERT_PRETRAINED_MODEL_ARCHIVE_MAP: Dict[str, str]
# Maps model names to download URLs for pre-trained weights
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str]
# Maps model names to download URLs for configurationsAvailable Pre-trained Models:
bert-base-uncased: 12-layer, 768-hidden, 12-heads, 110M parametersbert-large-uncased: 24-layer, 1024-hidden, 16-heads, 340M parametersbert-base-cased: 12-layer, 768-hidden, 12-heads, 110M parameters (cased)bert-large-cased: 24-layer, 1024-hidden, 16-heads, 340M parameters (cased)bert-base-multilingual-uncased: 12-layer, 768-hidden, 12-heads, 110M parameters (multilingual)bert-base-multilingual-cased: 12-layer, 768-hidden, 12-heads, 110M parameters (multilingual, cased)bert-base-chinese: 12-layer, 768-hidden, 12-heads, 110M parameters (Chinese)Install with Tessl CLI
npx tessl i tessl/pypi-pytorch-transformers