Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM
—
Additional transformer architectures beyond BERT and GPT-2, including OpenAI GPT, Transformer-XL, XLNet, XLM, RoBERTa, and DistilBERT. Each architecture has specific design characteristics optimized for different NLP tasks and languages.
XLNet uses permutation-based training and relative positional encodings, combining the best of autoregressive and autoencoding approaches.
class XLNetConfig(PretrainedConfig):
def __init__(
self,
vocab_size=32000,
d_model=1024,
n_layer=24,
n_head=16,
d_inner=4096,
ff_activation="gelu",
untie_r=True,
attn_type="bi",
initializer_range=0.02,
layer_norm_eps=1e-12,
dropout=0.1,
mem_len=None,
reuse_len=None,
bi_data=False,
clamp_len=-1,
same_length=False,
**kwargs
):
"""
Configuration for XLNet models.
Parameters:
- vocab_size (int): Vocabulary size
- d_model (int): Hidden layer dimensionality
- n_layer (int): Number of transformer layers
- n_head (int): Number of attention heads
- d_inner (int): Feed-forward layer dimensionality
- ff_activation (str): Feed-forward activation function
- untie_r (bool): Whether to untie relative position bias
- attn_type (str): Attention type ("bi" for bidirectional)
- dropout (float): Dropout probability
- mem_len (int): Memory length for recurrence
- reuse_len (int): Reuse length for recurrence
"""class XLNetModel(PreTrainedModel):
def forward(
self,
input_ids=None,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None
):
"""
Forward pass through XLNet model.
Parameters:
- input_ids (torch.Tensor): Token IDs
- attention_mask (torch.Tensor): Attention mask
- mems (List[torch.Tensor]): Memory from previous segments
- perm_mask (torch.Tensor): Permutation mask for attention
- target_mapping (torch.Tensor): Target mapping for partial prediction
- token_type_ids (torch.Tensor): Segment token indices
- input_mask (torch.Tensor): Input mask
- head_mask (torch.Tensor): Head mask
- inputs_embeds (torch.Tensor): Pre-computed embeddings
Returns:
XLNetModelOutput: Object with last_hidden_state and mems
"""class XLNetForSequenceClassification(PreTrainedModel):
def forward(
self,
input_ids=None,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None,
labels=None
):
"""
Forward pass for XLNet sequence classification.
Returns:
SequenceClassifierOutput: Object with loss and logits
"""class XLNetTokenizer(PreTrainedTokenizer):
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=True,
keep_accents=False,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
**kwargs
):
"""
SentencePiece-based tokenizer for XLNet.
"""SPIECE_UNDERLINE: str = "▁"
# SentencePiece underline character used by XLNet tokenizer
# Represents the beginning of words in subword tokenizationUsage Example:
from pytorch_transformers import XLNetForSequenceClassification, XLNetTokenizer
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
text = "This is a great movie!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)RoBERTa (Robustly Optimized BERT Pretraining Approach) improves upon BERT with better training procedures and hyperparameters.
class RobertaConfig(BertConfig):
def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
"""
Configuration for RoBERTa models (extends BertConfig).
Parameters:
- pad_token_id (int): Padding token ID
- bos_token_id (int): Beginning of sequence token ID
- eos_token_id (int): End of sequence token ID
"""class RobertaModel(PreTrainedModel):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None
):
"""
Forward pass through RoBERTa model.
Returns:
BaseModelOutputWithPooling: Object with last_hidden_state and pooler_output
"""class RobertaForMaskedLM(PreTrainedModel):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None
):
"""
Forward pass for RoBERTa masked language modeling.
Returns:
MaskedLMOutput: Object with loss and prediction_scores
"""class RobertaTokenizer(PreTrainedTokenizer):
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
**kwargs
):
"""
RoBERTa tokenizer (inherits from GPT2Tokenizer with different special tokens).
"""DistilBERT is a distilled version of BERT that is 60% smaller and 60% faster while retaining 97% of BERT's performance.
class DistilBertConfig(PretrainedConfig):
def __init__(
self,
vocab_size=30522,
max_position_embeddings=512,
sinusoidal_pos_embds=False,
n_layers=6,
n_heads=12,
dim=768,
hidden_dim=3072,
dropout=0.1,
attention_dropout=0.1,
activation="gelu",
initializer_range=0.02,
**kwargs
):
"""
Configuration for DistilBERT models.
Parameters:
- vocab_size (int): Vocabulary size
- max_position_embeddings (int): Maximum sequence length
- sinusoidal_pos_embds (bool): Whether to use sinusoidal position embeddings
- n_layers (int): Number of transformer layers
- n_heads (int): Number of attention heads
- dim (int): Hidden layer dimensionality
- hidden_dim (int): Feed-forward layer dimensionality
- dropout (float): Dropout probability
- attention_dropout (float): Attention dropout probability
- activation (str): Activation function
"""class DistilBertModel(PreTrainedModel):
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None
):
"""
Forward pass through DistilBERT model.
Returns:
BaseModelOutput: Object with last_hidden_state
"""class DistilBertForSequenceClassification(PreTrainedModel):
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
labels=None
):
"""
Forward pass for DistilBERT sequence classification.
Returns:
SequenceClassifierOutput: Object with loss and logits
"""class DistilBertTokenizer(PreTrainedTokenizer):
# Identical to BertTokenizer - uses same WordPiece tokenization
passXLM (Cross-lingual Language Model) for multilingual understanding and cross-lingual transfer learning.
class XLMConfig(PretrainedConfig):
def __init__(
self,
vocab_size=30145,
emb_dim=2048,
n_layers=12,
n_heads=16,
dropout=0.1,
attention_dropout=0.1,
gelu_activation=True,
sinusoidal_embeddings=False,
causal=False,
asm=False,
n_langs=1,
use_lang_emb=True,
max_position_embeddings=512,
**kwargs
):
"""
Configuration for XLM models.
"""class XLMModel(PreTrainedModel):
def forward(
self,
input_ids=None,
attention_mask=None,
langs=None,
token_type_ids=None,
position_ids=None,
lengths=None,
cache=None,
head_mask=None,
inputs_embeds=None
):
"""
Forward pass through XLM model.
Returns:
XLMModelOutput: Object with last_hidden_state
"""class XLMTokenizer(PreTrainedTokenizer):
def __init__(
self,
vocab_file,
merges_file,
unk_token="<unk>",
bos_token="<s>",
sep_token="</s>",
pad_token="<pad>",
cls_token="</s>",
mask_token="<special1>",
**kwargs
):
"""
BPE tokenizer for XLM multilingual models.
"""Transformer-XL enables learning longer-term dependencies with recurrence mechanisms and relative positional encodings.
class TransfoXLConfig(PretrainedConfig):
def __init__(
self,
vocab_size=267735,
cutoffs=[20000, 40000, 200000],
d_model=1024,
d_embed=1024,
n_head=16,
d_head=64,
d_inner=4096,
div_val=4,
pre_lnorm=False,
n_layer=18,
mem_len=1600,
clamp_len=1000,
same_length=True,
**kwargs
):
"""
Configuration for Transformer-XL models.
"""class TransfoXLModel(PreTrainedModel):
def forward(
self,
input_ids=None,
mems=None,
head_mask=None,
inputs_embeds=None
):
"""
Forward pass through Transformer-XL model.
Returns:
TransfoXLModelOutput: Object with last_hidden_state and mems
"""class TransfoXLTokenizer(PreTrainedTokenizer):
def __init__(
self,
special=None,
min_freq=0,
max_size=None,
lower_case=False,
delimiter=None,
vocab_file=None,
**kwargs
):
"""
Word-level tokenizer for Transformer-XL.
"""The original OpenAI GPT (Generative Pre-trained Transformer) model.
class OpenAIGPTConfig(PretrainedConfig):
def __init__(
self,
vocab_size=40478,
n_positions=512,
n_ctx=512,
n_embd=768,
n_layer=12,
n_head=12,
afn="gelu",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
**kwargs
):
"""
Configuration for OpenAI GPT models.
"""class OpenAIGPTModel(PreTrainedModel):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None
):
"""
Forward pass through OpenAI GPT model.
Returns:
BaseModelOutput: Object with last_hidden_state
"""class OpenAIGPTTokenizer(PreTrainedTokenizer):
def __init__(
self,
vocab_file,
merges_file,
unk_token="<unk>",
**kwargs
):
"""
BPE tokenizer for OpenAI GPT.
"""XLNet:
xlnet-base-cased: 12-layer, 768-hidden, 12-heads, 110M parametersxlnet-large-cased: 24-layer, 1024-hidden, 16-heads, 340M parametersRoBERTa:
roberta-base: 12-layer, 768-hidden, 12-heads, 125M parametersroberta-large: 24-layer, 1024-hidden, 16-heads, 355M parametersDistilBERT:
distilbert-base-uncased: 6-layer, 768-hidden, 12-heads, 66M parametersdistilbert-base-cased: 6-layer, 768-hidden, 12-heads, 65M parameters (cased)XLM:
xlm-mlm-en-2048: English MLM model, 1024-hiddenxlm-mlm-100-1280: 100-language MLM model, 1280-hiddenTransformer-XL:
transfo-xl-wt103: Trained on WikiText-103, 1024-hidden, 18-layerOpenAI GPT:
openai-gpt: 12-layer, 768-hidden, 12-heads, 117M parameters# XLNet for sequence classification
from pytorch_transformers import XLNetForSequenceClassification, XLNetTokenizer
xlnet_model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
# RoBERTa for masked language modeling
from pytorch_transformers import RobertaForMaskedLM, RobertaTokenizer
roberta_model = RobertaForMaskedLM.from_pretrained("roberta-base")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# DistilBERT for efficient inference
from pytorch_transformers import DistilBertForSequenceClassification, DistilBertTokenizer
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# Process text with any model
text = "This is an example sentence."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)Install with Tessl CLI
npx tessl i tessl/pypi-pytorch-transformers