CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pytorch-pretrained-bert

PyTorch implementations of transformer-based language models including BERT, OpenAI GPT, GPT-2, and Transformer-XL with pre-trained models, tokenizers, and utilities for NLP tasks

Pending
Overview
Eval results
Files

gpt-models.mddocs/

GPT Models

OpenAI GPT, GPT-2, and Transformer-XL model families with their configurations and specialized components for autoregressive language modeling, text generation, and extended context processing.

Capabilities

OpenAI GPT Models

Original OpenAI GPT models with configuration and task-specific variants for language modeling and classification.

Configuration

class OpenAIGPTConfig:
    def __init__(
        self,
        vocab_size_or_config_json_file=40478,
        n_positions=512,
        n_ctx=512,
        n_embd=768,
        n_layer=12,
        n_head=12,
        afn="gelu",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02
    ):
        """
        Initialize OpenAI GPT configuration.
        
        Args:
            vocab_size_or_config_json_file (int or str): Vocabulary size or config path
            n_positions (int): Maximum position embeddings
            n_ctx (int): Context size
            n_embd (int): Embedding dimension
            n_layer (int): Number of transformer layers
            n_head (int): Number of attention heads
            afn (str): Activation function
            resid_pdrop (float): Residual dropout probability
            embd_pdrop (float): Embedding dropout probability
            attn_pdrop (float): Attention dropout probability
            layer_norm_epsilon (float): Layer normalization epsilon
            initializer_range (float): Weight initialization range
        """
    
    @classmethod
    def from_dict(cls, json_object):
        """Create configuration from dictionary."""
    
    @classmethod
    def from_json_file(cls, json_file):
        """Create configuration from JSON file."""
    
    def to_dict(self):
        """Convert to dictionary."""
    
    def to_json_string(self):
        """Convert to JSON string."""

Base Model

class OpenAIGPTModel:
    def __init__(self, config):
        """
        Initialize OpenAI GPT base model.
        
        Args:
            config (OpenAIGPTConfig): Model configuration
        """
    
    def forward(self, input_ids, position_ids=None, token_type_ids=None):
        """
        Forward pass through GPT model.
        
        Args:
            input_ids (torch.Tensor): Token IDs of shape [batch_size, seq_len]
            position_ids (torch.Tensor, optional): Position IDs
            token_type_ids (torch.Tensor, optional): Token type IDs
            
        Returns:
            torch.Tensor: Hidden states of shape [batch_size, seq_len, hidden_size]
        """
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
        """Load pre-trained OpenAI GPT model."""

Language Modeling Head

class OpenAIGPTLMHeadModel:
    def __init__(self, config):
        """
        Initialize OpenAI GPT with language modeling head.
        
        Args:
            config (OpenAIGPTConfig): Model configuration
        """
    
    def forward(
        self,
        input_ids,
        position_ids=None,
        token_type_ids=None,
        lm_labels=None
    ):
        """
        Forward pass with language modeling head.
        
        Args:
            input_ids (torch.Tensor): Token IDs
            position_ids (torch.Tensor, optional): Position IDs
            token_type_ids (torch.Tensor, optional): Token type IDs
            lm_labels (torch.Tensor, optional): Language modeling labels
            
        Returns:
            torch.Tensor: Language modeling logits or loss if labels provided
        """
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
        """Load pre-trained model."""

Double Heads Model

class OpenAIGPTDoubleHeadsModel:
    def __init__(self, config):
        """
        Initialize OpenAI GPT with both language modeling and classification heads.
        
        Args:
            config (OpenAIGPTConfig): Model configuration
        """
    
    def forward(
        self,
        input_ids,
        position_ids=None,
        token_type_ids=None,
        lm_labels=None,
        multiple_choice_labels=None
    ):
        """
        Forward pass with both heads.
        
        Args:
            input_ids (torch.Tensor): Token IDs
            position_ids (torch.Tensor, optional): Position IDs
            token_type_ids (torch.Tensor, optional): Token type IDs
            lm_labels (torch.Tensor, optional): Language modeling labels
            multiple_choice_labels (torch.Tensor, optional): Classification labels
            
        Returns:
            tuple: (lm_logits, classification_logits) or losses if labels provided
        """
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
        """Load pre-trained model."""

GPT-2 Models

GPT-2 model family with improved architecture and byte-level BPE tokenization.

Configuration

class GPT2Config:
    def __init__(
        self,
        vocab_size_or_config_json_file=50257,
        n_positions=1024,
        n_ctx=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
        n_inner=None,
        afn="gelu_new",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02
    ):
        """
        Initialize GPT-2 configuration.
        
        Args:
            vocab_size_or_config_json_file (int or str): Vocabulary size or config path
            n_positions (int): Maximum position embeddings
            n_ctx (int): Context size
            n_embd (int): Embedding dimension
            n_layer (int): Number of layers
            n_head (int): Number of attention heads
            n_inner (int, optional): Inner dimension (defaults to 4 * n_embd)
            afn (str): Activation function
            resid_pdrop (float): Residual dropout
            embd_pdrop (float): Embedding dropout
            attn_pdrop (float): Attention dropout
            layer_norm_epsilon (float): Layer norm epsilon
            initializer_range (float): Initialization range
        """
    
    @classmethod
    def from_dict(cls, json_object):
        """Create from dictionary."""
    
    @classmethod
    def from_json_file(cls, json_file):
        """Create from JSON file."""
    
    def to_dict(self):
        """Convert to dictionary."""
    
    def to_json_string(self):
        """Convert to JSON string."""

Base Model

class GPT2Model:
    def __init__(self, config):
        """
        Initialize GPT-2 base model.
        
        Args:
            config (GPT2Config): Model configuration
        """
    
    def forward(self, input_ids, position_ids=None, token_type_ids=None):
        """
        Forward pass through GPT-2.
        
        Args:
            input_ids (torch.Tensor): Token IDs
            position_ids (torch.Tensor, optional): Position IDs
            token_type_ids (torch.Tensor, optional): Token type IDs
            
        Returns:
            torch.Tensor: Hidden states
        """
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
        """Load pre-trained GPT-2 model."""

Language Modeling Head

class GPT2LMHeadModel:
    def __init__(self, config):
        """
        Initialize GPT-2 with language modeling head.
        
        Args:
            config (GPT2Config): Model configuration
        """
    
    def forward(
        self,
        input_ids,
        position_ids=None,
        token_type_ids=None,
        lm_labels=None
    ):
        """
        Forward pass with LM head.
        
        Returns:
            torch.Tensor: Language modeling logits or loss
        """
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
        """Load pre-trained model."""

Double Heads Model

class GPT2DoubleHeadsModel:
    def __init__(self, config):
        """
        Initialize GPT-2 with language modeling and classification heads.
        
        Args:
            config (GPT2Config): Model configuration
        """
    
    def forward(
        self,
        input_ids,
        position_ids=None,
        token_type_ids=None,
        lm_labels=None,
        multiple_choice_labels=None
    ):
        """
        Forward pass with both heads.
        
        Returns:
            tuple: (lm_logits, classification_logits) or losses
        """
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
        """Load pre-trained model."""

Transformer-XL Models

Transformer-XL models with extended context capability through recurrence mechanism and adaptive attention.

Configuration

class TransfoXLConfig:
    def __init__(
        self,
        vocab_size_or_config_json_file=267735,
        cutoffs=[20000, 40000, 200000],
        d_model=1024,
        d_embed=1024,
        n_head=16,
        d_head=64,
        d_inner=4096,
        div_val=4,
        pre_lnorm=False,
        n_layer=18,
        tgt_len=128,
        ext_len=0,
        mem_len=1600,
        clamp_len=1000,
        same_length=True,
        attn_type=0,
        sample_softmax=-1,
        adaptive=True,
        tie_weight=True,
        dropout=0.1,
        dropatt=0.0,
        untie_r=True,
        embd_init='normal',
        init='normal',
        init_range=0.01,
        proj_init_std=0.01,
        init_std=0.02
    ):
        """
        Initialize Transformer-XL configuration.
        
        Args:
            vocab_size_or_config_json_file (int or str): Vocabulary size or config path
            cutoffs (list): Adaptive softmax cutoffs
            d_model (int): Model dimension
            d_embed (int): Embedding dimension
            n_head (int): Number of attention heads
            d_head (int): Dimension per attention head
            d_inner (int): Inner feed-forward dimension
            div_val (int): Dimension reduction factor
            pre_lnorm (bool): Whether to use pre-layer normalization
            n_layer (int): Number of layers
            tgt_len (int): Target sequence length
            ext_len (int): Extended sequence length
            mem_len (int): Memory length
            clamp_len (int): Clamp length for positional encoding
            same_length (bool): Whether to use same length
            attn_type (int): Attention type
            sample_softmax (int): Sample softmax parameter
            adaptive (bool): Whether to use adaptive softmax
            tie_weight (bool): Whether to tie weights
            dropout (float): Dropout probability
            dropatt (float): Attention dropout
            untie_r (bool): Whether to untie relative position bias
            embd_init (str): Embedding initialization
            init (str): General initialization
            init_range (float): Initialization range
            proj_init_std (float): Projection initialization std
            init_std (float): Initialization std
        """
    
    @classmethod
    def from_dict(cls, json_object):
        """Create from dictionary."""
    
    @classmethod
    def from_json_file(cls, json_file):
        """Create from JSON file."""
    
    def to_dict(self):
        """Convert to dictionary."""
    
    def to_json_string(self):
        """Convert to JSON string."""

Base Model

class TransfoXLModel:
    def __init__(self, config):
        """
        Initialize Transformer-XL base model.
        
        Args:
            config (TransfoXLConfig): Model configuration
        """
    
    def forward(self, input_ids, mems=None):
        """
        Forward pass with memory mechanism.
        
        Args:
            input_ids (torch.Tensor): Token IDs
            mems (list, optional): Memory states from previous segments
            
        Returns:
            tuple: (hidden_states, new_mems) where:
                - hidden_states (torch.Tensor): Output hidden states
                - new_mems (list): Updated memory states
        """
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
        """Load pre-trained Transformer-XL model."""

Language Modeling Head

class TransfoXLLMHeadModel:
    def __init__(self, config):
        """
        Initialize Transformer-XL with language modeling head.
        
        Args:
            config (TransfoXLConfig): Model configuration
        """
    
    def forward(self, input_ids, labels=None, mems=None):
        """
        Forward pass with LM head and memory.
        
        Args:
            input_ids (torch.Tensor): Token IDs
            labels (torch.Tensor, optional): Language modeling labels
            mems (list, optional): Memory states
            
        Returns:
            tuple: (prediction_scores, new_mems) or loss if labels provided
        """
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
        """Load pre-trained model."""

Weight Loading Functions

Functions to convert TensorFlow checkpoints to PyTorch format for each model family.

def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
    """
    Load TensorFlow OpenAI GPT checkpoint into PyTorch model.
    
    Args:
        model: PyTorch OpenAI GPT model
        openai_checkpoint_folder_path (str): Path to TF checkpoint folder
        
    Returns:
        PyTorch model with loaded weights
    """

def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
    """
    Load TensorFlow GPT-2 checkpoint into PyTorch model.
    
    Args:
        model: PyTorch GPT-2 model
        gpt2_checkpoint_path (str): Path to TF checkpoint
        
    Returns:
        PyTorch model with loaded weights
    """

def load_tf_weights_in_transfo_xl(model, config, tf_path):
    """
    Load TensorFlow Transformer-XL checkpoint into PyTorch model.
    
    Args:
        model: PyTorch Transformer-XL model
        config (TransfoXLConfig): Model configuration
        tf_path (str): Path to TF checkpoint
        
    Returns:
        PyTorch model with loaded weights
    """

Usage Examples

OpenAI GPT Text Generation

from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
import torch

# Load model and tokenizer
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

# Prepare input
text = "The artificial intelligence will"
input_ids = torch.tensor([tokenizer.encode(text)])

# Generate text
model.eval()
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs[0]
    
    # Get next token probabilities
    next_token_logits = predictions[0, -1, :]
    next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), 1)
    
    # Decode next token
    next_word = tokenizer.decode([next_token.item()])
    print(f"Next word: {next_word}")

GPT-2 with Custom Configuration

from pytorch_pretrained_bert import GPT2Config, GPT2LMHeadModel

# Create custom configuration
config = GPT2Config(
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12
)

# Initialize model with custom config
model = GPT2LMHeadModel(config)

# Or load pre-trained
model = GPT2LMHeadModel.from_pretrained('gpt2')

Transformer-XL with Memory

from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLTokenizer
import torch

# Load model and tokenizer
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')

# Process sequences with memory
sequence1 = "The weather today is beautiful and"
sequence2 = "sunny with clear blue skies."

# Encode sequences
input_ids_1 = torch.tensor([tokenizer.encode(sequence1)])
input_ids_2 = torch.tensor([tokenizer.encode(sequence2)])

# Forward pass with memory
model.eval()
with torch.no_grad():
    # Process first sequence
    outputs_1 = model(input_ids_1)
    mems = outputs_1[1]  # Extract memory states
    
    # Process second sequence with memory from first
    outputs_2 = model(input_ids_2, mems=mems)
    logits = outputs_2[0]

Double Heads Model for Multiple Tasks

from pytorch_pretrained_bert import GPT2DoubleHeadsModel, GPT2Tokenizer
import torch

# Load double heads model
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Prepare input for both language modeling and classification
text = "This movie is great!"
input_ids = torch.tensor([tokenizer.encode(text)])

# Forward pass
model.eval()
with torch.no_grad():
    outputs = model(input_ids)
    lm_logits = outputs[0]      # Language modeling logits
    cls_logits = outputs[1]     # Classification logits
    
    print(f"LM logits shape: {lm_logits.shape}")
    print(f"Classification logits shape: {cls_logits.shape}")

Install with Tessl CLI

npx tessl i tessl/pypi-pytorch-pretrained-bert

docs

bert-models.md

gpt-models.md

index.md

optimizers.md

tokenizers.md

utilities.md

tile.json