0
# GPT Models
1
2
OpenAI GPT, GPT-2, and Transformer-XL model families with their configurations and specialized components for autoregressive language modeling, text generation, and extended context processing.
3
4
## Capabilities
5
6
### OpenAI GPT Models
7
8
Original OpenAI GPT models with configuration and task-specific variants for language modeling and classification.
9
10
#### Configuration
11
12
```python { .api }
13
class OpenAIGPTConfig:
14
def __init__(
15
self,
16
vocab_size_or_config_json_file=40478,
17
n_positions=512,
18
n_ctx=512,
19
n_embd=768,
20
n_layer=12,
21
n_head=12,
22
afn="gelu",
23
resid_pdrop=0.1,
24
embd_pdrop=0.1,
25
attn_pdrop=0.1,
26
layer_norm_epsilon=1e-5,
27
initializer_range=0.02
28
):
29
"""
30
Initialize OpenAI GPT configuration.
31
32
Args:
33
vocab_size_or_config_json_file (int or str): Vocabulary size or config path
34
n_positions (int): Maximum position embeddings
35
n_ctx (int): Context size
36
n_embd (int): Embedding dimension
37
n_layer (int): Number of transformer layers
38
n_head (int): Number of attention heads
39
afn (str): Activation function
40
resid_pdrop (float): Residual dropout probability
41
embd_pdrop (float): Embedding dropout probability
42
attn_pdrop (float): Attention dropout probability
43
layer_norm_epsilon (float): Layer normalization epsilon
44
initializer_range (float): Weight initialization range
45
"""
46
47
@classmethod
48
def from_dict(cls, json_object):
49
"""Create configuration from dictionary."""
50
51
@classmethod
52
def from_json_file(cls, json_file):
53
"""Create configuration from JSON file."""
54
55
def to_dict(self):
56
"""Convert to dictionary."""
57
58
def to_json_string(self):
59
"""Convert to JSON string."""
60
```
61
62
#### Base Model
63
64
```python { .api }
65
class OpenAIGPTModel:
66
def __init__(self, config):
67
"""
68
Initialize OpenAI GPT base model.
69
70
Args:
71
config (OpenAIGPTConfig): Model configuration
72
"""
73
74
def forward(self, input_ids, position_ids=None, token_type_ids=None):
75
"""
76
Forward pass through GPT model.
77
78
Args:
79
input_ids (torch.Tensor): Token IDs of shape [batch_size, seq_len]
80
position_ids (torch.Tensor, optional): Position IDs
81
token_type_ids (torch.Tensor, optional): Token type IDs
82
83
Returns:
84
torch.Tensor: Hidden states of shape [batch_size, seq_len, hidden_size]
85
"""
86
87
@classmethod
88
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
89
"""Load pre-trained OpenAI GPT model."""
90
```
91
92
#### Language Modeling Head
93
94
```python { .api }
95
class OpenAIGPTLMHeadModel:
96
def __init__(self, config):
97
"""
98
Initialize OpenAI GPT with language modeling head.
99
100
Args:
101
config (OpenAIGPTConfig): Model configuration
102
"""
103
104
def forward(
105
self,
106
input_ids,
107
position_ids=None,
108
token_type_ids=None,
109
lm_labels=None
110
):
111
"""
112
Forward pass with language modeling head.
113
114
Args:
115
input_ids (torch.Tensor): Token IDs
116
position_ids (torch.Tensor, optional): Position IDs
117
token_type_ids (torch.Tensor, optional): Token type IDs
118
lm_labels (torch.Tensor, optional): Language modeling labels
119
120
Returns:
121
torch.Tensor: Language modeling logits or loss if labels provided
122
"""
123
124
@classmethod
125
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
126
"""Load pre-trained model."""
127
```
128
129
#### Double Heads Model
130
131
```python { .api }
132
class OpenAIGPTDoubleHeadsModel:
133
def __init__(self, config):
134
"""
135
Initialize OpenAI GPT with both language modeling and classification heads.
136
137
Args:
138
config (OpenAIGPTConfig): Model configuration
139
"""
140
141
def forward(
142
self,
143
input_ids,
144
position_ids=None,
145
token_type_ids=None,
146
lm_labels=None,
147
multiple_choice_labels=None
148
):
149
"""
150
Forward pass with both heads.
151
152
Args:
153
input_ids (torch.Tensor): Token IDs
154
position_ids (torch.Tensor, optional): Position IDs
155
token_type_ids (torch.Tensor, optional): Token type IDs
156
lm_labels (torch.Tensor, optional): Language modeling labels
157
multiple_choice_labels (torch.Tensor, optional): Classification labels
158
159
Returns:
160
tuple: (lm_logits, classification_logits) or losses if labels provided
161
"""
162
163
@classmethod
164
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
165
"""Load pre-trained model."""
166
```
167
168
### GPT-2 Models
169
170
GPT-2 model family with improved architecture and byte-level BPE tokenization.
171
172
#### Configuration
173
174
```python { .api }
175
class GPT2Config:
176
def __init__(
177
self,
178
vocab_size_or_config_json_file=50257,
179
n_positions=1024,
180
n_ctx=1024,
181
n_embd=768,
182
n_layer=12,
183
n_head=12,
184
n_inner=None,
185
afn="gelu_new",
186
resid_pdrop=0.1,
187
embd_pdrop=0.1,
188
attn_pdrop=0.1,
189
layer_norm_epsilon=1e-5,
190
initializer_range=0.02
191
):
192
"""
193
Initialize GPT-2 configuration.
194
195
Args:
196
vocab_size_or_config_json_file (int or str): Vocabulary size or config path
197
n_positions (int): Maximum position embeddings
198
n_ctx (int): Context size
199
n_embd (int): Embedding dimension
200
n_layer (int): Number of layers
201
n_head (int): Number of attention heads
202
n_inner (int, optional): Inner dimension (defaults to 4 * n_embd)
203
afn (str): Activation function
204
resid_pdrop (float): Residual dropout
205
embd_pdrop (float): Embedding dropout
206
attn_pdrop (float): Attention dropout
207
layer_norm_epsilon (float): Layer norm epsilon
208
initializer_range (float): Initialization range
209
"""
210
211
@classmethod
212
def from_dict(cls, json_object):
213
"""Create from dictionary."""
214
215
@classmethod
216
def from_json_file(cls, json_file):
217
"""Create from JSON file."""
218
219
def to_dict(self):
220
"""Convert to dictionary."""
221
222
def to_json_string(self):
223
"""Convert to JSON string."""
224
```
225
226
#### Base Model
227
228
```python { .api }
229
class GPT2Model:
230
def __init__(self, config):
231
"""
232
Initialize GPT-2 base model.
233
234
Args:
235
config (GPT2Config): Model configuration
236
"""
237
238
def forward(self, input_ids, position_ids=None, token_type_ids=None):
239
"""
240
Forward pass through GPT-2.
241
242
Args:
243
input_ids (torch.Tensor): Token IDs
244
position_ids (torch.Tensor, optional): Position IDs
245
token_type_ids (torch.Tensor, optional): Token type IDs
246
247
Returns:
248
torch.Tensor: Hidden states
249
"""
250
251
@classmethod
252
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
253
"""Load pre-trained GPT-2 model."""
254
```
255
256
#### Language Modeling Head
257
258
```python { .api }
259
class GPT2LMHeadModel:
260
def __init__(self, config):
261
"""
262
Initialize GPT-2 with language modeling head.
263
264
Args:
265
config (GPT2Config): Model configuration
266
"""
267
268
def forward(
269
self,
270
input_ids,
271
position_ids=None,
272
token_type_ids=None,
273
lm_labels=None
274
):
275
"""
276
Forward pass with LM head.
277
278
Returns:
279
torch.Tensor: Language modeling logits or loss
280
"""
281
282
@classmethod
283
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
284
"""Load pre-trained model."""
285
```
286
287
#### Double Heads Model
288
289
```python { .api }
290
class GPT2DoubleHeadsModel:
291
def __init__(self, config):
292
"""
293
Initialize GPT-2 with language modeling and classification heads.
294
295
Args:
296
config (GPT2Config): Model configuration
297
"""
298
299
def forward(
300
self,
301
input_ids,
302
position_ids=None,
303
token_type_ids=None,
304
lm_labels=None,
305
multiple_choice_labels=None
306
):
307
"""
308
Forward pass with both heads.
309
310
Returns:
311
tuple: (lm_logits, classification_logits) or losses
312
"""
313
314
@classmethod
315
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
316
"""Load pre-trained model."""
317
```
318
319
320
### Transformer-XL Models
321
322
Transformer-XL models with extended context capability through recurrence mechanism and adaptive attention.
323
324
#### Configuration
325
326
```python { .api }
327
class TransfoXLConfig:
328
def __init__(
329
self,
330
vocab_size_or_config_json_file=267735,
331
cutoffs=[20000, 40000, 200000],
332
d_model=1024,
333
d_embed=1024,
334
n_head=16,
335
d_head=64,
336
d_inner=4096,
337
div_val=4,
338
pre_lnorm=False,
339
n_layer=18,
340
tgt_len=128,
341
ext_len=0,
342
mem_len=1600,
343
clamp_len=1000,
344
same_length=True,
345
attn_type=0,
346
sample_softmax=-1,
347
adaptive=True,
348
tie_weight=True,
349
dropout=0.1,
350
dropatt=0.0,
351
untie_r=True,
352
embd_init='normal',
353
init='normal',
354
init_range=0.01,
355
proj_init_std=0.01,
356
init_std=0.02
357
):
358
"""
359
Initialize Transformer-XL configuration.
360
361
Args:
362
vocab_size_or_config_json_file (int or str): Vocabulary size or config path
363
cutoffs (list): Adaptive softmax cutoffs
364
d_model (int): Model dimension
365
d_embed (int): Embedding dimension
366
n_head (int): Number of attention heads
367
d_head (int): Dimension per attention head
368
d_inner (int): Inner feed-forward dimension
369
div_val (int): Dimension reduction factor
370
pre_lnorm (bool): Whether to use pre-layer normalization
371
n_layer (int): Number of layers
372
tgt_len (int): Target sequence length
373
ext_len (int): Extended sequence length
374
mem_len (int): Memory length
375
clamp_len (int): Clamp length for positional encoding
376
same_length (bool): Whether to use same length
377
attn_type (int): Attention type
378
sample_softmax (int): Sample softmax parameter
379
adaptive (bool): Whether to use adaptive softmax
380
tie_weight (bool): Whether to tie weights
381
dropout (float): Dropout probability
382
dropatt (float): Attention dropout
383
untie_r (bool): Whether to untie relative position bias
384
embd_init (str): Embedding initialization
385
init (str): General initialization
386
init_range (float): Initialization range
387
proj_init_std (float): Projection initialization std
388
init_std (float): Initialization std
389
"""
390
391
@classmethod
392
def from_dict(cls, json_object):
393
"""Create from dictionary."""
394
395
@classmethod
396
def from_json_file(cls, json_file):
397
"""Create from JSON file."""
398
399
def to_dict(self):
400
"""Convert to dictionary."""
401
402
def to_json_string(self):
403
"""Convert to JSON string."""
404
```
405
406
#### Base Model
407
408
```python { .api }
409
class TransfoXLModel:
410
def __init__(self, config):
411
"""
412
Initialize Transformer-XL base model.
413
414
Args:
415
config (TransfoXLConfig): Model configuration
416
"""
417
418
def forward(self, input_ids, mems=None):
419
"""
420
Forward pass with memory mechanism.
421
422
Args:
423
input_ids (torch.Tensor): Token IDs
424
mems (list, optional): Memory states from previous segments
425
426
Returns:
427
tuple: (hidden_states, new_mems) where:
428
- hidden_states (torch.Tensor): Output hidden states
429
- new_mems (list): Updated memory states
430
"""
431
432
@classmethod
433
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
434
"""Load pre-trained Transformer-XL model."""
435
```
436
437
#### Language Modeling Head
438
439
```python { .api }
440
class TransfoXLLMHeadModel:
441
def __init__(self, config):
442
"""
443
Initialize Transformer-XL with language modeling head.
444
445
Args:
446
config (TransfoXLConfig): Model configuration
447
"""
448
449
def forward(self, input_ids, labels=None, mems=None):
450
"""
451
Forward pass with LM head and memory.
452
453
Args:
454
input_ids (torch.Tensor): Token IDs
455
labels (torch.Tensor, optional): Language modeling labels
456
mems (list, optional): Memory states
457
458
Returns:
459
tuple: (prediction_scores, new_mems) or loss if labels provided
460
"""
461
462
@classmethod
463
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
464
"""Load pre-trained model."""
465
```
466
467
## Weight Loading Functions
468
469
Functions to convert TensorFlow checkpoints to PyTorch format for each model family.
470
471
```python { .api }
472
def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
473
"""
474
Load TensorFlow OpenAI GPT checkpoint into PyTorch model.
475
476
Args:
477
model: PyTorch OpenAI GPT model
478
openai_checkpoint_folder_path (str): Path to TF checkpoint folder
479
480
Returns:
481
PyTorch model with loaded weights
482
"""
483
484
def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
485
"""
486
Load TensorFlow GPT-2 checkpoint into PyTorch model.
487
488
Args:
489
model: PyTorch GPT-2 model
490
gpt2_checkpoint_path (str): Path to TF checkpoint
491
492
Returns:
493
PyTorch model with loaded weights
494
"""
495
496
def load_tf_weights_in_transfo_xl(model, config, tf_path):
497
"""
498
Load TensorFlow Transformer-XL checkpoint into PyTorch model.
499
500
Args:
501
model: PyTorch Transformer-XL model
502
config (TransfoXLConfig): Model configuration
503
tf_path (str): Path to TF checkpoint
504
505
Returns:
506
PyTorch model with loaded weights
507
"""
508
```
509
510
## Usage Examples
511
512
### OpenAI GPT Text Generation
513
514
```python
515
from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
516
import torch
517
518
# Load model and tokenizer
519
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
520
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
521
522
# Prepare input
523
text = "The artificial intelligence will"
524
input_ids = torch.tensor([tokenizer.encode(text)])
525
526
# Generate text
527
model.eval()
528
with torch.no_grad():
529
outputs = model(input_ids)
530
predictions = outputs[0]
531
532
# Get next token probabilities
533
next_token_logits = predictions[0, -1, :]
534
next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), 1)
535
536
# Decode next token
537
next_word = tokenizer.decode([next_token.item()])
538
print(f"Next word: {next_word}")
539
```
540
541
### GPT-2 with Custom Configuration
542
543
```python
544
from pytorch_pretrained_bert import GPT2Config, GPT2LMHeadModel
545
546
# Create custom configuration
547
config = GPT2Config(
548
vocab_size=50257,
549
n_positions=1024,
550
n_embd=768,
551
n_layer=12,
552
n_head=12
553
)
554
555
# Initialize model with custom config
556
model = GPT2LMHeadModel(config)
557
558
# Or load pre-trained
559
model = GPT2LMHeadModel.from_pretrained('gpt2')
560
```
561
562
### Transformer-XL with Memory
563
564
```python
565
from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLTokenizer
566
import torch
567
568
# Load model and tokenizer
569
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
570
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
571
572
# Process sequences with memory
573
sequence1 = "The weather today is beautiful and"
574
sequence2 = "sunny with clear blue skies."
575
576
# Encode sequences
577
input_ids_1 = torch.tensor([tokenizer.encode(sequence1)])
578
input_ids_2 = torch.tensor([tokenizer.encode(sequence2)])
579
580
# Forward pass with memory
581
model.eval()
582
with torch.no_grad():
583
# Process first sequence
584
outputs_1 = model(input_ids_1)
585
mems = outputs_1[1] # Extract memory states
586
587
# Process second sequence with memory from first
588
outputs_2 = model(input_ids_2, mems=mems)
589
logits = outputs_2[0]
590
```
591
592
### Double Heads Model for Multiple Tasks
593
594
```python
595
from pytorch_pretrained_bert import GPT2DoubleHeadsModel, GPT2Tokenizer
596
import torch
597
598
# Load double heads model
599
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
600
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
601
602
# Prepare input for both language modeling and classification
603
text = "This movie is great!"
604
input_ids = torch.tensor([tokenizer.encode(text)])
605
606
# Forward pass
607
model.eval()
608
with torch.no_grad():
609
outputs = model(input_ids)
610
lm_logits = outputs[0] # Language modeling logits
611
cls_logits = outputs[1] # Classification logits
612
613
print(f"LM logits shape: {lm_logits.shape}")
614
print(f"Classification logits shape: {cls_logits.shape}")
615
```