or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

bert-models.mdgpt-models.mdindex.mdoptimizers.mdtokenizers.mdutilities.md

gpt-models.mddocs/

0

# GPT Models

1

2

OpenAI GPT, GPT-2, and Transformer-XL model families with their configurations and specialized components for autoregressive language modeling, text generation, and extended context processing.

3

4

## Capabilities

5

6

### OpenAI GPT Models

7

8

Original OpenAI GPT models with configuration and task-specific variants for language modeling and classification.

9

10

#### Configuration

11

12

```python { .api }

13

class OpenAIGPTConfig:

14

def __init__(

15

self,

16

vocab_size_or_config_json_file=40478,

17

n_positions=512,

18

n_ctx=512,

19

n_embd=768,

20

n_layer=12,

21

n_head=12,

22

afn="gelu",

23

resid_pdrop=0.1,

24

embd_pdrop=0.1,

25

attn_pdrop=0.1,

26

layer_norm_epsilon=1e-5,

27

initializer_range=0.02

28

):

29

"""

30

Initialize OpenAI GPT configuration.

31

32

Args:

33

vocab_size_or_config_json_file (int or str): Vocabulary size or config path

34

n_positions (int): Maximum position embeddings

35

n_ctx (int): Context size

36

n_embd (int): Embedding dimension

37

n_layer (int): Number of transformer layers

38

n_head (int): Number of attention heads

39

afn (str): Activation function

40

resid_pdrop (float): Residual dropout probability

41

embd_pdrop (float): Embedding dropout probability

42

attn_pdrop (float): Attention dropout probability

43

layer_norm_epsilon (float): Layer normalization epsilon

44

initializer_range (float): Weight initialization range

45

"""

46

47

@classmethod

48

def from_dict(cls, json_object):

49

"""Create configuration from dictionary."""

50

51

@classmethod

52

def from_json_file(cls, json_file):

53

"""Create configuration from JSON file."""

54

55

def to_dict(self):

56

"""Convert to dictionary."""

57

58

def to_json_string(self):

59

"""Convert to JSON string."""

60

```

61

62

#### Base Model

63

64

```python { .api }

65

class OpenAIGPTModel:

66

def __init__(self, config):

67

"""

68

Initialize OpenAI GPT base model.

69

70

Args:

71

config (OpenAIGPTConfig): Model configuration

72

"""

73

74

def forward(self, input_ids, position_ids=None, token_type_ids=None):

75

"""

76

Forward pass through GPT model.

77

78

Args:

79

input_ids (torch.Tensor): Token IDs of shape [batch_size, seq_len]

80

position_ids (torch.Tensor, optional): Position IDs

81

token_type_ids (torch.Tensor, optional): Token type IDs

82

83

Returns:

84

torch.Tensor: Hidden states of shape [batch_size, seq_len, hidden_size]

85

"""

86

87

@classmethod

88

def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):

89

"""Load pre-trained OpenAI GPT model."""

90

```

91

92

#### Language Modeling Head

93

94

```python { .api }

95

class OpenAIGPTLMHeadModel:

96

def __init__(self, config):

97

"""

98

Initialize OpenAI GPT with language modeling head.

99

100

Args:

101

config (OpenAIGPTConfig): Model configuration

102

"""

103

104

def forward(

105

self,

106

input_ids,

107

position_ids=None,

108

token_type_ids=None,

109

lm_labels=None

110

):

111

"""

112

Forward pass with language modeling head.

113

114

Args:

115

input_ids (torch.Tensor): Token IDs

116

position_ids (torch.Tensor, optional): Position IDs

117

token_type_ids (torch.Tensor, optional): Token type IDs

118

lm_labels (torch.Tensor, optional): Language modeling labels

119

120

Returns:

121

torch.Tensor: Language modeling logits or loss if labels provided

122

"""

123

124

@classmethod

125

def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):

126

"""Load pre-trained model."""

127

```

128

129

#### Double Heads Model

130

131

```python { .api }

132

class OpenAIGPTDoubleHeadsModel:

133

def __init__(self, config):

134

"""

135

Initialize OpenAI GPT with both language modeling and classification heads.

136

137

Args:

138

config (OpenAIGPTConfig): Model configuration

139

"""

140

141

def forward(

142

self,

143

input_ids,

144

position_ids=None,

145

token_type_ids=None,

146

lm_labels=None,

147

multiple_choice_labels=None

148

):

149

"""

150

Forward pass with both heads.

151

152

Args:

153

input_ids (torch.Tensor): Token IDs

154

position_ids (torch.Tensor, optional): Position IDs

155

token_type_ids (torch.Tensor, optional): Token type IDs

156

lm_labels (torch.Tensor, optional): Language modeling labels

157

multiple_choice_labels (torch.Tensor, optional): Classification labels

158

159

Returns:

160

tuple: (lm_logits, classification_logits) or losses if labels provided

161

"""

162

163

@classmethod

164

def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):

165

"""Load pre-trained model."""

166

```

167

168

### GPT-2 Models

169

170

GPT-2 model family with improved architecture and byte-level BPE tokenization.

171

172

#### Configuration

173

174

```python { .api }

175

class GPT2Config:

176

def __init__(

177

self,

178

vocab_size_or_config_json_file=50257,

179

n_positions=1024,

180

n_ctx=1024,

181

n_embd=768,

182

n_layer=12,

183

n_head=12,

184

n_inner=None,

185

afn="gelu_new",

186

resid_pdrop=0.1,

187

embd_pdrop=0.1,

188

attn_pdrop=0.1,

189

layer_norm_epsilon=1e-5,

190

initializer_range=0.02

191

):

192

"""

193

Initialize GPT-2 configuration.

194

195

Args:

196

vocab_size_or_config_json_file (int or str): Vocabulary size or config path

197

n_positions (int): Maximum position embeddings

198

n_ctx (int): Context size

199

n_embd (int): Embedding dimension

200

n_layer (int): Number of layers

201

n_head (int): Number of attention heads

202

n_inner (int, optional): Inner dimension (defaults to 4 * n_embd)

203

afn (str): Activation function

204

resid_pdrop (float): Residual dropout

205

embd_pdrop (float): Embedding dropout

206

attn_pdrop (float): Attention dropout

207

layer_norm_epsilon (float): Layer norm epsilon

208

initializer_range (float): Initialization range

209

"""

210

211

@classmethod

212

def from_dict(cls, json_object):

213

"""Create from dictionary."""

214

215

@classmethod

216

def from_json_file(cls, json_file):

217

"""Create from JSON file."""

218

219

def to_dict(self):

220

"""Convert to dictionary."""

221

222

def to_json_string(self):

223

"""Convert to JSON string."""

224

```

225

226

#### Base Model

227

228

```python { .api }

229

class GPT2Model:

230

def __init__(self, config):

231

"""

232

Initialize GPT-2 base model.

233

234

Args:

235

config (GPT2Config): Model configuration

236

"""

237

238

def forward(self, input_ids, position_ids=None, token_type_ids=None):

239

"""

240

Forward pass through GPT-2.

241

242

Args:

243

input_ids (torch.Tensor): Token IDs

244

position_ids (torch.Tensor, optional): Position IDs

245

token_type_ids (torch.Tensor, optional): Token type IDs

246

247

Returns:

248

torch.Tensor: Hidden states

249

"""

250

251

@classmethod

252

def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):

253

"""Load pre-trained GPT-2 model."""

254

```

255

256

#### Language Modeling Head

257

258

```python { .api }

259

class GPT2LMHeadModel:

260

def __init__(self, config):

261

"""

262

Initialize GPT-2 with language modeling head.

263

264

Args:

265

config (GPT2Config): Model configuration

266

"""

267

268

def forward(

269

self,

270

input_ids,

271

position_ids=None,

272

token_type_ids=None,

273

lm_labels=None

274

):

275

"""

276

Forward pass with LM head.

277

278

Returns:

279

torch.Tensor: Language modeling logits or loss

280

"""

281

282

@classmethod

283

def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):

284

"""Load pre-trained model."""

285

```

286

287

#### Double Heads Model

288

289

```python { .api }

290

class GPT2DoubleHeadsModel:

291

def __init__(self, config):

292

"""

293

Initialize GPT-2 with language modeling and classification heads.

294

295

Args:

296

config (GPT2Config): Model configuration

297

"""

298

299

def forward(

300

self,

301

input_ids,

302

position_ids=None,

303

token_type_ids=None,

304

lm_labels=None,

305

multiple_choice_labels=None

306

):

307

"""

308

Forward pass with both heads.

309

310

Returns:

311

tuple: (lm_logits, classification_logits) or losses

312

"""

313

314

@classmethod

315

def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):

316

"""Load pre-trained model."""

317

```

318

319

320

### Transformer-XL Models

321

322

Transformer-XL models with extended context capability through recurrence mechanism and adaptive attention.

323

324

#### Configuration

325

326

```python { .api }

327

class TransfoXLConfig:

328

def __init__(

329

self,

330

vocab_size_or_config_json_file=267735,

331

cutoffs=[20000, 40000, 200000],

332

d_model=1024,

333

d_embed=1024,

334

n_head=16,

335

d_head=64,

336

d_inner=4096,

337

div_val=4,

338

pre_lnorm=False,

339

n_layer=18,

340

tgt_len=128,

341

ext_len=0,

342

mem_len=1600,

343

clamp_len=1000,

344

same_length=True,

345

attn_type=0,

346

sample_softmax=-1,

347

adaptive=True,

348

tie_weight=True,

349

dropout=0.1,

350

dropatt=0.0,

351

untie_r=True,

352

embd_init='normal',

353

init='normal',

354

init_range=0.01,

355

proj_init_std=0.01,

356

init_std=0.02

357

):

358

"""

359

Initialize Transformer-XL configuration.

360

361

Args:

362

vocab_size_or_config_json_file (int or str): Vocabulary size or config path

363

cutoffs (list): Adaptive softmax cutoffs

364

d_model (int): Model dimension

365

d_embed (int): Embedding dimension

366

n_head (int): Number of attention heads

367

d_head (int): Dimension per attention head

368

d_inner (int): Inner feed-forward dimension

369

div_val (int): Dimension reduction factor

370

pre_lnorm (bool): Whether to use pre-layer normalization

371

n_layer (int): Number of layers

372

tgt_len (int): Target sequence length

373

ext_len (int): Extended sequence length

374

mem_len (int): Memory length

375

clamp_len (int): Clamp length for positional encoding

376

same_length (bool): Whether to use same length

377

attn_type (int): Attention type

378

sample_softmax (int): Sample softmax parameter

379

adaptive (bool): Whether to use adaptive softmax

380

tie_weight (bool): Whether to tie weights

381

dropout (float): Dropout probability

382

dropatt (float): Attention dropout

383

untie_r (bool): Whether to untie relative position bias

384

embd_init (str): Embedding initialization

385

init (str): General initialization

386

init_range (float): Initialization range

387

proj_init_std (float): Projection initialization std

388

init_std (float): Initialization std

389

"""

390

391

@classmethod

392

def from_dict(cls, json_object):

393

"""Create from dictionary."""

394

395

@classmethod

396

def from_json_file(cls, json_file):

397

"""Create from JSON file."""

398

399

def to_dict(self):

400

"""Convert to dictionary."""

401

402

def to_json_string(self):

403

"""Convert to JSON string."""

404

```

405

406

#### Base Model

407

408

```python { .api }

409

class TransfoXLModel:

410

def __init__(self, config):

411

"""

412

Initialize Transformer-XL base model.

413

414

Args:

415

config (TransfoXLConfig): Model configuration

416

"""

417

418

def forward(self, input_ids, mems=None):

419

"""

420

Forward pass with memory mechanism.

421

422

Args:

423

input_ids (torch.Tensor): Token IDs

424

mems (list, optional): Memory states from previous segments

425

426

Returns:

427

tuple: (hidden_states, new_mems) where:

428

- hidden_states (torch.Tensor): Output hidden states

429

- new_mems (list): Updated memory states

430

"""

431

432

@classmethod

433

def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):

434

"""Load pre-trained Transformer-XL model."""

435

```

436

437

#### Language Modeling Head

438

439

```python { .api }

440

class TransfoXLLMHeadModel:

441

def __init__(self, config):

442

"""

443

Initialize Transformer-XL with language modeling head.

444

445

Args:

446

config (TransfoXLConfig): Model configuration

447

"""

448

449

def forward(self, input_ids, labels=None, mems=None):

450

"""

451

Forward pass with LM head and memory.

452

453

Args:

454

input_ids (torch.Tensor): Token IDs

455

labels (torch.Tensor, optional): Language modeling labels

456

mems (list, optional): Memory states

457

458

Returns:

459

tuple: (prediction_scores, new_mems) or loss if labels provided

460

"""

461

462

@classmethod

463

def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):

464

"""Load pre-trained model."""

465

```

466

467

## Weight Loading Functions

468

469

Functions to convert TensorFlow checkpoints to PyTorch format for each model family.

470

471

```python { .api }

472

def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):

473

"""

474

Load TensorFlow OpenAI GPT checkpoint into PyTorch model.

475

476

Args:

477

model: PyTorch OpenAI GPT model

478

openai_checkpoint_folder_path (str): Path to TF checkpoint folder

479

480

Returns:

481

PyTorch model with loaded weights

482

"""

483

484

def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):

485

"""

486

Load TensorFlow GPT-2 checkpoint into PyTorch model.

487

488

Args:

489

model: PyTorch GPT-2 model

490

gpt2_checkpoint_path (str): Path to TF checkpoint

491

492

Returns:

493

PyTorch model with loaded weights

494

"""

495

496

def load_tf_weights_in_transfo_xl(model, config, tf_path):

497

"""

498

Load TensorFlow Transformer-XL checkpoint into PyTorch model.

499

500

Args:

501

model: PyTorch Transformer-XL model

502

config (TransfoXLConfig): Model configuration

503

tf_path (str): Path to TF checkpoint

504

505

Returns:

506

PyTorch model with loaded weights

507

"""

508

```

509

510

## Usage Examples

511

512

### OpenAI GPT Text Generation

513

514

```python

515

from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer

516

import torch

517

518

# Load model and tokenizer

519

model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

520

tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

521

522

# Prepare input

523

text = "The artificial intelligence will"

524

input_ids = torch.tensor([tokenizer.encode(text)])

525

526

# Generate text

527

model.eval()

528

with torch.no_grad():

529

outputs = model(input_ids)

530

predictions = outputs[0]

531

532

# Get next token probabilities

533

next_token_logits = predictions[0, -1, :]

534

next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), 1)

535

536

# Decode next token

537

next_word = tokenizer.decode([next_token.item()])

538

print(f"Next word: {next_word}")

539

```

540

541

### GPT-2 with Custom Configuration

542

543

```python

544

from pytorch_pretrained_bert import GPT2Config, GPT2LMHeadModel

545

546

# Create custom configuration

547

config = GPT2Config(

548

vocab_size=50257,

549

n_positions=1024,

550

n_embd=768,

551

n_layer=12,

552

n_head=12

553

)

554

555

# Initialize model with custom config

556

model = GPT2LMHeadModel(config)

557

558

# Or load pre-trained

559

model = GPT2LMHeadModel.from_pretrained('gpt2')

560

```

561

562

### Transformer-XL with Memory

563

564

```python

565

from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLTokenizer

566

import torch

567

568

# Load model and tokenizer

569

model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')

570

tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')

571

572

# Process sequences with memory

573

sequence1 = "The weather today is beautiful and"

574

sequence2 = "sunny with clear blue skies."

575

576

# Encode sequences

577

input_ids_1 = torch.tensor([tokenizer.encode(sequence1)])

578

input_ids_2 = torch.tensor([tokenizer.encode(sequence2)])

579

580

# Forward pass with memory

581

model.eval()

582

with torch.no_grad():

583

# Process first sequence

584

outputs_1 = model(input_ids_1)

585

mems = outputs_1[1] # Extract memory states

586

587

# Process second sequence with memory from first

588

outputs_2 = model(input_ids_2, mems=mems)

589

logits = outputs_2[0]

590

```

591

592

### Double Heads Model for Multiple Tasks

593

594

```python

595

from pytorch_pretrained_bert import GPT2DoubleHeadsModel, GPT2Tokenizer

596

import torch

597

598

# Load double heads model

599

model = GPT2DoubleHeadsModel.from_pretrained('gpt2')

600

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

601

602

# Prepare input for both language modeling and classification

603

text = "This movie is great!"

604

input_ids = torch.tensor([tokenizer.encode(text)])

605

606

# Forward pass

607

model.eval()

608

with torch.no_grad():

609

outputs = model(input_ids)

610

lm_logits = outputs[0] # Language modeling logits

611

cls_logits = outputs[1] # Classification logits

612

613

print(f"LM logits shape: {lm_logits.shape}")

614

print(f"Classification logits shape: {cls_logits.shape}")

615

```