0
# Optimizers
1
2
Specialized optimizers with learning rate scheduling designed for transformer training, including BERT-specific and OpenAI-specific Adam variants with warmup schedules, weight decay corrections, and gradient clipping.
3
4
## Capabilities
5
6
### BERT Adam Optimizer
7
8
Adam optimizer with BERT-specific weight decay handling, learning rate scheduling, and gradient clipping designed for transformer fine-tuning.
9
10
```python { .api }
11
class BertAdam:
12
def __init__(
13
self,
14
params,
15
lr,
16
warmup=-1,
17
t_total=-1,
18
schedule='warmup_linear',
19
b1=0.9,
20
b2=0.999,
21
e=1e-6,
22
weight_decay=0.01,
23
max_grad_norm=1.0
24
):
25
"""
26
Initialize BERT Adam optimizer.
27
28
Args:
29
params: Model parameters to optimize
30
lr (float): Learning rate (required)
31
warmup (float): Warmup proportion of total training steps (-1 for no warmup)
32
t_total (int): Total training steps (-1 for no scheduling)
33
schedule (str): Learning rate schedule type
34
b1 (float): Adam beta1 parameter
35
b2 (float): Adam beta2 parameter
36
e (float): Adam epsilon parameter
37
weight_decay (float): Weight decay coefficient
38
max_grad_norm (float): Maximum gradient norm for clipping
39
"""
40
41
def step(self, closure=None):
42
"""
43
Perform single optimization step.
44
45
Args:
46
closure (callable, optional): A closure that reevaluates model and returns loss
47
48
Returns:
49
Optional loss value if closure is provided
50
"""
51
52
def zero_grad(self):
53
"""Clear gradients of all optimized parameters."""
54
55
def state_dict(self):
56
"""
57
Return optimizer state as dictionary.
58
59
Returns:
60
dict: Optimizer state dictionary
61
"""
62
63
def load_state_dict(self, state_dict):
64
"""
65
Load optimizer state from dictionary.
66
67
Args:
68
state_dict (dict): Optimizer state dictionary
69
"""
70
```
71
72
### OpenAI Adam Optimizer
73
74
OpenAI's Adam optimizer variant with improved weight decay handling and learning rate scheduling.
75
76
```python { .api }
77
class OpenAIAdam:
78
def __init__(
79
self,
80
params,
81
lr,
82
schedule='warmup_linear',
83
warmup=-1,
84
t_total=-1,
85
b1=0.9,
86
b2=0.999,
87
e=1e-8,
88
weight_decay=0,
89
vector_l2=False,
90
max_grad_norm=-1,
91
**kwargs
92
):
93
"""
94
Initialize OpenAI Adam optimizer.
95
96
Args:
97
params: Model parameters to optimize
98
lr (float): Learning rate (required)
99
schedule (str): Learning rate schedule type
100
warmup (float): Warmup proportion (-1 for no warmup)
101
t_total (int): Total training steps (-1 for no scheduling)
102
b1 (float): Adam beta1 parameter
103
b2 (float): Adam beta2 parameter
104
e (float): Adam epsilon parameter
105
weight_decay (float): Weight decay coefficient
106
vector_l2 (bool): Whether to apply L2 regularization to vectors only
107
max_grad_norm (float): Maximum gradient norm (-1 for no clipping)
108
"""
109
110
def step(self, closure=None):
111
"""Perform single optimization step."""
112
113
def zero_grad(self):
114
"""Clear gradients of all optimized parameters."""
115
116
def state_dict(self):
117
"""Return optimizer state as dictionary."""
118
119
def load_state_dict(self, state_dict):
120
"""Load optimizer state from dictionary."""
121
```
122
123
124
## Usage Examples
125
126
### Basic BERT Fine-tuning Setup
127
128
```python
129
from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam
130
import torch
131
132
# Load model
133
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
134
135
# Prepare optimizer parameters with weight decay
136
param_optimizer = list(model.named_parameters())
137
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
138
optimizer_grouped_parameters = [
139
{
140
'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
141
'weight_decay': 0.01
142
},
143
{
144
'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
145
'weight_decay': 0.0
146
}
147
]
148
149
# Setup BERT Adam optimizer
150
num_train_steps = 1000
151
optimizer = BertAdam(
152
optimizer_grouped_parameters,
153
lr=2e-5,
154
warmup=0.1,
155
t_total=num_train_steps
156
)
157
158
# Training loop
159
model.train()
160
for step, batch in enumerate(train_dataloader):
161
# Forward pass
162
loss = model(batch['input_ids'], labels=batch['labels'])[0]
163
164
# Backward pass
165
loss.backward()
166
167
# Optimization step
168
optimizer.step()
169
optimizer.zero_grad()
170
171
print(f"Step {step}, Loss: {loss.item()}")
172
```
173
174
### OpenAI GPT Fine-tuning
175
176
```python
177
from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIAdam
178
179
# Load model
180
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
181
182
# Setup OpenAI Adam optimizer
183
optimizer = OpenAIAdam(
184
model.parameters(),
185
lr=6.25e-5,
186
warmup=0.002,
187
t_total=num_train_steps,
188
weight_decay=0.01,
189
max_grad_norm=1.0
190
)
191
192
# Training with gradient clipping
193
for batch in train_dataloader:
194
loss = model(batch['input_ids'], lm_labels=batch['labels'])[0]
195
loss.backward()
196
197
# Gradient clipping is handled automatically by OpenAIAdam
198
optimizer.step()
199
optimizer.zero_grad()
200
```
201
202
203
### Advanced Optimizer Configuration
204
205
```python
206
from pytorch_pretrained_bert import BertAdam
207
208
# Setup with custom parameters
209
optimizer = BertAdam(
210
model.parameters(),
211
lr=1e-4, # Learning rate
212
warmup=0.1, # 10% warmup
213
t_total=5000, # Total training steps
214
schedule='warmup_cosine', # Cosine decay after warmup
215
b1=0.9, # Adam beta1
216
b2=0.999, # Adam beta2
217
e=1e-6, # Adam epsilon
218
weight_decay=0.01, # Weight decay
219
max_grad_norm=1.0 # Gradient clipping
220
)
221
222
# Save and load optimizer state
223
optimizer_state = optimizer.state_dict()
224
225
# Later restore
226
optimizer.load_state_dict(optimizer_state)
227
```
228
229
### Comparing Optimizer Effects
230
231
```python
232
from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam, OpenAIAdam
233
import torch.optim as optim
234
235
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
236
237
# Different optimizers for comparison
238
optimizers = {
239
'bert_adam': BertAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),
240
'openai_adam': OpenAIAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),
241
'standard_adam': optim.Adam(model.parameters(), lr=2e-5)
242
}
243
244
# Training comparison
245
for name, optimizer in optimizers.items():
246
print(f"Training with {name}")
247
model_copy = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
248
249
for step, batch in enumerate(train_dataloader):
250
loss = model_copy(batch['input_ids'], labels=batch['labels'])[0]
251
loss.backward()
252
optimizer.step()
253
optimizer.zero_grad()
254
255
if step % 100 == 0:
256
print(f" Step {step}, Loss: {loss.item()}")
257
```