Tessl Tile for pypi/pytorch-pretrained-bert@0.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

bert-models.md gpt-models.md index.md optimizers.md tokenizers.md utilities.md

optimizers.mddocs/

0
# Optimizers
1

2
Specialized optimizers with learning rate scheduling designed for transformer training, including BERT-specific and OpenAI-specific Adam variants with warmup schedules, weight decay corrections, and gradient clipping.
3

4
## Capabilities
5

6
### BERT Adam Optimizer
7

8
Adam optimizer with BERT-specific weight decay handling, learning rate scheduling, and gradient clipping designed for transformer fine-tuning.
9

10
```python { .api }
11
class BertAdam:
12
    def __init__(
13
        self,
14
        params,
15
        lr,
16
        warmup=-1,
17
        t_total=-1,
18
        schedule='warmup_linear',
19
        b1=0.9,
20
        b2=0.999,
21
        e=1e-6,
22
        weight_decay=0.01,
23
        max_grad_norm=1.0
24
    ):
25
        """
26
        Initialize BERT Adam optimizer.
27
        
28
        Args:
29
            params: Model parameters to optimize
30
            lr (float): Learning rate (required)
31
            warmup (float): Warmup proportion of total training steps (-1 for no warmup)
32
            t_total (int): Total training steps (-1 for no scheduling)
33
            schedule (str): Learning rate schedule type
34
            b1 (float): Adam beta1 parameter
35
            b2 (float): Adam beta2 parameter
36
            e (float): Adam epsilon parameter
37
            weight_decay (float): Weight decay coefficient
38
            max_grad_norm (float): Maximum gradient norm for clipping
39
        """
40
    
41
    def step(self, closure=None):
42
        """
43
        Perform single optimization step.
44
        
45
        Args:
46
            closure (callable, optional): A closure that reevaluates model and returns loss
47
            
48
        Returns:
49
            Optional loss value if closure is provided
50
        """
51
    
52
    def zero_grad(self):
53
        """Clear gradients of all optimized parameters."""
54
    
55
    def state_dict(self):
56
        """
57
        Return optimizer state as dictionary.
58
        
59
        Returns:
60
            dict: Optimizer state dictionary
61
        """
62
    
63
    def load_state_dict(self, state_dict):
64
        """
65
        Load optimizer state from dictionary.
66
        
67
        Args:
68
            state_dict (dict): Optimizer state dictionary
69
        """
70
```
71

72
### OpenAI Adam Optimizer
73

74
OpenAI's Adam optimizer variant with improved weight decay handling and learning rate scheduling.
75

76
```python { .api }
77
class OpenAIAdam:
78
    def __init__(
79
        self,
80
        params,
81
        lr,
82
        schedule='warmup_linear',
83
        warmup=-1,
84
        t_total=-1,
85
        b1=0.9,
86
        b2=0.999,
87
        e=1e-8,
88
        weight_decay=0,
89
        vector_l2=False,
90
        max_grad_norm=-1,
91
        **kwargs
92
    ):
93
        """
94
        Initialize OpenAI Adam optimizer.
95
        
96
        Args:
97
            params: Model parameters to optimize
98
            lr (float): Learning rate (required)
99
            schedule (str): Learning rate schedule type
100
            warmup (float): Warmup proportion (-1 for no warmup)
101
            t_total (int): Total training steps (-1 for no scheduling)
102
            b1 (float): Adam beta1 parameter
103
            b2 (float): Adam beta2 parameter
104
            e (float): Adam epsilon parameter
105
            weight_decay (float): Weight decay coefficient
106
            vector_l2 (bool): Whether to apply L2 regularization to vectors only
107
            max_grad_norm (float): Maximum gradient norm (-1 for no clipping)
108
        """
109
    
110
    def step(self, closure=None):
111
        """Perform single optimization step."""
112
    
113
    def zero_grad(self):
114
        """Clear gradients of all optimized parameters."""
115
    
116
    def state_dict(self):
117
        """Return optimizer state as dictionary."""
118
    
119
    def load_state_dict(self, state_dict):
120
        """Load optimizer state from dictionary."""
121
```
122

123

124
## Usage Examples
125

126
### Basic BERT Fine-tuning Setup
127

128
```python
129
from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam
130
import torch
131

132
# Load model
133
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
134

135
# Prepare optimizer parameters with weight decay
136
param_optimizer = list(model.named_parameters())
137
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
138
optimizer_grouped_parameters = [
139
    {
140
        'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
141
        'weight_decay': 0.01
142
    },
143
    {
144
        'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
145
        'weight_decay': 0.0
146
    }
147
]
148

149
# Setup BERT Adam optimizer
150
num_train_steps = 1000
151
optimizer = BertAdam(
152
    optimizer_grouped_parameters,
153
    lr=2e-5,
154
    warmup=0.1,
155
    t_total=num_train_steps
156
)
157

158
# Training loop
159
model.train()
160
for step, batch in enumerate(train_dataloader):
161
    # Forward pass
162
    loss = model(batch['input_ids'], labels=batch['labels'])[0]
163
    
164
    # Backward pass
165
    loss.backward()
166
    
167
    # Optimization step
168
    optimizer.step()
169
    optimizer.zero_grad()
170
    
171
    print(f"Step {step}, Loss: {loss.item()}")
172
```
173

174
### OpenAI GPT Fine-tuning
175

176
```python
177
from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIAdam
178

179
# Load model
180
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
181

182
# Setup OpenAI Adam optimizer
183
optimizer = OpenAIAdam(
184
    model.parameters(),
185
    lr=6.25e-5,
186
    warmup=0.002,
187
    t_total=num_train_steps,
188
    weight_decay=0.01,
189
    max_grad_norm=1.0
190
)
191

192
# Training with gradient clipping
193
for batch in train_dataloader:
194
    loss = model(batch['input_ids'], lm_labels=batch['labels'])[0]
195
    loss.backward()
196
    
197
    # Gradient clipping is handled automatically by OpenAIAdam
198
    optimizer.step()
199
    optimizer.zero_grad()
200
```
201

202

203
### Advanced Optimizer Configuration
204

205
```python
206
from pytorch_pretrained_bert import BertAdam
207

208
# Setup with custom parameters
209
optimizer = BertAdam(
210
    model.parameters(),
211
    lr=1e-4,                    # Learning rate
212
    warmup=0.1,                 # 10% warmup
213
    t_total=5000,              # Total training steps
214
    schedule='warmup_cosine',   # Cosine decay after warmup
215
    b1=0.9,                    # Adam beta1
216
    b2=0.999,                  # Adam beta2
217
    e=1e-6,                    # Adam epsilon
218
    weight_decay=0.01,         # Weight decay
219
    max_grad_norm=1.0          # Gradient clipping
220
)
221

222
# Save and load optimizer state
223
optimizer_state = optimizer.state_dict()
224

225
# Later restore
226
optimizer.load_state_dict(optimizer_state)
227
```
228

229
### Comparing Optimizer Effects
230

231
```python
232
from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam, OpenAIAdam
233
import torch.optim as optim
234

235
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
236

237
# Different optimizers for comparison
238
optimizers = {
239
    'bert_adam': BertAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),
240
    'openai_adam': OpenAIAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),
241
    'standard_adam': optim.Adam(model.parameters(), lr=2e-5)
242
}
243

244
# Training comparison
245
for name, optimizer in optimizers.items():
246
    print(f"Training with {name}")
247
    model_copy = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
248
    
249
    for step, batch in enumerate(train_dataloader):
250
        loss = model_copy(batch['input_ids'], labels=batch['labels'])[0]
251
        loss.backward()
252
        optimizer.step()
253
        optimizer.zero_grad()
254
        
255
        if step % 100 == 0:
256
            print(f"  Step {step}, Loss: {loss.item()}")
257
```

Version

Tile

Files

optimizers.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

optimizers.mddocs/