or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

bert-models.mdgpt-models.mdindex.mdoptimizers.mdtokenizers.mdutilities.md

optimizers.mddocs/

0

# Optimizers

1

2

Specialized optimizers with learning rate scheduling designed for transformer training, including BERT-specific and OpenAI-specific Adam variants with warmup schedules, weight decay corrections, and gradient clipping.

3

4

## Capabilities

5

6

### BERT Adam Optimizer

7

8

Adam optimizer with BERT-specific weight decay handling, learning rate scheduling, and gradient clipping designed for transformer fine-tuning.

9

10

```python { .api }

11

class BertAdam:

12

def __init__(

13

self,

14

params,

15

lr,

16

warmup=-1,

17

t_total=-1,

18

schedule='warmup_linear',

19

b1=0.9,

20

b2=0.999,

21

e=1e-6,

22

weight_decay=0.01,

23

max_grad_norm=1.0

24

):

25

"""

26

Initialize BERT Adam optimizer.

27

28

Args:

29

params: Model parameters to optimize

30

lr (float): Learning rate (required)

31

warmup (float): Warmup proportion of total training steps (-1 for no warmup)

32

t_total (int): Total training steps (-1 for no scheduling)

33

schedule (str): Learning rate schedule type

34

b1 (float): Adam beta1 parameter

35

b2 (float): Adam beta2 parameter

36

e (float): Adam epsilon parameter

37

weight_decay (float): Weight decay coefficient

38

max_grad_norm (float): Maximum gradient norm for clipping

39

"""

40

41

def step(self, closure=None):

42

"""

43

Perform single optimization step.

44

45

Args:

46

closure (callable, optional): A closure that reevaluates model and returns loss

47

48

Returns:

49

Optional loss value if closure is provided

50

"""

51

52

def zero_grad(self):

53

"""Clear gradients of all optimized parameters."""

54

55

def state_dict(self):

56

"""

57

Return optimizer state as dictionary.

58

59

Returns:

60

dict: Optimizer state dictionary

61

"""

62

63

def load_state_dict(self, state_dict):

64

"""

65

Load optimizer state from dictionary.

66

67

Args:

68

state_dict (dict): Optimizer state dictionary

69

"""

70

```

71

72

### OpenAI Adam Optimizer

73

74

OpenAI's Adam optimizer variant with improved weight decay handling and learning rate scheduling.

75

76

```python { .api }

77

class OpenAIAdam:

78

def __init__(

79

self,

80

params,

81

lr,

82

schedule='warmup_linear',

83

warmup=-1,

84

t_total=-1,

85

b1=0.9,

86

b2=0.999,

87

e=1e-8,

88

weight_decay=0,

89

vector_l2=False,

90

max_grad_norm=-1,

91

**kwargs

92

):

93

"""

94

Initialize OpenAI Adam optimizer.

95

96

Args:

97

params: Model parameters to optimize

98

lr (float): Learning rate (required)

99

schedule (str): Learning rate schedule type

100

warmup (float): Warmup proportion (-1 for no warmup)

101

t_total (int): Total training steps (-1 for no scheduling)

102

b1 (float): Adam beta1 parameter

103

b2 (float): Adam beta2 parameter

104

e (float): Adam epsilon parameter

105

weight_decay (float): Weight decay coefficient

106

vector_l2 (bool): Whether to apply L2 regularization to vectors only

107

max_grad_norm (float): Maximum gradient norm (-1 for no clipping)

108

"""

109

110

def step(self, closure=None):

111

"""Perform single optimization step."""

112

113

def zero_grad(self):

114

"""Clear gradients of all optimized parameters."""

115

116

def state_dict(self):

117

"""Return optimizer state as dictionary."""

118

119

def load_state_dict(self, state_dict):

120

"""Load optimizer state from dictionary."""

121

```

122

123

124

## Usage Examples

125

126

### Basic BERT Fine-tuning Setup

127

128

```python

129

from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam

130

import torch

131

132

# Load model

133

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

134

135

# Prepare optimizer parameters with weight decay

136

param_optimizer = list(model.named_parameters())

137

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

138

optimizer_grouped_parameters = [

139

{

140

'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],

141

'weight_decay': 0.01

142

},

143

{

144

'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],

145

'weight_decay': 0.0

146

}

147

]

148

149

# Setup BERT Adam optimizer

150

num_train_steps = 1000

151

optimizer = BertAdam(

152

optimizer_grouped_parameters,

153

lr=2e-5,

154

warmup=0.1,

155

t_total=num_train_steps

156

)

157

158

# Training loop

159

model.train()

160

for step, batch in enumerate(train_dataloader):

161

# Forward pass

162

loss = model(batch['input_ids'], labels=batch['labels'])[0]

163

164

# Backward pass

165

loss.backward()

166

167

# Optimization step

168

optimizer.step()

169

optimizer.zero_grad()

170

171

print(f"Step {step}, Loss: {loss.item()}")

172

```

173

174

### OpenAI GPT Fine-tuning

175

176

```python

177

from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIAdam

178

179

# Load model

180

model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

181

182

# Setup OpenAI Adam optimizer

183

optimizer = OpenAIAdam(

184

model.parameters(),

185

lr=6.25e-5,

186

warmup=0.002,

187

t_total=num_train_steps,

188

weight_decay=0.01,

189

max_grad_norm=1.0

190

)

191

192

# Training with gradient clipping

193

for batch in train_dataloader:

194

loss = model(batch['input_ids'], lm_labels=batch['labels'])[0]

195

loss.backward()

196

197

# Gradient clipping is handled automatically by OpenAIAdam

198

optimizer.step()

199

optimizer.zero_grad()

200

```

201

202

203

### Advanced Optimizer Configuration

204

205

```python

206

from pytorch_pretrained_bert import BertAdam

207

208

# Setup with custom parameters

209

optimizer = BertAdam(

210

model.parameters(),

211

lr=1e-4, # Learning rate

212

warmup=0.1, # 10% warmup

213

t_total=5000, # Total training steps

214

schedule='warmup_cosine', # Cosine decay after warmup

215

b1=0.9, # Adam beta1

216

b2=0.999, # Adam beta2

217

e=1e-6, # Adam epsilon

218

weight_decay=0.01, # Weight decay

219

max_grad_norm=1.0 # Gradient clipping

220

)

221

222

# Save and load optimizer state

223

optimizer_state = optimizer.state_dict()

224

225

# Later restore

226

optimizer.load_state_dict(optimizer_state)

227

```

228

229

### Comparing Optimizer Effects

230

231

```python

232

from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam, OpenAIAdam

233

import torch.optim as optim

234

235

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

236

237

# Different optimizers for comparison

238

optimizers = {

239

'bert_adam': BertAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),

240

'openai_adam': OpenAIAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=1000),

241

'standard_adam': optim.Adam(model.parameters(), lr=2e-5)

242

}

243

244

# Training comparison

245

for name, optimizer in optimizers.items():

246

print(f"Training with {name}")

247

model_copy = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

248

249

for step, batch in enumerate(train_dataloader):

250

loss = model_copy(batch['input_ids'], labels=batch['labels'])[0]

251

loss.backward()

252

optimizer.step()

253

optimizer.zero_grad()

254

255

if step % 100 == 0:

256

print(f" Step {step}, Loss: {loss.item()}")

257

```