or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

activations.mdapplications.mdbackend-config.mdcore-framework.mdindex.mdinitializers.mdlayers.mdlosses-metrics.mdoperations.mdoptimizers.mdpreprocessing.mdregularizers.mdtraining-callbacks.md

optimizers.mddocs/

0

# Optimizers

1

2

Optimization algorithms for training neural networks, from basic gradient descent to advanced adaptive methods with automatic learning rate adjustment and momentum variants.

3

4

## Capabilities

5

6

### Gradient Descent Optimizers

7

8

Fundamental gradient-based optimization algorithms including basic SGD and momentum variants.

9

10

```python { .api }

11

class SGD:

12

"""

13

Stochastic Gradient Descent optimizer.

14

15

Args:

16

learning_rate (float): Learning rate

17

momentum (float): Momentum factor

18

nesterov (bool): Whether to apply Nesterov momentum

19

weight_decay (float, optional): Weight decay rate

20

clipnorm (float, optional): Gradient clipping by norm

21

clipvalue (float, optional): Gradient clipping by value

22

global_clipnorm (float, optional): Global gradient clipping by norm

23

use_ema (bool): Whether to use Exponential Moving Average

24

ema_momentum (float): EMA momentum

25

ema_overwrite_frequency (int, optional): EMA overwrite frequency

26

name (str): Name of the optimizer

27

"""

28

def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False, **kwargs): ...

29

30

class Adam:

31

"""

32

Adam optimizer with adaptive learning rates.

33

34

Args:

35

learning_rate (float): Initial learning rate

36

beta_1 (float): Exponential decay rate for first moment estimates

37

beta_2 (float): Exponential decay rate for second moment estimates

38

epsilon (float): Small constant for numerical stability

39

amsgrad (bool): Whether to apply AMSGrad variant

40

weight_decay (float, optional): Weight decay rate

41

clipnorm (float, optional): Gradient clipping by norm

42

clipvalue (float, optional): Gradient clipping by value

43

global_clipnorm (float, optional): Global gradient clipping by norm

44

use_ema (bool): Whether to use Exponential Moving Average

45

ema_momentum (float): EMA momentum

46

ema_overwrite_frequency (int, optional): EMA overwrite frequency

47

name (str): Name of the optimizer

48

"""

49

def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7,

50

amsgrad=False, **kwargs): ...

51

52

class Nadam:

53

"""

54

Nesterov-accelerated Adam optimizer.

55

56

Args:

57

learning_rate (float): Initial learning rate

58

beta_1 (float): Exponential decay rate for first moment estimates

59

beta_2 (float): Exponential decay rate for second moment estimates

60

epsilon (float): Small constant for numerical stability

61

"""

62

def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, **kwargs): ...

63

64

class Adamax:

65

"""

66

Adamax optimizer (Adam based on infinity norm).

67

68

Args:

69

learning_rate (float): Initial learning rate

70

beta_1 (float): Exponential decay rate for first moment estimates

71

beta_2 (float): Exponential decay rate for weighted infinity norm

72

epsilon (float): Small constant for numerical stability

73

"""

74

def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, **kwargs): ...

75

```

76

77

### Adaptive Learning Rate Optimizers

78

79

Optimizers that automatically adapt learning rates based on gradient history and parameter-specific statistics.

80

81

```python { .api }

82

class Adagrad:

83

"""

84

Adagrad optimizer with adaptive learning rates.

85

86

Args:

87

learning_rate (float): Initial learning rate

88

initial_accumulator_value (float): Initial value for gradient accumulator

89

epsilon (float): Small constant for numerical stability

90

weight_decay (float, optional): Weight decay rate

91

clipnorm (float, optional): Gradient clipping by norm

92

clipvalue (float, optional): Gradient clipping by value

93

global_clipnorm (float, optional): Global gradient clipping by norm

94

use_ema (bool): Whether to use Exponential Moving Average

95

ema_momentum (float): EMA momentum

96

ema_overwrite_frequency (int, optional): EMA overwrite frequency

97

name (str): Name of the optimizer

98

"""

99

def __init__(self, learning_rate=0.001, initial_accumulator_value=0.1, epsilon=1e-7, **kwargs): ...

100

101

class Adadelta:

102

"""

103

Adadelta optimizer with adaptive learning rates.

104

105

Args:

106

learning_rate (float): Initial learning rate

107

rho (float): Decay rate for moving averages

108

epsilon (float): Small constant for numerical stability

109

"""

110

def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-7, **kwargs): ...

111

112

class RMSprop:

113

"""

114

RMSprop optimizer with adaptive learning rates.

115

116

Args:

117

learning_rate (float): Initial learning rate

118

rho (float): Decay rate for moving average of squared gradients

119

momentum (float): Momentum factor

120

epsilon (float): Small constant for numerical stability

121

centered (bool): Whether to compute centered RMSprop

122

"""

123

def __init__(self, learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-7,

124

centered=False, **kwargs): ...

125

126

class Ftrl:

127

"""

128

Follow The Regularized Leader optimizer.

129

130

Args:

131

learning_rate (float): Initial learning rate

132

learning_rate_power (float): Power to scale learning rate

133

initial_accumulator_value (float): Initial value for accumulator

134

l1_regularization_strength (float): L1 regularization strength

135

l2_regularization_strength (float): L2 regularization strength

136

l2_shrinkage_regularization_strength (float): L2 shrinkage regularization

137

beta (float): Beta parameter

138

"""

139

def __init__(self, learning_rate=0.001, learning_rate_power=-0.5,

140

initial_accumulator_value=0.1, l1_regularization_strength=0.0,

141

l2_regularization_strength=0.0, **kwargs): ...

142

```

143

144

### Advanced Optimizers

145

146

State-of-the-art optimization algorithms with advanced features for improved convergence and performance.

147

148

```python { .api }

149

class AdamW:

150

"""

151

Adam optimizer with decoupled weight decay.

152

153

Args:

154

learning_rate (float): Initial learning rate

155

weight_decay (float): Weight decay coefficient

156

beta_1 (float): Exponential decay rate for first moment estimates

157

beta_2 (float): Exponential decay rate for second moment estimates

158

epsilon (float): Small constant for numerical stability

159

amsgrad (bool): Whether to apply AMSGrad variant

160

clipnorm (float, optional): Gradient clipping by norm

161

clipvalue (float, optional): Gradient clipping by value

162

global_clipnorm (float, optional): Global gradient clipping by norm

163

use_ema (bool): Whether to use Exponential Moving Average

164

ema_momentum (float): EMA momentum

165

ema_overwrite_frequency (int, optional): EMA overwrite frequency

166

name (str): Name of the optimizer

167

"""

168

def __init__(self, learning_rate=0.001, weight_decay=0.004, beta_1=0.9, beta_2=0.999,

169

epsilon=1e-7, amsgrad=False, **kwargs): ...

170

171

class Adafactor:

172

"""

173

Adafactor optimizer with factorized second moments.

174

175

Args:

176

learning_rate (float): Initial learning rate

177

epsilon2 (float): Second epsilon value

178

cliping_threshold (float): Clipping threshold

179

decay_rate (float): Decay rate for moving averages

180

beta1 (float, optional): Beta1 parameter

181

weight_decay_rate (float): Weight decay rate

182

eps_scale (float): Epsilon scaling factor

183

clip_threshold (float): Gradient clipping threshold

184

relative_step (bool): Whether to use relative step size

185

warmup_init (bool): Whether to use warmup initialization

186

"""

187

def __init__(self, learning_rate=0.001, epsilon2=1e-30, cliping_threshold=1.0,

188

decay_rate=0.8, beta1=None, **kwargs): ...

189

190

class Lamb:

191

"""

192

Layer-wise Adaptive Moments optimizer.

193

194

Args:

195

learning_rate (float): Initial learning rate

196

beta_1 (float): Exponential decay rate for first moment estimates

197

beta_2 (float): Exponential decay rate for second moment estimates

198

epsilon (float): Small constant for numerical stability

199

weight_decay_rate (float): Weight decay rate

200

always_adapt (bool): Whether to always adapt learning rate

201

"""

202

def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-6,

203

weight_decay_rate=0.01, always_adapt=False, **kwargs): ...

204

205

class Lion:

206

"""

207

EvoLved Sign Momentum optimizer.

208

209

Args:

210

learning_rate (float): Initial learning rate

211

beta (float): Momentum coefficient

212

weight_decay (float): Weight decay coefficient

213

"""

214

def __init__(self, learning_rate=0.0001, beta=0.99, weight_decay=0.0, **kwargs): ...

215

216

class Muon:

217

"""

218

Momentum Orthogonalized by Newton's method.

219

220

Args:

221

learning_rate (float): Initial learning rate

222

momentum (float): Momentum coefficient

223

nesterov (bool): Whether to use Nesterov momentum

224

backend_update_momentum (float): Backend update momentum

225

k (int): K parameter for Muon

226

norm_axes (tuple): Axes for normalization

227

"""

228

def __init__(self, learning_rate=0.02, momentum=0.95, nesterov=True, **kwargs): ...

229

```

230

231

### Mixed Precision Training

232

233

Optimizer wrapper for mixed precision training with automatic loss scaling.

234

235

```python { .api }

236

class LossScaleOptimizer:

237

"""

238

Optimizer wrapper for mixed precision training with loss scaling.

239

240

Args:

241

inner_optimizer: Base optimizer to wrap

242

dynamic (bool): Whether to use dynamic loss scaling

243

initial_scale (float): Initial loss scale value

244

dynamic_growth_steps (int): Steps between scale increases

245

name (str): Name of the optimizer

246

"""

247

def __init__(self, inner_optimizer, dynamic=True, initial_scale=32768.0,

248

dynamic_growth_steps=2000, **kwargs): ...

249

250

def get_scaled_loss(self, loss):

251

"""Scale loss for mixed precision training."""

252

253

def get_unscaled_gradients(self, scaled_gradients):

254

"""Unscale gradients after backpropagation."""

255

```

256

257

### Base Optimizer Class

258

259

Base class for all optimizers providing common functionality.

260

261

```python { .api }

262

class Optimizer:

263

"""

264

Base class for all optimizers.

265

266

Args:

267

name (str): Name of the optimizer

268

weight_decay (float, optional): Weight decay rate

269

clipnorm (float, optional): Gradient clipping by norm

270

clipvalue (float, optional): Gradient clipping by value

271

global_clipnorm (float, optional): Global gradient clipping by norm

272

use_ema (bool): Whether to use Exponential Moving Average

273

ema_momentum (float): EMA momentum

274

ema_overwrite_frequency (int, optional): EMA overwrite frequency

275

loss_scale_factor (float, optional): Loss scale factor for mixed precision

276

gradient_accumulation_steps (int, optional): Gradient accumulation steps

277

"""

278

def __init__(self, name, **kwargs): ...

279

280

def apply_gradients(self, grads_and_vars):

281

"""Apply gradients to variables."""

282

283

def minimize(self, loss, var_list=None, tape=None):

284

"""Minimize loss by updating variables."""

285

286

def get_config(self):

287

"""Get optimizer configuration."""

288

289

def from_config(cls, config):

290

"""Create optimizer from configuration."""

291

292

def get_weights(self):

293

"""Get optimizer state as list of arrays."""

294

295

def set_weights(self, weights):

296

"""Set optimizer state from list of arrays."""

297

```

298

299

### Learning Rate Schedules

300

301

Learning rate scheduling utilities for adaptive training.

302

303

```python { .api }

304

# Available in keras.optimizers.schedules

305

class ExponentialDecay:

306

"""

307

Exponential decay schedule.

308

309

Args:

310

initial_learning_rate (float): Initial learning rate

311

decay_steps (int): Steps for decay

312

decay_rate (float): Decay rate

313

staircase (bool): Whether to apply decay in discrete intervals

314

name (str): Name of the schedule

315

"""

316

def __init__(self, initial_learning_rate, decay_steps, decay_rate,

317

staircase=False, name=None): ...

318

319

class InverseTimeDecay:

320

"""

321

Inverse time decay schedule.

322

323

Args:

324

initial_learning_rate (float): Initial learning rate

325

decay_steps (int): Steps for decay

326

decay_rate (float): Decay rate

327

staircase (bool): Whether to apply decay in discrete intervals

328

name (str): Name of the schedule

329

"""

330

def __init__(self, initial_learning_rate, decay_steps, decay_rate,

331

staircase=False, name=None): ...

332

333

class CosineDecay:

334

"""

335

Cosine decay schedule.

336

337

Args:

338

initial_learning_rate (float): Initial learning rate

339

decay_steps (int): Steps for decay

340

alpha (float): Minimum learning rate as fraction of initial

341

name (str): Name of the schedule

342

"""

343

def __init__(self, initial_learning_rate, decay_steps, alpha=0.0, name=None): ...

344

345

class PiecewiseConstantDecay:

346

"""

347

Piecewise constant decay schedule.

348

349

Args:

350

boundaries (list): Step boundaries for rate changes

351

values (list): Learning rate values for each interval

352

name (str): Name of the schedule

353

"""

354

def __init__(self, boundaries, values, name=None): ...

355

```

356

357

### Utility Functions

358

359

Functions for optimizer management and configuration.

360

361

```python { .api }

362

def get(identifier):

363

"""

364

Retrieve optimizer by name or return callable.

365

366

Args:

367

identifier (str or callable): Optimizer name or instance

368

369

Returns:

370

Optimizer: Optimizer instance

371

"""

372

373

def serialize(optimizer):

374

"""

375

Serialize optimizer to JSON-serializable dict.

376

377

Args:

378

optimizer: Optimizer to serialize

379

380

Returns:

381

dict: Serialized optimizer configuration

382

"""

383

384

def deserialize(config, custom_objects=None):

385

"""

386

Deserialize optimizer from config dict.

387

388

Args:

389

config (dict): Optimizer configuration

390

custom_objects (dict, optional): Custom objects for deserialization

391

392

Returns:

393

Optimizer: Deserialized optimizer

394

"""

395

```

396

397

## Usage Examples

398

399

### Basic Optimizer Usage

400

401

```python

402

import keras

403

from keras import optimizers

404

405

# Using string identifier (default parameters)

406

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

407

408

# Using optimizer class with custom parameters

409

optimizer = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

410

model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

411

412

# Advanced configuration with weight decay and clipping

413

optimizer = optimizers.AdamW(

414

learning_rate=0.001,

415

weight_decay=0.01,

416

clipnorm=1.0,

417

use_ema=True,

418

ema_momentum=0.99

419

)

420

model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

421

```

422

423

### Learning Rate Scheduling

424

425

```python

426

import keras

427

from keras import optimizers

428

429

# Exponential decay schedule

430

lr_schedule = optimizers.schedules.ExponentialDecay(

431

initial_learning_rate=0.1,

432

decay_steps=10000,

433

decay_rate=0.96,

434

staircase=True

435

)

436

437

optimizer = optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)

438

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

439

440

# Cosine decay schedule

441

lr_schedule = optimizers.schedules.CosineDecay(

442

initial_learning_rate=0.001,

443

decay_steps=1000,

444

alpha=0.01

445

)

446

447

optimizer = optimizers.Adam(learning_rate=lr_schedule)

448

model.compile(optimizer=optimizer, loss='mse')

449

```

450

451

### Mixed Precision Training

452

453

```python

454

import keras

455

from keras import mixed_precision, optimizers

456

457

# Enable mixed precision

458

mixed_precision.set_global_policy('mixed_float16')

459

460

# Wrap optimizer for mixed precision

461

base_optimizer = optimizers.Adam(learning_rate=0.001)

462

optimizer = optimizers.LossScaleOptimizer(base_optimizer)

463

464

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

465

466

# The model will automatically handle loss scaling

467

history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

468

```

469

470

### Custom Training Loop with Optimizer

471

472

```python

473

import keras

474

from keras import optimizers, ops

475

476

# Create model and optimizer

477

model = keras.Sequential([

478

keras.layers.Dense(64, activation='relu'),

479

keras.layers.Dense(10, activation='softmax')

480

])

481

482

optimizer = optimizers.Adam(learning_rate=0.001)

483

484

# Custom training step

485

@keras.utils.jit_compile

486

def train_step(x, y):

487

with keras.device('gpu'):

488

with keras.ops.GradientTape() as tape:

489

predictions = model(x, training=True)

490

loss = keras.losses.sparse_categorical_crossentropy(y, predictions)

491

loss = ops.mean(loss)

492

493

gradients = tape.gradient(loss, model.trainable_variables)

494

optimizer.apply_gradients(zip(gradients, model.trainable_variables))

495

496

return loss

497

498

# Training loop

499

for epoch in range(10):

500

for step, (x_batch, y_batch) in enumerate(train_dataset):

501

loss = train_step(x_batch, y_batch)

502

503

if step % 100 == 0:

504

print(f"Epoch {epoch}, Step {step}, Loss: {loss:.4f}")

505

```

506

507

### Optimizer State Management

508

509

```python

510

import keras

511

from keras import optimizers

512

513

# Create and configure optimizer

514

optimizer = optimizers.Adam(learning_rate=0.001)

515

model.compile(optimizer=optimizer, loss='mse')

516

517

# Train model

518

model.fit(x_train, y_train, epochs=5)

519

520

# Save optimizer state

521

optimizer_weights = optimizer.get_weights()

522

523

# Create new optimizer and restore state

524

new_optimizer = optimizers.Adam(learning_rate=0.001)

525

model.compile(optimizer=new_optimizer, loss='mse')

526

527

# Build optimizer by running one step

528

model.fit(x_train[:1], y_train[:1], epochs=1)

529

530

# Restore saved state

531

new_optimizer.set_weights(optimizer_weights)

532

533

# Continue training from saved state

534

model.fit(x_train, y_train, epochs=5)

535

```