Tessl Tile for pypi/keras-nightly@3.11.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

activations.md applications.md backend-config.md core-framework.md index.md initializers.md layers.md losses-metrics.md operations.md optimizers.md preprocessing.md regularizers.md training-callbacks.md

optimizers.mddocs/

0
# Optimizers
1

2
Optimization algorithms for training neural networks, from basic gradient descent to advanced adaptive methods with automatic learning rate adjustment and momentum variants.
3

4
## Capabilities
5

6
### Gradient Descent Optimizers
7

8
Fundamental gradient-based optimization algorithms including basic SGD and momentum variants.
9

10
```python { .api }
11
class SGD:
12
    """
13
    Stochastic Gradient Descent optimizer.
14
    
15
    Args:
16
        learning_rate (float): Learning rate
17
        momentum (float): Momentum factor
18
        nesterov (bool): Whether to apply Nesterov momentum
19
        weight_decay (float, optional): Weight decay rate
20
        clipnorm (float, optional): Gradient clipping by norm
21
        clipvalue (float, optional): Gradient clipping by value
22
        global_clipnorm (float, optional): Global gradient clipping by norm
23
        use_ema (bool): Whether to use Exponential Moving Average
24
        ema_momentum (float): EMA momentum
25
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
26
        name (str): Name of the optimizer
27
    """
28
    def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False, **kwargs): ...
29

30
class Adam:
31
    """
32
    Adam optimizer with adaptive learning rates.
33
    
34
    Args:
35
        learning_rate (float): Initial learning rate
36
        beta_1 (float): Exponential decay rate for first moment estimates
37
        beta_2 (float): Exponential decay rate for second moment estimates
38
        epsilon (float): Small constant for numerical stability
39
        amsgrad (bool): Whether to apply AMSGrad variant
40
        weight_decay (float, optional): Weight decay rate
41
        clipnorm (float, optional): Gradient clipping by norm
42
        clipvalue (float, optional): Gradient clipping by value
43
        global_clipnorm (float, optional): Global gradient clipping by norm
44
        use_ema (bool): Whether to use Exponential Moving Average
45
        ema_momentum (float): EMA momentum
46
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
47
        name (str): Name of the optimizer
48
    """
49
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, 
50
                 amsgrad=False, **kwargs): ...
51

52
class Nadam:
53
    """
54
    Nesterov-accelerated Adam optimizer.
55
    
56
    Args:
57
        learning_rate (float): Initial learning rate
58
        beta_1 (float): Exponential decay rate for first moment estimates
59
        beta_2 (float): Exponential decay rate for second moment estimates
60
        epsilon (float): Small constant for numerical stability
61
    """
62
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, **kwargs): ...
63

64
class Adamax:
65
    """
66
    Adamax optimizer (Adam based on infinity norm).
67
    
68
    Args:
69
        learning_rate (float): Initial learning rate
70
        beta_1 (float): Exponential decay rate for first moment estimates
71
        beta_2 (float): Exponential decay rate for weighted infinity norm
72
        epsilon (float): Small constant for numerical stability
73
    """
74
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, **kwargs): ...
75
```
76

77
### Adaptive Learning Rate Optimizers
78

79
Optimizers that automatically adapt learning rates based on gradient history and parameter-specific statistics.
80

81
```python { .api }
82
class Adagrad:
83
    """
84
    Adagrad optimizer with adaptive learning rates.
85
    
86
    Args:
87
        learning_rate (float): Initial learning rate
88
        initial_accumulator_value (float): Initial value for gradient accumulator
89
        epsilon (float): Small constant for numerical stability
90
        weight_decay (float, optional): Weight decay rate
91
        clipnorm (float, optional): Gradient clipping by norm
92
        clipvalue (float, optional): Gradient clipping by value
93
        global_clipnorm (float, optional): Global gradient clipping by norm
94
        use_ema (bool): Whether to use Exponential Moving Average
95
        ema_momentum (float): EMA momentum
96
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
97
        name (str): Name of the optimizer
98
    """
99
    def __init__(self, learning_rate=0.001, initial_accumulator_value=0.1, epsilon=1e-7, **kwargs): ...
100

101
class Adadelta:
102
    """
103
    Adadelta optimizer with adaptive learning rates.
104
    
105
    Args:
106
        learning_rate (float): Initial learning rate
107
        rho (float): Decay rate for moving averages
108
        epsilon (float): Small constant for numerical stability
109
    """
110
    def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-7, **kwargs): ...
111

112
class RMSprop:
113
    """
114
    RMSprop optimizer with adaptive learning rates.
115
    
116
    Args:
117
        learning_rate (float): Initial learning rate
118
        rho (float): Decay rate for moving average of squared gradients
119
        momentum (float): Momentum factor
120
        epsilon (float): Small constant for numerical stability
121
        centered (bool): Whether to compute centered RMSprop
122
    """
123
    def __init__(self, learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-7, 
124
                 centered=False, **kwargs): ...
125

126
class Ftrl:
127
    """
128
    Follow The Regularized Leader optimizer.
129
    
130
    Args:
131
        learning_rate (float): Initial learning rate
132
        learning_rate_power (float): Power to scale learning rate
133
        initial_accumulator_value (float): Initial value for accumulator
134
        l1_regularization_strength (float): L1 regularization strength
135
        l2_regularization_strength (float): L2 regularization strength
136
        l2_shrinkage_regularization_strength (float): L2 shrinkage regularization
137
        beta (float): Beta parameter
138
    """
139
    def __init__(self, learning_rate=0.001, learning_rate_power=-0.5, 
140
                 initial_accumulator_value=0.1, l1_regularization_strength=0.0,
141
                 l2_regularization_strength=0.0, **kwargs): ...
142
```
143

144
### Advanced Optimizers
145

146
State-of-the-art optimization algorithms with advanced features for improved convergence and performance.
147

148
```python { .api }
149
class AdamW:
150
    """
151
    Adam optimizer with decoupled weight decay.
152
    
153
    Args:
154
        learning_rate (float): Initial learning rate
155
        weight_decay (float): Weight decay coefficient
156
        beta_1 (float): Exponential decay rate for first moment estimates
157
        beta_2 (float): Exponential decay rate for second moment estimates
158
        epsilon (float): Small constant for numerical stability
159
        amsgrad (bool): Whether to apply AMSGrad variant
160
        clipnorm (float, optional): Gradient clipping by norm
161
        clipvalue (float, optional): Gradient clipping by value
162
        global_clipnorm (float, optional): Global gradient clipping by norm
163
        use_ema (bool): Whether to use Exponential Moving Average
164
        ema_momentum (float): EMA momentum
165
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
166
        name (str): Name of the optimizer
167
    """
168
    def __init__(self, learning_rate=0.001, weight_decay=0.004, beta_1=0.9, beta_2=0.999, 
169
                 epsilon=1e-7, amsgrad=False, **kwargs): ...
170

171
class Adafactor:
172
    """
173
    Adafactor optimizer with factorized second moments.
174
    
175
    Args:
176
        learning_rate (float): Initial learning rate
177
        epsilon2 (float): Second epsilon value
178
        cliping_threshold (float): Clipping threshold
179
        decay_rate (float): Decay rate for moving averages
180
        beta1 (float, optional): Beta1 parameter
181
        weight_decay_rate (float): Weight decay rate
182
        eps_scale (float): Epsilon scaling factor
183
        clip_threshold (float): Gradient clipping threshold
184
        relative_step (bool): Whether to use relative step size
185
        warmup_init (bool): Whether to use warmup initialization
186
    """
187
    def __init__(self, learning_rate=0.001, epsilon2=1e-30, cliping_threshold=1.0, 
188
                 decay_rate=0.8, beta1=None, **kwargs): ...
189

190
class Lamb:
191
    """
192
    Layer-wise Adaptive Moments optimizer.
193
    
194
    Args:
195
        learning_rate (float): Initial learning rate
196
        beta_1 (float): Exponential decay rate for first moment estimates
197
        beta_2 (float): Exponential decay rate for second moment estimates
198
        epsilon (float): Small constant for numerical stability
199
        weight_decay_rate (float): Weight decay rate
200
        always_adapt (bool): Whether to always adapt learning rate
201
    """
202
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-6,
203
                 weight_decay_rate=0.01, always_adapt=False, **kwargs): ...
204

205
class Lion:
206
    """
207
    EvoLved Sign Momentum optimizer.
208
    
209
    Args:
210
        learning_rate (float): Initial learning rate
211
        beta (float): Momentum coefficient
212
        weight_decay (float): Weight decay coefficient
213
    """
214
    def __init__(self, learning_rate=0.0001, beta=0.99, weight_decay=0.0, **kwargs): ...
215

216
class Muon:
217
    """
218
    Momentum Orthogonalized by Newton's method.
219
    
220
    Args:
221
        learning_rate (float): Initial learning rate
222
        momentum (float): Momentum coefficient
223
        nesterov (bool): Whether to use Nesterov momentum
224
        backend_update_momentum (float): Backend update momentum
225
        k (int): K parameter for Muon
226
        norm_axes (tuple): Axes for normalization
227
    """
228
    def __init__(self, learning_rate=0.02, momentum=0.95, nesterov=True, **kwargs): ...
229
```
230

231
### Mixed Precision Training
232

233
Optimizer wrapper for mixed precision training with automatic loss scaling.
234

235
```python { .api }
236
class LossScaleOptimizer:
237
    """
238
    Optimizer wrapper for mixed precision training with loss scaling.
239
    
240
    Args:
241
        inner_optimizer: Base optimizer to wrap
242
        dynamic (bool): Whether to use dynamic loss scaling
243
        initial_scale (float): Initial loss scale value
244
        dynamic_growth_steps (int): Steps between scale increases
245
        name (str): Name of the optimizer
246
    """
247
    def __init__(self, inner_optimizer, dynamic=True, initial_scale=32768.0, 
248
                 dynamic_growth_steps=2000, **kwargs): ...
249
    
250
    def get_scaled_loss(self, loss):
251
        """Scale loss for mixed precision training."""
252
    
253
    def get_unscaled_gradients(self, scaled_gradients):
254
        """Unscale gradients after backpropagation."""
255
```
256

257
### Base Optimizer Class
258

259
Base class for all optimizers providing common functionality.
260

261
```python { .api }
262
class Optimizer:
263
    """
264
    Base class for all optimizers.
265
    
266
    Args:
267
        name (str): Name of the optimizer
268
        weight_decay (float, optional): Weight decay rate
269
        clipnorm (float, optional): Gradient clipping by norm
270
        clipvalue (float, optional): Gradient clipping by value
271
        global_clipnorm (float, optional): Global gradient clipping by norm
272
        use_ema (bool): Whether to use Exponential Moving Average
273
        ema_momentum (float): EMA momentum
274
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
275
        loss_scale_factor (float, optional): Loss scale factor for mixed precision
276
        gradient_accumulation_steps (int, optional): Gradient accumulation steps
277
    """
278
    def __init__(self, name, **kwargs): ...
279
    
280
    def apply_gradients(self, grads_and_vars):
281
        """Apply gradients to variables."""
282
    
283
    def minimize(self, loss, var_list=None, tape=None):
284
        """Minimize loss by updating variables."""
285
    
286
    def get_config(self):
287
        """Get optimizer configuration."""
288
    
289
    def from_config(cls, config):
290
        """Create optimizer from configuration."""
291
    
292
    def get_weights(self):
293
        """Get optimizer state as list of arrays."""
294
    
295
    def set_weights(self, weights):
296
        """Set optimizer state from list of arrays."""
297
```
298

299
### Learning Rate Schedules
300

301
Learning rate scheduling utilities for adaptive training.
302

303
```python { .api }
304
# Available in keras.optimizers.schedules
305
class ExponentialDecay:
306
    """
307
    Exponential decay schedule.
308
    
309
    Args:
310
        initial_learning_rate (float): Initial learning rate
311
        decay_steps (int): Steps for decay
312
        decay_rate (float): Decay rate
313
        staircase (bool): Whether to apply decay in discrete intervals
314
        name (str): Name of the schedule
315
    """
316
    def __init__(self, initial_learning_rate, decay_steps, decay_rate, 
317
                 staircase=False, name=None): ...
318

319
class InverseTimeDecay:
320
    """
321
    Inverse time decay schedule.
322
    
323
    Args:
324
        initial_learning_rate (float): Initial learning rate
325
        decay_steps (int): Steps for decay
326
        decay_rate (float): Decay rate
327
        staircase (bool): Whether to apply decay in discrete intervals
328
        name (str): Name of the schedule
329
    """
330
    def __init__(self, initial_learning_rate, decay_steps, decay_rate, 
331
                 staircase=False, name=None): ...
332

333
class CosineDecay:
334
    """
335
    Cosine decay schedule.
336
    
337
    Args:
338
        initial_learning_rate (float): Initial learning rate
339
        decay_steps (int): Steps for decay
340
        alpha (float): Minimum learning rate as fraction of initial
341
        name (str): Name of the schedule
342
    """
343
    def __init__(self, initial_learning_rate, decay_steps, alpha=0.0, name=None): ...
344

345
class PiecewiseConstantDecay:
346
    """
347
    Piecewise constant decay schedule.
348
    
349
    Args:
350
        boundaries (list): Step boundaries for rate changes
351
        values (list): Learning rate values for each interval
352
        name (str): Name of the schedule
353
    """
354
    def __init__(self, boundaries, values, name=None): ...
355
```
356

357
### Utility Functions
358

359
Functions for optimizer management and configuration.
360

361
```python { .api }
362
def get(identifier):
363
    """
364
    Retrieve optimizer by name or return callable.
365
    
366
    Args:
367
        identifier (str or callable): Optimizer name or instance
368
        
369
    Returns:
370
        Optimizer: Optimizer instance
371
    """
372

373
def serialize(optimizer):
374
    """
375
    Serialize optimizer to JSON-serializable dict.
376
    
377
    Args:
378
        optimizer: Optimizer to serialize
379
        
380
    Returns:
381
        dict: Serialized optimizer configuration
382
    """
383

384
def deserialize(config, custom_objects=None):
385
    """
386
    Deserialize optimizer from config dict.
387
    
388
    Args:
389
        config (dict): Optimizer configuration
390
        custom_objects (dict, optional): Custom objects for deserialization
391
        
392
    Returns:
393
        Optimizer: Deserialized optimizer
394
    """
395
```
396

397
## Usage Examples
398

399
### Basic Optimizer Usage
400

401
```python
402
import keras
403
from keras import optimizers
404

405
# Using string identifier (default parameters)
406
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
407

408
# Using optimizer class with custom parameters
409
optimizer = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
410
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
411

412
# Advanced configuration with weight decay and clipping
413
optimizer = optimizers.AdamW(
414
    learning_rate=0.001,
415
    weight_decay=0.01,
416
    clipnorm=1.0,
417
    use_ema=True,
418
    ema_momentum=0.99
419
)
420
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
421
```
422

423
### Learning Rate Scheduling
424

425
```python
426
import keras
427
from keras import optimizers
428

429
# Exponential decay schedule
430
lr_schedule = optimizers.schedules.ExponentialDecay(
431
    initial_learning_rate=0.1,
432
    decay_steps=10000,
433
    decay_rate=0.96,
434
    staircase=True
435
)
436

437
optimizer = optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)
438
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
439

440
# Cosine decay schedule
441
lr_schedule = optimizers.schedules.CosineDecay(
442
    initial_learning_rate=0.001,
443
    decay_steps=1000,
444
    alpha=0.01
445
)
446

447
optimizer = optimizers.Adam(learning_rate=lr_schedule)
448
model.compile(optimizer=optimizer, loss='mse')
449
```
450

451
### Mixed Precision Training
452

453
```python
454
import keras
455
from keras import mixed_precision, optimizers
456

457
# Enable mixed precision
458
mixed_precision.set_global_policy('mixed_float16')
459

460
# Wrap optimizer for mixed precision
461
base_optimizer = optimizers.Adam(learning_rate=0.001)
462
optimizer = optimizers.LossScaleOptimizer(base_optimizer)
463

464
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
465

466
# The model will automatically handle loss scaling
467
history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))
468
```
469

470
### Custom Training Loop with Optimizer
471

472
```python
473
import keras
474
from keras import optimizers, ops
475

476
# Create model and optimizer
477
model = keras.Sequential([
478
    keras.layers.Dense(64, activation='relu'),
479
    keras.layers.Dense(10, activation='softmax')
480
])
481

482
optimizer = optimizers.Adam(learning_rate=0.001)
483

484
# Custom training step
485
@keras.utils.jit_compile
486
def train_step(x, y):
487
    with keras.device('gpu'):
488
        with keras.ops.GradientTape() as tape:
489
            predictions = model(x, training=True)
490
            loss = keras.losses.sparse_categorical_crossentropy(y, predictions)
491
            loss = ops.mean(loss)
492
        
493
        gradients = tape.gradient(loss, model.trainable_variables)
494
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
495
        
496
        return loss
497

498
# Training loop
499
for epoch in range(10):
500
    for step, (x_batch, y_batch) in enumerate(train_dataset):
501
        loss = train_step(x_batch, y_batch)
502
        
503
        if step % 100 == 0:
504
            print(f"Epoch {epoch}, Step {step}, Loss: {loss:.4f}")
505
```
506

507
### Optimizer State Management
508

509
```python
510
import keras
511
from keras import optimizers
512

513
# Create and configure optimizer
514
optimizer = optimizers.Adam(learning_rate=0.001)
515
model.compile(optimizer=optimizer, loss='mse')
516

517
# Train model
518
model.fit(x_train, y_train, epochs=5)
519

520
# Save optimizer state
521
optimizer_weights = optimizer.get_weights()
522

523
# Create new optimizer and restore state
524
new_optimizer = optimizers.Adam(learning_rate=0.001)
525
model.compile(optimizer=new_optimizer, loss='mse')
526

527
# Build optimizer by running one step
528
model.fit(x_train[:1], y_train[:1], epochs=1)
529

530
# Restore saved state
531
new_optimizer.set_weights(optimizer_weights)
532

533
# Continue training from saved state
534
model.fit(x_train, y_train, epochs=5)
535
```

Version

Tile

Files

optimizers.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

optimizers.mddocs/