Multi-backend deep learning framework providing a unified API for building and training neural networks across JAX, TensorFlow, PyTorch, and OpenVINO backends
—
Optimization algorithms for training neural networks, from basic gradient descent to advanced adaptive methods with automatic learning rate adjustment and momentum variants.
Fundamental gradient-based optimization algorithms including basic SGD and momentum variants.
class SGD:
"""
Stochastic Gradient Descent optimizer.
Args:
learning_rate (float): Learning rate
momentum (float): Momentum factor
nesterov (bool): Whether to apply Nesterov momentum
weight_decay (float, optional): Weight decay rate
clipnorm (float, optional): Gradient clipping by norm
clipvalue (float, optional): Gradient clipping by value
global_clipnorm (float, optional): Global gradient clipping by norm
use_ema (bool): Whether to use Exponential Moving Average
ema_momentum (float): EMA momentum
ema_overwrite_frequency (int, optional): EMA overwrite frequency
name (str): Name of the optimizer
"""
def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False, **kwargs): ...
class Adam:
"""
Adam optimizer with adaptive learning rates.
Args:
learning_rate (float): Initial learning rate
beta_1 (float): Exponential decay rate for first moment estimates
beta_2 (float): Exponential decay rate for second moment estimates
epsilon (float): Small constant for numerical stability
amsgrad (bool): Whether to apply AMSGrad variant
weight_decay (float, optional): Weight decay rate
clipnorm (float, optional): Gradient clipping by norm
clipvalue (float, optional): Gradient clipping by value
global_clipnorm (float, optional): Global gradient clipping by norm
use_ema (bool): Whether to use Exponential Moving Average
ema_momentum (float): EMA momentum
ema_overwrite_frequency (int, optional): EMA overwrite frequency
name (str): Name of the optimizer
"""
def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7,
amsgrad=False, **kwargs): ...
class Nadam:
"""
Nesterov-accelerated Adam optimizer.
Args:
learning_rate (float): Initial learning rate
beta_1 (float): Exponential decay rate for first moment estimates
beta_2 (float): Exponential decay rate for second moment estimates
epsilon (float): Small constant for numerical stability
"""
def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, **kwargs): ...
class Adamax:
"""
Adamax optimizer (Adam based on infinity norm).
Args:
learning_rate (float): Initial learning rate
beta_1 (float): Exponential decay rate for first moment estimates
beta_2 (float): Exponential decay rate for weighted infinity norm
epsilon (float): Small constant for numerical stability
"""
def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, **kwargs): ...Optimizers that automatically adapt learning rates based on gradient history and parameter-specific statistics.
class Adagrad:
"""
Adagrad optimizer with adaptive learning rates.
Args:
learning_rate (float): Initial learning rate
initial_accumulator_value (float): Initial value for gradient accumulator
epsilon (float): Small constant for numerical stability
weight_decay (float, optional): Weight decay rate
clipnorm (float, optional): Gradient clipping by norm
clipvalue (float, optional): Gradient clipping by value
global_clipnorm (float, optional): Global gradient clipping by norm
use_ema (bool): Whether to use Exponential Moving Average
ema_momentum (float): EMA momentum
ema_overwrite_frequency (int, optional): EMA overwrite frequency
name (str): Name of the optimizer
"""
def __init__(self, learning_rate=0.001, initial_accumulator_value=0.1, epsilon=1e-7, **kwargs): ...
class Adadelta:
"""
Adadelta optimizer with adaptive learning rates.
Args:
learning_rate (float): Initial learning rate
rho (float): Decay rate for moving averages
epsilon (float): Small constant for numerical stability
"""
def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-7, **kwargs): ...
class RMSprop:
"""
RMSprop optimizer with adaptive learning rates.
Args:
learning_rate (float): Initial learning rate
rho (float): Decay rate for moving average of squared gradients
momentum (float): Momentum factor
epsilon (float): Small constant for numerical stability
centered (bool): Whether to compute centered RMSprop
"""
def __init__(self, learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-7,
centered=False, **kwargs): ...
class Ftrl:
"""
Follow The Regularized Leader optimizer.
Args:
learning_rate (float): Initial learning rate
learning_rate_power (float): Power to scale learning rate
initial_accumulator_value (float): Initial value for accumulator
l1_regularization_strength (float): L1 regularization strength
l2_regularization_strength (float): L2 regularization strength
l2_shrinkage_regularization_strength (float): L2 shrinkage regularization
beta (float): Beta parameter
"""
def __init__(self, learning_rate=0.001, learning_rate_power=-0.5,
initial_accumulator_value=0.1, l1_regularization_strength=0.0,
l2_regularization_strength=0.0, **kwargs): ...State-of-the-art optimization algorithms with advanced features for improved convergence and performance.
class AdamW:
"""
Adam optimizer with decoupled weight decay.
Args:
learning_rate (float): Initial learning rate
weight_decay (float): Weight decay coefficient
beta_1 (float): Exponential decay rate for first moment estimates
beta_2 (float): Exponential decay rate for second moment estimates
epsilon (float): Small constant for numerical stability
amsgrad (bool): Whether to apply AMSGrad variant
clipnorm (float, optional): Gradient clipping by norm
clipvalue (float, optional): Gradient clipping by value
global_clipnorm (float, optional): Global gradient clipping by norm
use_ema (bool): Whether to use Exponential Moving Average
ema_momentum (float): EMA momentum
ema_overwrite_frequency (int, optional): EMA overwrite frequency
name (str): Name of the optimizer
"""
def __init__(self, learning_rate=0.001, weight_decay=0.004, beta_1=0.9, beta_2=0.999,
epsilon=1e-7, amsgrad=False, **kwargs): ...
class Adafactor:
"""
Adafactor optimizer with factorized second moments.
Args:
learning_rate (float): Initial learning rate
epsilon2 (float): Second epsilon value
cliping_threshold (float): Clipping threshold
decay_rate (float): Decay rate for moving averages
beta1 (float, optional): Beta1 parameter
weight_decay_rate (float): Weight decay rate
eps_scale (float): Epsilon scaling factor
clip_threshold (float): Gradient clipping threshold
relative_step (bool): Whether to use relative step size
warmup_init (bool): Whether to use warmup initialization
"""
def __init__(self, learning_rate=0.001, epsilon2=1e-30, cliping_threshold=1.0,
decay_rate=0.8, beta1=None, **kwargs): ...
class Lamb:
"""
Layer-wise Adaptive Moments optimizer.
Args:
learning_rate (float): Initial learning rate
beta_1 (float): Exponential decay rate for first moment estimates
beta_2 (float): Exponential decay rate for second moment estimates
epsilon (float): Small constant for numerical stability
weight_decay_rate (float): Weight decay rate
always_adapt (bool): Whether to always adapt learning rate
"""
def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-6,
weight_decay_rate=0.01, always_adapt=False, **kwargs): ...
class Lion:
"""
EvoLved Sign Momentum optimizer.
Args:
learning_rate (float): Initial learning rate
beta (float): Momentum coefficient
weight_decay (float): Weight decay coefficient
"""
def __init__(self, learning_rate=0.0001, beta=0.99, weight_decay=0.0, **kwargs): ...
class Muon:
"""
Momentum Orthogonalized by Newton's method.
Args:
learning_rate (float): Initial learning rate
momentum (float): Momentum coefficient
nesterov (bool): Whether to use Nesterov momentum
backend_update_momentum (float): Backend update momentum
k (int): K parameter for Muon
norm_axes (tuple): Axes for normalization
"""
def __init__(self, learning_rate=0.02, momentum=0.95, nesterov=True, **kwargs): ...Optimizer wrapper for mixed precision training with automatic loss scaling.
class LossScaleOptimizer:
"""
Optimizer wrapper for mixed precision training with loss scaling.
Args:
inner_optimizer: Base optimizer to wrap
dynamic (bool): Whether to use dynamic loss scaling
initial_scale (float): Initial loss scale value
dynamic_growth_steps (int): Steps between scale increases
name (str): Name of the optimizer
"""
def __init__(self, inner_optimizer, dynamic=True, initial_scale=32768.0,
dynamic_growth_steps=2000, **kwargs): ...
def get_scaled_loss(self, loss):
"""Scale loss for mixed precision training."""
def get_unscaled_gradients(self, scaled_gradients):
"""Unscale gradients after backpropagation."""Base class for all optimizers providing common functionality.
class Optimizer:
"""
Base class for all optimizers.
Args:
name (str): Name of the optimizer
weight_decay (float, optional): Weight decay rate
clipnorm (float, optional): Gradient clipping by norm
clipvalue (float, optional): Gradient clipping by value
global_clipnorm (float, optional): Global gradient clipping by norm
use_ema (bool): Whether to use Exponential Moving Average
ema_momentum (float): EMA momentum
ema_overwrite_frequency (int, optional): EMA overwrite frequency
loss_scale_factor (float, optional): Loss scale factor for mixed precision
gradient_accumulation_steps (int, optional): Gradient accumulation steps
"""
def __init__(self, name, **kwargs): ...
def apply_gradients(self, grads_and_vars):
"""Apply gradients to variables."""
def minimize(self, loss, var_list=None, tape=None):
"""Minimize loss by updating variables."""
def get_config(self):
"""Get optimizer configuration."""
def from_config(cls, config):
"""Create optimizer from configuration."""
def get_weights(self):
"""Get optimizer state as list of arrays."""
def set_weights(self, weights):
"""Set optimizer state from list of arrays."""Learning rate scheduling utilities for adaptive training.
# Available in keras.optimizers.schedules
class ExponentialDecay:
"""
Exponential decay schedule.
Args:
initial_learning_rate (float): Initial learning rate
decay_steps (int): Steps for decay
decay_rate (float): Decay rate
staircase (bool): Whether to apply decay in discrete intervals
name (str): Name of the schedule
"""
def __init__(self, initial_learning_rate, decay_steps, decay_rate,
staircase=False, name=None): ...
class InverseTimeDecay:
"""
Inverse time decay schedule.
Args:
initial_learning_rate (float): Initial learning rate
decay_steps (int): Steps for decay
decay_rate (float): Decay rate
staircase (bool): Whether to apply decay in discrete intervals
name (str): Name of the schedule
"""
def __init__(self, initial_learning_rate, decay_steps, decay_rate,
staircase=False, name=None): ...
class CosineDecay:
"""
Cosine decay schedule.
Args:
initial_learning_rate (float): Initial learning rate
decay_steps (int): Steps for decay
alpha (float): Minimum learning rate as fraction of initial
name (str): Name of the schedule
"""
def __init__(self, initial_learning_rate, decay_steps, alpha=0.0, name=None): ...
class PiecewiseConstantDecay:
"""
Piecewise constant decay schedule.
Args:
boundaries (list): Step boundaries for rate changes
values (list): Learning rate values for each interval
name (str): Name of the schedule
"""
def __init__(self, boundaries, values, name=None): ...Functions for optimizer management and configuration.
def get(identifier):
"""
Retrieve optimizer by name or return callable.
Args:
identifier (str or callable): Optimizer name or instance
Returns:
Optimizer: Optimizer instance
"""
def serialize(optimizer):
"""
Serialize optimizer to JSON-serializable dict.
Args:
optimizer: Optimizer to serialize
Returns:
dict: Serialized optimizer configuration
"""
def deserialize(config, custom_objects=None):
"""
Deserialize optimizer from config dict.
Args:
config (dict): Optimizer configuration
custom_objects (dict, optional): Custom objects for deserialization
Returns:
Optimizer: Deserialized optimizer
"""import keras
from keras import optimizers
# Using string identifier (default parameters)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Using optimizer class with custom parameters
optimizer = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
# Advanced configuration with weight decay and clipping
optimizer = optimizers.AdamW(
learning_rate=0.001,
weight_decay=0.01,
clipnorm=1.0,
use_ema=True,
ema_momentum=0.99
)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])import keras
from keras import optimizers
# Exponential decay schedule
lr_schedule = optimizers.schedules.ExponentialDecay(
initial_learning_rate=0.1,
decay_steps=10000,
decay_rate=0.96,
staircase=True
)
optimizer = optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Cosine decay schedule
lr_schedule = optimizers.schedules.CosineDecay(
initial_learning_rate=0.001,
decay_steps=1000,
alpha=0.01
)
optimizer = optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer, loss='mse')import keras
from keras import mixed_precision, optimizers
# Enable mixed precision
mixed_precision.set_global_policy('mixed_float16')
# Wrap optimizer for mixed precision
base_optimizer = optimizers.Adam(learning_rate=0.001)
optimizer = optimizers.LossScaleOptimizer(base_optimizer)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# The model will automatically handle loss scaling
history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))import keras
from keras import optimizers, ops
# Create model and optimizer
model = keras.Sequential([
keras.layers.Dense(64, activation='relu'),
keras.layers.Dense(10, activation='softmax')
])
optimizer = optimizers.Adam(learning_rate=0.001)
# Custom training step
@keras.utils.jit_compile
def train_step(x, y):
with keras.device('gpu'):
with keras.ops.GradientTape() as tape:
predictions = model(x, training=True)
loss = keras.losses.sparse_categorical_crossentropy(y, predictions)
loss = ops.mean(loss)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
return loss
# Training loop
for epoch in range(10):
for step, (x_batch, y_batch) in enumerate(train_dataset):
loss = train_step(x_batch, y_batch)
if step % 100 == 0:
print(f"Epoch {epoch}, Step {step}, Loss: {loss:.4f}")import keras
from keras import optimizers
# Create and configure optimizer
optimizer = optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse')
# Train model
model.fit(x_train, y_train, epochs=5)
# Save optimizer state
optimizer_weights = optimizer.get_weights()
# Create new optimizer and restore state
new_optimizer = optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=new_optimizer, loss='mse')
# Build optimizer by running one step
model.fit(x_train[:1], y_train[:1], epochs=1)
# Restore saved state
new_optimizer.set_weights(optimizer_weights)
# Continue training from saved state
model.fit(x_train, y_train, epochs=5)Install with Tessl CLI
npx tessl i tessl/pypi-keras-nightly