CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-keras-nightly

Multi-backend deep learning framework providing a unified API for building and training neural networks across JAX, TensorFlow, PyTorch, and OpenVINO backends

Pending
Overview
Eval results
Files

optimizers.mddocs/

Optimizers

Optimization algorithms for training neural networks, from basic gradient descent to advanced adaptive methods with automatic learning rate adjustment and momentum variants.

Capabilities

Gradient Descent Optimizers

Fundamental gradient-based optimization algorithms including basic SGD and momentum variants.

class SGD:
    """
    Stochastic Gradient Descent optimizer.
    
    Args:
        learning_rate (float): Learning rate
        momentum (float): Momentum factor
        nesterov (bool): Whether to apply Nesterov momentum
        weight_decay (float, optional): Weight decay rate
        clipnorm (float, optional): Gradient clipping by norm
        clipvalue (float, optional): Gradient clipping by value
        global_clipnorm (float, optional): Global gradient clipping by norm
        use_ema (bool): Whether to use Exponential Moving Average
        ema_momentum (float): EMA momentum
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
        name (str): Name of the optimizer
    """
    def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False, **kwargs): ...

class Adam:
    """
    Adam optimizer with adaptive learning rates.
    
    Args:
        learning_rate (float): Initial learning rate
        beta_1 (float): Exponential decay rate for first moment estimates
        beta_2 (float): Exponential decay rate for second moment estimates
        epsilon (float): Small constant for numerical stability
        amsgrad (bool): Whether to apply AMSGrad variant
        weight_decay (float, optional): Weight decay rate
        clipnorm (float, optional): Gradient clipping by norm
        clipvalue (float, optional): Gradient clipping by value
        global_clipnorm (float, optional): Global gradient clipping by norm
        use_ema (bool): Whether to use Exponential Moving Average
        ema_momentum (float): EMA momentum
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
        name (str): Name of the optimizer
    """
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, 
                 amsgrad=False, **kwargs): ...

class Nadam:
    """
    Nesterov-accelerated Adam optimizer.
    
    Args:
        learning_rate (float): Initial learning rate
        beta_1 (float): Exponential decay rate for first moment estimates
        beta_2 (float): Exponential decay rate for second moment estimates
        epsilon (float): Small constant for numerical stability
    """
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, **kwargs): ...

class Adamax:
    """
    Adamax optimizer (Adam based on infinity norm).
    
    Args:
        learning_rate (float): Initial learning rate
        beta_1 (float): Exponential decay rate for first moment estimates
        beta_2 (float): Exponential decay rate for weighted infinity norm
        epsilon (float): Small constant for numerical stability
    """
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, **kwargs): ...

Adaptive Learning Rate Optimizers

Optimizers that automatically adapt learning rates based on gradient history and parameter-specific statistics.

class Adagrad:
    """
    Adagrad optimizer with adaptive learning rates.
    
    Args:
        learning_rate (float): Initial learning rate
        initial_accumulator_value (float): Initial value for gradient accumulator
        epsilon (float): Small constant for numerical stability
        weight_decay (float, optional): Weight decay rate
        clipnorm (float, optional): Gradient clipping by norm
        clipvalue (float, optional): Gradient clipping by value
        global_clipnorm (float, optional): Global gradient clipping by norm
        use_ema (bool): Whether to use Exponential Moving Average
        ema_momentum (float): EMA momentum
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
        name (str): Name of the optimizer
    """
    def __init__(self, learning_rate=0.001, initial_accumulator_value=0.1, epsilon=1e-7, **kwargs): ...

class Adadelta:
    """
    Adadelta optimizer with adaptive learning rates.
    
    Args:
        learning_rate (float): Initial learning rate
        rho (float): Decay rate for moving averages
        epsilon (float): Small constant for numerical stability
    """
    def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-7, **kwargs): ...

class RMSprop:
    """
    RMSprop optimizer with adaptive learning rates.
    
    Args:
        learning_rate (float): Initial learning rate
        rho (float): Decay rate for moving average of squared gradients
        momentum (float): Momentum factor
        epsilon (float): Small constant for numerical stability
        centered (bool): Whether to compute centered RMSprop
    """
    def __init__(self, learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-7, 
                 centered=False, **kwargs): ...

class Ftrl:
    """
    Follow The Regularized Leader optimizer.
    
    Args:
        learning_rate (float): Initial learning rate
        learning_rate_power (float): Power to scale learning rate
        initial_accumulator_value (float): Initial value for accumulator
        l1_regularization_strength (float): L1 regularization strength
        l2_regularization_strength (float): L2 regularization strength
        l2_shrinkage_regularization_strength (float): L2 shrinkage regularization
        beta (float): Beta parameter
    """
    def __init__(self, learning_rate=0.001, learning_rate_power=-0.5, 
                 initial_accumulator_value=0.1, l1_regularization_strength=0.0,
                 l2_regularization_strength=0.0, **kwargs): ...

Advanced Optimizers

State-of-the-art optimization algorithms with advanced features for improved convergence and performance.

class AdamW:
    """
    Adam optimizer with decoupled weight decay.
    
    Args:
        learning_rate (float): Initial learning rate
        weight_decay (float): Weight decay coefficient
        beta_1 (float): Exponential decay rate for first moment estimates
        beta_2 (float): Exponential decay rate for second moment estimates
        epsilon (float): Small constant for numerical stability
        amsgrad (bool): Whether to apply AMSGrad variant
        clipnorm (float, optional): Gradient clipping by norm
        clipvalue (float, optional): Gradient clipping by value
        global_clipnorm (float, optional): Global gradient clipping by norm
        use_ema (bool): Whether to use Exponential Moving Average
        ema_momentum (float): EMA momentum
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
        name (str): Name of the optimizer
    """
    def __init__(self, learning_rate=0.001, weight_decay=0.004, beta_1=0.9, beta_2=0.999, 
                 epsilon=1e-7, amsgrad=False, **kwargs): ...

class Adafactor:
    """
    Adafactor optimizer with factorized second moments.
    
    Args:
        learning_rate (float): Initial learning rate
        epsilon2 (float): Second epsilon value
        cliping_threshold (float): Clipping threshold
        decay_rate (float): Decay rate for moving averages
        beta1 (float, optional): Beta1 parameter
        weight_decay_rate (float): Weight decay rate
        eps_scale (float): Epsilon scaling factor
        clip_threshold (float): Gradient clipping threshold
        relative_step (bool): Whether to use relative step size
        warmup_init (bool): Whether to use warmup initialization
    """
    def __init__(self, learning_rate=0.001, epsilon2=1e-30, cliping_threshold=1.0, 
                 decay_rate=0.8, beta1=None, **kwargs): ...

class Lamb:
    """
    Layer-wise Adaptive Moments optimizer.
    
    Args:
        learning_rate (float): Initial learning rate
        beta_1 (float): Exponential decay rate for first moment estimates
        beta_2 (float): Exponential decay rate for second moment estimates
        epsilon (float): Small constant for numerical stability
        weight_decay_rate (float): Weight decay rate
        always_adapt (bool): Whether to always adapt learning rate
    """
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-6,
                 weight_decay_rate=0.01, always_adapt=False, **kwargs): ...

class Lion:
    """
    EvoLved Sign Momentum optimizer.
    
    Args:
        learning_rate (float): Initial learning rate
        beta (float): Momentum coefficient
        weight_decay (float): Weight decay coefficient
    """
    def __init__(self, learning_rate=0.0001, beta=0.99, weight_decay=0.0, **kwargs): ...

class Muon:
    """
    Momentum Orthogonalized by Newton's method.
    
    Args:
        learning_rate (float): Initial learning rate
        momentum (float): Momentum coefficient
        nesterov (bool): Whether to use Nesterov momentum
        backend_update_momentum (float): Backend update momentum
        k (int): K parameter for Muon
        norm_axes (tuple): Axes for normalization
    """
    def __init__(self, learning_rate=0.02, momentum=0.95, nesterov=True, **kwargs): ...

Mixed Precision Training

Optimizer wrapper for mixed precision training with automatic loss scaling.

class LossScaleOptimizer:
    """
    Optimizer wrapper for mixed precision training with loss scaling.
    
    Args:
        inner_optimizer: Base optimizer to wrap
        dynamic (bool): Whether to use dynamic loss scaling
        initial_scale (float): Initial loss scale value
        dynamic_growth_steps (int): Steps between scale increases
        name (str): Name of the optimizer
    """
    def __init__(self, inner_optimizer, dynamic=True, initial_scale=32768.0, 
                 dynamic_growth_steps=2000, **kwargs): ...
    
    def get_scaled_loss(self, loss):
        """Scale loss for mixed precision training."""
    
    def get_unscaled_gradients(self, scaled_gradients):
        """Unscale gradients after backpropagation."""

Base Optimizer Class

Base class for all optimizers providing common functionality.

class Optimizer:
    """
    Base class for all optimizers.
    
    Args:
        name (str): Name of the optimizer
        weight_decay (float, optional): Weight decay rate
        clipnorm (float, optional): Gradient clipping by norm
        clipvalue (float, optional): Gradient clipping by value
        global_clipnorm (float, optional): Global gradient clipping by norm
        use_ema (bool): Whether to use Exponential Moving Average
        ema_momentum (float): EMA momentum
        ema_overwrite_frequency (int, optional): EMA overwrite frequency
        loss_scale_factor (float, optional): Loss scale factor for mixed precision
        gradient_accumulation_steps (int, optional): Gradient accumulation steps
    """
    def __init__(self, name, **kwargs): ...
    
    def apply_gradients(self, grads_and_vars):
        """Apply gradients to variables."""
    
    def minimize(self, loss, var_list=None, tape=None):
        """Minimize loss by updating variables."""
    
    def get_config(self):
        """Get optimizer configuration."""
    
    def from_config(cls, config):
        """Create optimizer from configuration."""
    
    def get_weights(self):
        """Get optimizer state as list of arrays."""
    
    def set_weights(self, weights):
        """Set optimizer state from list of arrays."""

Learning Rate Schedules

Learning rate scheduling utilities for adaptive training.

# Available in keras.optimizers.schedules
class ExponentialDecay:
    """
    Exponential decay schedule.
    
    Args:
        initial_learning_rate (float): Initial learning rate
        decay_steps (int): Steps for decay
        decay_rate (float): Decay rate
        staircase (bool): Whether to apply decay in discrete intervals
        name (str): Name of the schedule
    """
    def __init__(self, initial_learning_rate, decay_steps, decay_rate, 
                 staircase=False, name=None): ...

class InverseTimeDecay:
    """
    Inverse time decay schedule.
    
    Args:
        initial_learning_rate (float): Initial learning rate
        decay_steps (int): Steps for decay
        decay_rate (float): Decay rate
        staircase (bool): Whether to apply decay in discrete intervals
        name (str): Name of the schedule
    """
    def __init__(self, initial_learning_rate, decay_steps, decay_rate, 
                 staircase=False, name=None): ...

class CosineDecay:
    """
    Cosine decay schedule.
    
    Args:
        initial_learning_rate (float): Initial learning rate
        decay_steps (int): Steps for decay
        alpha (float): Minimum learning rate as fraction of initial
        name (str): Name of the schedule
    """
    def __init__(self, initial_learning_rate, decay_steps, alpha=0.0, name=None): ...

class PiecewiseConstantDecay:
    """
    Piecewise constant decay schedule.
    
    Args:
        boundaries (list): Step boundaries for rate changes
        values (list): Learning rate values for each interval
        name (str): Name of the schedule
    """
    def __init__(self, boundaries, values, name=None): ...

Utility Functions

Functions for optimizer management and configuration.

def get(identifier):
    """
    Retrieve optimizer by name or return callable.
    
    Args:
        identifier (str or callable): Optimizer name or instance
        
    Returns:
        Optimizer: Optimizer instance
    """

def serialize(optimizer):
    """
    Serialize optimizer to JSON-serializable dict.
    
    Args:
        optimizer: Optimizer to serialize
        
    Returns:
        dict: Serialized optimizer configuration
    """

def deserialize(config, custom_objects=None):
    """
    Deserialize optimizer from config dict.
    
    Args:
        config (dict): Optimizer configuration
        custom_objects (dict, optional): Custom objects for deserialization
        
    Returns:
        Optimizer: Deserialized optimizer
    """

Usage Examples

Basic Optimizer Usage

import keras
from keras import optimizers

# Using string identifier (default parameters)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Using optimizer class with custom parameters
optimizer = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# Advanced configuration with weight decay and clipping
optimizer = optimizers.AdamW(
    learning_rate=0.001,
    weight_decay=0.01,
    clipnorm=1.0,
    use_ema=True,
    ema_momentum=0.99
)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

Learning Rate Scheduling

import keras
from keras import optimizers

# Exponential decay schedule
lr_schedule = optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.1,
    decay_steps=10000,
    decay_rate=0.96,
    staircase=True
)

optimizer = optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Cosine decay schedule
lr_schedule = optimizers.schedules.CosineDecay(
    initial_learning_rate=0.001,
    decay_steps=1000,
    alpha=0.01
)

optimizer = optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer, loss='mse')

Mixed Precision Training

import keras
from keras import mixed_precision, optimizers

# Enable mixed precision
mixed_precision.set_global_policy('mixed_float16')

# Wrap optimizer for mixed precision
base_optimizer = optimizers.Adam(learning_rate=0.001)
optimizer = optimizers.LossScaleOptimizer(base_optimizer)

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# The model will automatically handle loss scaling
history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

Custom Training Loop with Optimizer

import keras
from keras import optimizers, ops

# Create model and optimizer
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

optimizer = optimizers.Adam(learning_rate=0.001)

# Custom training step
@keras.utils.jit_compile
def train_step(x, y):
    with keras.device('gpu'):
        with keras.ops.GradientTape() as tape:
            predictions = model(x, training=True)
            loss = keras.losses.sparse_categorical_crossentropy(y, predictions)
            loss = ops.mean(loss)
        
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        return loss

# Training loop
for epoch in range(10):
    for step, (x_batch, y_batch) in enumerate(train_dataset):
        loss = train_step(x_batch, y_batch)
        
        if step % 100 == 0:
            print(f"Epoch {epoch}, Step {step}, Loss: {loss:.4f}")

Optimizer State Management

import keras
from keras import optimizers

# Create and configure optimizer
optimizer = optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse')

# Train model
model.fit(x_train, y_train, epochs=5)

# Save optimizer state
optimizer_weights = optimizer.get_weights()

# Create new optimizer and restore state
new_optimizer = optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=new_optimizer, loss='mse')

# Build optimizer by running one step
model.fit(x_train[:1], y_train[:1], epochs=1)

# Restore saved state
new_optimizer.set_weights(optimizer_weights)

# Continue training from saved state
model.fit(x_train, y_train, epochs=5)

Install with Tessl CLI

npx tessl i tessl/pypi-keras-nightly

docs

activations.md

applications.md

backend-config.md

core-framework.md

index.md

initializers.md

layers.md

losses-metrics.md

operations.md

optimizers.md

preprocessing.md

regularizers.md

training-callbacks.md

tile.json