tessl/pypi-lightgbm

LightGBM is a gradient boosting framework that uses tree-based learning algorithms, designed to be distributed and efficient with faster training speed, higher efficiency, lower memory usage, better accuracy, and support for parallel, distributed, and GPU learning.

—

Pending

Overview

Eval results

Files

Training Callbacks

Name: tessl/pypi-lightgbm
Author: tessl

Flexible training control through callback functions enabling early stopping, evaluation logging, parameter adjustment, and custom training behaviors. LightGBM's callback system supports both built-in callbacks for common use cases and custom callback implementations for specialized training requirements.

Capabilities

Early Stopping

Automatically stop training when validation metric stops improving to prevent overfitting and save computation time.

def early_stopping(stopping_rounds, first_metric_only=False, verbose=True, min_delta=0.0):
    """
    Create early stopping callback for training.
    
    Parameters:
    - stopping_rounds: int - Number of rounds without improvement to trigger stopping
    - first_metric_only: bool - Whether to use only the first metric for early stopping
    - verbose: bool - Whether to print early stopping information
    - min_delta: float - Minimum change in monitored quantity to qualify as improvement
    
    Returns:
    - callable: Early stopping callback function for use in train() or cv()
    """

class EarlyStopException(Exception):
    """
    Exception raised for early stopping in training.
    
    This exception can be raised from custom callbacks to trigger early stopping
    with specific iteration and score information.
    """
    
    def __init__(self, best_iteration, best_score):
        """
        Create early stopping exception.
        
        Parameters:
        - best_iteration: int - Best iteration when early stopping occurred
        - best_score: list - Best evaluation scores when stopping
        """
        super().__init__()
        self.best_iteration = best_iteration
        self.best_score = best_score

Evaluation Logging

Control the frequency and format of evaluation metric logging during training.

def log_evaluation(period=1, show_stdv=True):
    """
    Create evaluation logging callback for training progress monitoring.
    
    Parameters:
    - period: int - Evaluation logging frequency (log every N iterations)
    - show_stdv: bool - Whether to show standard deviation in cross-validation
    
    Returns:
    - callable: Logging callback function for use in train() or cv()
    """

Evaluation Recording

Record evaluation results in a dictionary for later analysis and visualization.

def record_evaluation(eval_result):
    """
    Create evaluation recording callback to store training history.
    
    Parameters:
    - eval_result: dict - Dictionary to store evaluation results
                          Will be populated with structure:
                          {
                              'dataset_name': {
                                  'metric_name': [score1, score2, ...]
                              }
                          }
    
    Returns:
    - callable: Recording callback function for use in train() or cv()
    """

Parameter Reset

Dynamically adjust training parameters during the training process.

def reset_parameter(**kwargs):
    """
    Create parameter reset callback for dynamic parameter adjustment.
    
    Parameters:
    - **kwargs: Parameter names and values to reset during training
                Can include any LightGBM parameter (learning_rate, num_leaves, etc.)
    
    Returns:
    - callable: Parameter reset callback function for use in train() or cv()
    """

Usage Examples

Early Stopping Example

import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Load data
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Train with early stopping
model = lgb.train(
    {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'verbose': -1
    },
    train_data,
    num_boost_round=200,
    valid_sets=[test_data],
    valid_names=['test'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=20, verbose=True),
        lgb.log_evaluation(period=20)
    ]
)

print(f"Training stopped at iteration: {model.best_iteration}")
print(f"Best validation score: {model.best_score['test']['binary_logloss']:.4f}")

Comprehensive Logging Example

import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

# Load data
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set up evaluation result recording
eval_result = {}

# Train with comprehensive logging
model = lgb.train(
    {
        'objective': 'regression',
        'metric': ['rmse', 'mae', 'mape'],
        'num_leaves': 31,
        'learning_rate': 0.05,
        'verbose': -1
    },
    train_data,
    num_boost_round=150,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'test'],
    callbacks=[
        lgb.record_evaluation(eval_result),  # Record all metrics
        lgb.log_evaluation(period=25, show_stdv=False),  # Log every 25 iterations
        lgb.early_stopping(stopping_rounds=15, first_metric_only=True)
    ]
)

# Analyze recorded results
print("Recorded metrics:")
for dataset in eval_result:
    print(f"  {dataset}:")
    for metric in eval_result[dataset]:
        final_score = eval_result[dataset][metric][-1]
        print(f"    {metric}: {final_score:.4f}")

# Plot training curves
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, metric in enumerate(['rmse', 'mae', 'mape']):
    ax = axes[i]
    
    # Plot train and test curves
    train_scores = eval_result['train'][metric]
    test_scores = eval_result['test'][metric]
    
    ax.plot(range(len(train_scores)), train_scores, label='Train', color='blue')
    ax.plot(range(len(test_scores)), test_scores, label='Test', color='red')
    
    # Mark best iteration
    ax.axvline(x=model.best_iteration-1, color='green', linestyle='--', 
               label=f'Best ({model.best_iteration})')
    
    ax.set_title(f'{metric.upper()} During Training')
    ax.set_xlabel('Iteration')
    ax.set_ylabel(metric.upper())
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Dynamic Parameter Adjustment Example

import lightgbm as lgb
from sklearn.datasets import make_regression

# Generate data
X, y = make_regression(n_samples=10000, n_features=20, noise=0.1, random_state=42)
train_data = lgb.Dataset(X, label=y)

# Create learning rate scheduler
def learning_rate_scheduler(current_round, learning_rate_start=0.1, decay_rate=0.95, decay_step=20):
    """Custom learning rate scheduler."""
    if current_round % decay_step == 0 and current_round > 0:
        new_lr = learning_rate_start * (decay_rate ** (current_round // decay_step))
        return {'learning_rate': new_lr}
    return {}

# Train with dynamic parameter adjustment
eval_result = {}
model = lgb.train(
    {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.1,  # Starting learning rate
        'verbose': -1
    },
    train_data,
    num_boost_round=100,
    callbacks=[
        lgb.record_evaluation(eval_result),
        lgb.log_evaluation(period=20),
        # Reset learning rate every 20 iterations
        lgb.reset_parameter(learning_rate=lambda: 0.1 * (0.95 ** (model.current_iteration() // 20)))
    ]
)

print(f"Final RMSE: {eval_result['training']['rmse'][-1]:.4f}")

Cross-Validation with Callbacks

import lightgbm as lgb
import numpy as np
from sklearn.datasets import load_wine

# Load data
X, y = load_wine(return_X_y=True)
train_data = lgb.Dataset(X, label=y)

# Perform cross-validation with callbacks
cv_results = lgb.cv(
    {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'verbose': -1
    },
    train_data,
    num_boost_round=100,
    nfold=5,
    stratified=True,
    shuffle=True,
    seed=42,
    callbacks=[
        lgb.log_evaluation(period=20, show_stdv=True),  # Show std dev in CV
        lgb.early_stopping(stopping_rounds=10)
    ]
)

print(f"CV Results:")
print(f"Best iteration: {len(cv_results['valid multi_logloss-mean'])}")
print(f"Best CV score: {cv_results['valid multi_logloss-mean'][-1]:.4f} ± {cv_results['valid multi_logloss-stdv'][-1]:.4f}")

# Plot CV results with error bars
import matplotlib.pyplot as plt

iterations = range(len(cv_results['valid multi_logloss-mean']))
means = cv_results['valid multi_logloss-mean']
stds = cv_results['valid multi_logloss-stdv']

plt.figure(figsize=(10, 6))
plt.plot(iterations, means, color='blue', label='CV Mean')
plt.fill_between(iterations, 
                 np.array(means) - np.array(stds),
                 np.array(means) + np.array(stds),
                 alpha=0.3, color='blue', label='CV Std Dev')
plt.xlabel('Iteration')
plt.ylabel('Multi Log Loss')
plt.title('Cross-Validation Results with Standard Deviation')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Custom Callback Implementation

import lightgbm as lgb
from sklearn.datasets import load_boston

def custom_metric_tracker(metric_threshold=0.1):
    """
    Custom callback to track when metrics cross a threshold.
    """
    def callback(env):
        # env contains information about current training state
        # env.model: current model
        # env.params: training parameters  
        # env.iteration: current iteration
        # env.begin_iteration: beginning iteration
        # env.end_iteration: ending iteration
        # env.evaluation_result_list: current evaluation results
        
        if env.evaluation_result_list:
            for eval_result in env.evaluation_result_list:
                dataset_name, metric_name, metric_value, is_higher_better = eval_result
                
                if metric_name == 'rmse' and metric_value < metric_threshold:
                    print(f"🎯 Metric threshold reached! RMSE: {metric_value:.4f} at iteration {env.iteration}")
                    
        # Continue training
        return False
    
    return callback

def custom_progress_bar(total_rounds, bar_length=50):
    """
    Custom progress bar callback.
    """
    def callback(env):
        current = env.iteration - env.begin_iteration + 1
        progress = current / total_rounds
        filled_length = int(bar_length * progress)
        
        bar = '█' * filled_length + '-' * (bar_length - filled_length)
        percent = progress * 100
        
        print(f'\rProgress: |{bar}| {percent:.1f}% ({current}/{total_rounds})', end='')
        
        if current == total_rounds:
            print()  # New line when complete
            
        return False
    
    return callback

# Load data
X, y = load_boston(return_X_y=True)
train_data = lgb.Dataset(X, label=y)

# Train with custom callbacks
model = lgb.train(
    {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'verbose': -1
    },
    train_data,
    num_boost_round=100,
    callbacks=[
        custom_progress_bar(100),  # Custom progress tracking
        custom_metric_tracker(5.0),  # Alert when RMSE < 5.0
        lgb.log_evaluation(period=25)  # Standard logging
    ]
)

print(f"\nTraining completed!")
print(f"Final RMSE: {model.eval_train()[0][2]:.4f}")

Callback with sklearn Interface

import lightgbm as lgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load data
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up evaluation tracking
eval_result = {}

# Use callbacks with sklearn interface
model = lgb.LGBMClassifier(
    objective='multiclass',
    n_estimators=100,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

# Fit with callbacks
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_names=['train', 'test'],
    eval_metric='multi_logloss',
    early_stopping_rounds=15,
    verbose=True,  # Equivalent to log_evaluation
    callbacks=[
        lgb.record_evaluation(eval_result)
    ]
)

# Access recorded results
print(f"Training completed at iteration: {model.best_iteration_}")
print(f"Best test score: {eval_result['test']['multi_logloss'][model.best_iteration_-1]:.4f}")

# Make predictions
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

print(f"Test accuracy: {(predictions == y_test).mean():.4f}")

Advanced Callback Patterns

Conditional Early Stopping

def conditional_early_stopping(stopping_rounds, condition_func):
    """
    Early stopping that only triggers when a condition is met.
    """
    best_score = float('inf')
    best_iteration = 0
    current_rounds = 0
    
    def callback(env):
        nonlocal best_score, best_iteration, current_rounds
        
        if env.evaluation_result_list:
            current_score = env.evaluation_result_list[0][2]  # First metric value
            
            if current_score < best_score:
                best_score = current_score
                best_iteration = env.iteration
                current_rounds = 0
            else:
                current_rounds += 1
            
            # Only stop if condition is met AND stopping rounds exceeded
            if condition_func(env) and current_rounds >= stopping_rounds:
                print(f"Conditional early stopping at iteration {env.iteration}")
                raise lgb.EarlyStopException(best_iteration, env.evaluation_result_list)
                
        return False
    
    return callback

# Example usage
def stop_condition(env):
    """Stop only if we've trained for at least 50 iterations."""
    return env.iteration >= 50

# Use conditional early stopping
model = lgb.train(
    params,
    train_data,
    num_boost_round=200,
    valid_sets=[test_data],
    callbacks=[
        conditional_early_stopping(10, stop_condition),
        lgb.log_evaluation(20)
    ]
)

Multi-Metric Monitoring

def multi_metric_monitor(metrics_config):
    """
    Monitor multiple metrics with different thresholds and behaviors.
    
    Args:
        metrics_config: dict like {
            'rmse': {'threshold': 5.0, 'action': 'alert'},
            'mae': {'threshold': 3.0, 'action': 'stop'}
        }
    """
    def callback(env):
        if env.evaluation_result_list:
            for eval_result in env.evaluation_result_list:
                dataset_name, metric_name, metric_value, is_higher_better = eval_result
                
                if metric_name in metrics_config:
                    config = metrics_config[metric_name]
                    threshold = config['threshold']
                    action = config['action']
                    
                    # Check threshold (assuming lower is better for this example)
                    if metric_value < threshold:
                        if action == 'alert':
                            print(f"🔔 {metric_name} threshold reached: {metric_value:.4f}")
                        elif action == 'stop':
                            print(f"🛑 Stopping due to {metric_name}: {metric_value:.4f}")
                            raise lgb.EarlyStopException(env.iteration, env.evaluation_result_list)
                            
        return False
    
    return callback

# Example usage
metrics_config = {
    'rmse': {'threshold': 4.0, 'action': 'alert'},
    'mae': {'threshold': 3.0, 'action': 'stop'}
}

model = lgb.train(
    {
        'objective': 'regression',
        'metric': ['rmse', 'mae'],
        'verbose': -1
    },
    train_data,
    num_boost_round=200,
    valid_sets=[test_data],
    callbacks=[
        multi_metric_monitor(metrics_config),
        lgb.log_evaluation(25)
    ]
)

Install with Tessl CLI