LightGBM is a gradient boosting framework that uses tree-based learning algorithms, designed to be distributed and efficient with faster training speed, higher efficiency, lower memory usage, better accuracy, and support for parallel, distributed, and GPU learning.
—
Flexible training control through callback functions enabling early stopping, evaluation logging, parameter adjustment, and custom training behaviors. LightGBM's callback system supports both built-in callbacks for common use cases and custom callback implementations for specialized training requirements.
Automatically stop training when validation metric stops improving to prevent overfitting and save computation time.
def early_stopping(stopping_rounds, first_metric_only=False, verbose=True, min_delta=0.0):
"""
Create early stopping callback for training.
Parameters:
- stopping_rounds: int - Number of rounds without improvement to trigger stopping
- first_metric_only: bool - Whether to use only the first metric for early stopping
- verbose: bool - Whether to print early stopping information
- min_delta: float - Minimum change in monitored quantity to qualify as improvement
Returns:
- callable: Early stopping callback function for use in train() or cv()
"""
class EarlyStopException(Exception):
"""
Exception raised for early stopping in training.
This exception can be raised from custom callbacks to trigger early stopping
with specific iteration and score information.
"""
def __init__(self, best_iteration, best_score):
"""
Create early stopping exception.
Parameters:
- best_iteration: int - Best iteration when early stopping occurred
- best_score: list - Best evaluation scores when stopping
"""
super().__init__()
self.best_iteration = best_iteration
self.best_score = best_scoreControl the frequency and format of evaluation metric logging during training.
def log_evaluation(period=1, show_stdv=True):
"""
Create evaluation logging callback for training progress monitoring.
Parameters:
- period: int - Evaluation logging frequency (log every N iterations)
- show_stdv: bool - Whether to show standard deviation in cross-validation
Returns:
- callable: Logging callback function for use in train() or cv()
"""Record evaluation results in a dictionary for later analysis and visualization.
def record_evaluation(eval_result):
"""
Create evaluation recording callback to store training history.
Parameters:
- eval_result: dict - Dictionary to store evaluation results
Will be populated with structure:
{
'dataset_name': {
'metric_name': [score1, score2, ...]
}
}
Returns:
- callable: Recording callback function for use in train() or cv()
"""Dynamically adjust training parameters during the training process.
def reset_parameter(**kwargs):
"""
Create parameter reset callback for dynamic parameter adjustment.
Parameters:
- **kwargs: Parameter names and values to reset during training
Can include any LightGBM parameter (learning_rate, num_leaves, etc.)
Returns:
- callable: Parameter reset callback function for use in train() or cv()
"""import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# Load data
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Prepare datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
# Train with early stopping
model = lgb.train(
{
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
},
train_data,
num_boost_round=200,
valid_sets=[test_data],
valid_names=['test'],
callbacks=[
lgb.early_stopping(stopping_rounds=20, verbose=True),
lgb.log_evaluation(period=20)
]
)
print(f"Training stopped at iteration: {model.best_iteration}")
print(f"Best validation score: {model.best_score['test']['binary_logloss']:.4f}")import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
# Load data
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Prepare datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
# Set up evaluation result recording
eval_result = {}
# Train with comprehensive logging
model = lgb.train(
{
'objective': 'regression',
'metric': ['rmse', 'mae', 'mape'],
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
},
train_data,
num_boost_round=150,
valid_sets=[train_data, test_data],
valid_names=['train', 'test'],
callbacks=[
lgb.record_evaluation(eval_result), # Record all metrics
lgb.log_evaluation(period=25, show_stdv=False), # Log every 25 iterations
lgb.early_stopping(stopping_rounds=15, first_metric_only=True)
]
)
# Analyze recorded results
print("Recorded metrics:")
for dataset in eval_result:
print(f" {dataset}:")
for metric in eval_result[dataset]:
final_score = eval_result[dataset][metric][-1]
print(f" {metric}: {final_score:.4f}")
# Plot training curves
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, metric in enumerate(['rmse', 'mae', 'mape']):
ax = axes[i]
# Plot train and test curves
train_scores = eval_result['train'][metric]
test_scores = eval_result['test'][metric]
ax.plot(range(len(train_scores)), train_scores, label='Train', color='blue')
ax.plot(range(len(test_scores)), test_scores, label='Test', color='red')
# Mark best iteration
ax.axvline(x=model.best_iteration-1, color='green', linestyle='--',
label=f'Best ({model.best_iteration})')
ax.set_title(f'{metric.upper()} During Training')
ax.set_xlabel('Iteration')
ax.set_ylabel(metric.upper())
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()import lightgbm as lgb
from sklearn.datasets import make_regression
# Generate data
X, y = make_regression(n_samples=10000, n_features=20, noise=0.1, random_state=42)
train_data = lgb.Dataset(X, label=y)
# Create learning rate scheduler
def learning_rate_scheduler(current_round, learning_rate_start=0.1, decay_rate=0.95, decay_step=20):
"""Custom learning rate scheduler."""
if current_round % decay_step == 0 and current_round > 0:
new_lr = learning_rate_start * (decay_rate ** (current_round // decay_step))
return {'learning_rate': new_lr}
return {}
# Train with dynamic parameter adjustment
eval_result = {}
model = lgb.train(
{
'objective': 'regression',
'metric': 'rmse',
'num_leaves': 31,
'learning_rate': 0.1, # Starting learning rate
'verbose': -1
},
train_data,
num_boost_round=100,
callbacks=[
lgb.record_evaluation(eval_result),
lgb.log_evaluation(period=20),
# Reset learning rate every 20 iterations
lgb.reset_parameter(learning_rate=lambda: 0.1 * (0.95 ** (model.current_iteration() // 20)))
]
)
print(f"Final RMSE: {eval_result['training']['rmse'][-1]:.4f}")import lightgbm as lgb
import numpy as np
from sklearn.datasets import load_wine
# Load data
X, y = load_wine(return_X_y=True)
train_data = lgb.Dataset(X, label=y)
# Perform cross-validation with callbacks
cv_results = lgb.cv(
{
'objective': 'multiclass',
'num_class': 3,
'metric': 'multi_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
},
train_data,
num_boost_round=100,
nfold=5,
stratified=True,
shuffle=True,
seed=42,
callbacks=[
lgb.log_evaluation(period=20, show_stdv=True), # Show std dev in CV
lgb.early_stopping(stopping_rounds=10)
]
)
print(f"CV Results:")
print(f"Best iteration: {len(cv_results['valid multi_logloss-mean'])}")
print(f"Best CV score: {cv_results['valid multi_logloss-mean'][-1]:.4f} ± {cv_results['valid multi_logloss-stdv'][-1]:.4f}")
# Plot CV results with error bars
import matplotlib.pyplot as plt
iterations = range(len(cv_results['valid multi_logloss-mean']))
means = cv_results['valid multi_logloss-mean']
stds = cv_results['valid multi_logloss-stdv']
plt.figure(figsize=(10, 6))
plt.plot(iterations, means, color='blue', label='CV Mean')
plt.fill_between(iterations,
np.array(means) - np.array(stds),
np.array(means) + np.array(stds),
alpha=0.3, color='blue', label='CV Std Dev')
plt.xlabel('Iteration')
plt.ylabel('Multi Log Loss')
plt.title('Cross-Validation Results with Standard Deviation')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()import lightgbm as lgb
from sklearn.datasets import load_boston
def custom_metric_tracker(metric_threshold=0.1):
"""
Custom callback to track when metrics cross a threshold.
"""
def callback(env):
# env contains information about current training state
# env.model: current model
# env.params: training parameters
# env.iteration: current iteration
# env.begin_iteration: beginning iteration
# env.end_iteration: ending iteration
# env.evaluation_result_list: current evaluation results
if env.evaluation_result_list:
for eval_result in env.evaluation_result_list:
dataset_name, metric_name, metric_value, is_higher_better = eval_result
if metric_name == 'rmse' and metric_value < metric_threshold:
print(f"🎯 Metric threshold reached! RMSE: {metric_value:.4f} at iteration {env.iteration}")
# Continue training
return False
return callback
def custom_progress_bar(total_rounds, bar_length=50):
"""
Custom progress bar callback.
"""
def callback(env):
current = env.iteration - env.begin_iteration + 1
progress = current / total_rounds
filled_length = int(bar_length * progress)
bar = '█' * filled_length + '-' * (bar_length - filled_length)
percent = progress * 100
print(f'\rProgress: |{bar}| {percent:.1f}% ({current}/{total_rounds})', end='')
if current == total_rounds:
print() # New line when complete
return False
return callback
# Load data
X, y = load_boston(return_X_y=True)
train_data = lgb.Dataset(X, label=y)
# Train with custom callbacks
model = lgb.train(
{
'objective': 'regression',
'metric': 'rmse',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
},
train_data,
num_boost_round=100,
callbacks=[
custom_progress_bar(100), # Custom progress tracking
custom_metric_tracker(5.0), # Alert when RMSE < 5.0
lgb.log_evaluation(period=25) # Standard logging
]
)
print(f"\nTraining completed!")
print(f"Final RMSE: {model.eval_train()[0][2]:.4f}")import lightgbm as lgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Load data
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Set up evaluation tracking
eval_result = {}
# Use callbacks with sklearn interface
model = lgb.LGBMClassifier(
objective='multiclass',
n_estimators=100,
learning_rate=0.05,
num_leaves=31,
random_state=42
)
# Fit with callbacks
model.fit(
X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_names=['train', 'test'],
eval_metric='multi_logloss',
early_stopping_rounds=15,
verbose=True, # Equivalent to log_evaluation
callbacks=[
lgb.record_evaluation(eval_result)
]
)
# Access recorded results
print(f"Training completed at iteration: {model.best_iteration_}")
print(f"Best test score: {eval_result['test']['multi_logloss'][model.best_iteration_-1]:.4f}")
# Make predictions
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)
print(f"Test accuracy: {(predictions == y_test).mean():.4f}")def conditional_early_stopping(stopping_rounds, condition_func):
"""
Early stopping that only triggers when a condition is met.
"""
best_score = float('inf')
best_iteration = 0
current_rounds = 0
def callback(env):
nonlocal best_score, best_iteration, current_rounds
if env.evaluation_result_list:
current_score = env.evaluation_result_list[0][2] # First metric value
if current_score < best_score:
best_score = current_score
best_iteration = env.iteration
current_rounds = 0
else:
current_rounds += 1
# Only stop if condition is met AND stopping rounds exceeded
if condition_func(env) and current_rounds >= stopping_rounds:
print(f"Conditional early stopping at iteration {env.iteration}")
raise lgb.EarlyStopException(best_iteration, env.evaluation_result_list)
return False
return callback
# Example usage
def stop_condition(env):
"""Stop only if we've trained for at least 50 iterations."""
return env.iteration >= 50
# Use conditional early stopping
model = lgb.train(
params,
train_data,
num_boost_round=200,
valid_sets=[test_data],
callbacks=[
conditional_early_stopping(10, stop_condition),
lgb.log_evaluation(20)
]
)def multi_metric_monitor(metrics_config):
"""
Monitor multiple metrics with different thresholds and behaviors.
Args:
metrics_config: dict like {
'rmse': {'threshold': 5.0, 'action': 'alert'},
'mae': {'threshold': 3.0, 'action': 'stop'}
}
"""
def callback(env):
if env.evaluation_result_list:
for eval_result in env.evaluation_result_list:
dataset_name, metric_name, metric_value, is_higher_better = eval_result
if metric_name in metrics_config:
config = metrics_config[metric_name]
threshold = config['threshold']
action = config['action']
# Check threshold (assuming lower is better for this example)
if metric_value < threshold:
if action == 'alert':
print(f"🔔 {metric_name} threshold reached: {metric_value:.4f}")
elif action == 'stop':
print(f"🛑 Stopping due to {metric_name}: {metric_value:.4f}")
raise lgb.EarlyStopException(env.iteration, env.evaluation_result_list)
return False
return callback
# Example usage
metrics_config = {
'rmse': {'threshold': 4.0, 'action': 'alert'},
'mae': {'threshold': 3.0, 'action': 'stop'}
}
model = lgb.train(
{
'objective': 'regression',
'metric': ['rmse', 'mae'],
'verbose': -1
},
train_data,
num_boost_round=200,
valid_sets=[test_data],
callbacks=[
multi_metric_monitor(metrics_config),
lgb.log_evaluation(25)
]
)Install with Tessl CLI
npx tessl i tessl/pypi-lightgbm