CatBoost is a fast, scalable, high performance gradient boosting on decision trees library used for ranking, classification, regression and other ML tasks.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
CatBoost provides interactive visualization components specifically designed for Jupyter notebooks, along with compatibility layers for XGBoost and LightGBM plotting workflows. These tools enable real-time monitoring of training progress and model analysis.
Visualization components that integrate seamlessly with Jupyter notebook environments.
class MetricVisualizer:
"""
Interactive widget for visualizing training metrics in Jupyter notebooks.
Provides real-time plots of training and validation metrics during model training,
with interactive controls for zooming, filtering, and metric selection.
"""
def __init__(self, train_dirs=None, subdirs=None):
"""
Initialize MetricVisualizer widget.
Parameters:
- train_dirs: List of training directories to monitor (list of strings)
- subdirs: Subdirectories within train_dirs to include (list of strings)
"""
def start(self, train_dirs=None, subdirs=None):
"""
Start the metric visualization widget.
Parameters:
- train_dirs: Training directories to visualize (list of strings)
- subdirs: Subdirectories to include (list of strings)
Returns:
Interactive Jupyter widget displaying training metrics
"""
def stop(self):
"""Stop the metric visualization widget."""
class MetricsPlotter:
"""
Utility class for plotting training metrics with matplotlib integration.
Provides static and dynamic plotting capabilities for CatBoost training metrics,
with customizable styling and export options.
"""
def __init__(self, train_dir=None):
"""
Initialize MetricsPlotter.
Parameters:
- train_dir: Training directory containing metric logs (string)
"""
def plot_metrics(self, metrics=None, train_dir=None, figsize=(12, 8),
title=None, save_path=None):
"""
Plot training metrics from log files.
Parameters:
- metrics: List of metrics to plot (list of strings)
- train_dir: Directory containing training logs (string)
- figsize: Figure size for matplotlib (tuple)
- title: Plot title (string)
- save_path: Path to save plot image (string)
Returns:
matplotlib.figure.Figure: Generated plot figure
"""
def plot_feature_importance(self, model, feature_names=None,
max_features=20, figsize=(10, 8),
title="Feature Importance", save_path=None):
"""
Plot feature importance from trained model.
Parameters:
- model: Trained CatBoost model
- feature_names: Feature names for labeling (list of strings)
- max_features: Maximum number of features to show (int)
- figsize: Figure size for matplotlib (tuple)
- title: Plot title (string)
- save_path: Path to save plot (string)
Returns:
matplotlib.figure.Figure: Feature importance plot
"""
def plot_learning_curve(self, train_scores, val_scores=None,
metric_name="Loss", figsize=(10, 6),
title="Learning Curve", save_path=None):
"""
Plot learning curves for training and validation.
Parameters:
- train_scores: Training metric scores (array-like)
- val_scores: Validation metric scores (array-like, optional)
- metric_name: Name of the metric being plotted (string)
- figsize: Figure size for matplotlib (tuple)
- title: Plot title (string)
- save_path: Path to save plot (string)
Returns:
matplotlib.figure.Figure: Learning curve plot
"""Plotting callbacks compatible with XGBoost and LightGBM workflows for easy migration.
def XGBPlottingCallback(period=1, show_stdv=False, figsize=(10, 6)):
"""
Create XGBoost-style plotting callback for CatBoost training.
Provides compatibility with XGBoost plotting workflows when migrating
to CatBoost, maintaining similar API and visualization style.
Parameters:
- period: Plotting update period in iterations (int)
- show_stdv: Show standard deviation bands for CV (bool)
- figsize: Figure size for matplotlib plots (tuple)
Returns:
Callback function for use with CatBoost training
Usage:
model.fit(X, y, callbacks=[XGBPlottingCallback(period=10)])
"""
def lgbm_plotting_callback(period=1, show_stdv=False, figsize=(10, 6)):
"""
Create LightGBM-style plotting callback for CatBoost training.
Provides compatibility with LightGBM plotting workflows when migrating
to CatBoost, maintaining similar API and visualization patterns.
Parameters:
- period: Plotting update period in iterations (int)
- show_stdv: Show standard deviation bands for CV (bool)
- figsize: Figure size for matplotlib plots (tuple)
Returns:
Callback function for use with CatBoost training
Usage:
model.fit(X, y, callbacks=[lgbm_plotting_callback(period=5)])
"""Direct plotting methods available on trained CatBoost models.
# These methods are available on trained CatBoost model objects
def plot_tree(self, tree_idx=0, pool=None, figsize=(20, 15),
save_path=None, title=None):
"""
Visualize individual decision tree from the ensemble.
Parameters:
- tree_idx: Index of tree to visualize (int)
- pool: Pool for leaf value calculation (Pool, optional)
- figsize: Figure size for visualization (tuple)
- save_path: Path to save tree visualization (string)
- title: Plot title (string)
Returns:
Tree visualization plot
"""
def plot_predictions(self, data, target=None, figsize=(10, 6),
title="Predictions vs Actual", save_path=None):
"""
Plot model predictions against actual values.
Parameters:
- data: Input data for predictions (Pool or array-like)
- target: True target values (array-like, optional)
- figsize: Figure size for matplotlib (tuple)
- title: Plot title (string)
- save_path: Path to save plot (string)
Returns:
matplotlib.figure.Figure: Predictions scatter plot
"""from catboost import CatBoostClassifier
from catboost.widget import MetricVisualizer
import pandas as pd
# Prepare data
df = pd.read_csv('train.csv')
X = df.drop('target', axis=1)
y = df['target']
# Initialize visualizer (in Jupyter notebook)
visualizer = MetricVisualizer()
# Train model with visualization
model = CatBoostClassifier(
iterations=500,
learning_rate=0.1,
depth=6,
eval_metric='AUC',
train_dir='./catboost_training', # Required for visualization
verbose=True
)
# Start visualization widget
visualizer.start(train_dirs=['./catboost_training'])
# Fit model (metrics will be visualized in real-time)
model.fit(
X, y,
eval_set=[(X_val, y_val)],
plot=True # Enable built-in plotting
)
# Stop visualization when done
visualizer.stop()from catboost.widget import MetricsPlotter
from catboost import CatBoostRegressor, cv
import matplotlib.pyplot as plt
# Initialize plotter
plotter = MetricsPlotter()
# Train model with comprehensive logging
model = CatBoostRegressor(
iterations=1000,
learning_rate=0.05,
depth=8,
eval_metric=['RMSE', 'MAE', 'R2'],
train_dir='./detailed_training',
metric_period=10,
verbose=100
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=50,
use_best_model=True
)
# Plot multiple metrics
fig = plotter.plot_metrics(
metrics=['RMSE', 'MAE', 'R2'],
train_dir='./detailed_training',
figsize=(15, 10),
title='CatBoost Training Metrics',
save_path='training_metrics.png'
)
plt.show()
# Plot feature importance
importance_fig = plotter.plot_feature_importance(
model=model,
feature_names=X_train.columns.tolist(),
max_features=25,
title='Top 25 Most Important Features'
)
plt.show()from catboost import cv, Pool
from catboost.widget import MetricsPlotter
import numpy as np
import matplotlib.pyplot as plt
# Create pool for CV
cv_pool = Pool(X_train, y_train, cat_features=['category'])
# Perform cross-validation with detailed logging
cv_results = cv(
pool=cv_pool,
params={
'iterations': 500,
'learning_rate': 0.1,
'depth': 6,
'loss_function': 'RMSE',
'eval_metric': 'RMSE',
'train_dir': './cv_training'
},
fold_count=5,
shuffle=True,
partition_random_seed=42,
plot=True,
verbose=50
)
# Extract scores for custom plotting
train_scores = cv_results['train-RMSE-mean'].values
val_scores = cv_results['test-RMSE-mean'].values
train_std = cv_results['train-RMSE-std'].values
val_std = cv_results['test-RMSE-std'].values
# Create custom learning curve with confidence intervals
plotter = MetricsPlotter()
fig, ax = plt.subplots(figsize=(12, 8))
iterations = np.arange(1, len(train_scores) + 1)
# Plot mean scores
ax.plot(iterations, train_scores, 'b-', label='Training RMSE', linewidth=2)
ax.plot(iterations, val_scores, 'r-', label='Validation RMSE', linewidth=2)
# Add confidence intervals
ax.fill_between(iterations, train_scores - train_std, train_scores + train_std,
alpha=0.2, color='blue')
ax.fill_between(iterations, val_scores - val_std, val_scores + val_std,
alpha=0.2, color='red')
ax.set_xlabel('Iteration')
ax.set_ylabel('RMSE')
ax.set_title('5-Fold Cross-Validation Learning Curves')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('cv_learning_curves.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Best CV score: {val_scores.min():.4f} ± {val_std[val_scores.argmin()]:.4f}")from catboost import CatBoostClassifier
from catboost.widget import XGBPlottingCallback, lgbm_plotting_callback
# XGBoost-style plotting
xgb_callback = XGBPlottingCallback(period=25, show_stdv=True, figsize=(12, 8))
model_xgb_style = CatBoostClassifier(
iterations=300,
learning_rate=0.1,
depth=6,
verbose=False
)
model_xgb_style.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
callbacks=[xgb_callback]
)
# LightGBM-style plotting
lgbm_callback = lgbm_plotting_callback(period=20, figsize=(10, 6))
model_lgbm_style = CatBoostClassifier(
iterations=300,
learning_rate=0.1,
depth=6,
verbose=False
)
model_lgbm_style.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
callbacks=[lgbm_callback]
)from catboost import CatBoostClassifier, EFstrType
from catboost.widget import MetricsPlotter
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Train model
model = CatBoostClassifier(iterations=200, verbose=False)
model.fit(X_train, y_train)
# Get SHAP values for visualization
shap_values = model.get_feature_importance(
data=X_test[:100], # First 100 samples for visualization
type=EFstrType.ShapValues
)
# Create SHAP summary plot
plt.figure(figsize=(12, 8))
shap_df = pd.DataFrame(shap_values, columns=X_train.columns)
# Plot mean absolute SHAP values
mean_shap = shap_df.abs().mean().sort_values(ascending=True)
plt.barh(range(len(mean_shap)), mean_shap.values)
plt.yticks(range(len(mean_shap)), mean_shap.index)
plt.xlabel('Mean |SHAP Value|')
plt.title('Feature Importance (SHAP Values)')
plt.tight_layout()
plt.show()
# Feature interaction heatmap
interactions = model.get_feature_importance(type=EFstrType.Interaction)
plt.figure(figsize=(12, 10))
sns.heatmap(
interactions,
xticklabels=X_train.columns,
yticklabels=X_train.columns,
annot=False,
cmap='RdBu_r',
center=0
)
plt.title('Feature Interaction Matrix')
plt.tight_layout()
plt.show()import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import numpy as np
def create_training_dashboard(model, X_test, y_test, cv_results=None):
"""Create comprehensive training dashboard."""
fig = plt.figure(figsize=(20, 15))
gs = GridSpec(3, 3, figure=fig)
# 1. Learning curves
ax1 = fig.add_subplot(gs[0, :2])
if cv_results is not None:
iterations = range(1, len(cv_results) + 1)
ax1.plot(iterations, cv_results['train-RMSE-mean'], 'b-', label='Train')
ax1.plot(iterations, cv_results['test-RMSE-mean'], 'r-', label='Validation')
ax1.fill_between(iterations,
cv_results['train-RMSE-mean'] - cv_results['train-RMSE-std'],
cv_results['train-RMSE-mean'] + cv_results['train-RMSE-std'],
alpha=0.2, color='blue')
ax1.fill_between(iterations,
cv_results['test-RMSE-mean'] - cv_results['test-RMSE-std'],
cv_results['test-RMSE-mean'] + cv_results['test-RMSE-std'],
alpha=0.2, color='red')
ax1.set_title('Learning Curves')
ax1.set_xlabel('Iteration')
ax1.set_ylabel('RMSE')
ax1.legend()
ax1.grid(True, alpha=0.3)
# 2. Feature importance
ax2 = fig.add_subplot(gs[0, 2])
importance = model.get_feature_importance()
top_features = np.argsort(importance)[-10:]
ax2.barh(range(len(top_features)), importance[top_features])
ax2.set_yticks(range(len(top_features)))
ax2.set_yticklabels([f'Feature_{i}' for i in top_features])
ax2.set_title('Top 10 Features')
ax2.set_xlabel('Importance')
# 3. Predictions vs Actual
ax3 = fig.add_subplot(gs[1, 0])
predictions = model.predict(X_test)
ax3.scatter(y_test, predictions, alpha=0.6)
min_val = min(y_test.min(), predictions.min())
max_val = max(y_test.max(), predictions.max())
ax3.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
ax3.set_xlabel('Actual')
ax3.set_ylabel('Predicted')
ax3.set_title('Predictions vs Actual')
# 4. Residuals
ax4 = fig.add_subplot(gs[1, 1])
residuals = y_test - predictions
ax4.scatter(predictions, residuals, alpha=0.6)
ax4.axhline(y=0, color='r', linestyle='--')
ax4.set_xlabel('Predicted')
ax4.set_ylabel('Residuals')
ax4.set_title('Residual Plot')
# 5. Residual distribution
ax5 = fig.add_subplot(gs[1, 2])
ax5.hist(residuals, bins=30, alpha=0.7, edgecolor='black')
ax5.set_xlabel('Residuals')
ax5.set_ylabel('Frequency')
ax5.set_title('Residual Distribution')
# 6. Model metrics summary
ax6 = fig.add_subplot(gs[2, :])
ax6.axis('off')
# Calculate metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
metrics_text = f"""
Model Performance Metrics:
RMSE: {rmse:.4f}
MAE: {mae:.4f}
R²: {r2:.4f}
Model Info:
Trees: {model.tree_count_}
Features: {model.feature_count_}
"""
ax6.text(0.1, 0.5, metrics_text, fontsize=12, verticalalignment='center',
bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray"))
plt.tight_layout()
plt.savefig('training_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()
# Usage
create_training_dashboard(model, X_test, y_test, cv_results)Install with Tessl CLI
npx tessl i tessl/pypi-catboost