0
# Visualization
1
2
CatBoost provides interactive visualization components specifically designed for Jupyter notebooks, along with compatibility layers for XGBoost and LightGBM plotting workflows. These tools enable real-time monitoring of training progress and model analysis.
3
4
## Capabilities
5
6
### Interactive Jupyter Widgets
7
8
Visualization components that integrate seamlessly with Jupyter notebook environments.
9
10
```python { .api }
11
class MetricVisualizer:
12
"""
13
Interactive widget for visualizing training metrics in Jupyter notebooks.
14
15
Provides real-time plots of training and validation metrics during model training,
16
with interactive controls for zooming, filtering, and metric selection.
17
"""
18
19
def __init__(self, train_dirs=None, subdirs=None):
20
"""
21
Initialize MetricVisualizer widget.
22
23
Parameters:
24
- train_dirs: List of training directories to monitor (list of strings)
25
- subdirs: Subdirectories within train_dirs to include (list of strings)
26
"""
27
28
def start(self, train_dirs=None, subdirs=None):
29
"""
30
Start the metric visualization widget.
31
32
Parameters:
33
- train_dirs: Training directories to visualize (list of strings)
34
- subdirs: Subdirectories to include (list of strings)
35
36
Returns:
37
Interactive Jupyter widget displaying training metrics
38
"""
39
40
def stop(self):
41
"""Stop the metric visualization widget."""
42
43
class MetricsPlotter:
44
"""
45
Utility class for plotting training metrics with matplotlib integration.
46
47
Provides static and dynamic plotting capabilities for CatBoost training metrics,
48
with customizable styling and export options.
49
"""
50
51
def __init__(self, train_dir=None):
52
"""
53
Initialize MetricsPlotter.
54
55
Parameters:
56
- train_dir: Training directory containing metric logs (string)
57
"""
58
59
def plot_metrics(self, metrics=None, train_dir=None, figsize=(12, 8),
60
title=None, save_path=None):
61
"""
62
Plot training metrics from log files.
63
64
Parameters:
65
- metrics: List of metrics to plot (list of strings)
66
- train_dir: Directory containing training logs (string)
67
- figsize: Figure size for matplotlib (tuple)
68
- title: Plot title (string)
69
- save_path: Path to save plot image (string)
70
71
Returns:
72
matplotlib.figure.Figure: Generated plot figure
73
"""
74
75
def plot_feature_importance(self, model, feature_names=None,
76
max_features=20, figsize=(10, 8),
77
title="Feature Importance", save_path=None):
78
"""
79
Plot feature importance from trained model.
80
81
Parameters:
82
- model: Trained CatBoost model
83
- feature_names: Feature names for labeling (list of strings)
84
- max_features: Maximum number of features to show (int)
85
- figsize: Figure size for matplotlib (tuple)
86
- title: Plot title (string)
87
- save_path: Path to save plot (string)
88
89
Returns:
90
matplotlib.figure.Figure: Feature importance plot
91
"""
92
93
def plot_learning_curve(self, train_scores, val_scores=None,
94
metric_name="Loss", figsize=(10, 6),
95
title="Learning Curve", save_path=None):
96
"""
97
Plot learning curves for training and validation.
98
99
Parameters:
100
- train_scores: Training metric scores (array-like)
101
- val_scores: Validation metric scores (array-like, optional)
102
- metric_name: Name of the metric being plotted (string)
103
- figsize: Figure size for matplotlib (tuple)
104
- title: Plot title (string)
105
- save_path: Path to save plot (string)
106
107
Returns:
108
matplotlib.figure.Figure: Learning curve plot
109
"""
110
```
111
112
### Framework Compatibility Callbacks
113
114
Plotting callbacks compatible with XGBoost and LightGBM workflows for easy migration.
115
116
```python { .api }
117
def XGBPlottingCallback(period=1, show_stdv=False, figsize=(10, 6)):
118
"""
119
Create XGBoost-style plotting callback for CatBoost training.
120
121
Provides compatibility with XGBoost plotting workflows when migrating
122
to CatBoost, maintaining similar API and visualization style.
123
124
Parameters:
125
- period: Plotting update period in iterations (int)
126
- show_stdv: Show standard deviation bands for CV (bool)
127
- figsize: Figure size for matplotlib plots (tuple)
128
129
Returns:
130
Callback function for use with CatBoost training
131
132
Usage:
133
model.fit(X, y, callbacks=[XGBPlottingCallback(period=10)])
134
"""
135
136
def lgbm_plotting_callback(period=1, show_stdv=False, figsize=(10, 6)):
137
"""
138
Create LightGBM-style plotting callback for CatBoost training.
139
140
Provides compatibility with LightGBM plotting workflows when migrating
141
to CatBoost, maintaining similar API and visualization patterns.
142
143
Parameters:
144
- period: Plotting update period in iterations (int)
145
- show_stdv: Show standard deviation bands for CV (bool)
146
- figsize: Figure size for matplotlib plots (tuple)
147
148
Returns:
149
Callback function for use with CatBoost training
150
151
Usage:
152
model.fit(X, y, callbacks=[lgbm_plotting_callback(period=5)])
153
"""
154
```
155
156
### Built-in Model Plotting Methods
157
158
Direct plotting methods available on trained CatBoost models.
159
160
```python { .api }
161
# These methods are available on trained CatBoost model objects
162
def plot_tree(self, tree_idx=0, pool=None, figsize=(20, 15),
163
save_path=None, title=None):
164
"""
165
Visualize individual decision tree from the ensemble.
166
167
Parameters:
168
- tree_idx: Index of tree to visualize (int)
169
- pool: Pool for leaf value calculation (Pool, optional)
170
- figsize: Figure size for visualization (tuple)
171
- save_path: Path to save tree visualization (string)
172
- title: Plot title (string)
173
174
Returns:
175
Tree visualization plot
176
"""
177
178
def plot_predictions(self, data, target=None, figsize=(10, 6),
179
title="Predictions vs Actual", save_path=None):
180
"""
181
Plot model predictions against actual values.
182
183
Parameters:
184
- data: Input data for predictions (Pool or array-like)
185
- target: True target values (array-like, optional)
186
- figsize: Figure size for matplotlib (tuple)
187
- title: Plot title (string)
188
- save_path: Path to save plot (string)
189
190
Returns:
191
matplotlib.figure.Figure: Predictions scatter plot
192
"""
193
```
194
195
## Visualization Examples
196
197
### Basic Training Visualization
198
199
```python
200
from catboost import CatBoostClassifier
201
from catboost.widget import MetricVisualizer
202
import pandas as pd
203
204
# Prepare data
205
df = pd.read_csv('train.csv')
206
X = df.drop('target', axis=1)
207
y = df['target']
208
209
# Initialize visualizer (in Jupyter notebook)
210
visualizer = MetricVisualizer()
211
212
# Train model with visualization
213
model = CatBoostClassifier(
214
iterations=500,
215
learning_rate=0.1,
216
depth=6,
217
eval_metric='AUC',
218
train_dir='./catboost_training', # Required for visualization
219
verbose=True
220
)
221
222
# Start visualization widget
223
visualizer.start(train_dirs=['./catboost_training'])
224
225
# Fit model (metrics will be visualized in real-time)
226
model.fit(
227
X, y,
228
eval_set=[(X_val, y_val)],
229
plot=True # Enable built-in plotting
230
)
231
232
# Stop visualization when done
233
visualizer.stop()
234
```
235
236
### Advanced Metrics Plotting
237
238
```python
239
from catboost.widget import MetricsPlotter
240
from catboost import CatBoostRegressor, cv
241
import matplotlib.pyplot as plt
242
243
# Initialize plotter
244
plotter = MetricsPlotter()
245
246
# Train model with comprehensive logging
247
model = CatBoostRegressor(
248
iterations=1000,
249
learning_rate=0.05,
250
depth=8,
251
eval_metric=['RMSE', 'MAE', 'R2'],
252
train_dir='./detailed_training',
253
metric_period=10,
254
verbose=100
255
)
256
257
model.fit(
258
X_train, y_train,
259
eval_set=[(X_val, y_val)],
260
early_stopping_rounds=50,
261
use_best_model=True
262
)
263
264
# Plot multiple metrics
265
fig = plotter.plot_metrics(
266
metrics=['RMSE', 'MAE', 'R2'],
267
train_dir='./detailed_training',
268
figsize=(15, 10),
269
title='CatBoost Training Metrics',
270
save_path='training_metrics.png'
271
)
272
273
plt.show()
274
275
# Plot feature importance
276
importance_fig = plotter.plot_feature_importance(
277
model=model,
278
feature_names=X_train.columns.tolist(),
279
max_features=25,
280
title='Top 25 Most Important Features'
281
)
282
283
plt.show()
284
```
285
286
### Cross-Validation Visualization
287
288
```python
289
from catboost import cv, Pool
290
from catboost.widget import MetricsPlotter
291
import numpy as np
292
import matplotlib.pyplot as plt
293
294
# Create pool for CV
295
cv_pool = Pool(X_train, y_train, cat_features=['category'])
296
297
# Perform cross-validation with detailed logging
298
cv_results = cv(
299
pool=cv_pool,
300
params={
301
'iterations': 500,
302
'learning_rate': 0.1,
303
'depth': 6,
304
'loss_function': 'RMSE',
305
'eval_metric': 'RMSE',
306
'train_dir': './cv_training'
307
},
308
fold_count=5,
309
shuffle=True,
310
partition_random_seed=42,
311
plot=True,
312
verbose=50
313
)
314
315
# Extract scores for custom plotting
316
train_scores = cv_results['train-RMSE-mean'].values
317
val_scores = cv_results['test-RMSE-mean'].values
318
train_std = cv_results['train-RMSE-std'].values
319
val_std = cv_results['test-RMSE-std'].values
320
321
# Create custom learning curve with confidence intervals
322
plotter = MetricsPlotter()
323
fig, ax = plt.subplots(figsize=(12, 8))
324
325
iterations = np.arange(1, len(train_scores) + 1)
326
327
# Plot mean scores
328
ax.plot(iterations, train_scores, 'b-', label='Training RMSE', linewidth=2)
329
ax.plot(iterations, val_scores, 'r-', label='Validation RMSE', linewidth=2)
330
331
# Add confidence intervals
332
ax.fill_between(iterations, train_scores - train_std, train_scores + train_std,
333
alpha=0.2, color='blue')
334
ax.fill_between(iterations, val_scores - val_std, val_scores + val_std,
335
alpha=0.2, color='red')
336
337
ax.set_xlabel('Iteration')
338
ax.set_ylabel('RMSE')
339
ax.set_title('5-Fold Cross-Validation Learning Curves')
340
ax.legend()
341
ax.grid(True, alpha=0.3)
342
343
plt.tight_layout()
344
plt.savefig('cv_learning_curves.png', dpi=300, bbox_inches='tight')
345
plt.show()
346
347
print(f"Best CV score: {val_scores.min():.4f} ± {val_std[val_scores.argmin()]:.4f}")
348
```
349
350
### Framework Compatibility Examples
351
352
```python
353
from catboost import CatBoostClassifier
354
from catboost.widget import XGBPlottingCallback, lgbm_plotting_callback
355
356
# XGBoost-style plotting
357
xgb_callback = XGBPlottingCallback(period=25, show_stdv=True, figsize=(12, 8))
358
359
model_xgb_style = CatBoostClassifier(
360
iterations=300,
361
learning_rate=0.1,
362
depth=6,
363
verbose=False
364
)
365
366
model_xgb_style.fit(
367
X_train, y_train,
368
eval_set=[(X_val, y_val)],
369
callbacks=[xgb_callback]
370
)
371
372
# LightGBM-style plotting
373
lgbm_callback = lgbm_plotting_callback(period=20, figsize=(10, 6))
374
375
model_lgbm_style = CatBoostClassifier(
376
iterations=300,
377
learning_rate=0.1,
378
depth=6,
379
verbose=False
380
)
381
382
model_lgbm_style.fit(
383
X_train, y_train,
384
eval_set=[(X_val, y_val)],
385
callbacks=[lgbm_callback]
386
)
387
```
388
389
### Interactive Feature Analysis Visualization
390
391
```python
392
from catboost import CatBoostClassifier, EFstrType
393
from catboost.widget import MetricsPlotter
394
import matplotlib.pyplot as plt
395
import seaborn as sns
396
import pandas as pd
397
398
# Train model
399
model = CatBoostClassifier(iterations=200, verbose=False)
400
model.fit(X_train, y_train)
401
402
# Get SHAP values for visualization
403
shap_values = model.get_feature_importance(
404
data=X_test[:100], # First 100 samples for visualization
405
type=EFstrType.ShapValues
406
)
407
408
# Create SHAP summary plot
409
plt.figure(figsize=(12, 8))
410
shap_df = pd.DataFrame(shap_values, columns=X_train.columns)
411
412
# Plot mean absolute SHAP values
413
mean_shap = shap_df.abs().mean().sort_values(ascending=True)
414
plt.barh(range(len(mean_shap)), mean_shap.values)
415
plt.yticks(range(len(mean_shap)), mean_shap.index)
416
plt.xlabel('Mean |SHAP Value|')
417
plt.title('Feature Importance (SHAP Values)')
418
plt.tight_layout()
419
plt.show()
420
421
# Feature interaction heatmap
422
interactions = model.get_feature_importance(type=EFstrType.Interaction)
423
plt.figure(figsize=(12, 10))
424
sns.heatmap(
425
interactions,
426
xticklabels=X_train.columns,
427
yticklabels=X_train.columns,
428
annot=False,
429
cmap='RdBu_r',
430
center=0
431
)
432
plt.title('Feature Interaction Matrix')
433
plt.tight_layout()
434
plt.show()
435
```
436
437
### Custom Visualization Dashboard
438
439
```python
440
import matplotlib.pyplot as plt
441
from matplotlib.gridspec import GridSpec
442
import numpy as np
443
444
def create_training_dashboard(model, X_test, y_test, cv_results=None):
445
"""Create comprehensive training dashboard."""
446
447
fig = plt.figure(figsize=(20, 15))
448
gs = GridSpec(3, 3, figure=fig)
449
450
# 1. Learning curves
451
ax1 = fig.add_subplot(gs[0, :2])
452
if cv_results is not None:
453
iterations = range(1, len(cv_results) + 1)
454
ax1.plot(iterations, cv_results['train-RMSE-mean'], 'b-', label='Train')
455
ax1.plot(iterations, cv_results['test-RMSE-mean'], 'r-', label='Validation')
456
ax1.fill_between(iterations,
457
cv_results['train-RMSE-mean'] - cv_results['train-RMSE-std'],
458
cv_results['train-RMSE-mean'] + cv_results['train-RMSE-std'],
459
alpha=0.2, color='blue')
460
ax1.fill_between(iterations,
461
cv_results['test-RMSE-mean'] - cv_results['test-RMSE-std'],
462
cv_results['test-RMSE-mean'] + cv_results['test-RMSE-std'],
463
alpha=0.2, color='red')
464
ax1.set_title('Learning Curves')
465
ax1.set_xlabel('Iteration')
466
ax1.set_ylabel('RMSE')
467
ax1.legend()
468
ax1.grid(True, alpha=0.3)
469
470
# 2. Feature importance
471
ax2 = fig.add_subplot(gs[0, 2])
472
importance = model.get_feature_importance()
473
top_features = np.argsort(importance)[-10:]
474
ax2.barh(range(len(top_features)), importance[top_features])
475
ax2.set_yticks(range(len(top_features)))
476
ax2.set_yticklabels([f'Feature_{i}' for i in top_features])
477
ax2.set_title('Top 10 Features')
478
ax2.set_xlabel('Importance')
479
480
# 3. Predictions vs Actual
481
ax3 = fig.add_subplot(gs[1, 0])
482
predictions = model.predict(X_test)
483
ax3.scatter(y_test, predictions, alpha=0.6)
484
min_val = min(y_test.min(), predictions.min())
485
max_val = max(y_test.max(), predictions.max())
486
ax3.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
487
ax3.set_xlabel('Actual')
488
ax3.set_ylabel('Predicted')
489
ax3.set_title('Predictions vs Actual')
490
491
# 4. Residuals
492
ax4 = fig.add_subplot(gs[1, 1])
493
residuals = y_test - predictions
494
ax4.scatter(predictions, residuals, alpha=0.6)
495
ax4.axhline(y=0, color='r', linestyle='--')
496
ax4.set_xlabel('Predicted')
497
ax4.set_ylabel('Residuals')
498
ax4.set_title('Residual Plot')
499
500
# 5. Residual distribution
501
ax5 = fig.add_subplot(gs[1, 2])
502
ax5.hist(residuals, bins=30, alpha=0.7, edgecolor='black')
503
ax5.set_xlabel('Residuals')
504
ax5.set_ylabel('Frequency')
505
ax5.set_title('Residual Distribution')
506
507
# 6. Model metrics summary
508
ax6 = fig.add_subplot(gs[2, :])
509
ax6.axis('off')
510
511
# Calculate metrics
512
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
513
rmse = np.sqrt(mean_squared_error(y_test, predictions))
514
mae = mean_absolute_error(y_test, predictions)
515
r2 = r2_score(y_test, predictions)
516
517
metrics_text = f"""
518
Model Performance Metrics:
519
520
RMSE: {rmse:.4f}
521
MAE: {mae:.4f}
522
R²: {r2:.4f}
523
524
Model Info:
525
Trees: {model.tree_count_}
526
Features: {model.feature_count_}
527
"""
528
529
ax6.text(0.1, 0.5, metrics_text, fontsize=12, verticalalignment='center',
530
bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray"))
531
532
plt.tight_layout()
533
plt.savefig('training_dashboard.png', dpi=300, bbox_inches='tight')
534
plt.show()
535
536
# Usage
537
create_training_dashboard(model, X_test, y_test, cv_results)
538
```