CatBoost is a fast, scalable, high performance gradient boosting on decision trees library used for ranking, classification, regression and other ML tasks.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
CatBoost provides comprehensive training and evaluation capabilities including cross-validation, hyperparameter tuning, early stopping, and Gaussian process sampling for uncertainty estimation. These functions support both Pool objects and raw data formats.
Direct training functions that provide more control over the training process compared to the model classes.
def train(pool, params=None, dtrain=None, logging_level=None, verbose=None,
iterations=None, num_boost_round=None, eval_set=None, plot=False,
save_snapshot=None, snapshot_file=None, snapshot_interval=600,
metric_period=1, verbose_eval=None, early_stopping_rounds=None,
use_best_model=None, best_model_min_trees=1, log_cout=None,
log_cerr=None):
"""
Train a CatBoost model using specified parameters.
Parameters:
- pool: Training data (Pool object)
- params: Training parameters (dict)
- dtrain: Deprecated, use pool instead
- logging_level: Logging level ('Silent', 'Verbose', 'Info', 'Debug')
- verbose: Verbosity level (bool or int)
- iterations: Number of boosting iterations (int)
- num_boost_round: Alias for iterations
- eval_set: Evaluation datasets (list of Pool objects)
- plot: Enable plotting during training (bool)
- save_snapshot: Save training snapshots (bool)
- snapshot_file: Snapshot file name (string)
- snapshot_interval: Snapshot interval in seconds (int)
- metric_period: Metric calculation period (int)
- verbose_eval: Verbose evaluation period (int)
- early_stopping_rounds: Early stopping rounds (int)
- use_best_model: Use best model from validation (bool)
- best_model_min_trees: Minimum trees for best model (int)
- log_cout: Output stream for logging
- log_cerr: Error stream for logging
Returns:
CatBoost: Trained CatBoost model
"""Robust cross-validation with stratification, custom folds, and comprehensive evaluation metrics.
def cv(pool, params=None, dtrain=None, iterations=None, num_boost_round=None,
fold_count=3, inverted=False, shuffle=True, partition_random_seed=0,
stratified=None, train_dir=None, verbose=None, logging_level=None,
metric_period=1, verbose_eval=None, plot=False, save_snapshot=None,
snapshot_file=None, snapshot_interval=600, folds=None,
early_stopping_rounds=None, as_pandas=True, return_models=False,
log_cout=None, log_cerr=None, type='Classical'):
"""
Perform cross-validation with CatBoost.
Parameters:
- pool: Training data (Pool object)
- params: Model parameters (dict)
- dtrain: Deprecated, use pool instead
- iterations: Number of boosting iterations (int)
- num_boost_round: Alias for iterations
- fold_count: Number of cross-validation folds (int, default: 3)
- inverted: Use inverted folds (bool)
- shuffle: Shuffle data before folding (bool)
- partition_random_seed: Random seed for data partitioning (int)
- stratified: Use stratified cross-validation (bool, auto for classification)
- train_dir: Directory for training artifacts (string)
- verbose: Verbosity level (bool or int)
- logging_level: Logging level (string)
- metric_period: Metric calculation period (int)
- verbose_eval: Verbose evaluation period (int)
- plot: Enable plotting (bool)
- save_snapshot: Save snapshots (bool)
- snapshot_file: Snapshot file name (string)
- snapshot_interval: Snapshot interval in seconds (int)
- folds: Custom fold indices (array-like)
- early_stopping_rounds: Early stopping rounds (int)
- as_pandas: Return results as pandas DataFrame (bool)
- return_models: Return trained fold models (bool)
- log_cout: Output stream for logging
- log_cerr: Error stream for logging
- type: Cross-validation type ('Classical', 'Inverted', 'TimeSeries')
Returns:
pandas.DataFrame or dict: Cross-validation results with metrics per iteration
"""Advanced uncertainty estimation using Gaussian process sampling for regression tasks.
def sample_gaussian_process(X, y, eval_set=None, cat_features=None,
text_features=None, embedding_features=None,
sample_weight=None, group_id=None, group_weight=None,
subgroup_id=None, pairs=None, pairs_weight=None,
baseline=None, n_samples=100, random_seed=None,
logging_level='Verbose', verbose=None, plot=False,
model_params=None, gp_params=None):
"""
Sample from Gaussian process for uncertainty estimation.
Parameters:
- X: Input features (array-like or Pool)
- y: Target values (array-like)
- eval_set: Evaluation datasets (list of tuples)
- cat_features: Categorical feature indices (list)
- text_features: Text feature indices (list)
- embedding_features: Embedding feature indices (list)
- sample_weight: Sample weights (array-like)
- group_id: Group identifiers (array-like)
- group_weight: Group weights (array-like)
- subgroup_id: Subgroup identifiers (array-like)
- pairs: Pairs for ranking (array-like)
- pairs_weight: Pairs weights (array-like)
- baseline: Baseline values (array-like)
- n_samples: Number of GP samples (int, default: 100)
- random_seed: Random seed (int)
- logging_level: Logging level (string)
- verbose: Verbosity level (bool or int)
- plot: Enable plotting (bool)
- model_params: CatBoost model parameters (dict)
- gp_params: Gaussian process parameters (dict)
Returns:
tuple: (predictions_mean, predictions_std, gp_samples)
- predictions_mean: Mean predictions (numpy.ndarray)
- predictions_std: Standard deviation of predictions (numpy.ndarray)
- gp_samples: Individual GP samples (numpy.ndarray)
"""Functions for combining multiple CatBoost models into ensembles.
def sum_models(models, weights=None, ctr_merge_policy='IntersectingCountersAverage'):
"""
Create ensemble by summing multiple CatBoost models.
Parameters:
- models: List of CatBoost models to combine (list)
- weights: Weights for each model (list of float, optional)
- ctr_merge_policy: CTR merge policy ('IntersectingCountersAverage', 'CountersSum')
Returns:
CatBoost: Combined model
"""
def _have_equal_features(models):
"""
Check if models have equal feature sets.
Parameters:
- models: List of CatBoost models (list)
Returns:
bool: True if all models have equal features
"""Grid search and randomized search utilities for hyperparameter optimization.
class CatBoostSearchCV:
"""
Hyperparameter search using cross-validation.
Similar to scikit-learn's GridSearchCV but optimized for CatBoost.
"""
def __init__(self, estimator, param_grid, scoring=None, cv=3,
refit=True, verbose=0, n_jobs=1, return_train_score=False):
"""
Initialize hyperparameter search.
Parameters:
- estimator: CatBoost estimator
- param_grid: Parameter grid to search (dict)
- scoring: Scoring metric (string or callable)
- cv: Cross-validation strategy (int or cv splitter)
- refit: Refit best estimator (bool)
- verbose: Verbosity level (int)
- n_jobs: Number of parallel jobs (int)
- return_train_score: Return training scores (bool)
"""
def fit(self, X, y=None, **fit_params):
"""Fit the search."""
def predict(self, X):
"""Predict using the best estimator."""
@property
def best_estimator_(self):
"""Best estimator found."""
@property
def best_params_(self):
"""Best parameters found."""
@property
def best_score_(self):
"""Best cross-validation score."""from catboost import Pool, train, cv
import pandas as pd
# Prepare data
df = pd.read_csv('train.csv')
X = df.drop('target', axis=1)
y = df['target']
# Create Pool
train_pool = Pool(
data=X,
label=y,
cat_features=['category_column']
)
# Define parameters
params = {
'iterations': 1000,
'learning_rate': 0.1,
'depth': 6,
'loss_function': 'RMSE',
'eval_metric': 'RMSE',
'verbose': 100,
'early_stopping_rounds': 50
}
# Perform cross-validation
cv_results = cv(
pool=train_pool,
params=params,
fold_count=5,
stratified=False,
shuffle=True,
partition_random_seed=42
)
print(f"Best CV score: {cv_results['test-RMSE-mean'].min():.4f}")
# Train final model
model = train(
pool=train_pool,
params=params
)from catboost import Pool, train
from sklearn.model_selection import train_test_split
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Create pools
train_pool = Pool(X_train, y_train, cat_features=['category'])
val_pool = Pool(X_val, y_val, cat_features=['category'])
# Train with validation
model = train(
pool=train_pool,
eval_set=[val_pool],
params={
'iterations': 1000,
'learning_rate': 0.1,
'depth': 6,
'loss_function': 'RMSE',
'eval_metric': 'RMSE',
'early_stopping_rounds': 100,
'use_best_model': True,
'verbose': 100
},
plot=True # Enable training plots
)from catboost import sample_gaussian_process
import numpy as np
# Sample from Gaussian process
mean_pred, std_pred, gp_samples = sample_gaussian_process(
X=X_train,
y=y_train,
eval_set=[(X_val, y_val)],
cat_features=['category'],
n_samples=100,
model_params={
'iterations': 500,
'learning_rate': 0.1,
'depth': 4
},
random_seed=42
)
# Get prediction intervals
lower_bound = mean_pred - 1.96 * std_pred
upper_bound = mean_pred + 1.96 * std_pred
print(f"Mean prediction: {mean_pred.mean():.4f}")
print(f"Prediction uncertainty: {std_pred.mean():.4f}")from catboost import CatBoostRegressor, sum_models
# Train multiple models with different parameters
models = []
for depth in [4, 6, 8]:
model = CatBoostRegressor(
iterations=500,
depth=depth,
learning_rate=0.1,
verbose=False
)
model.fit(X_train, y_train)
models.append(model)
# Create ensemble
ensemble = sum_models(
models=models,
weights=[0.4, 0.4, 0.2] # Weight models differently
)
# Make predictions with ensemble
ensemble_pred = ensemble.predict(X_test)Install with Tessl CLI
npx tessl i tessl/pypi-catboost