CatBoost is a fast, scalable, high performance gradient boosting on decision trees library used for ranking, classification, regression and other ML tasks.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
CatBoost provides extensive utility functions for model evaluation, GPU management, metric calculation, data conversion, and model export. These utilities enhance the machine learning workflow with comprehensive analysis and deployment capabilities.
Comprehensive model evaluation tools including confusion matrices, ROC curves, and threshold optimization.
def eval_metric(label, approx, metric, weight=None, group_id=None,
group_weight=None, subgroup_id=None, pairs=None, thread_count=-1):
"""
Evaluate a metric on predictions.
Parameters:
- label: True target values (array-like)
- approx: Model predictions (array-like)
- metric: Metric name to evaluate (string)
Classification: 'Logloss', 'CrossEntropy', 'AUC', 'Accuracy', 'Precision', 'Recall', 'F1'
Regression: 'RMSE', 'MAE', 'R2', 'MSLE', 'MedianAbsoluteError', 'SMAPE', 'MAPE'
Ranking: 'NDCG', 'DCG', 'MAP', 'MRR', 'ERR'
- weight: Sample weights (array-like, optional)
- group_id: Group identifiers for ranking metrics (array-like, optional)
- group_weight: Group weights (array-like, optional)
- subgroup_id: Subgroup identifiers (array-like, optional)
- pairs: Pairs for ranking metrics (array-like, optional)
- thread_count: Number of threads for computation (int)
Returns:
float: Metric value
"""
def get_confusion_matrix(model, data, thread_count=-1):
"""
Calculate confusion matrix for classification model.
Parameters:
- model: Trained CatBoost classifier
- data: Input data (Pool or array-like)
- thread_count: Number of threads for computation (int)
Returns:
numpy.ndarray: Confusion matrix (n_classes, n_classes)
"""
def get_roc_curve(model, data, thread_count=-1, plot=False):
"""
Calculate ROC curve data for binary classification.
Parameters:
- model: Trained CatBoost classifier
- data: Input data with true labels (Pool)
- thread_count: Number of threads for computation (int)
- plot: Whether to plot the ROC curve (bool)
Returns:
tuple: (fpr, tpr, thresholds)
- fpr: False positive rates (numpy.ndarray)
- tpr: True positive rates (numpy.ndarray)
- thresholds: Decision thresholds (numpy.ndarray)
"""
def get_fpr_curve(model, data, curve=None, thread_count=-1, plot=False):
"""
Calculate False Positive Rate curve.
Parameters:
- model: Trained CatBoost classifier
- data: Input data with true labels (Pool)
- curve: Curve type (string, optional)
- thread_count: Number of threads for computation (int)
- plot: Whether to plot the curve (bool)
Returns:
tuple: (thresholds, fpr_values)
"""
def get_fnr_curve(model, data, curve=None, thread_count=-1, plot=False):
"""
Calculate False Negative Rate curve.
Parameters:
- model: Trained CatBoost classifier
- data: Input data with true labels (Pool)
- curve: Curve type (string, optional)
- thread_count: Number of threads for computation (int)
- plot: Whether to plot the curve (bool)
Returns:
tuple: (thresholds, fnr_values)
"""
def select_threshold(model, data, curve=None, FPR=None, FNR=None, thread_count=-1):
"""
Select optimal decision threshold based on FPR/FNR constraints.
Parameters:
- model: Trained CatBoost classifier
- data: Input data with true labels (Pool)
- curve: Curve type for threshold selection (string, optional)
- FPR: Target false positive rate (float, 0-1)
- FNR: Target false negative rate (float, 0-1)
- thread_count: Number of threads for computation (int)
Returns:
float: Optimal threshold value
"""System information and GPU management functions.
def get_gpu_device_count():
"""
Get the number of available GPU devices.
Returns:
int: Number of GPU devices available for CatBoost
"""
def reset_trace_backend(filename):
"""
Reset trace backend with filename.
Parameters:
- filename: Path to trace file (string)
"""Functions for exporting models to various formats for deployment.
def convert_to_onnx_object(model, export_parameters=None):
"""
Convert CatBoost model to ONNX format object.
Parameters:
- model: Trained CatBoost model
- export_parameters: Export configuration parameters (dict, optional)
- 'onnx_domain': ONNX domain name (string)
- 'onnx_model_version': Model version (int)
- 'onnx_doc_string': Documentation string (string)
- 'onnx_graph_name': Graph name (string)
Returns:
onnx.ModelProto: ONNX model object
"""Utilities for data preprocessing, quantization, and format conversion.
def calculate_quantization_grid(values, border_count, border_type='Median'):
"""
Calculate quantization grid for numerical values.
Parameters:
- values: Input numerical values (array-like)
- border_count: Number of quantization borders (int)
- border_type: Border selection method (string)
- 'Median': Median-based borders
- 'Uniform': Uniformly spaced borders
- 'UniformAndQuantiles': Mix of uniform and quantile borders
- 'MaxLogSum': Maximum log sum borders
- 'MinEntropy': Minimum entropy borders
- 'GreedyLogSum': Greedy log sum borders
Returns:
numpy.ndarray: Quantization border values
"""
def quantize(data_path, column_description=None, pairs=None, graph=None,
delimiter='\t', has_header=False, ignore_csv_quoting=False,
feature_names=None, thread_count=-1, ignored_features=None,
per_float_feature_quantization=None, border_count=None,
max_bin=None, feature_border_type=None, nan_mode=None,
input_borders=None, task_type=None, used_ram_limit=None,
random_seed=None, **kwargs):
"""
Construct quantized Pool from non-quantized pool stored in file.
Parameters:
- data_path: Path to data file (string)
- column_description: Path to column description file (string, optional)
- pairs: Path to pairs file (string, optional)
- graph: Path to graph file (string, optional)
- delimiter: Delimiter used in data file (string)
- has_header: Whether file has header row (bool)
- ignore_csv_quoting: Ignore CSV quoting (bool)
- feature_names: Feature names (list, optional)
- thread_count: Number of threads (int)
- ignored_features: Indices of ignored features (list, optional)
- per_float_feature_quantization: Per-feature quantization settings (dict, optional)
- border_count: Number of borders for quantization (int, optional)
- max_bin: Maximum number of bins (int, optional)
- feature_border_type: Border type for features (string, optional)
- nan_mode: NaN handling mode (string, optional)
- input_borders: Input borders (dict, optional)
- task_type: Task type ('CPU' or 'GPU', optional)
- used_ram_limit: RAM usage limit (string, optional)
- random_seed: Random seed (int, optional)
Returns:
Pool: Quantized Pool object
"""
def create_cd(label=None, cat_features=None, text_features=None,
embedding_features=None, weight=None, baseline=None,
doc_id=None, group_id=None, subgroup_id=None,
timestamp=None, auxiliary_columns=None, feature_names=None,
output_path='train.cd'):
"""
Create column description file for CatBoost data loading.
Parameters:
- label: Label column index (int, optional)
- cat_features: Categorical feature column indices (list of int, optional)
- text_features: Text feature column indices (list of int, optional)
- embedding_features: Embedding feature column indices (list of int, optional)
- weight: Weight column index (int, optional)
- baseline: Baseline column index (int, optional)
- doc_id: Document ID column index (int, optional)
- group_id: Group ID column index (int, optional)
- subgroup_id: Subgroup ID column index (int, optional)
- timestamp: Timestamp column index (int, optional)
- auxiliary_columns: Auxiliary column indices (list of int, optional)
- feature_names: Feature names (list of str, optional)
- output_path: Output file path (string)
"""
def read_cd(cd_file, column_count=None, data_file=None, canonize_column_types=False):
"""
Read column description file.
Parameters:
- cd_file: Path to column description file (string)
- column_count: Number of columns expected (int, optional)
- data_file: Path to data file for validation (string, optional)
- canonize_column_types: Whether to canonize column types (bool)
Returns:
dict: Column description information
"""Other utility functions for advanced use cases.
def compute_wx_test():
"""
Compute Wilcoxon test statistic.
"""
class TargetStats:
"""
Target statistics computation class.
"""
class DataMetaInfo:
"""
Data metadata information class.
"""
def compute_training_options():
"""
Compute training options and parameters.
"""from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric, get_confusion_matrix, get_roc_curve
# Train model
model = CatBoostClassifier(iterations=100, verbose=False)
model.fit(X_train, y_train)
# Create test pool with labels for evaluation
test_pool = Pool(X_test, y_test, cat_features=['category'])
# Get predictions
predictions = model.predict(test_pool)
probabilities = model.predict_proba(test_pool)[:, 1] # Positive class probabilities
# Evaluate various metrics
accuracy = eval_metric(y_test, predictions, 'Accuracy')
auc = eval_metric(y_test, probabilities, 'AUC')
logloss = eval_metric(y_test, probabilities, 'Logloss')
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print(f"LogLoss: {logloss:.4f}")
# Get confusion matrix
conf_matrix = get_confusion_matrix(model, test_pool)
print("Confusion Matrix:")
print(conf_matrix)
# Get ROC curve data
fpr, tpr, thresholds = get_roc_curve(model, test_pool, plot=True)
print(f"ROC curve computed with {len(thresholds)} thresholds")from catboost.utils import select_threshold, get_fpr_curve, get_fnr_curve
# Get FPR and FNR curves
thresholds_fpr, fpr_values = get_fpr_curve(model, test_pool)
thresholds_fnr, fnr_values = get_fnr_curve(model, test_pool)
# Select threshold for specific FPR constraint
threshold_fpr = select_threshold(model, test_pool, FPR=0.05) # 5% FPR
print(f"Threshold for 5% FPR: {threshold_fpr:.4f}")
# Select threshold for specific FNR constraint
threshold_fnr = select_threshold(model, test_pool, FNR=0.10) # 10% FNR
print(f"Threshold for 10% FNR: {threshold_fnr:.4f}")
# Apply optimal threshold to predictions
optimal_predictions = (probabilities > threshold_fpr).astype(int)
optimal_accuracy = eval_metric(y_test, optimal_predictions, 'Accuracy')
print(f"Accuracy with optimal threshold: {optimal_accuracy:.4f}")from catboost.utils import convert_to_onnx_object
import onnx
# Export model to ONNX format
onnx_model = convert_to_onnx_object(model, export_parameters={
'onnx_domain': 'ai.catboost',
'onnx_model_version': 1,
'onnx_doc_string': 'CatBoost classifier model',
'onnx_graph_name': 'CatBoostModel'
})
# Save ONNX model
onnx.save(onnx_model, 'model.onnx')
print("Model exported to ONNX format")
# Export as Python code
model.save_model('model.py', format='python')
print("Model exported as Python code")
# Export as C++ code
model.save_model('model.cpp', format='cpp')
print("Model exported as C++ code")from catboost.utils import get_gpu_device_count
from catboost import CatBoostRegressor
# Check GPU availability
gpu_count = get_gpu_device_count()
print(f"Available GPU devices: {gpu_count}")
if gpu_count > 0:
# Train model on GPU
gpu_model = CatBoostRegressor(
iterations=500,
task_type='GPU',
devices='0', # Use first GPU
verbose=False
)
gpu_model.fit(X_train, y_train)
print("Model trained on GPU")
else:
print("No GPU devices available, using CPU")from catboost.utils import calculate_quantization_grid, quantize
from catboost import Pool
import numpy as np
# Calculate custom quantization grid
feature_values = X_train.iloc[:, 0].values # First feature
custom_borders = calculate_quantization_grid(
values=feature_values,
border_count=64,
border_type='GreedyLogSum'
)
print(f"Custom quantization borders: {len(custom_borders)} borders")
print(f"Border range: [{custom_borders[0]:.4f}, {custom_borders[-1]:.4f}]")
# Create and quantize pool
train_pool = Pool(X_train, y_train, cat_features=['category'])
quantized_pool = quantize(
pool=train_pool,
border_count=128,
feature_border_type='GreedyLogSum',
task_type='CPU'
)
print("Pool quantized successfully")
print(f"Original pool quantized: {train_pool.is_quantized()}")
print(f"Quantized pool quantized: {quantized_pool.is_quantized()}")import time
import psutil
import numpy as np
def benchmark_prediction(model, data, num_runs=100):
"""Custom benchmarking function."""
times = []
for _ in range(num_runs):
start_time = time.time()
predictions = model.predict(data)
end_time = time.time()
times.append(end_time - start_time)
avg_time = np.mean(times)
std_time = np.std(times)
predictions_per_second = len(data) / avg_time
return {
'avg_time': avg_time,
'std_time': std_time,
'predictions_per_second': predictions_per_second,
'num_predictions': len(data)
}
# Benchmark model performance
benchmark_results = benchmark_prediction(model, X_test, num_runs=50)
print(f"Average prediction time: {benchmark_results['avg_time']:.6f} seconds")
print(f"Standard deviation: {benchmark_results['std_time']:.6f} seconds")
print(f"Predictions per second: {benchmark_results['predictions_per_second']:.0f}")
print(f"Processed {benchmark_results['num_predictions']} samples")Install with Tessl CLI
npx tessl i tessl/pypi-catboost