tessl/pypi-catboost

CatBoost is a fast, scalable, high performance gradient boosting on decision trees library used for ranking, classification, regression and other ML tasks.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Utilities

Name: tessl/pypi-catboost
Author: tessl

CatBoost provides extensive utility functions for model evaluation, GPU management, metric calculation, data conversion, and model export. These utilities enhance the machine learning workflow with comprehensive analysis and deployment capabilities.

Capabilities

Model Evaluation Utilities

Comprehensive model evaluation tools including confusion matrices, ROC curves, and threshold optimization.

def eval_metric(label, approx, metric, weight=None, group_id=None, 
               group_weight=None, subgroup_id=None, pairs=None, thread_count=-1):
    """
    Evaluate a metric on predictions.
    
    Parameters:
    - label: True target values (array-like)
    - approx: Model predictions (array-like)
    - metric: Metric name to evaluate (string)
        Classification: 'Logloss', 'CrossEntropy', 'AUC', 'Accuracy', 'Precision', 'Recall', 'F1'
        Regression: 'RMSE', 'MAE', 'R2', 'MSLE', 'MedianAbsoluteError', 'SMAPE', 'MAPE'
        Ranking: 'NDCG', 'DCG', 'MAP', 'MRR', 'ERR'
    - weight: Sample weights (array-like, optional)
    - group_id: Group identifiers for ranking metrics (array-like, optional)
    - group_weight: Group weights (array-like, optional)
    - subgroup_id: Subgroup identifiers (array-like, optional)
    - pairs: Pairs for ranking metrics (array-like, optional)
    - thread_count: Number of threads for computation (int)
    
    Returns:
    float: Metric value
    """

def get_confusion_matrix(model, data, thread_count=-1):
    """
    Calculate confusion matrix for classification model.
    
    Parameters:
    - model: Trained CatBoost classifier
    - data: Input data (Pool or array-like)
    - thread_count: Number of threads for computation (int)
    
    Returns:
    numpy.ndarray: Confusion matrix (n_classes, n_classes)
    """

def get_roc_curve(model, data, thread_count=-1, plot=False):
    """
    Calculate ROC curve data for binary classification.
    
    Parameters:
    - model: Trained CatBoost classifier
    - data: Input data with true labels (Pool)
    - thread_count: Number of threads for computation (int)
    - plot: Whether to plot the ROC curve (bool)
    
    Returns:
    tuple: (fpr, tpr, thresholds)
        - fpr: False positive rates (numpy.ndarray)
        - tpr: True positive rates (numpy.ndarray)  
        - thresholds: Decision thresholds (numpy.ndarray)
    """

def get_fpr_curve(model, data, curve=None, thread_count=-1, plot=False):
    """
    Calculate False Positive Rate curve.
    
    Parameters:
    - model: Trained CatBoost classifier
    - data: Input data with true labels (Pool)  
    - curve: Curve type (string, optional)
    - thread_count: Number of threads for computation (int)
    - plot: Whether to plot the curve (bool)
    
    Returns:
    tuple: (thresholds, fpr_values)
    """

def get_fnr_curve(model, data, curve=None, thread_count=-1, plot=False):
    """
    Calculate False Negative Rate curve.
    
    Parameters:
    - model: Trained CatBoost classifier
    - data: Input data with true labels (Pool)
    - curve: Curve type (string, optional)  
    - thread_count: Number of threads for computation (int)
    - plot: Whether to plot the curve (bool)
    
    Returns:
    tuple: (thresholds, fnr_values)
    """

def select_threshold(model, data, curve=None, FPR=None, FNR=None, thread_count=-1):
    """
    Select optimal decision threshold based on FPR/FNR constraints.
    
    Parameters:
    - model: Trained CatBoost classifier
    - data: Input data with true labels (Pool)
    - curve: Curve type for threshold selection (string, optional)
    - FPR: Target false positive rate (float, 0-1)
    - FNR: Target false negative rate (float, 0-1)  
    - thread_count: Number of threads for computation (int)
    
    Returns:
    float: Optimal threshold value
    """

GPU and System Utilities

System information and GPU management functions.

def get_gpu_device_count():
    """
    Get the number of available GPU devices.
    
    Returns:
    int: Number of GPU devices available for CatBoost
    """

def reset_trace_backend(filename):
    """
    Reset trace backend with filename.
    
    Parameters:
    - filename: Path to trace file (string)
    """

Model Export and Conversion

Functions for exporting models to various formats for deployment.

def convert_to_onnx_object(model, export_parameters=None):
    """
    Convert CatBoost model to ONNX format object.
    
    Parameters:
    - model: Trained CatBoost model
    - export_parameters: Export configuration parameters (dict, optional)
        - 'onnx_domain': ONNX domain name (string)
        - 'onnx_model_version': Model version (int)
        - 'onnx_doc_string': Documentation string (string)
        - 'onnx_graph_name': Graph name (string)
    
    Returns:
    onnx.ModelProto: ONNX model object
    """

Data Processing Utilities

Utilities for data preprocessing, quantization, and format conversion.

def calculate_quantization_grid(values, border_count, border_type='Median'):
    """
    Calculate quantization grid for numerical values.
    
    Parameters:
    - values: Input numerical values (array-like)
    - border_count: Number of quantization borders (int)
    - border_type: Border selection method (string)
        - 'Median': Median-based borders
        - 'Uniform': Uniformly spaced borders
        - 'UniformAndQuantiles': Mix of uniform and quantile borders
        - 'MaxLogSum': Maximum log sum borders
        - 'MinEntropy': Minimum entropy borders
        - 'GreedyLogSum': Greedy log sum borders
    
    Returns:
    numpy.ndarray: Quantization border values
    """

def quantize(data_path, column_description=None, pairs=None, graph=None,
            delimiter='\t', has_header=False, ignore_csv_quoting=False,
            feature_names=None, thread_count=-1, ignored_features=None,
            per_float_feature_quantization=None, border_count=None,
            max_bin=None, feature_border_type=None, nan_mode=None,
            input_borders=None, task_type=None, used_ram_limit=None,
            random_seed=None, **kwargs):
    """
    Construct quantized Pool from non-quantized pool stored in file.
    
    Parameters:
    - data_path: Path to data file (string)
    - column_description: Path to column description file (string, optional)
    - pairs: Path to pairs file (string, optional)
    - graph: Path to graph file (string, optional)
    - delimiter: Delimiter used in data file (string)
    - has_header: Whether file has header row (bool)
    - ignore_csv_quoting: Ignore CSV quoting (bool)
    - feature_names: Feature names (list, optional)
    - thread_count: Number of threads (int)
    - ignored_features: Indices of ignored features (list, optional)
    - per_float_feature_quantization: Per-feature quantization settings (dict, optional)
    - border_count: Number of borders for quantization (int, optional)
    - max_bin: Maximum number of bins (int, optional)
    - feature_border_type: Border type for features (string, optional)
    - nan_mode: NaN handling mode (string, optional)
    - input_borders: Input borders (dict, optional)
    - task_type: Task type ('CPU' or 'GPU', optional)
    - used_ram_limit: RAM usage limit (string, optional)
    - random_seed: Random seed (int, optional)
    
    Returns:
    Pool: Quantized Pool object
    """

def create_cd(label=None, cat_features=None, text_features=None,
             embedding_features=None, weight=None, baseline=None,
             doc_id=None, group_id=None, subgroup_id=None,
             timestamp=None, auxiliary_columns=None, feature_names=None,
             output_path='train.cd'):
    """
    Create column description file for CatBoost data loading.
    
    Parameters:
    - label: Label column index (int, optional)
    - cat_features: Categorical feature column indices (list of int, optional)
    - text_features: Text feature column indices (list of int, optional)
    - embedding_features: Embedding feature column indices (list of int, optional)
    - weight: Weight column index (int, optional)
    - baseline: Baseline column index (int, optional)
    - doc_id: Document ID column index (int, optional)
    - group_id: Group ID column index (int, optional)
    - subgroup_id: Subgroup ID column index (int, optional)
    - timestamp: Timestamp column index (int, optional)
    - auxiliary_columns: Auxiliary column indices (list of int, optional)
    - feature_names: Feature names (list of str, optional)
    - output_path: Output file path (string)
    """

def read_cd(cd_file, column_count=None, data_file=None, canonize_column_types=False):
    """
    Read column description file.
    
    Parameters:
    - cd_file: Path to column description file (string)
    - column_count: Number of columns expected (int, optional)
    - data_file: Path to data file for validation (string, optional)
    - canonize_column_types: Whether to canonize column types (bool)
    
    Returns:
    dict: Column description information
    """

Additional Utility Functions

Other utility functions for advanced use cases.

def compute_wx_test():
    """
    Compute Wilcoxon test statistic.
    """

class TargetStats:
    """
    Target statistics computation class.
    """

class DataMetaInfo:
    """
    Data metadata information class.
    """

def compute_training_options():
    """
    Compute training options and parameters.
    """

Utility Examples

Model Evaluation and Analysis

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric, get_confusion_matrix, get_roc_curve

# Train model
model = CatBoostClassifier(iterations=100, verbose=False)
model.fit(X_train, y_train)

# Create test pool with labels for evaluation
test_pool = Pool(X_test, y_test, cat_features=['category'])

# Get predictions
predictions = model.predict(test_pool)
probabilities = model.predict_proba(test_pool)[:, 1]  # Positive class probabilities

# Evaluate various metrics
accuracy = eval_metric(y_test, predictions, 'Accuracy')
auc = eval_metric(y_test, probabilities, 'AUC')
logloss = eval_metric(y_test, probabilities, 'Logloss')

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}") 
print(f"LogLoss: {logloss:.4f}")

# Get confusion matrix
conf_matrix = get_confusion_matrix(model, test_pool)
print("Confusion Matrix:")
print(conf_matrix)

# Get ROC curve data
fpr, tpr, thresholds = get_roc_curve(model, test_pool, plot=True)
print(f"ROC curve computed with {len(thresholds)} thresholds")

Threshold Optimization

from catboost.utils import select_threshold, get_fpr_curve, get_fnr_curve

# Get FPR and FNR curves
thresholds_fpr, fpr_values = get_fpr_curve(model, test_pool)
thresholds_fnr, fnr_values = get_fnr_curve(model, test_pool)

# Select threshold for specific FPR constraint
threshold_fpr = select_threshold(model, test_pool, FPR=0.05)  # 5% FPR
print(f"Threshold for 5% FPR: {threshold_fpr:.4f}")

# Select threshold for specific FNR constraint  
threshold_fnr = select_threshold(model, test_pool, FNR=0.10)  # 10% FNR
print(f"Threshold for 10% FNR: {threshold_fnr:.4f}")

# Apply optimal threshold to predictions
optimal_predictions = (probabilities > threshold_fpr).astype(int)
optimal_accuracy = eval_metric(y_test, optimal_predictions, 'Accuracy')
print(f"Accuracy with optimal threshold: {optimal_accuracy:.4f}")

Model Export and Deployment

from catboost.utils import convert_to_onnx_object
import onnx

# Export model to ONNX format
onnx_model = convert_to_onnx_object(model, export_parameters={
    'onnx_domain': 'ai.catboost',
    'onnx_model_version': 1,
    'onnx_doc_string': 'CatBoost classifier model',
    'onnx_graph_name': 'CatBoostModel'
})

# Save ONNX model
onnx.save(onnx_model, 'model.onnx')
print("Model exported to ONNX format")

# Export as Python code
model.save_model('model.py', format='python')
print("Model exported as Python code")

# Export as C++ code
model.save_model('model.cpp', format='cpp')
print("Model exported as C++ code")

GPU Utilization

from catboost.utils import get_gpu_device_count
from catboost import CatBoostRegressor

# Check GPU availability
gpu_count = get_gpu_device_count()
print(f"Available GPU devices: {gpu_count}")

if gpu_count > 0:
    # Train model on GPU
    gpu_model = CatBoostRegressor(
        iterations=500,
        task_type='GPU',
        devices='0',  # Use first GPU
        verbose=False
    )
    gpu_model.fit(X_train, y_train)
    print("Model trained on GPU")
else:
    print("No GPU devices available, using CPU")

Data Quantization Optimization

from catboost.utils import calculate_quantization_grid, quantize
from catboost import Pool
import numpy as np

# Calculate custom quantization grid
feature_values = X_train.iloc[:, 0].values  # First feature
custom_borders = calculate_quantization_grid(
    values=feature_values,
    border_count=64,
    border_type='GreedyLogSum'
)

print(f"Custom quantization borders: {len(custom_borders)} borders")
print(f"Border range: [{custom_borders[0]:.4f}, {custom_borders[-1]:.4f}]")

# Create and quantize pool
train_pool = Pool(X_train, y_train, cat_features=['category'])
quantized_pool = quantize(
    pool=train_pool,
    border_count=128,
    feature_border_type='GreedyLogSum',
    task_type='CPU'
)

print("Pool quantized successfully")
print(f"Original pool quantized: {train_pool.is_quantized()}")
print(f"Quantized pool quantized: {quantized_pool.is_quantized()}")

Model Performance Benchmarking

import time
import psutil
import numpy as np

def benchmark_prediction(model, data, num_runs=100):
    """Custom benchmarking function."""
    
    times = []
    for _ in range(num_runs):
        start_time = time.time()
        predictions = model.predict(data)
        end_time = time.time()
        times.append(end_time - start_time)
    
    avg_time = np.mean(times)
    std_time = np.std(times)
    predictions_per_second = len(data) / avg_time
    
    return {
        'avg_time': avg_time,
        'std_time': std_time,
        'predictions_per_second': predictions_per_second,
        'num_predictions': len(data)
    }

# Benchmark model performance
benchmark_results = benchmark_prediction(model, X_test, num_runs=50)

print(f"Average prediction time: {benchmark_results['avg_time']:.6f} seconds")
print(f"Standard deviation: {benchmark_results['std_time']:.6f} seconds")
print(f"Predictions per second: {benchmark_results['predictions_per_second']:.0f}")
print(f"Processed {benchmark_results['num_predictions']} samples")

Install with Tessl CLI