tessl/pypi-catboost

CatBoost is a fast, scalable, high performance gradient boosting on decision trees library used for ranking, classification, regression and other ML tasks.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Core Model Classes

Name: tessl/pypi-catboost
Author: tessl

Scikit-learn compatible model classes that provide the main interfaces for CatBoost gradient boosting. These classes handle classification, regression, and ranking tasks with comprehensive parameter configuration and training options.

Capabilities

CatBoost Base Class

The foundational model class providing core gradient boosting functionality with training, prediction, feature importance, and model persistence methods.

class CatBoost:
    def __init__(self, params=None):
        """
        Initialize CatBoost model with parameters.
        
        Parameters:
        - params (dict): Model parameters
        """

    def fit(self, X, y=None, cat_features=None, text_features=None, 
            embedding_features=None, pairs=None, graph=None, sample_weight=None, 
            group_id=None, group_weight=None, subgroup_id=None, pairs_weight=None, 
            baseline=None, use_best_model=None, eval_set=None, verbose=None, 
            logging_level=None, plot=False, plot_file=None, early_stopping_rounds=None, 
            save_snapshot=None, snapshot_file=None, snapshot_interval=600, 
            init_model=None):
        """
        Train the CatBoost model.
        
        Parameters:
        - X: Training data (Pool, list, numpy.ndarray, pandas.DataFrame, pandas.Series, FeaturesData, or file path)
        - y: Target values (array-like)
        - cat_features: Categorical feature column indices or names
        - text_features: Text feature column indices or names  
        - embedding_features: Embedding feature column indices or names
        - pairs: Pairs for ranking (array-like)
        - graph: Graph for collaborative filtering
        - sample_weight: Sample weights
        - group_id: Group identifiers for ranking
        - group_weight: Group weights
        - subgroup_id: Subgroup identifiers
        - pairs_weight: Pairs weights
        - baseline: Baseline values
        - use_best_model: Use best model from evaluation
        - eval_set: Evaluation datasets [(X, y), ...]
        - verbose: Verbosity level
        - logging_level: Logging level
        - plot: Enable plotting
        - plot_file: Plot output file
        - early_stopping_rounds: Early stopping rounds
        - save_snapshot: Save training snapshots
        - snapshot_file: Snapshot file name
        - snapshot_interval: Snapshot interval in seconds
        - init_model: Initial model for continued training
        
        Returns:
        Self
        """

    def predict(self, data, prediction_type='RawFormulaVal', ntree_start=0, 
               ntree_end=0, thread_count=-1, verbose=None, task_type='CPU'):
        """
        Make predictions on data.
        
        Parameters:
        - data: Input data (Pool or array-like)
        - prediction_type: Type of prediction ('RawFormulaVal', 'Class', 'Probability')
        - ntree_start: Start tree index
        - ntree_end: End tree index (0 means use all trees)
        - thread_count: Number of threads
        - verbose: Verbosity level
        - task_type: Task type ('CPU' or 'GPU')
        
        Returns:
        numpy.ndarray: Predictions
        """

    def get_feature_importance(self, data=None, type='FeatureImportance', 
                              prettified=False, thread_count=-1, shap_mode=None, 
                              interaction_indices=None, shap_calc_type='Regular', 
                              model_output_type='RawFormulaVal', **kwargs):
        """
        Calculate feature importance.
        
        Parameters:
        - data: Data for importance calculation (Pool or array-like)
        - type: Importance type (EFstrType enum value)
        - prettified: Return prettified DataFrame
        - thread_count: Number of threads
        - shap_mode: SHAP calculation mode
        - interaction_indices: Feature indices for interaction
        - shap_calc_type: SHAP calculation type
        - model_output_type: Model output type
        
        Returns:
        numpy.ndarray or pandas.DataFrame: Feature importance values
        """

    def get_object_importance(self, pool, train_pool, top_size=-1, 
                             type='Average', update_method='SinglePoint', 
                             importance_values_sign='All', thread_count=-1):
        """
        Calculate object importance (leaf influence).
        
        Parameters:
        - pool: Pool for importance calculation
        - train_pool: Training pool
        - top_size: Number of top important objects (-1 for all)
        - type: Importance type ('Average', 'PerObject')
        - update_method: Update method ('SinglePoint', 'TopKLeaves', 'AllPoints')
        - importance_values_sign: Values sign ('All', 'Positive', 'Negative')
        - thread_count: Number of threads
        
        Returns:
        numpy.ndarray: Object importance values
        """

    def save_model(self, fname, format='cbm', export_parameters=None, pool=None):
        """
        Save model to file.
        
        Parameters:
        - fname: File name or file-like object
        - format: Model format ('cbm', 'json', 'onnx', 'pmml', 'python', 'cpp')
        - export_parameters: Export parameters for specific formats
        - pool: Pool for ONNX export
        """

    def load_model(self, fname=None, format='cbm', stream=None, blob=None):
        """
        Load model from file.
        
        Parameters:
        - fname: File name
        - format: Model format
        - stream: Input stream
        - blob: Model blob data
        """

    def copy(self):
        """Create a copy of the model."""

    def get_params(self, deep=True):
        """Get model parameters."""

    def set_params(self, **params):
        """Set model parameters."""

CatBoostClassifier

Scikit-learn compatible classifier with binary and multi-class classification support, including probability predictions and class-specific methods.

class CatBoostClassifier(CatBoost):
    def __init__(self, iterations=500, learning_rate=None, depth=6, l2_leaf_reg=3.0, 
                 model_size_reg=0.5, rsm=1.0, loss_function='Logloss', 
                 border_count=254, feature_border_type='GreedyLogSum', 
                 per_float_feature_quantization=None, input_borders=None, 
                 output_borders=None, fold_permutation_block=1, 
                 od_pval=0.001, od_wait=20, od_type='IncToDec', nan_mode='Min', 
                 counter_calc_method='SkipTest', leaf_estimation_iterations=None, 
                 leaf_estimation_method='Newton', thread_count=-1, 
                 random_seed=None, use_best_model=None, best_model_min_trees=1, 
                 verbose=None, silent=None, logging_level=None, metric_period=1, 
                 ctr_leaf_count_limit=None, store_all_simple_ctr=None, 
                 max_ctr_complexity=4, has_time=False, allow_const_label=None, 
                 target_border=None, classes_count=None, class_weights=None, 
                 auto_class_weights=None, class_names=None, one_hot_max_size=None, 
                 random_strength=1.0, name='experiment', ignored_features=None, 
                 train_dir=None, custom_loss=None, custom_metric=None, 
                 eval_metric=None, bagging_temperature=1.0, save_snapshot=None, 
                 snapshot_file=None, snapshot_interval=600, fold_len_multiplier=2.0, 
                 used_ram_limit='1gb', gpu_ram_part=0.95, pinned_memory_size='104857600', 
                 allow_writing_files=True, final_ctr_computation_mode='Default', 
                 approx_on_full_history=False, boosting_type=None, simple_ctr=None, 
                 combinations_ctr=None, per_feature_ctr=None, ctr_description=None, 
                 ctr_target_border_count=None, task_type=None, device_config=None, 
                 devices=None, bootstrap_type=None, subsample=None, 
                 sampling_unit='Object', dev_score_calc_obj_block_size=None, 
                 max_depth=None, grow_policy='SymmetricTree', min_data_in_leaf=1, 
                 max_leaves=31, num_boost_round=None, feature_weights=None, 
                 penalties_coefficient=1.0, first_feature_use_penalties=None, 
                 model_shrink_rate=None, model_shrink_mode=None, langevin=False, 
                 diffusion_temperature=10000.0, posterior_sampling=False, 
                 boost_from_average=None, text_features=None, 
                 tokenizers=None, dictionaries=None, feature_calcers=None, 
                 text_processing=None, embedding_features=None, **kwargs):
        """
        Initialize CatBoost classifier.
        
        Key Parameters:
        - iterations (int): Number of boosting iterations (default: 500)
        - learning_rate (float): Learning rate (default: auto-calculated)
        - depth (int): Tree depth (default: 6)
        - l2_leaf_reg (float): L2 regularization coefficient (default: 3.0)
        - loss_function (str): Loss function ('Logloss', 'CrossEntropy', 'MultiClass', 'MultiClassOneVsAll')
        - class_weights (list/dict): Class weights for imbalanced datasets
        - auto_class_weights (str): Automatic class weight calculation ('Balanced', 'SqrtBalanced')
        - eval_metric (str): Evaluation metric ('Logloss', 'AUC', 'Accuracy', 'Precision', 'Recall', 'F1')
        - early_stopping_rounds (int): Early stopping rounds
        - task_type (str): Task type ('CPU' or 'GPU')
        - verbose (bool/int): Verbosity level
        """

    def fit(self, X, y, cat_features=None, text_features=None, 
            embedding_features=None, graph=None, sample_weight=None, 
            baseline=None, use_best_model=None, eval_set=None, verbose=None, 
            logging_level=None, plot=False, plot_file=None, 
            early_stopping_rounds=None, save_snapshot=None, snapshot_file=None, 
            snapshot_interval=600, init_model=None):
        """
        Train the classifier.
        
        Parameters: Same as CatBoost.fit()
        
        Returns:
        Self
        """

    def predict(self, data, prediction_type='Class', ntree_start=0, ntree_end=0, 
               thread_count=-1, verbose=None, task_type='CPU'):
        """
        Predict class labels.
        
        Parameters:
        - data: Input data
        - prediction_type: 'Class' for class labels, 'RawFormulaVal' for raw values
        
        Returns:
        numpy.ndarray: Predicted class labels
        """

    def predict_proba(self, X, ntree_start=0, ntree_end=0, thread_count=-1, 
                     verbose=None, task_type='CPU'):
        """
        Predict class probabilities.
        
        Parameters:
        - X: Input data
        - ntree_start: Start tree index  
        - ntree_end: End tree index
        - thread_count: Number of threads
        - verbose: Verbosity level
        - task_type: Task type
        
        Returns:
        numpy.ndarray: Class probabilities (n_samples, n_classes)
        """

    def predict_log_proba(self, data, ntree_start=0, ntree_end=0, thread_count=-1, 
                         verbose=None, task_type='CPU'):
        """
        Predict logarithm of class probabilities.
        
        Returns:
        numpy.ndarray: Log probabilities
        """

    def staged_predict(self, data, prediction_type='Class', ntree_start=0, 
                      ntree_end=0, eval_period=1, thread_count=-1, verbose=None):
        """
        Predict for each stage of boosting.
        
        Returns:
        generator: Predictions for each boosting iteration
        """

    def staged_predict_proba(self, data, ntree_start=0, ntree_end=0, eval_period=1, 
                           thread_count=-1, verbose=None):
        """
        Predict probabilities for each stage of boosting.
        
        Returns:
        generator: Probabilities for each boosting iteration
        """

    @property
    def classes_(self):
        """Get class labels."""

    @property  
    def feature_importances_(self):
        """Get feature importances (scikit-learn compatibility)."""

CatBoostRegressor

Scikit-learn compatible regressor supporting various loss functions for different regression tasks including standard regression, quantile regression, and survival analysis.

class CatBoostRegressor(CatBoost):
    def __init__(self, iterations=500, learning_rate=None, depth=6, l2_leaf_reg=3.0,
                 model_size_reg=0.5, rsm=1.0, loss_function='RMSE', 
                 border_count=128, feature_border_type='GreedyLogSum',
                 # ... (same parameters as CatBoostClassifier except loss-specific ones)
                 **kwargs):
        """
        Initialize CatBoost regressor.
        
        Key Parameters:
        - loss_function (str): Loss function ('RMSE', 'MAE', 'Quantile:alpha=0.5', 
                              'LogLinQuantile:alpha=0.5', 'Poisson', 'MAPE', 
                              'Lq:q=2', 'SurvivalAft:dist=Normal;scale=1.0')
        - eval_metric (str): Evaluation metric ('RMSE', 'MAE', 'R2', 'MSLE', 'MedianAbsoluteError')
        """

    def fit(self, X, y, **kwargs):
        """Train the regressor. Same interface as CatBoost.fit()."""

    def predict(self, data, **kwargs):
        """
        Predict target values.
        
        Returns:
        numpy.ndarray: Predicted values
        """

    def staged_predict(self, data, **kwargs):
        """
        Predict for each stage of boosting.
        
        Returns:
        generator: Predictions for each boosting iteration
        """

    @property
    def feature_importances_(self):
        """Get feature importances (scikit-learn compatibility)."""

CatBoostRanker

Scikit-learn compatible ranker for learning-to-rank tasks with support for various ranking loss functions and group-based evaluation.

class CatBoostRanker(CatBoost):
    def __init__(self, iterations=500, learning_rate=None, depth=6, l2_leaf_reg=3.0,
                 model_size_reg=0.5, rsm=1.0, loss_function='YetiRank',
                 # ... (same parameters as other CatBoost classes)
                 **kwargs):
        """
        Initialize CatBoost ranker.
        
        Key Parameters:
        - loss_function (str): Ranking loss function ('YetiRank', 'YetiRankPairwise',
                              'StochasticFilter', 'StochasticRank', 'QueryCrossEntropy',
                              'QueryRMSE', 'GroupQuantile:alpha=0.5', 'QuerySoftMax',
                              'PairLogit', 'PairLogitPairwise')
        - eval_metric (str): Ranking evaluation metric ('NDCG', 'DCG', 'MAP', 'MRR', 'ERR')
        """

    def fit(self, X, y, group_id=None, **kwargs):
        """
        Train the ranker.
        
        Parameters: Same as CatBoost.fit() with group_id being important for ranking
        - group_id: Group identifiers for ranking (required for most ranking tasks)
        """

    def predict(self, data, **kwargs):
        """
        Predict ranking scores.
        
        Returns:
        numpy.ndarray: Ranking scores
        """

    def staged_predict(self, data, **kwargs):
        """
        Predict ranking scores for each stage of boosting.
        
        Returns:
        generator: Ranking scores for each boosting iteration  
        """

    @property
    def feature_importances_(self):
        """Get feature importances (scikit-learn compatibility)."""

Model Conversion Functions

def to_classifier(model):
    """
    Convert CatBoost model to classifier.
    
    Parameters:
    - model: CatBoost model
    
    Returns:
    CatBoostClassifier: Converted classifier
    """

def to_regressor(model):
    """
    Convert CatBoost model to regressor.
    
    Parameters:
    - model: CatBoost model
    
    Returns:
    CatBoostRegressor: Converted regressor
    """

def to_ranker(model):
    """
    Convert CatBoost model to ranker.
    
    Parameters:
    - model: CatBoost model
    
    Returns:
    CatBoostRanker: Converted ranker
    """

Install with Tessl CLI