tessl/pypi-catboost

CatBoost is a fast, scalable, high performance gradient boosting on decision trees library used for ranking, classification, regression and other ML tasks.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Advanced Features

Name: tessl/pypi-catboost
Author: tessl

CatBoost provides specialized features for advanced use cases including text processing, monoforest model interpretation, custom metrics and objectives, and evaluation frameworks. These capabilities extend CatBoost's functionality for specialized domains and research applications.

Capabilities

Custom Metrics and Objectives

Base classes for implementing custom loss functions and evaluation metrics for specialized machine learning tasks.

class MultiRegressionCustomMetric:
    """
    Base class for implementing custom metrics for multi-regression tasks.
    
    Allows creation of domain-specific evaluation metrics that can be used
    during training and validation of multi-output regression models.
    """
    
    def __init__(self):
        """Initialize custom metric."""
        pass
    
    def is_max_optimal(self):
        """
        Specify optimization direction for the metric.
        
        Returns:
        bool: True if higher values are better, False if lower values are better
        """
        raise NotImplementedError()
    
    def evaluate(self, approxes, target, weight):
        """
        Calculate metric value for given predictions and targets.
        
        Parameters:
        - approxes: Model predictions (list of numpy.ndarray for each target)
        - target: True target values (numpy.ndarray)
        - weight: Sample weights (numpy.ndarray, optional)
        
        Returns:
        tuple: (metric_value, weight_sum)
            - metric_value: Calculated metric value (float)
            - weight_sum: Sum of weights used (float)
        """
        raise NotImplementedError()
    
    def get_final_error(self, error, weight):
        """
        Calculate final error from accumulated values.
        
        Parameters:
        - error: Accumulated error value (float)
        - weight: Accumulated weight (float)
        
        Returns:
        float: Final metric value
        """
        return error / weight if weight != 0 else 0

class MultiRegressionCustomObjective:
    """
    Base class for implementing custom loss functions (objectives) for multi-regression.
    
    Enables implementation of specialized loss functions tailored to specific
    problem domains or research requirements.
    """
    
    def __init__(self):
        """Initialize custom objective."""
        pass
    
    def calc_ders_range(self, approxes, targets, weights):
        """
        Calculate first and second derivatives of the loss function.
        
        Parameters:
        - approxes: Current model predictions (list of numpy.ndarray)
        - targets: True target values (numpy.ndarray)
        - weights: Sample weights (numpy.ndarray)
        
        Returns:
        tuple: (first_derivatives, second_derivatives)
            - first_derivatives: First derivatives (list of numpy.ndarray)
            - second_derivatives: Second derivatives (list of numpy.ndarray)
        """
        raise NotImplementedError()

# Type aliases for multi-target scenarios
MultiTargetCustomMetric = MultiRegressionCustomMetric
MultiTargetCustomObjective = MultiRegressionCustomObjective

Text Processing

Specialized classes for handling text features within CatBoost's gradient boosting framework.

class Tokenizer:
    """
    Text tokenization utility for preprocessing text features in CatBoost.
    
    Provides various tokenization strategies optimized for gradient boosting
    on text data, with support for different languages and text types.
    """
    
    def __init__(self, tokenizer_id='Space', separator_type='ByDelimiter', 
                 delimiter=' ', **kwargs):
        """
        Initialize text tokenizer.
        
        Parameters:
        - tokenizer_id: Tokenizer type ('Space', 'SentenсePiece', 'Regexp')
        - separator_type: How to separate tokens ('ByDelimiter', 'BySeparator')
        - delimiter: Token delimiter character (string)
        - kwargs: Additional tokenizer-specific parameters
        """
        self.tokenizer_id = tokenizer_id
        self.separator_type = separator_type
        self.delimiter = delimiter
    
    def tokenize(self, text):
        """
        Tokenize input text.
        
        Parameters:
        - text: Input text string
        
        Returns:
        list: List of tokens
        """
        # Implementation depends on tokenizer type
        pass

class Dictionary:
    """
    Dictionary builder for text feature processing in CatBoost.
    
    Creates and manages vocabularies for text features, with support for
    frequency-based filtering and domain-specific dictionaries.
    """
    
    def __init__(self, dictionary_id='Word', max_dictionary_size=50000,
                 occurrence_lower_bound=1, **kwargs):
        """
        Initialize dictionary builder.
        
        Parameters:
        - dictionary_id: Dictionary identifier (string)
        - max_dictionary_size: Maximum vocabulary size (int)
        - occurrence_lower_bound: Minimum token frequency (int)
        - kwargs: Additional dictionary parameters
        """
        self.dictionary_id = dictionary_id
        self.max_dictionary_size = max_dictionary_size
        self.occurrence_lower_bound = occurrence_lower_bound
    
    def build(self, texts):
        """
        Build dictionary from text corpus.
        
        Parameters:
        - texts: List of text documents
        
        Returns:
        Dictionary object ready for use in text processing
        """
        pass
    
    def get_dictionary_info(self):
        """
        Get information about the built dictionary.
        
        Returns:
        dict: Dictionary statistics including size and coverage
        """
        pass

Monoforest Interpretation

Tools for interpreting monotonic forest models and converting them to polynomial representations.

def to_polynom(model):
    """
    Convert monoforest model to polynomial representation.
    
    Parameters:
    - model: Trained CatBoost model with monotonic constraints
    
    Returns:
    Polynomial representation of the model that can be used for analysis
    and interpretation of feature relationships.
    """
    pass

def to_polynom_string(model):
    """
    Convert monoforest model to human-readable polynomial string.
    
    Parameters:
    - model: Trained CatBoost model with monotonic constraints
    
    Returns:
    string: Mathematical polynomial expression representing the model
    """
    pass

def explain_features(model):
    """
    Generate feature explanations for monoforest models.
    
    Parameters:
    - model: Trained CatBoost model with monotonic constraints
    
    Returns:
    FeatureExplanation: Detailed explanation of feature contributions
    """
    pass

class FeatureExplanation:
    """
    Container for detailed feature explanations from monoforest models.
    
    Provides structured information about how features contribute to
    predictions in monotonic gradient boosting models.
    """
    
    def __init__(self):
        """Initialize feature explanation container."""
        self.feature_effects = {}
        self.monotonic_constraints = {}
        self.feature_interactions = {}
    
    def get_feature_effect(self, feature_idx):
        """
        Get the effect description for a specific feature.
        
        Parameters:
        - feature_idx: Feature index (int)
        
        Returns:
        dict: Feature effect information including direction and magnitude
        """
        return self.feature_effects.get(feature_idx, {})
    
    def get_monotonic_constraints(self):
        """
        Get monotonic constraints applied to features.
        
        Returns:
        dict: Feature indices mapped to constraint types (1, -1, or 0)
        """
        return self.monotonic_constraints
    
    def summary(self):
        """
        Generate summary of feature explanations.
        
        Returns:
        string: Human-readable summary of model feature behavior
        """
        pass

Advanced Evaluation Framework

Comprehensive evaluation system for complex model assessment scenarios.

class CatboostEvaluation:
    """
    Advanced evaluation framework for comprehensive model assessment.
    
    Provides tools for statistical testing, confidence intervals, and
    rigorous comparison of CatBoost models across different scenarios.
    """
    
    def __init__(self, eval_type='Classification', score_type='Logloss'):
        """
        Initialize evaluation framework.
        
        Parameters:
        - eval_type: Type of evaluation ('Classification', 'Regression', 'Ranking')
        - score_type: Primary scoring metric (string)
        """
        self.eval_type = eval_type
        self.score_type = score_type
    
    def add_case(self, case_name, model, test_data, test_labels):
        """
        Add evaluation case to the framework.
        
        Parameters:
        - case_name: Identifier for this evaluation case (string)
        - model: Trained CatBoost model
        - test_data: Test dataset (Pool or array-like)
        - test_labels: True labels (array-like)
        """
        pass
    
    def evaluate(self):
        """
        Perform comprehensive evaluation across all cases.
        
        Returns:
        EvaluationResults: Detailed evaluation results with statistical tests
        """
        pass

class EvaluationResults:
    """Container for comprehensive evaluation results."""
    
    def __init__(self):
        self.case_results = {}
        self.statistical_tests = {}
        self.confidence_intervals = {}
    
    def get_case_result(self, case_name):
        """Get results for specific evaluation case."""
        return self.case_results.get(case_name)
    
    def get_statistical_comparison(self, case1, case2):
        """Get statistical comparison between two cases."""
        pass
    
    def summary_table(self):
        """Generate summary table of all evaluation results."""
        pass

def calc_wilcoxon_test(scores1, scores2, alternative='two-sided'):
    """
    Calculate Wilcoxon signed-rank test for comparing model performance.
    
    Parameters:
    - scores1: Performance scores from first model (array-like)
    - scores2: Performance scores from second model (array-like)
    - alternative: Test alternative ('two-sided', 'less', 'greater')
    
    Returns:
    tuple: (statistic, p_value)
        - statistic: Test statistic (float)
        - p_value: P-value of the test (float)
    """
    pass

def calc_bootstrap_ci_for_mean(scores, confidence_level=0.95, num_bootstrap=10000):
    """
    Calculate bootstrap confidence interval for mean performance.
    
    Parameters:
    - scores: Performance scores (array-like)
    - confidence_level: Confidence level (float, 0-1)
    - num_bootstrap: Number of bootstrap samples (int)
    
    Returns:
    tuple: (lower_bound, upper_bound, mean_estimate)
        - lower_bound: Lower confidence bound (float)
        - upper_bound: Upper confidence bound (float)
        - mean_estimate: Bootstrap mean estimate (float)
    """
    pass

Advanced Features Examples

Custom Metric Implementation

from catboost import CatBoostRegressor, Pool
from catboost import MultiRegressionCustomMetric
import numpy as np

class MeanAbsolutePercentageError(MultiRegressionCustomMetric):
    """Custom MAPE metric implementation."""
    
    def is_max_optimal(self):
        return False  # Lower MAPE is better
    
    def evaluate(self, approxes, target, weight):
        """Calculate MAPE."""
        # approxes[0] contains predictions for single output regression
        predictions = approxes[0]
        
        # Avoid division by zero
        mask = target != 0
        if not np.any(mask):
            return 0.0, len(target)
        
        # Calculate MAPE only for non-zero targets
        ape = np.abs((target[mask] - predictions[mask]) / target[mask]) * 100
        
        if weight is not None:
            weight_sum = np.sum(weight[mask])
            mape = np.sum(ape * weight[mask]) / weight_sum if weight_sum > 0 else 0
        else:
            mape = np.mean(ape)
            weight_sum = len(ape)
        
        return mape, weight_sum

# Use custom metric
custom_mape = MeanAbsolutePercentageError()

model = CatBoostRegressor(
    iterations=200,
    eval_metric=custom_mape,  # Use custom metric for evaluation
    verbose=50
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
print("Model trained with custom MAPE metric")

Text Processing Configuration

from catboost import CatBoostClassifier, Pool
from catboost.text_processing import Tokenizer, Dictionary

# Prepare text data
text_data = pd.DataFrame({
    'text_feature': [
        "This is a positive review",
        "Negative sentiment example", 
        "Another positive text sample",
        "Bad negative review text"
    ],
    'category': ['A', 'B', 'A', 'B'],
    'target': [1, 0, 1, 0]
})

# Create pool with text features
text_pool = Pool(
    data=text_data.drop('target', axis=1),
    label=text_data['target'],
    text_features=['text_feature'],
    cat_features=['category']
)

# Configure text processing
text_processing_config = {
    'tokenizers': [
        {
            'tokenizer_id': 'Space',
            'separator_type': 'ByDelimiter',
            'delimiter': ' '
        },
        {
            'tokenizer_id': 'SentencePiece',
            'number_of_tokens': 1000
        }
    ],
    'dictionaries': [
        {
            'dictionary_id': 'Word',
            'max_dictionary_size': 50000,
            'occurrence_lower_bound': 1
        },
        {
            'dictionary_id': 'Bigram', 
            'max_dictionary_size': 50000,
            'occurrence_lower_bound': 2,
            'gram_order': 2
        }
    ],
    'feature_processing': {
        'default': [
            {
                'dictionaries_names': ['Word', 'Bigram'],
                'feature_calcers': ['BoW', 'NaiveBayes'],
                'tokenizers_names': ['Space']
            }
        ]
    }
}

# Train model with text processing
text_model = CatBoostClassifier(
    iterations=100,
    text_processing=text_processing_config,
    verbose=50
)

text_model.fit(text_pool)
print("Text classification model trained")

# Get text feature importance
text_importance = text_model.get_feature_importance(prettified=True)
print("Text feature importance calculated")

Monoforest Model Interpretation

from catboost import CatBoostRegressor
from catboost.monoforest import to_polynom_string, explain_features
import numpy as np

# Create synthetic data with known monotonic relationships
np.random.seed(42)
n_samples = 1000

X_mono = pd.DataFrame({
    'increasing_feature': np.random.uniform(0, 10, n_samples),
    'decreasing_feature': np.random.uniform(0, 5, n_samples),
    'neutral_feature': np.random.uniform(-2, 2, n_samples)
})

# Create target with monotonic relationships
y_mono = (
    2 * X_mono['increasing_feature'] +          # Positive monotonic
    -1.5 * X_mono['decreasing_feature'] +       # Negative monotonic  
    0.5 * X_mono['neutral_feature']**2 +        # Non-monotonic
    np.random.normal(0, 0.5, n_samples)         # Noise
)

# Train model with monotonic constraints
mono_model = CatBoostRegressor(
    iterations=200,
    depth=4,
    monotone_constraints=[1, -1, 0],  # +1: increasing, -1: decreasing, 0: no constraint
    verbose=50
)

mono_model.fit(X_mono, y_mono)

# Convert to polynomial representation
try:
    poly_string = to_polynom_string(mono_model)
    print("Polynomial representation:")
    print(poly_string)
except:
    print("Polynomial conversion not available for this model type")

# Get feature explanations
try:
    explanations = explain_features(mono_model)
    print("\nFeature explanations:")
    print(explanations.summary())
except:
    print("Feature explanations not available")

# Verify monotonic behavior
test_values = np.linspace(0, 10, 100)
predictions_increasing = []
predictions_decreasing = []

for val in test_values:
    # Test increasing feature
    test_data_inc = pd.DataFrame({
        'increasing_feature': [val],
        'decreasing_feature': [2.5],  # Fixed value
        'neutral_feature': [0]        # Fixed value  
    })
    pred_inc = mono_model.predict(test_data_inc)[0]
    predictions_increasing.append(pred_inc)
    
    # Test decreasing feature
    test_data_dec = pd.DataFrame({
        'increasing_feature': [5],     # Fixed value
        'decreasing_feature': [val],
        'neutral_feature': [0]        # Fixed value
    })
    pred_dec = mono_model.predict(test_data_dec)[0]
    predictions_decreasing.append(pred_dec)

# Check monotonicity
increasing_diff = np.diff(predictions_increasing) 
decreasing_diff = np.diff(predictions_decreasing)

print(f"\nMonotonic constraint verification:")
print(f"Increasing feature violations: {np.sum(increasing_diff < 0)} / {len(increasing_diff)}")
print(f"Decreasing feature violations: {np.sum(decreasing_diff > 0)} / {len(decreasing_diff)}")

Advanced Model Evaluation

from catboost import CatBoostClassifier
from catboost.eval import calc_wilcoxon_test, calc_bootstrap_ci_for_mean
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# Train multiple models for comparison
models = {
    'shallow': CatBoostClassifier(iterations=100, depth=4, verbose=False),
    'medium': CatBoostClassifier(iterations=200, depth=6, verbose=False),
    'deep': CatBoostClassifier(iterations=300, depth=8, verbose=False)
}

# Perform cross-validation for each model
cv_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    cv_scores[name] = scores
    print(f"{name} model - CV AUC: {scores.mean():.4f} ± {scores.std():.4f}")

# Statistical comparison between models
comparisons = [('shallow', 'medium'), ('medium', 'deep'), ('shallow', 'deep')]

for model1, model2 in comparisons:
    statistic, p_value = calc_wilcoxon_test(
        cv_scores[model1], 
        cv_scores[model2],
        alternative='two-sided'
    )
    
    print(f"\nWilcoxon test: {model1} vs {model2}")
    print(f"Statistic: {statistic:.4f}, P-value: {p_value:.4f}")
    
    if p_value < 0.05:
        better_model = model1 if cv_scores[model1].mean() > cv_scores[model2].mean() else model2
        print(f"Significant difference (p < 0.05): {better_model} performs better")
    else:
        print("No significant difference (p >= 0.05)")

# Bootstrap confidence intervals
for name, scores in cv_scores.items():
    lower, upper, mean_est = calc_bootstrap_ci_for_mean(
        scores, 
        confidence_level=0.95,
        num_bootstrap=10000
    )
    
    print(f"\n{name} model - Bootstrap 95% CI:")
    print(f"Mean: {mean_est:.4f}, CI: [{lower:.4f}, {upper:.4f}]")

# Final model selection and evaluation
best_model_name = max(cv_scores.keys(), key=lambda k: cv_scores[k].mean())
best_model = models[best_model_name]

print(f"\nSelected best model: {best_model_name}")

# Train best model on full training set and evaluate on test set
best_model.fit(X_train, y_train)
test_predictions = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_predictions)

print(f"Final test AUC: {test_auc:.4f}")

# Calculate bootstrap CI for test performance
test_bootstrap_scores = []
for _ in range(1000):
    indices = np.random.choice(len(y_test), len(y_test), replace=True)
    boot_auc = roc_auc_score(y_test[indices], test_predictions[indices])
    test_bootstrap_scores.append(boot_auc)

test_lower, test_upper, test_mean = calc_bootstrap_ci_for_mean(
    test_bootstrap_scores,
    confidence_level=0.95
)

print(f"Test AUC 95% Bootstrap CI: [{test_lower:.4f}, {test_upper:.4f}]")

Install with Tessl CLI