tessl/pypi-catboost

CatBoost is a fast, scalable, high performance gradient boosting on decision trees library used for ranking, classification, regression and other ML tasks.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Feature Analysis

Name: tessl/pypi-catboost
Author: tessl

CatBoost provides comprehensive feature analysis capabilities including feature importance calculation, SHAP values, feature interactions, and automatic feature selection. These tools help understand model behavior and identify the most important features for predictions.

Capabilities

Feature Importance Types

Enums and constants defining different methods for calculating feature importance.

class EFstrType:
    """Enumeration of feature importance calculation types."""
    
    PredictionValuesChange = 0
    """Calculate feature importance by measuring prediction values change when feature is removed."""
    
    LossFunctionChange = 1
    """Calculate feature importance by measuring loss function change when feature is removed."""
    
    FeatureImportance = 2
    """
    Use LossFunctionChange for ranking problems, PredictionValuesChange for other problems.
    This is the default and recommended method.
    """
    
    Interaction = 3
    """Calculate pairwise feature interaction scores between all feature pairs."""
    
    ShapValues = 4
    """Calculate SHAP (SHapley Additive exPlanations) values for every object."""
    
    PredictionDiff = 5
    """Calculate feature importance explaining prediction differences between objects."""
    
    ShapInteractionValues = 6
    """Calculate SHAP interaction values for feature pairs."""
    
    SageValues = 7
    """Calculate SAGE (Shapley Additive Global importancE) values for every feature."""

class EShapCalcType:
    """Enumeration of SHAP calculation types."""
    
    Regular = "Regular"
    """Calculate regular SHAP values using standard algorithm."""
    
    Approximate = "Approximate"
    """Calculate approximate SHAP values for faster computation."""
    
    Exact = "Exact"
    """Calculate exact SHAP values using precise but slower algorithm."""

Feature Selection

Algorithms and grouping methods for automatic feature selection.

class EFeaturesSelectionAlgorithm:
    """Enumeration of feature selection algorithms."""
    
    RecursiveByPredictionValuesChange = "RecursiveByPredictionValuesChange"
    """
    Recursive feature elimination using prediction values change.
    Eliminates a batch of features at each step.
    """
    
    RecursiveByLossFunctionChange = "RecursiveByLossFunctionChange"
    """
    Recursive feature elimination using loss function change.
    Eliminates a batch of features at each step.
    """
    
    RecursiveByShapValues = "RecursiveByShapValues"
    """
    Recursive feature elimination using SHAP values to estimate loss change.
    Eliminates features one by one based on SHAP importance.
    """

class EFeaturesSelectionGrouping:
    """Enumeration of feature selection grouping methods."""
    
    Individual = "Individual"
    """Select individual features independently."""
    
    ByTags = "ByTags"
    """Select feature groups marked by tags in the Pool."""

Model Feature Importance Methods

Methods available on trained CatBoost models for feature analysis.

# These methods are available on CatBoost model objects
def get_feature_importance(self, data=None, type='FeatureImportance', 
                          prettified=False, thread_count=-1, shap_mode=None,
                          interaction_indices=None, shap_calc_type='Regular',
                          model_output_type='RawFormulaVal', train_pool=None,
                          fstr_type=None):
    """
    Calculate feature importance for the trained model.
    
    Parameters:
    - data: Data for importance calculation (Pool, array-like, or None for training data)
    - type: Importance type (EFstrType value or string)
        - 'FeatureImportance': Default feature importance
        - 'PredictionValuesChange': Prediction change importance
        - 'LossFunctionChange': Loss change importance  
        - 'ShapValues': SHAP values
        - 'Interaction': Feature interactions
        - 'ShapInteractionValues': SHAP interaction values
        - 'SageValues': SAGE values
    - prettified: Return results as formatted pandas DataFrame (bool)
    - thread_count: Number of threads for computation (int)
    - shap_mode: SHAP calculation mode ('SinglePoint', 'AllPoints')
    - interaction_indices: Feature pairs for interaction calculation (list of pairs)  
    - shap_calc_type: SHAP calculation type (EShapCalcType value)
    - model_output_type: Model output type ('RawFormulaVal', 'Probability', 'Class')
    - train_pool: Training pool for some importance types
    - fstr_type: Deprecated, use 'type' parameter
    
    Returns:
    numpy.ndarray or pandas.DataFrame: Feature importance values
        - For regular importance: (n_features,) array
        - For SHAP values: (n_objects, n_features) array
        - For interactions: (n_features, n_features) array
    """

def get_object_importance(self, pool, train_pool, top_size=-1, 
                         type='Average', update_method='SinglePoint',
                         importance_values_sign='All', thread_count=-1):
    """
    Calculate object importance (leaf influence) for understanding which 
    training objects most influence predictions on new data.
    
    Parameters:
    - pool: Pool for which to calculate object importance
    - train_pool: Training pool containing influential objects
    - top_size: Number of most important objects to return (-1 for all)
    - type: Importance calculation type
        - 'Average': Average importance across all test objects
        - 'PerObject': Individual importance for each test object
    - update_method: Leaf update method
        - 'SinglePoint': Single point update
        - 'TopKLeaves': Top K leaves update
        - 'AllPoints': All points update
    - importance_values_sign: Which importance values to return
        - 'All': All importance values
        - 'Positive': Only positive importance values
        - 'Negative': Only negative importance values
    - thread_count: Number of threads for computation
    
    Returns:
    numpy.ndarray: Object importance values
        - For 'Average': (n_train_objects,) array
        - For 'PerObject': (n_test_objects, n_train_objects) array
    """

Feature Selection Methods

Methods for automatic feature selection using various algorithms.

def select_features(self, X, y=None, eval_set=None, features_for_select=None,
                   num_features_to_select=None, steps=1, algorithm='RecursiveByShapValues',
                   shap_calc_type='Regular', train_final_model=True, 
                   logging_level=None, plot=False, log_cout=None, log_cerr=None):
    """
    Perform automatic feature selection on the model.
    
    Parameters:
    - X: Input features (Pool or array-like)
    - y: Target values (array-like, optional if X is Pool)
    - eval_set: Evaluation datasets (list of tuples)
    - features_for_select: Features to consider for selection (list of indices/names)
    - num_features_to_select: Target number of features to select (int)
    - steps: Number of features to eliminate at each step for batch methods (int)
    - algorithm: Feature selection algorithm (EFeaturesSelectionAlgorithm value)
    - shap_calc_type: SHAP calculation type for SHAP-based selection
    - train_final_model: Whether to retrain model with selected features (bool)
    - logging_level: Logging level during selection
    - plot: Enable plotting of selection process (bool)
    - log_cout: Output stream for logging
    - log_cerr: Error stream for logging
    
    Returns:
    dict: Selection results containing:
        - 'selected_features': List of selected feature indices
        - 'eliminated_features': List of eliminated feature indices  
        - 'selected_features_names': Names of selected features (if available)
        - 'eliminated_features_names': Names of eliminated features (if available)
        - 'loss_graph': Loss values during selection process
    """

Feature Analysis Examples

Basic Feature Importance

from catboost import CatBoostClassifier, Pool
import pandas as pd
import numpy as np

# Train model
model = CatBoostClassifier(iterations=100, verbose=False)
model.fit(X_train, y_train, cat_features=['category'])

# Get default feature importance
importance = model.get_feature_importance()
feature_names = X_train.columns

# Create importance DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 10 most important features:")
print(importance_df.head(10))

SHAP Values Analysis

from catboost import CatBoostRegressor, Pool, EFstrType
import matplotlib.pyplot as plt

# Train model  
model = CatBoostRegressor(iterations=100, verbose=False)
model.fit(X_train, y_train)

# Get SHAP values for test set
test_pool = Pool(X_test, cat_features=['category'])
shap_values = model.get_feature_importance(
    data=test_pool,
    type=EFstrType.ShapValues,
    prettified=True
)

print(f"SHAP values shape: {shap_values.shape}")  # (n_samples, n_features)

# Calculate mean absolute SHAP values for feature ranking
mean_shap = np.abs(shap_values).mean(axis=0)
feature_ranking = pd.DataFrame({
    'feature': X_test.columns,
    'mean_abs_shap': mean_shap
}).sort_values('mean_abs_shap', ascending=False)

print("Feature ranking by mean absolute SHAP:")
print(feature_ranking.head())

Feature Interactions

from catboost import EFstrType

# Calculate pairwise feature interactions
interactions = model.get_feature_importance(
    type=EFstrType.Interaction,
    prettified=True
)

# Find top feature pairs
n_features = len(X_train.columns)
top_interactions = []

for i in range(n_features):
    for j in range(i+1, n_features):
        interaction_score = interactions[i, j]
        top_interactions.append({
            'feature1': X_train.columns[i],
            'feature2': X_train.columns[j], 
            'interaction_score': interaction_score
        })

# Sort by interaction strength
top_interactions = sorted(top_interactions, 
                         key=lambda x: abs(x['interaction_score']), 
                         reverse=True)

print("Top 5 feature interactions:")
for interaction in top_interactions[:5]:
    print(f"{interaction['feature1']} × {interaction['feature2']}: {interaction['interaction_score']:.4f}")

Automatic Feature Selection

from catboost import CatBoostClassifier, EFeaturesSelectionAlgorithm

# Initialize model for feature selection
model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    verbose=False
)

# Perform feature selection
selection_results = model.select_features(
    X=X_train,
    y=y_train,
    eval_set=[(X_val, y_val)],
    features_for_select=None,  # Consider all features
    num_features_to_select=20,  # Select top 20 features
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    steps=5,  # Remove 5 features at each step
    train_final_model=True,
    plot=True
)

print(f"Selected {len(selection_results['selected_features'])} features:")
print("Selected features:", selection_results['selected_features_names'])
print("Eliminated features:", selection_results['eliminated_features_names'])

# Train final model with selected features only
selected_features = selection_results['selected_features']
X_train_selected = X_train.iloc[:, selected_features]
X_val_selected = X_val.iloc[:, selected_features]

final_model = CatBoostClassifier(iterations=500, verbose=False)
final_model.fit(X_train_selected, y_train)
final_score = final_model.score(X_val_selected, y_val)
print(f"Final model accuracy with selected features: {final_score:.4f}")

Advanced SHAP Analysis

from catboost import EFstrType, EShapCalcType

# Get exact SHAP values for important samples
important_samples = X_test.iloc[:100]  # First 100 samples
test_pool = Pool(important_samples, cat_features=['category'])

exact_shap = model.get_feature_importance(
    data=test_pool,
    type=EFstrType.ShapValues,
    shap_calc_type=EShapCalcType.Exact
)

# Calculate SHAP interaction values for top features
top_features = [0, 1, 2, 3, 4]  # Top 5 feature indices
shap_interactions = model.get_feature_importance(
    data=test_pool,
    type=EFstrType.ShapInteractionValues,
    interaction_indices=[(i, j) for i in top_features for j in top_features if i != j]
)

print(f"SHAP interaction values shape: {shap_interactions.shape}")

Object Importance Analysis

# Analyze which training examples most influence test predictions
test_pool = Pool(X_test[:10], cat_features=['category'])  # Analyze first 10 test samples
train_pool = Pool(X_train, y_train, cat_features=['category'])

# Get average object importance
obj_importance = model.get_object_importance(
    pool=test_pool,
    train_pool=train_pool,
    top_size=50,  # Top 50 most influential training examples
    type='Average'
)

print(f"Top 10 most influential training examples (indices): {obj_importance[:10]}")

# Get per-object importance for detailed analysis
detailed_importance = model.get_object_importance(
    pool=test_pool,
    train_pool=train_pool,
    type='PerObject',
    top_size=20
)

print(f"Per-object importance shape: {detailed_importance.shape}")  # (n_test, n_top_train)

Install with Tessl CLI