Tessl Tile for pypi/catboost@1.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-features.md core-models.md data-handling.md datasets.md evaluation.md feature-analysis.md index.md metrics.md training-evaluation.md utilities.md visualization.md

feature-analysis.mddocs/

0
# Feature Analysis
1

2
CatBoost provides comprehensive feature analysis capabilities including feature importance calculation, SHAP values, feature interactions, and automatic feature selection. These tools help understand model behavior and identify the most important features for predictions.
3

4
## Capabilities
5

6
### Feature Importance Types
7

8
Enums and constants defining different methods for calculating feature importance.
9

10
```python { .api }
11
class EFstrType:
12
    """Enumeration of feature importance calculation types."""
13
    
14
    PredictionValuesChange = 0
15
    """Calculate feature importance by measuring prediction values change when feature is removed."""
16
    
17
    LossFunctionChange = 1
18
    """Calculate feature importance by measuring loss function change when feature is removed."""
19
    
20
    FeatureImportance = 2
21
    """
22
    Use LossFunctionChange for ranking problems, PredictionValuesChange for other problems.
23
    This is the default and recommended method.
24
    """
25
    
26
    Interaction = 3
27
    """Calculate pairwise feature interaction scores between all feature pairs."""
28
    
29
    ShapValues = 4
30
    """Calculate SHAP (SHapley Additive exPlanations) values for every object."""
31
    
32
    PredictionDiff = 5
33
    """Calculate feature importance explaining prediction differences between objects."""
34
    
35
    ShapInteractionValues = 6
36
    """Calculate SHAP interaction values for feature pairs."""
37
    
38
    SageValues = 7
39
    """Calculate SAGE (Shapley Additive Global importancE) values for every feature."""
40

41
class EShapCalcType:
42
    """Enumeration of SHAP calculation types."""
43
    
44
    Regular = "Regular"
45
    """Calculate regular SHAP values using standard algorithm."""
46
    
47
    Approximate = "Approximate"
48
    """Calculate approximate SHAP values for faster computation."""
49
    
50
    Exact = "Exact"
51
    """Calculate exact SHAP values using precise but slower algorithm."""
52
```
53

54
### Feature Selection
55

56
Algorithms and grouping methods for automatic feature selection.
57

58
```python { .api }
59
class EFeaturesSelectionAlgorithm:
60
    """Enumeration of feature selection algorithms."""
61
    
62
    RecursiveByPredictionValuesChange = "RecursiveByPredictionValuesChange"
63
    """
64
    Recursive feature elimination using prediction values change.
65
    Eliminates a batch of features at each step.
66
    """
67
    
68
    RecursiveByLossFunctionChange = "RecursiveByLossFunctionChange"
69
    """
70
    Recursive feature elimination using loss function change.
71
    Eliminates a batch of features at each step.
72
    """
73
    
74
    RecursiveByShapValues = "RecursiveByShapValues"
75
    """
76
    Recursive feature elimination using SHAP values to estimate loss change.
77
    Eliminates features one by one based on SHAP importance.
78
    """
79

80
class EFeaturesSelectionGrouping:
81
    """Enumeration of feature selection grouping methods."""
82
    
83
    Individual = "Individual"
84
    """Select individual features independently."""
85
    
86
    ByTags = "ByTags"
87
    """Select feature groups marked by tags in the Pool."""
88
```
89

90
### Model Feature Importance Methods
91

92
Methods available on trained CatBoost models for feature analysis.
93

94
```python { .api }
95
# These methods are available on CatBoost model objects
96
def get_feature_importance(self, data=None, type='FeatureImportance', 
97
                          prettified=False, thread_count=-1, shap_mode=None,
98
                          interaction_indices=None, shap_calc_type='Regular',
99
                          model_output_type='RawFormulaVal', train_pool=None,
100
                          fstr_type=None):
101
    """
102
    Calculate feature importance for the trained model.
103
    
104
    Parameters:
105
    - data: Data for importance calculation (Pool, array-like, or None for training data)
106
    - type: Importance type (EFstrType value or string)
107
        - 'FeatureImportance': Default feature importance
108
        - 'PredictionValuesChange': Prediction change importance
109
        - 'LossFunctionChange': Loss change importance  
110
        - 'ShapValues': SHAP values
111
        - 'Interaction': Feature interactions
112
        - 'ShapInteractionValues': SHAP interaction values
113
        - 'SageValues': SAGE values
114
    - prettified: Return results as formatted pandas DataFrame (bool)
115
    - thread_count: Number of threads for computation (int)
116
    - shap_mode: SHAP calculation mode ('SinglePoint', 'AllPoints')
117
    - interaction_indices: Feature pairs for interaction calculation (list of pairs)  
118
    - shap_calc_type: SHAP calculation type (EShapCalcType value)
119
    - model_output_type: Model output type ('RawFormulaVal', 'Probability', 'Class')
120
    - train_pool: Training pool for some importance types
121
    - fstr_type: Deprecated, use 'type' parameter
122
    
123
    Returns:
124
    numpy.ndarray or pandas.DataFrame: Feature importance values
125
        - For regular importance: (n_features,) array
126
        - For SHAP values: (n_objects, n_features) array
127
        - For interactions: (n_features, n_features) array
128
    """
129

130
def get_object_importance(self, pool, train_pool, top_size=-1, 
131
                         type='Average', update_method='SinglePoint',
132
                         importance_values_sign='All', thread_count=-1):
133
    """
134
    Calculate object importance (leaf influence) for understanding which 
135
    training objects most influence predictions on new data.
136
    
137
    Parameters:
138
    - pool: Pool for which to calculate object importance
139
    - train_pool: Training pool containing influential objects
140
    - top_size: Number of most important objects to return (-1 for all)
141
    - type: Importance calculation type
142
        - 'Average': Average importance across all test objects
143
        - 'PerObject': Individual importance for each test object
144
    - update_method: Leaf update method
145
        - 'SinglePoint': Single point update
146
        - 'TopKLeaves': Top K leaves update
147
        - 'AllPoints': All points update
148
    - importance_values_sign: Which importance values to return
149
        - 'All': All importance values
150
        - 'Positive': Only positive importance values
151
        - 'Negative': Only negative importance values
152
    - thread_count: Number of threads for computation
153
    
154
    Returns:
155
    numpy.ndarray: Object importance values
156
        - For 'Average': (n_train_objects,) array
157
        - For 'PerObject': (n_test_objects, n_train_objects) array
158
    """
159
```
160

161
### Feature Selection Methods
162

163
Methods for automatic feature selection using various algorithms.
164

165
```python { .api }
166
def select_features(self, X, y=None, eval_set=None, features_for_select=None,
167
                   num_features_to_select=None, steps=1, algorithm='RecursiveByShapValues',
168
                   shap_calc_type='Regular', train_final_model=True, 
169
                   logging_level=None, plot=False, log_cout=None, log_cerr=None):
170
    """
171
    Perform automatic feature selection on the model.
172
    
173
    Parameters:
174
    - X: Input features (Pool or array-like)
175
    - y: Target values (array-like, optional if X is Pool)
176
    - eval_set: Evaluation datasets (list of tuples)
177
    - features_for_select: Features to consider for selection (list of indices/names)
178
    - num_features_to_select: Target number of features to select (int)
179
    - steps: Number of features to eliminate at each step for batch methods (int)
180
    - algorithm: Feature selection algorithm (EFeaturesSelectionAlgorithm value)
181
    - shap_calc_type: SHAP calculation type for SHAP-based selection
182
    - train_final_model: Whether to retrain model with selected features (bool)
183
    - logging_level: Logging level during selection
184
    - plot: Enable plotting of selection process (bool)
185
    - log_cout: Output stream for logging
186
    - log_cerr: Error stream for logging
187
    
188
    Returns:
189
    dict: Selection results containing:
190
        - 'selected_features': List of selected feature indices
191
        - 'eliminated_features': List of eliminated feature indices  
192
        - 'selected_features_names': Names of selected features (if available)
193
        - 'eliminated_features_names': Names of eliminated features (if available)
194
        - 'loss_graph': Loss values during selection process
195
    """
196
```
197

198
## Feature Analysis Examples
199

200
### Basic Feature Importance
201

202
```python
203
from catboost import CatBoostClassifier, Pool
204
import pandas as pd
205
import numpy as np
206

207
# Train model
208
model = CatBoostClassifier(iterations=100, verbose=False)
209
model.fit(X_train, y_train, cat_features=['category'])
210

211
# Get default feature importance
212
importance = model.get_feature_importance()
213
feature_names = X_train.columns
214

215
# Create importance DataFrame
216
importance_df = pd.DataFrame({
217
    'feature': feature_names,
218
    'importance': importance
219
}).sort_values('importance', ascending=False)
220

221
print("Top 10 most important features:")
222
print(importance_df.head(10))
223
```
224

225
### SHAP Values Analysis
226

227
```python
228
from catboost import CatBoostRegressor, Pool, EFstrType
229
import matplotlib.pyplot as plt
230

231
# Train model  
232
model = CatBoostRegressor(iterations=100, verbose=False)
233
model.fit(X_train, y_train)
234

235
# Get SHAP values for test set
236
test_pool = Pool(X_test, cat_features=['category'])
237
shap_values = model.get_feature_importance(
238
    data=test_pool,
239
    type=EFstrType.ShapValues,
240
    prettified=True
241
)
242

243
print(f"SHAP values shape: {shap_values.shape}")  # (n_samples, n_features)
244

245
# Calculate mean absolute SHAP values for feature ranking
246
mean_shap = np.abs(shap_values).mean(axis=0)
247
feature_ranking = pd.DataFrame({
248
    'feature': X_test.columns,
249
    'mean_abs_shap': mean_shap
250
}).sort_values('mean_abs_shap', ascending=False)
251

252
print("Feature ranking by mean absolute SHAP:")
253
print(feature_ranking.head())
254
```
255

256
### Feature Interactions
257

258
```python
259
from catboost import EFstrType
260

261
# Calculate pairwise feature interactions
262
interactions = model.get_feature_importance(
263
    type=EFstrType.Interaction,
264
    prettified=True
265
)
266

267
# Find top feature pairs
268
n_features = len(X_train.columns)
269
top_interactions = []
270

271
for i in range(n_features):
272
    for j in range(i+1, n_features):
273
        interaction_score = interactions[i, j]
274
        top_interactions.append({
275
            'feature1': X_train.columns[i],
276
            'feature2': X_train.columns[j], 
277
            'interaction_score': interaction_score
278
        })
279

280
# Sort by interaction strength
281
top_interactions = sorted(top_interactions, 
282
                         key=lambda x: abs(x['interaction_score']), 
283
                         reverse=True)
284

285
print("Top 5 feature interactions:")
286
for interaction in top_interactions[:5]:
287
    print(f"{interaction['feature1']} × {interaction['feature2']}: {interaction['interaction_score']:.4f}")
288
```
289

290
### Automatic Feature Selection
291

292
```python
293
from catboost import CatBoostClassifier, EFeaturesSelectionAlgorithm
294

295
# Initialize model for feature selection
296
model = CatBoostClassifier(
297
    iterations=200,
298
    learning_rate=0.1,
299
    depth=6,
300
    verbose=False
301
)
302

303
# Perform feature selection
304
selection_results = model.select_features(
305
    X=X_train,
306
    y=y_train,
307
    eval_set=[(X_val, y_val)],
308
    features_for_select=None,  # Consider all features
309
    num_features_to_select=20,  # Select top 20 features
310
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
311
    steps=5,  # Remove 5 features at each step
312
    train_final_model=True,
313
    plot=True
314
)
315

316
print(f"Selected {len(selection_results['selected_features'])} features:")
317
print("Selected features:", selection_results['selected_features_names'])
318
print("Eliminated features:", selection_results['eliminated_features_names'])
319

320
# Train final model with selected features only
321
selected_features = selection_results['selected_features']
322
X_train_selected = X_train.iloc[:, selected_features]
323
X_val_selected = X_val.iloc[:, selected_features]
324

325
final_model = CatBoostClassifier(iterations=500, verbose=False)
326
final_model.fit(X_train_selected, y_train)
327
final_score = final_model.score(X_val_selected, y_val)
328
print(f"Final model accuracy with selected features: {final_score:.4f}")
329
```
330

331
### Advanced SHAP Analysis
332

333
```python
334
from catboost import EFstrType, EShapCalcType
335

336
# Get exact SHAP values for important samples
337
important_samples = X_test.iloc[:100]  # First 100 samples
338
test_pool = Pool(important_samples, cat_features=['category'])
339

340
exact_shap = model.get_feature_importance(
341
    data=test_pool,
342
    type=EFstrType.ShapValues,
343
    shap_calc_type=EShapCalcType.Exact
344
)
345

346
# Calculate SHAP interaction values for top features
347
top_features = [0, 1, 2, 3, 4]  # Top 5 feature indices
348
shap_interactions = model.get_feature_importance(
349
    data=test_pool,
350
    type=EFstrType.ShapInteractionValues,
351
    interaction_indices=[(i, j) for i in top_features for j in top_features if i != j]
352
)
353

354
print(f"SHAP interaction values shape: {shap_interactions.shape}")
355
```
356

357
### Object Importance Analysis
358

359
```python
360
# Analyze which training examples most influence test predictions
361
test_pool = Pool(X_test[:10], cat_features=['category'])  # Analyze first 10 test samples
362
train_pool = Pool(X_train, y_train, cat_features=['category'])
363

364
# Get average object importance
365
obj_importance = model.get_object_importance(
366
    pool=test_pool,
367
    train_pool=train_pool,
368
    top_size=50,  # Top 50 most influential training examples
369
    type='Average'
370
)
371

372
print(f"Top 10 most influential training examples (indices): {obj_importance[:10]}")
373

374
# Get per-object importance for detailed analysis
375
detailed_importance = model.get_object_importance(
376
    pool=test_pool,
377
    train_pool=train_pool,
378
    type='PerObject',
379
    top_size=20
380
)
381

382
print(f"Per-object importance shape: {detailed_importance.shape}")  # (n_test, n_top_train)
383
```

Version

Tile

Files

feature-analysis.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

feature-analysis.mddocs/