0
# Utilities
1
2
CatBoost provides extensive utility functions for model evaluation, GPU management, metric calculation, data conversion, and model export. These utilities enhance the machine learning workflow with comprehensive analysis and deployment capabilities.
3
4
## Capabilities
5
6
### Model Evaluation Utilities
7
8
Comprehensive model evaluation tools including confusion matrices, ROC curves, and threshold optimization.
9
10
```python { .api }
11
def eval_metric(label, approx, metric, weight=None, group_id=None,
12
group_weight=None, subgroup_id=None, pairs=None, thread_count=-1):
13
"""
14
Evaluate a metric on predictions.
15
16
Parameters:
17
- label: True target values (array-like)
18
- approx: Model predictions (array-like)
19
- metric: Metric name to evaluate (string)
20
Classification: 'Logloss', 'CrossEntropy', 'AUC', 'Accuracy', 'Precision', 'Recall', 'F1'
21
Regression: 'RMSE', 'MAE', 'R2', 'MSLE', 'MedianAbsoluteError', 'SMAPE', 'MAPE'
22
Ranking: 'NDCG', 'DCG', 'MAP', 'MRR', 'ERR'
23
- weight: Sample weights (array-like, optional)
24
- group_id: Group identifiers for ranking metrics (array-like, optional)
25
- group_weight: Group weights (array-like, optional)
26
- subgroup_id: Subgroup identifiers (array-like, optional)
27
- pairs: Pairs for ranking metrics (array-like, optional)
28
- thread_count: Number of threads for computation (int)
29
30
Returns:
31
float: Metric value
32
"""
33
34
def get_confusion_matrix(model, data, thread_count=-1):
35
"""
36
Calculate confusion matrix for classification model.
37
38
Parameters:
39
- model: Trained CatBoost classifier
40
- data: Input data (Pool or array-like)
41
- thread_count: Number of threads for computation (int)
42
43
Returns:
44
numpy.ndarray: Confusion matrix (n_classes, n_classes)
45
"""
46
47
def get_roc_curve(model, data, thread_count=-1, plot=False):
48
"""
49
Calculate ROC curve data for binary classification.
50
51
Parameters:
52
- model: Trained CatBoost classifier
53
- data: Input data with true labels (Pool)
54
- thread_count: Number of threads for computation (int)
55
- plot: Whether to plot the ROC curve (bool)
56
57
Returns:
58
tuple: (fpr, tpr, thresholds)
59
- fpr: False positive rates (numpy.ndarray)
60
- tpr: True positive rates (numpy.ndarray)
61
- thresholds: Decision thresholds (numpy.ndarray)
62
"""
63
64
def get_fpr_curve(model, data, curve=None, thread_count=-1, plot=False):
65
"""
66
Calculate False Positive Rate curve.
67
68
Parameters:
69
- model: Trained CatBoost classifier
70
- data: Input data with true labels (Pool)
71
- curve: Curve type (string, optional)
72
- thread_count: Number of threads for computation (int)
73
- plot: Whether to plot the curve (bool)
74
75
Returns:
76
tuple: (thresholds, fpr_values)
77
"""
78
79
def get_fnr_curve(model, data, curve=None, thread_count=-1, plot=False):
80
"""
81
Calculate False Negative Rate curve.
82
83
Parameters:
84
- model: Trained CatBoost classifier
85
- data: Input data with true labels (Pool)
86
- curve: Curve type (string, optional)
87
- thread_count: Number of threads for computation (int)
88
- plot: Whether to plot the curve (bool)
89
90
Returns:
91
tuple: (thresholds, fnr_values)
92
"""
93
94
def select_threshold(model, data, curve=None, FPR=None, FNR=None, thread_count=-1):
95
"""
96
Select optimal decision threshold based on FPR/FNR constraints.
97
98
Parameters:
99
- model: Trained CatBoost classifier
100
- data: Input data with true labels (Pool)
101
- curve: Curve type for threshold selection (string, optional)
102
- FPR: Target false positive rate (float, 0-1)
103
- FNR: Target false negative rate (float, 0-1)
104
- thread_count: Number of threads for computation (int)
105
106
Returns:
107
float: Optimal threshold value
108
"""
109
```
110
111
### GPU and System Utilities
112
113
System information and GPU management functions.
114
115
```python { .api }
116
def get_gpu_device_count():
117
"""
118
Get the number of available GPU devices.
119
120
Returns:
121
int: Number of GPU devices available for CatBoost
122
"""
123
124
def reset_trace_backend(filename):
125
"""
126
Reset trace backend with filename.
127
128
Parameters:
129
- filename: Path to trace file (string)
130
"""
131
```
132
133
### Model Export and Conversion
134
135
Functions for exporting models to various formats for deployment.
136
137
```python { .api }
138
def convert_to_onnx_object(model, export_parameters=None):
139
"""
140
Convert CatBoost model to ONNX format object.
141
142
Parameters:
143
- model: Trained CatBoost model
144
- export_parameters: Export configuration parameters (dict, optional)
145
- 'onnx_domain': ONNX domain name (string)
146
- 'onnx_model_version': Model version (int)
147
- 'onnx_doc_string': Documentation string (string)
148
- 'onnx_graph_name': Graph name (string)
149
150
Returns:
151
onnx.ModelProto: ONNX model object
152
"""
153
154
```
155
156
### Data Processing Utilities
157
158
Utilities for data preprocessing, quantization, and format conversion.
159
160
```python { .api }
161
def calculate_quantization_grid(values, border_count, border_type='Median'):
162
"""
163
Calculate quantization grid for numerical values.
164
165
Parameters:
166
- values: Input numerical values (array-like)
167
- border_count: Number of quantization borders (int)
168
- border_type: Border selection method (string)
169
- 'Median': Median-based borders
170
- 'Uniform': Uniformly spaced borders
171
- 'UniformAndQuantiles': Mix of uniform and quantile borders
172
- 'MaxLogSum': Maximum log sum borders
173
- 'MinEntropy': Minimum entropy borders
174
- 'GreedyLogSum': Greedy log sum borders
175
176
Returns:
177
numpy.ndarray: Quantization border values
178
"""
179
180
def quantize(data_path, column_description=None, pairs=None, graph=None,
181
delimiter='\t', has_header=False, ignore_csv_quoting=False,
182
feature_names=None, thread_count=-1, ignored_features=None,
183
per_float_feature_quantization=None, border_count=None,
184
max_bin=None, feature_border_type=None, nan_mode=None,
185
input_borders=None, task_type=None, used_ram_limit=None,
186
random_seed=None, **kwargs):
187
"""
188
Construct quantized Pool from non-quantized pool stored in file.
189
190
Parameters:
191
- data_path: Path to data file (string)
192
- column_description: Path to column description file (string, optional)
193
- pairs: Path to pairs file (string, optional)
194
- graph: Path to graph file (string, optional)
195
- delimiter: Delimiter used in data file (string)
196
- has_header: Whether file has header row (bool)
197
- ignore_csv_quoting: Ignore CSV quoting (bool)
198
- feature_names: Feature names (list, optional)
199
- thread_count: Number of threads (int)
200
- ignored_features: Indices of ignored features (list, optional)
201
- per_float_feature_quantization: Per-feature quantization settings (dict, optional)
202
- border_count: Number of borders for quantization (int, optional)
203
- max_bin: Maximum number of bins (int, optional)
204
- feature_border_type: Border type for features (string, optional)
205
- nan_mode: NaN handling mode (string, optional)
206
- input_borders: Input borders (dict, optional)
207
- task_type: Task type ('CPU' or 'GPU', optional)
208
- used_ram_limit: RAM usage limit (string, optional)
209
- random_seed: Random seed (int, optional)
210
211
Returns:
212
Pool: Quantized Pool object
213
"""
214
215
def create_cd(label=None, cat_features=None, text_features=None,
216
embedding_features=None, weight=None, baseline=None,
217
doc_id=None, group_id=None, subgroup_id=None,
218
timestamp=None, auxiliary_columns=None, feature_names=None,
219
output_path='train.cd'):
220
"""
221
Create column description file for CatBoost data loading.
222
223
Parameters:
224
- label: Label column index (int, optional)
225
- cat_features: Categorical feature column indices (list of int, optional)
226
- text_features: Text feature column indices (list of int, optional)
227
- embedding_features: Embedding feature column indices (list of int, optional)
228
- weight: Weight column index (int, optional)
229
- baseline: Baseline column index (int, optional)
230
- doc_id: Document ID column index (int, optional)
231
- group_id: Group ID column index (int, optional)
232
- subgroup_id: Subgroup ID column index (int, optional)
233
- timestamp: Timestamp column index (int, optional)
234
- auxiliary_columns: Auxiliary column indices (list of int, optional)
235
- feature_names: Feature names (list of str, optional)
236
- output_path: Output file path (string)
237
"""
238
239
def read_cd(cd_file, column_count=None, data_file=None, canonize_column_types=False):
240
"""
241
Read column description file.
242
243
Parameters:
244
- cd_file: Path to column description file (string)
245
- column_count: Number of columns expected (int, optional)
246
- data_file: Path to data file for validation (string, optional)
247
- canonize_column_types: Whether to canonize column types (bool)
248
249
Returns:
250
dict: Column description information
251
"""
252
```
253
254
### Additional Utility Functions
255
256
Other utility functions for advanced use cases.
257
258
```python { .api }
259
def compute_wx_test():
260
"""
261
Compute Wilcoxon test statistic.
262
"""
263
264
class TargetStats:
265
"""
266
Target statistics computation class.
267
"""
268
269
class DataMetaInfo:
270
"""
271
Data metadata information class.
272
"""
273
274
def compute_training_options():
275
"""
276
Compute training options and parameters.
277
"""
278
```
279
280
## Utility Examples
281
282
### Model Evaluation and Analysis
283
284
```python
285
from catboost import CatBoostClassifier, Pool
286
from catboost.utils import eval_metric, get_confusion_matrix, get_roc_curve
287
288
# Train model
289
model = CatBoostClassifier(iterations=100, verbose=False)
290
model.fit(X_train, y_train)
291
292
# Create test pool with labels for evaluation
293
test_pool = Pool(X_test, y_test, cat_features=['category'])
294
295
# Get predictions
296
predictions = model.predict(test_pool)
297
probabilities = model.predict_proba(test_pool)[:, 1] # Positive class probabilities
298
299
# Evaluate various metrics
300
accuracy = eval_metric(y_test, predictions, 'Accuracy')
301
auc = eval_metric(y_test, probabilities, 'AUC')
302
logloss = eval_metric(y_test, probabilities, 'Logloss')
303
304
print(f"Accuracy: {accuracy:.4f}")
305
print(f"AUC: {auc:.4f}")
306
print(f"LogLoss: {logloss:.4f}")
307
308
# Get confusion matrix
309
conf_matrix = get_confusion_matrix(model, test_pool)
310
print("Confusion Matrix:")
311
print(conf_matrix)
312
313
# Get ROC curve data
314
fpr, tpr, thresholds = get_roc_curve(model, test_pool, plot=True)
315
print(f"ROC curve computed with {len(thresholds)} thresholds")
316
```
317
318
### Threshold Optimization
319
320
```python
321
from catboost.utils import select_threshold, get_fpr_curve, get_fnr_curve
322
323
# Get FPR and FNR curves
324
thresholds_fpr, fpr_values = get_fpr_curve(model, test_pool)
325
thresholds_fnr, fnr_values = get_fnr_curve(model, test_pool)
326
327
# Select threshold for specific FPR constraint
328
threshold_fpr = select_threshold(model, test_pool, FPR=0.05) # 5% FPR
329
print(f"Threshold for 5% FPR: {threshold_fpr:.4f}")
330
331
# Select threshold for specific FNR constraint
332
threshold_fnr = select_threshold(model, test_pool, FNR=0.10) # 10% FNR
333
print(f"Threshold for 10% FNR: {threshold_fnr:.4f}")
334
335
# Apply optimal threshold to predictions
336
optimal_predictions = (probabilities > threshold_fpr).astype(int)
337
optimal_accuracy = eval_metric(y_test, optimal_predictions, 'Accuracy')
338
print(f"Accuracy with optimal threshold: {optimal_accuracy:.4f}")
339
```
340
341
### Model Export and Deployment
342
343
```python
344
from catboost.utils import convert_to_onnx_object
345
import onnx
346
347
# Export model to ONNX format
348
onnx_model = convert_to_onnx_object(model, export_parameters={
349
'onnx_domain': 'ai.catboost',
350
'onnx_model_version': 1,
351
'onnx_doc_string': 'CatBoost classifier model',
352
'onnx_graph_name': 'CatBoostModel'
353
})
354
355
# Save ONNX model
356
onnx.save(onnx_model, 'model.onnx')
357
print("Model exported to ONNX format")
358
359
# Export as Python code
360
model.save_model('model.py', format='python')
361
print("Model exported as Python code")
362
363
# Export as C++ code
364
model.save_model('model.cpp', format='cpp')
365
print("Model exported as C++ code")
366
```
367
368
### GPU Utilization
369
370
```python
371
from catboost.utils import get_gpu_device_count
372
from catboost import CatBoostRegressor
373
374
# Check GPU availability
375
gpu_count = get_gpu_device_count()
376
print(f"Available GPU devices: {gpu_count}")
377
378
if gpu_count > 0:
379
# Train model on GPU
380
gpu_model = CatBoostRegressor(
381
iterations=500,
382
task_type='GPU',
383
devices='0', # Use first GPU
384
verbose=False
385
)
386
gpu_model.fit(X_train, y_train)
387
print("Model trained on GPU")
388
else:
389
print("No GPU devices available, using CPU")
390
```
391
392
### Data Quantization Optimization
393
394
```python
395
from catboost.utils import calculate_quantization_grid, quantize
396
from catboost import Pool
397
import numpy as np
398
399
# Calculate custom quantization grid
400
feature_values = X_train.iloc[:, 0].values # First feature
401
custom_borders = calculate_quantization_grid(
402
values=feature_values,
403
border_count=64,
404
border_type='GreedyLogSum'
405
)
406
407
print(f"Custom quantization borders: {len(custom_borders)} borders")
408
print(f"Border range: [{custom_borders[0]:.4f}, {custom_borders[-1]:.4f}]")
409
410
# Create and quantize pool
411
train_pool = Pool(X_train, y_train, cat_features=['category'])
412
quantized_pool = quantize(
413
pool=train_pool,
414
border_count=128,
415
feature_border_type='GreedyLogSum',
416
task_type='CPU'
417
)
418
419
print("Pool quantized successfully")
420
print(f"Original pool quantized: {train_pool.is_quantized()}")
421
print(f"Quantized pool quantized: {quantized_pool.is_quantized()}")
422
```
423
424
### Model Performance Benchmarking
425
426
```python
427
import time
428
import psutil
429
import numpy as np
430
431
def benchmark_prediction(model, data, num_runs=100):
432
"""Custom benchmarking function."""
433
434
times = []
435
for _ in range(num_runs):
436
start_time = time.time()
437
predictions = model.predict(data)
438
end_time = time.time()
439
times.append(end_time - start_time)
440
441
avg_time = np.mean(times)
442
std_time = np.std(times)
443
predictions_per_second = len(data) / avg_time
444
445
return {
446
'avg_time': avg_time,
447
'std_time': std_time,
448
'predictions_per_second': predictions_per_second,
449
'num_predictions': len(data)
450
}
451
452
# Benchmark model performance
453
benchmark_results = benchmark_prediction(model, X_test, num_runs=50)
454
455
print(f"Average prediction time: {benchmark_results['avg_time']:.6f} seconds")
456
print(f"Standard deviation: {benchmark_results['std_time']:.6f} seconds")
457
print(f"Predictions per second: {benchmark_results['predictions_per_second']:.0f}")
458
print(f"Processed {benchmark_results['num_predictions']} samples")
459
```