Tessl Tile for pypi/catboost@1.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-features.md core-models.md data-handling.md datasets.md evaluation.md feature-analysis.md index.md metrics.md training-evaluation.md utilities.md visualization.md

utilities.mddocs/

0
# Utilities
1

2
CatBoost provides extensive utility functions for model evaluation, GPU management, metric calculation, data conversion, and model export. These utilities enhance the machine learning workflow with comprehensive analysis and deployment capabilities.
3

4
## Capabilities
5

6
### Model Evaluation Utilities
7

8
Comprehensive model evaluation tools including confusion matrices, ROC curves, and threshold optimization.
9

10
```python { .api }
11
def eval_metric(label, approx, metric, weight=None, group_id=None, 
12
               group_weight=None, subgroup_id=None, pairs=None, thread_count=-1):
13
    """
14
    Evaluate a metric on predictions.
15
    
16
    Parameters:
17
    - label: True target values (array-like)
18
    - approx: Model predictions (array-like)
19
    - metric: Metric name to evaluate (string)
20
        Classification: 'Logloss', 'CrossEntropy', 'AUC', 'Accuracy', 'Precision', 'Recall', 'F1'
21
        Regression: 'RMSE', 'MAE', 'R2', 'MSLE', 'MedianAbsoluteError', 'SMAPE', 'MAPE'
22
        Ranking: 'NDCG', 'DCG', 'MAP', 'MRR', 'ERR'
23
    - weight: Sample weights (array-like, optional)
24
    - group_id: Group identifiers for ranking metrics (array-like, optional)
25
    - group_weight: Group weights (array-like, optional)
26
    - subgroup_id: Subgroup identifiers (array-like, optional)
27
    - pairs: Pairs for ranking metrics (array-like, optional)
28
    - thread_count: Number of threads for computation (int)
29
    
30
    Returns:
31
    float: Metric value
32
    """
33

34
def get_confusion_matrix(model, data, thread_count=-1):
35
    """
36
    Calculate confusion matrix for classification model.
37
    
38
    Parameters:
39
    - model: Trained CatBoost classifier
40
    - data: Input data (Pool or array-like)
41
    - thread_count: Number of threads for computation (int)
42
    
43
    Returns:
44
    numpy.ndarray: Confusion matrix (n_classes, n_classes)
45
    """
46

47
def get_roc_curve(model, data, thread_count=-1, plot=False):
48
    """
49
    Calculate ROC curve data for binary classification.
50
    
51
    Parameters:
52
    - model: Trained CatBoost classifier
53
    - data: Input data with true labels (Pool)
54
    - thread_count: Number of threads for computation (int)
55
    - plot: Whether to plot the ROC curve (bool)
56
    
57
    Returns:
58
    tuple: (fpr, tpr, thresholds)
59
        - fpr: False positive rates (numpy.ndarray)
60
        - tpr: True positive rates (numpy.ndarray)  
61
        - thresholds: Decision thresholds (numpy.ndarray)
62
    """
63

64
def get_fpr_curve(model, data, curve=None, thread_count=-1, plot=False):
65
    """
66
    Calculate False Positive Rate curve.
67
    
68
    Parameters:
69
    - model: Trained CatBoost classifier
70
    - data: Input data with true labels (Pool)  
71
    - curve: Curve type (string, optional)
72
    - thread_count: Number of threads for computation (int)
73
    - plot: Whether to plot the curve (bool)
74
    
75
    Returns:
76
    tuple: (thresholds, fpr_values)
77
    """
78

79
def get_fnr_curve(model, data, curve=None, thread_count=-1, plot=False):
80
    """
81
    Calculate False Negative Rate curve.
82
    
83
    Parameters:
84
    - model: Trained CatBoost classifier
85
    - data: Input data with true labels (Pool)
86
    - curve: Curve type (string, optional)  
87
    - thread_count: Number of threads for computation (int)
88
    - plot: Whether to plot the curve (bool)
89
    
90
    Returns:
91
    tuple: (thresholds, fnr_values)
92
    """
93

94
def select_threshold(model, data, curve=None, FPR=None, FNR=None, thread_count=-1):
95
    """
96
    Select optimal decision threshold based on FPR/FNR constraints.
97
    
98
    Parameters:
99
    - model: Trained CatBoost classifier
100
    - data: Input data with true labels (Pool)
101
    - curve: Curve type for threshold selection (string, optional)
102
    - FPR: Target false positive rate (float, 0-1)
103
    - FNR: Target false negative rate (float, 0-1)  
104
    - thread_count: Number of threads for computation (int)
105
    
106
    Returns:
107
    float: Optimal threshold value
108
    """
109
```
110

111
### GPU and System Utilities
112

113
System information and GPU management functions.
114

115
```python { .api }
116
def get_gpu_device_count():
117
    """
118
    Get the number of available GPU devices.
119
    
120
    Returns:
121
    int: Number of GPU devices available for CatBoost
122
    """
123

124
def reset_trace_backend(filename):
125
    """
126
    Reset trace backend with filename.
127
    
128
    Parameters:
129
    - filename: Path to trace file (string)
130
    """
131
```
132

133
### Model Export and Conversion
134

135
Functions for exporting models to various formats for deployment.
136

137
```python { .api }
138
def convert_to_onnx_object(model, export_parameters=None):
139
    """
140
    Convert CatBoost model to ONNX format object.
141
    
142
    Parameters:
143
    - model: Trained CatBoost model
144
    - export_parameters: Export configuration parameters (dict, optional)
145
        - 'onnx_domain': ONNX domain name (string)
146
        - 'onnx_model_version': Model version (int)
147
        - 'onnx_doc_string': Documentation string (string)
148
        - 'onnx_graph_name': Graph name (string)
149
    
150
    Returns:
151
    onnx.ModelProto: ONNX model object
152
    """
153
154
```
155

156
### Data Processing Utilities
157

158
Utilities for data preprocessing, quantization, and format conversion.
159

160
```python { .api }
161
def calculate_quantization_grid(values, border_count, border_type='Median'):
162
    """
163
    Calculate quantization grid for numerical values.
164
    
165
    Parameters:
166
    - values: Input numerical values (array-like)
167
    - border_count: Number of quantization borders (int)
168
    - border_type: Border selection method (string)
169
        - 'Median': Median-based borders
170
        - 'Uniform': Uniformly spaced borders
171
        - 'UniformAndQuantiles': Mix of uniform and quantile borders
172
        - 'MaxLogSum': Maximum log sum borders
173
        - 'MinEntropy': Minimum entropy borders
174
        - 'GreedyLogSum': Greedy log sum borders
175
    
176
    Returns:
177
    numpy.ndarray: Quantization border values
178
    """
179

180
def quantize(data_path, column_description=None, pairs=None, graph=None,
181
            delimiter='\t', has_header=False, ignore_csv_quoting=False,
182
            feature_names=None, thread_count=-1, ignored_features=None,
183
            per_float_feature_quantization=None, border_count=None,
184
            max_bin=None, feature_border_type=None, nan_mode=None,
185
            input_borders=None, task_type=None, used_ram_limit=None,
186
            random_seed=None, **kwargs):
187
    """
188
    Construct quantized Pool from non-quantized pool stored in file.
189
    
190
    Parameters:
191
    - data_path: Path to data file (string)
192
    - column_description: Path to column description file (string, optional)
193
    - pairs: Path to pairs file (string, optional)
194
    - graph: Path to graph file (string, optional)
195
    - delimiter: Delimiter used in data file (string)
196
    - has_header: Whether file has header row (bool)
197
    - ignore_csv_quoting: Ignore CSV quoting (bool)
198
    - feature_names: Feature names (list, optional)
199
    - thread_count: Number of threads (int)
200
    - ignored_features: Indices of ignored features (list, optional)
201
    - per_float_feature_quantization: Per-feature quantization settings (dict, optional)
202
    - border_count: Number of borders for quantization (int, optional)
203
    - max_bin: Maximum number of bins (int, optional)
204
    - feature_border_type: Border type for features (string, optional)
205
    - nan_mode: NaN handling mode (string, optional)
206
    - input_borders: Input borders (dict, optional)
207
    - task_type: Task type ('CPU' or 'GPU', optional)
208
    - used_ram_limit: RAM usage limit (string, optional)
209
    - random_seed: Random seed (int, optional)
210
    
211
    Returns:
212
    Pool: Quantized Pool object
213
    """
214

215
def create_cd(label=None, cat_features=None, text_features=None,
216
             embedding_features=None, weight=None, baseline=None,
217
             doc_id=None, group_id=None, subgroup_id=None,
218
             timestamp=None, auxiliary_columns=None, feature_names=None,
219
             output_path='train.cd'):
220
    """
221
    Create column description file for CatBoost data loading.
222
    
223
    Parameters:
224
    - label: Label column index (int, optional)
225
    - cat_features: Categorical feature column indices (list of int, optional)
226
    - text_features: Text feature column indices (list of int, optional)
227
    - embedding_features: Embedding feature column indices (list of int, optional)
228
    - weight: Weight column index (int, optional)
229
    - baseline: Baseline column index (int, optional)
230
    - doc_id: Document ID column index (int, optional)
231
    - group_id: Group ID column index (int, optional)
232
    - subgroup_id: Subgroup ID column index (int, optional)
233
    - timestamp: Timestamp column index (int, optional)
234
    - auxiliary_columns: Auxiliary column indices (list of int, optional)
235
    - feature_names: Feature names (list of str, optional)
236
    - output_path: Output file path (string)
237
    """
238

239
def read_cd(cd_file, column_count=None, data_file=None, canonize_column_types=False):
240
    """
241
    Read column description file.
242
    
243
    Parameters:
244
    - cd_file: Path to column description file (string)
245
    - column_count: Number of columns expected (int, optional)
246
    - data_file: Path to data file for validation (string, optional)
247
    - canonize_column_types: Whether to canonize column types (bool)
248
    
249
    Returns:
250
    dict: Column description information
251
    """
252
```
253

254
### Additional Utility Functions
255

256
Other utility functions for advanced use cases.
257

258
```python { .api }
259
def compute_wx_test():
260
    """
261
    Compute Wilcoxon test statistic.
262
    """
263

264
class TargetStats:
265
    """
266
    Target statistics computation class.
267
    """
268

269
class DataMetaInfo:
270
    """
271
    Data metadata information class.
272
    """
273

274
def compute_training_options():
275
    """
276
    Compute training options and parameters.
277
    """
278
```
279

280
## Utility Examples
281

282
### Model Evaluation and Analysis
283

284
```python
285
from catboost import CatBoostClassifier, Pool
286
from catboost.utils import eval_metric, get_confusion_matrix, get_roc_curve
287

288
# Train model
289
model = CatBoostClassifier(iterations=100, verbose=False)
290
model.fit(X_train, y_train)
291

292
# Create test pool with labels for evaluation
293
test_pool = Pool(X_test, y_test, cat_features=['category'])
294

295
# Get predictions
296
predictions = model.predict(test_pool)
297
probabilities = model.predict_proba(test_pool)[:, 1]  # Positive class probabilities
298

299
# Evaluate various metrics
300
accuracy = eval_metric(y_test, predictions, 'Accuracy')
301
auc = eval_metric(y_test, probabilities, 'AUC')
302
logloss = eval_metric(y_test, probabilities, 'Logloss')
303

304
print(f"Accuracy: {accuracy:.4f}")
305
print(f"AUC: {auc:.4f}") 
306
print(f"LogLoss: {logloss:.4f}")
307

308
# Get confusion matrix
309
conf_matrix = get_confusion_matrix(model, test_pool)
310
print("Confusion Matrix:")
311
print(conf_matrix)
312

313
# Get ROC curve data
314
fpr, tpr, thresholds = get_roc_curve(model, test_pool, plot=True)
315
print(f"ROC curve computed with {len(thresholds)} thresholds")
316
```
317

318
### Threshold Optimization
319

320
```python
321
from catboost.utils import select_threshold, get_fpr_curve, get_fnr_curve
322

323
# Get FPR and FNR curves
324
thresholds_fpr, fpr_values = get_fpr_curve(model, test_pool)
325
thresholds_fnr, fnr_values = get_fnr_curve(model, test_pool)
326

327
# Select threshold for specific FPR constraint
328
threshold_fpr = select_threshold(model, test_pool, FPR=0.05)  # 5% FPR
329
print(f"Threshold for 5% FPR: {threshold_fpr:.4f}")
330

331
# Select threshold for specific FNR constraint  
332
threshold_fnr = select_threshold(model, test_pool, FNR=0.10)  # 10% FNR
333
print(f"Threshold for 10% FNR: {threshold_fnr:.4f}")
334

335
# Apply optimal threshold to predictions
336
optimal_predictions = (probabilities > threshold_fpr).astype(int)
337
optimal_accuracy = eval_metric(y_test, optimal_predictions, 'Accuracy')
338
print(f"Accuracy with optimal threshold: {optimal_accuracy:.4f}")
339
```
340

341
### Model Export and Deployment
342

343
```python
344
from catboost.utils import convert_to_onnx_object
345
import onnx
346

347
# Export model to ONNX format
348
onnx_model = convert_to_onnx_object(model, export_parameters={
349
    'onnx_domain': 'ai.catboost',
350
    'onnx_model_version': 1,
351
    'onnx_doc_string': 'CatBoost classifier model',
352
    'onnx_graph_name': 'CatBoostModel'
353
})
354

355
# Save ONNX model
356
onnx.save(onnx_model, 'model.onnx')
357
print("Model exported to ONNX format")
358

359
# Export as Python code
360
model.save_model('model.py', format='python')
361
print("Model exported as Python code")
362

363
# Export as C++ code
364
model.save_model('model.cpp', format='cpp')
365
print("Model exported as C++ code")
366
```
367

368
### GPU Utilization
369

370
```python
371
from catboost.utils import get_gpu_device_count
372
from catboost import CatBoostRegressor
373

374
# Check GPU availability
375
gpu_count = get_gpu_device_count()
376
print(f"Available GPU devices: {gpu_count}")
377

378
if gpu_count > 0:
379
    # Train model on GPU
380
    gpu_model = CatBoostRegressor(
381
        iterations=500,
382
        task_type='GPU',
383
        devices='0',  # Use first GPU
384
        verbose=False
385
    )
386
    gpu_model.fit(X_train, y_train)
387
    print("Model trained on GPU")
388
else:
389
    print("No GPU devices available, using CPU")
390
```
391

392
### Data Quantization Optimization
393

394
```python
395
from catboost.utils import calculate_quantization_grid, quantize
396
from catboost import Pool
397
import numpy as np
398

399
# Calculate custom quantization grid
400
feature_values = X_train.iloc[:, 0].values  # First feature
401
custom_borders = calculate_quantization_grid(
402
    values=feature_values,
403
    border_count=64,
404
    border_type='GreedyLogSum'
405
)
406

407
print(f"Custom quantization borders: {len(custom_borders)} borders")
408
print(f"Border range: [{custom_borders[0]:.4f}, {custom_borders[-1]:.4f}]")
409

410
# Create and quantize pool
411
train_pool = Pool(X_train, y_train, cat_features=['category'])
412
quantized_pool = quantize(
413
    pool=train_pool,
414
    border_count=128,
415
    feature_border_type='GreedyLogSum',
416
    task_type='CPU'
417
)
418

419
print("Pool quantized successfully")
420
print(f"Original pool quantized: {train_pool.is_quantized()}")
421
print(f"Quantized pool quantized: {quantized_pool.is_quantized()}")
422
```
423

424
### Model Performance Benchmarking
425

426
```python
427
import time
428
import psutil
429
import numpy as np
430

431
def benchmark_prediction(model, data, num_runs=100):
432
    """Custom benchmarking function."""
433
    
434
    times = []
435
    for _ in range(num_runs):
436
        start_time = time.time()
437
        predictions = model.predict(data)
438
        end_time = time.time()
439
        times.append(end_time - start_time)
440
    
441
    avg_time = np.mean(times)
442
    std_time = np.std(times)
443
    predictions_per_second = len(data) / avg_time
444
    
445
    return {
446
        'avg_time': avg_time,
447
        'std_time': std_time,
448
        'predictions_per_second': predictions_per_second,
449
        'num_predictions': len(data)
450
    }
451

452
# Benchmark model performance
453
benchmark_results = benchmark_prediction(model, X_test, num_runs=50)
454

455
print(f"Average prediction time: {benchmark_results['avg_time']:.6f} seconds")
456
print(f"Standard deviation: {benchmark_results['std_time']:.6f} seconds")
457
print(f"Predictions per second: {benchmark_results['predictions_per_second']:.0f}")
458
print(f"Processed {benchmark_results['num_predictions']} samples")
459
```

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/