CatBoost is a fast, scalable, high performance gradient boosting on decision trees library used for ranking, classification, regression and other ML tasks.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
CatBoost's data handling capabilities center around the Pool class, which efficiently manages training data with categorical features, text features, embeddings, and metadata. The Pool class optimizes data storage and access patterns for CatBoost's gradient boosting algorithms.
The primary data container for CatBoost that handles various data types, feature specifications, and metadata required for training and prediction.
class Pool:
def __init__(self, data, label=None, cat_features=None, text_features=None,
embedding_features=None, embedding_features_data=None,
column_description=None, pairs=None, graph=None, delimiter='\t',
has_header=False, weight=None, group_id=None, group_weight=None,
subgroup_id=None, pairs_weight=None, baseline=None, timestamp=None,
feature_names=None, feature_tags=None, thread_count=-1):
"""
Create a Pool object for CatBoost training and prediction.
Parameters:
- data: Input data (list, numpy.ndarray, pandas.DataFrame, pandas.Series,
FeaturesData, string path, or pathlib.Path)
- label: Target values (array-like, string path, or pathlib.Path)
- cat_features: Categorical feature column indices or names (list of int/str)
- text_features: Text feature column indices or names (list of int/str)
- embedding_features: Embedding feature column indices or names (list of int/str)
- embedding_features_data: Embedding feature data (list of numpy.ndarray)
- column_description: Path to column description file (string)
- pairs: Pairs for ranking tasks (array-like or string path)
- graph: Graph for collaborative filtering (dict or string path)
- delimiter: Column delimiter for file inputs (default: '\t')
- has_header: Whether input files have headers (bool)
- weight: Sample weights (array-like)
- group_id: Group identifiers for ranking (array-like)
- group_weight: Group weights (array-like)
- subgroup_id: Subgroup identifiers (array-like)
- pairs_weight: Pairs weights for ranking (array-like)
- baseline: Baseline values (array-like)
- timestamp: Timestamp values (array-like)
- feature_names: Feature names (list of str)
- feature_tags: Feature tags for feature selection (dict)
- thread_count: Number of threads for data processing
"""
def slice(self, rindex):
"""
Create a new Pool with a subset of objects.
Parameters:
- rindex: Row indices to include (array-like of int)
Returns:
Pool: New Pool object with selected rows
"""
def set_feature_names(self, feature_names):
"""
Set feature names for the Pool.
Parameters:
- feature_names: List of feature names (list of str)
"""
def set_baseline(self, baseline):
"""
Set baseline values for the Pool.
Parameters:
- baseline: Baseline values (array-like)
"""
def set_weight(self, weight):
"""
Set sample weights for the Pool.
Parameters:
- weight: Sample weights (array-like)
"""
def set_group_id(self, group_id):
"""
Set group identifiers for ranking tasks.
Parameters:
- group_id: Group identifiers (array-like)
"""
def set_group_weight(self, group_weight):
"""
Set group weights for ranking tasks.
Parameters:
- group_weight: Group weights (array-like)
"""
def set_pairs(self, pairs):
"""
Set pairs for ranking tasks.
Parameters:
- pairs: Pairs data (array-like)
"""
def save(self, fname, format=None, pool_metainfo=None):
"""
Save Pool to file.
Parameters:
- fname: Output file name (string)
- format: Output format ('dsv' or None for auto-detection)
- pool_metainfo: Additional pool metadata (dict)
"""
def quantize(self, ignored_features=None, per_float_feature_quantization=None,
border_count=None, max_bin=None, feature_border_type=None,
sparse_features_conflict_fraction=0.0, nan_mode=None,
input_borders=None, task_type=None, used_ram_limit=None):
"""
Quantize Pool data for faster training.
Parameters:
- ignored_features: Features to ignore during quantization (list)
- per_float_feature_quantization: Per-feature quantization settings (list)
- border_count: Number of borders for quantization (int)
- max_bin: Maximum number of bins (int)
- feature_border_type: Border selection method (str)
- sparse_features_conflict_fraction: Conflict fraction for sparse features
- nan_mode: NaN handling mode ('Min', 'Max')
- input_borders: Pre-computed borders (dict)
- task_type: Task type ('CPU' or 'GPU')
- used_ram_limit: RAM usage limit (str)
Returns:
Pool: Quantized Pool object
"""
@property
def shape(self):
"""Get Pool shape (n_samples, n_features)."""
@property
def num_row(self):
"""Get number of rows in Pool."""
@property
def num_col(self):
"""Get number of columns in Pool."""
def get_feature_names(self):
"""
Get feature names.
Returns:
list: Feature names
"""
def get_cat_feature_indices(self):
"""
Get categorical feature indices.
Returns:
list: Categorical feature column indices
"""
def get_text_feature_indices(self):
"""
Get text feature indices.
Returns:
list: Text feature column indices
"""
def get_embedding_feature_indices(self):
"""
Get embedding feature indices.
Returns:
list: Embedding feature column indices
"""
def is_empty(self):
"""
Check if Pool is empty.
Returns:
bool: True if Pool is empty
"""
def is_quantized(self):
"""
Check if Pool is quantized.
Returns:
bool: True if Pool is quantized
"""Low-level container for feature data with metadata, used internally by CatBoost for efficient data management.
class FeaturesData:
"""
Container for feature data with metadata.
This class is primarily used internally by CatBoost for efficient
feature data storage and manipulation. Most users should use the
Pool class instead.
"""
def __init__(self, *args, **kwargs):
"""Initialize FeaturesData object."""
# Internal methods and properties for feature data management
# Detailed API not exposed as this is primarily internalUtility functions for data preparation, column description files, and data format conversion.
def create_cd(label_column, cat_feature_indices=None, column_description_path="train.cd"):
"""
Create column description file for CatBoost.
Parameters:
- label_column: Index of label column (int)
- cat_feature_indices: Indices of categorical features (list of int)
- column_description_path: Output file path (string)
"""
def read_cd(column_description_path, delimiter='\t'):
"""
Read column description file.
Parameters:
- column_description_path: Path to column description file (string)
- delimiter: Column delimiter (string)
Returns:
dict: Column description information
"""
def quantize(pool, ignored_features=None, per_float_feature_quantization=None,
border_count=None, max_bin=None, feature_border_type=None,
sparse_features_conflict_fraction=0.0, nan_mode=None,
input_borders=None, task_type=None, used_ram_limit=None):
"""
Quantize Pool data for faster training.
Parameters: Same as Pool.quantize()
Returns:
Pool: Quantized Pool object
"""
def calculate_quantization_grid(values, border_count=128, border_type='Median'):
"""
Calculate quantization grid for numerical values.
Parameters:
- values: Input values (array-like)
- border_count: Number of borders to create (int)
- border_type: Border selection method ('Median', 'Uniform', 'UniformAndQuantiles', 'MaxLogSum', 'MinEntropy', 'GreedyLogSum')
Returns:
numpy.ndarray: Quantization borders
"""CatBoost Pool supports multiple input formats for maximum flexibility:
import pandas as pd
from catboost import Pool
# Create Pool from pandas DataFrame
df = pd.DataFrame({
'feature1': [1, 2, 3, 4],
'feature2': [0.1, 0.2, 0.3, 0.4],
'category': ['A', 'B', 'A', 'C']
})
labels = [0, 1, 0, 1]
pool = Pool(
data=df,
label=labels,
cat_features=['category']
)import numpy as np
from catboost import Pool
# Create Pool from NumPy arrays
data = np.array([[1, 0.1, 0], [2, 0.2, 1], [3, 0.3, 0], [4, 0.4, 2]])
labels = np.array([0, 1, 0, 1])
pool = Pool(
data=data,
label=labels,
cat_features=[2] # Third column is categorical
)from catboost import Pool
# Create Pool from files
pool = Pool(
data='train.tsv',
column_description='train.cd',
delimiter='\t',
has_header=True
)from catboost import Pool
# Pool with comprehensive metadata
pool = Pool(
data=df,
label=labels,
cat_features=['category'],
text_features=['description'],
embedding_features=['user_embedding'],
weight=sample_weights,
group_id=group_ids, # For ranking
pairs=ranking_pairs, # For ranking
baseline=baseline_values,
feature_names=['feat1', 'feat2', 'cat1'],
feature_tags={'important': [0, 1], 'text': [2]}
)Install with Tessl CLI
npx tessl i tessl/pypi-catboost