or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdcore-models.mddata-handling.mddatasets.mdevaluation.mdfeature-analysis.mdindex.mdmetrics.mdtraining-evaluation.mdutilities.mdvisualization.md

data-handling.mddocs/

0

# Data Handling

1

2

CatBoost's data handling capabilities center around the Pool class, which efficiently manages training data with categorical features, text features, embeddings, and metadata. The Pool class optimizes data storage and access patterns for CatBoost's gradient boosting algorithms.

3

4

## Capabilities

5

6

### Pool Class

7

8

The primary data container for CatBoost that handles various data types, feature specifications, and metadata required for training and prediction.

9

10

```python { .api }

11

class Pool:

12

def __init__(self, data, label=None, cat_features=None, text_features=None,

13

embedding_features=None, embedding_features_data=None,

14

column_description=None, pairs=None, graph=None, delimiter='\t',

15

has_header=False, weight=None, group_id=None, group_weight=None,

16

subgroup_id=None, pairs_weight=None, baseline=None, timestamp=None,

17

feature_names=None, feature_tags=None, thread_count=-1):

18

"""

19

Create a Pool object for CatBoost training and prediction.

20

21

Parameters:

22

- data: Input data (list, numpy.ndarray, pandas.DataFrame, pandas.Series,

23

FeaturesData, string path, or pathlib.Path)

24

- label: Target values (array-like, string path, or pathlib.Path)

25

- cat_features: Categorical feature column indices or names (list of int/str)

26

- text_features: Text feature column indices or names (list of int/str)

27

- embedding_features: Embedding feature column indices or names (list of int/str)

28

- embedding_features_data: Embedding feature data (list of numpy.ndarray)

29

- column_description: Path to column description file (string)

30

- pairs: Pairs for ranking tasks (array-like or string path)

31

- graph: Graph for collaborative filtering (dict or string path)

32

- delimiter: Column delimiter for file inputs (default: '\t')

33

- has_header: Whether input files have headers (bool)

34

- weight: Sample weights (array-like)

35

- group_id: Group identifiers for ranking (array-like)

36

- group_weight: Group weights (array-like)

37

- subgroup_id: Subgroup identifiers (array-like)

38

- pairs_weight: Pairs weights for ranking (array-like)

39

- baseline: Baseline values (array-like)

40

- timestamp: Timestamp values (array-like)

41

- feature_names: Feature names (list of str)

42

- feature_tags: Feature tags for feature selection (dict)

43

- thread_count: Number of threads for data processing

44

"""

45

46

def slice(self, rindex):

47

"""

48

Create a new Pool with a subset of objects.

49

50

Parameters:

51

- rindex: Row indices to include (array-like of int)

52

53

Returns:

54

Pool: New Pool object with selected rows

55

"""

56

57

def set_feature_names(self, feature_names):

58

"""

59

Set feature names for the Pool.

60

61

Parameters:

62

- feature_names: List of feature names (list of str)

63

"""

64

65

def set_baseline(self, baseline):

66

"""

67

Set baseline values for the Pool.

68

69

Parameters:

70

- baseline: Baseline values (array-like)

71

"""

72

73

def set_weight(self, weight):

74

"""

75

Set sample weights for the Pool.

76

77

Parameters:

78

- weight: Sample weights (array-like)

79

"""

80

81

def set_group_id(self, group_id):

82

"""

83

Set group identifiers for ranking tasks.

84

85

Parameters:

86

- group_id: Group identifiers (array-like)

87

"""

88

89

def set_group_weight(self, group_weight):

90

"""

91

Set group weights for ranking tasks.

92

93

Parameters:

94

- group_weight: Group weights (array-like)

95

"""

96

97

def set_pairs(self, pairs):

98

"""

99

Set pairs for ranking tasks.

100

101

Parameters:

102

- pairs: Pairs data (array-like)

103

"""

104

105

def save(self, fname, format=None, pool_metainfo=None):

106

"""

107

Save Pool to file.

108

109

Parameters:

110

- fname: Output file name (string)

111

- format: Output format ('dsv' or None for auto-detection)

112

- pool_metainfo: Additional pool metadata (dict)

113

"""

114

115

def quantize(self, ignored_features=None, per_float_feature_quantization=None,

116

border_count=None, max_bin=None, feature_border_type=None,

117

sparse_features_conflict_fraction=0.0, nan_mode=None,

118

input_borders=None, task_type=None, used_ram_limit=None):

119

"""

120

Quantize Pool data for faster training.

121

122

Parameters:

123

- ignored_features: Features to ignore during quantization (list)

124

- per_float_feature_quantization: Per-feature quantization settings (list)

125

- border_count: Number of borders for quantization (int)

126

- max_bin: Maximum number of bins (int)

127

- feature_border_type: Border selection method (str)

128

- sparse_features_conflict_fraction: Conflict fraction for sparse features

129

- nan_mode: NaN handling mode ('Min', 'Max')

130

- input_borders: Pre-computed borders (dict)

131

- task_type: Task type ('CPU' or 'GPU')

132

- used_ram_limit: RAM usage limit (str)

133

134

Returns:

135

Pool: Quantized Pool object

136

"""

137

138

@property

139

def shape(self):

140

"""Get Pool shape (n_samples, n_features)."""

141

142

@property

143

def num_row(self):

144

"""Get number of rows in Pool."""

145

146

@property

147

def num_col(self):

148

"""Get number of columns in Pool."""

149

150

def get_feature_names(self):

151

"""

152

Get feature names.

153

154

Returns:

155

list: Feature names

156

"""

157

158

def get_cat_feature_indices(self):

159

"""

160

Get categorical feature indices.

161

162

Returns:

163

list: Categorical feature column indices

164

"""

165

166

def get_text_feature_indices(self):

167

"""

168

Get text feature indices.

169

170

Returns:

171

list: Text feature column indices

172

"""

173

174

def get_embedding_feature_indices(self):

175

"""

176

Get embedding feature indices.

177

178

Returns:

179

list: Embedding feature column indices

180

"""

181

182

def is_empty(self):

183

"""

184

Check if Pool is empty.

185

186

Returns:

187

bool: True if Pool is empty

188

"""

189

190

def is_quantized(self):

191

"""

192

Check if Pool is quantized.

193

194

Returns:

195

bool: True if Pool is quantized

196

"""

197

```

198

199

### FeaturesData Class

200

201

Low-level container for feature data with metadata, used internally by CatBoost for efficient data management.

202

203

```python { .api }

204

class FeaturesData:

205

"""

206

Container for feature data with metadata.

207

208

This class is primarily used internally by CatBoost for efficient

209

feature data storage and manipulation. Most users should use the

210

Pool class instead.

211

"""

212

213

def __init__(self, *args, **kwargs):

214

"""Initialize FeaturesData object."""

215

216

# Internal methods and properties for feature data management

217

# Detailed API not exposed as this is primarily internal

218

```

219

220

### Data Loading and Conversion Utilities

221

222

Utility functions for data preparation, column description files, and data format conversion.

223

224

```python { .api }

225

def create_cd(label_column, cat_feature_indices=None, column_description_path="train.cd"):

226

"""

227

Create column description file for CatBoost.

228

229

Parameters:

230

- label_column: Index of label column (int)

231

- cat_feature_indices: Indices of categorical features (list of int)

232

- column_description_path: Output file path (string)

233

"""

234

235

def read_cd(column_description_path, delimiter='\t'):

236

"""

237

Read column description file.

238

239

Parameters:

240

- column_description_path: Path to column description file (string)

241

- delimiter: Column delimiter (string)

242

243

Returns:

244

dict: Column description information

245

"""

246

247

def quantize(pool, ignored_features=None, per_float_feature_quantization=None,

248

border_count=None, max_bin=None, feature_border_type=None,

249

sparse_features_conflict_fraction=0.0, nan_mode=None,

250

input_borders=None, task_type=None, used_ram_limit=None):

251

"""

252

Quantize Pool data for faster training.

253

254

Parameters: Same as Pool.quantize()

255

256

Returns:

257

Pool: Quantized Pool object

258

"""

259

260

def calculate_quantization_grid(values, border_count=128, border_type='Median'):

261

"""

262

Calculate quantization grid for numerical values.

263

264

Parameters:

265

- values: Input values (array-like)

266

- border_count: Number of borders to create (int)

267

- border_type: Border selection method ('Median', 'Uniform', 'UniformAndQuantiles', 'MaxLogSum', 'MinEntropy', 'GreedyLogSum')

268

269

Returns:

270

numpy.ndarray: Quantization borders

271

"""

272

```

273

274

## Data Input Formats

275

276

CatBoost Pool supports multiple input formats for maximum flexibility:

277

278

### DataFrame Input

279

```python

280

import pandas as pd

281

from catboost import Pool

282

283

# Create Pool from pandas DataFrame

284

df = pd.DataFrame({

285

'feature1': [1, 2, 3, 4],

286

'feature2': [0.1, 0.2, 0.3, 0.4],

287

'category': ['A', 'B', 'A', 'C']

288

})

289

labels = [0, 1, 0, 1]

290

291

pool = Pool(

292

data=df,

293

label=labels,

294

cat_features=['category']

295

)

296

```

297

298

### NumPy Array Input

299

```python

300

import numpy as np

301

from catboost import Pool

302

303

# Create Pool from NumPy arrays

304

data = np.array([[1, 0.1, 0], [2, 0.2, 1], [3, 0.3, 0], [4, 0.4, 2]])

305

labels = np.array([0, 1, 0, 1])

306

307

pool = Pool(

308

data=data,

309

label=labels,

310

cat_features=[2] # Third column is categorical

311

)

312

```

313

314

### File Input

315

```python

316

from catboost import Pool

317

318

# Create Pool from files

319

pool = Pool(

320

data='train.tsv',

321

column_description='train.cd',

322

delimiter='\t',

323

has_header=True

324

)

325

```

326

327

### Advanced Data Configurations

328

329

```python

330

from catboost import Pool

331

332

# Pool with comprehensive metadata

333

pool = Pool(

334

data=df,

335

label=labels,

336

cat_features=['category'],

337

text_features=['description'],

338

embedding_features=['user_embedding'],

339

weight=sample_weights,

340

group_id=group_ids, # For ranking

341

pairs=ranking_pairs, # For ranking

342

baseline=baseline_values,

343

feature_names=['feat1', 'feat2', 'cat1'],

344

feature_tags={'important': [0, 1], 'text': [2]}

345

)

346

```