or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-data-models.mddistributed-computing.mdindex.mdsklearn-interface.mdtraining-evaluation.mdutilities.md

core-data-models.mddocs/

0

# Core Data Structures and Models

1

2

Fundamental XGBoost data structures and model objects that provide the foundation for training and prediction. These components handle data ingestion, optimization, and model storage with support for various data formats and memory optimization strategies.

3

4

## Capabilities

5

6

### DMatrix - Primary Data Structure

7

8

The core data structure for XGBoost that optimizes data storage and access patterns for gradient boosting. DMatrix handles various input formats including NumPy arrays, pandas DataFrames, scipy sparse matrices, and supports missing values, categorical features, and external memory datasets.

9

10

```python { .api }

11

class DMatrix:

12

def __init__(self, data, label=None, *, weight=None, base_margin=None,

13

missing=None, silent=False, feature_names=None,

14

feature_types=None, nthread=None, group=None, qid=None,

15

label_lower_bound=None, label_upper_bound=None,

16

feature_weights=None, enable_categorical=False,

17

data_split_mode=DataSplitMode.ROW):

18

"""

19

Optimized data matrix for XGBoost training and prediction.

20

21

Parameters:

22

- data: Input data (array-like, DataFrame, sparse matrix, or file path)

23

- label: Target values (array-like)

24

- weight: Instance weights (array-like)

25

- base_margin: Base prediction margins (array-like)

26

- missing: Value to be treated as missing (float, default: NaN)

27

- silent: Whether to suppress loading messages (bool)

28

- feature_names: Names for features (list of str)

29

- feature_types: Types for features ('int', 'float', 'c' for categorical)

30

- nthread: Number of threads for loading data (int)

31

- group: Group sizes for ranking (array-like)

32

- qid: Query IDs for ranking (array-like)

33

- label_lower_bound: Lower bound for labels in ranking (array-like)

34

- label_upper_bound: Upper bound for labels in ranking (array-like)

35

- feature_weights: Weights for features (array-like)

36

- enable_categorical: Enable categorical feature support (bool)

37

- data_split_mode: How to split data for distributed training

38

"""

39

40

def set_info(self, *, label=None, weight=None, base_margin=None,

41

group=None, qid=None, label_lower_bound=None,

42

label_upper_bound=None, feature_names=None,

43

feature_types=None, feature_weights=None):

44

"""

45

Set meta-information for the DMatrix.

46

47

Parameters: Same as constructor parameters for updating specific fields

48

"""

49

50

def get_label(self):

51

"""Get the labels of the DMatrix. Returns: numpy.ndarray"""

52

53

def get_weight(self):

54

"""Get the weights of the DMatrix. Returns: numpy.ndarray"""

55

56

def get_base_margin(self):

57

"""Get the base margins of the DMatrix. Returns: numpy.ndarray"""

58

59

def get_group(self):

60

"""Get the group sizes of the DMatrix. Returns: numpy.ndarray"""

61

62

def set_label(self, label):

63

"""Set labels for the DMatrix. Parameters: label (array-like)"""

64

65

def set_weight(self, weight):

66

"""Set instance weights for the DMatrix. Parameters: weight (array-like)"""

67

68

def set_base_margin(self, margin):

69

"""Set base prediction margins. Parameters: margin (array-like)"""

70

71

def set_group(self, group):

72

"""Set group sizes for ranking. Parameters: group (array-like)"""

73

74

def get_float_info(self, field):

75

"""Get float information by field name. Returns: numpy.ndarray"""

76

77

def get_uint_info(self, field):

78

"""Get unsigned integer information by field name. Returns: numpy.ndarray"""

79

80

def set_float_info(self, field, data):

81

"""Set float information. Parameters: field (str), data (array-like)"""

82

83

def set_uint_info(self, field, data):

84

"""Set unsigned integer information. Parameters: field (str), data (array-like)"""

85

86

def save_binary(self, fname, silent=True):

87

"""Save DMatrix to binary format. Parameters: fname (str), silent (bool)"""

88

89

def load_model(self, fname):

90

"""Load DMatrix from file. Parameters: fname (str)"""

91

92

def get_data(self):

93

"""Get the data matrix. Returns: CSR matrix representation"""

94

95

def num_row(self):

96

"""Get number of rows. Returns: int"""

97

98

def num_col(self):

99

"""Get number of columns. Returns: int"""

100

101

def num_nonmissing(self):

102

"""Get number of non-missing values. Returns: int"""

103

104

def slice(self, rindex, allow_groups=False):

105

"""

106

Slice DMatrix by row indices.

107

108

Parameters:

109

- rindex: Row indices to select (array-like)

110

- allow_groups: Whether to allow slicing with groups (bool)

111

112

Returns: DMatrix

113

"""

114

115

@property

116

def feature_names(self):

117

"""Feature names. Returns: list of str or None"""

118

119

@property

120

def feature_types(self):

121

"""Feature types. Returns: list of str or None"""

122

```

123

124

### QuantileDMatrix - Memory-Efficient Data Structure

125

126

Memory-efficient variant of DMatrix that uses quantized data representation, designed specifically for the hist tree method. Reduces memory usage while maintaining accuracy for large datasets.

127

128

```python { .api }

129

class QuantileDMatrix:

130

def __init__(self, data, label=None, *, ref=None, weight=None,

131

base_margin=None, missing=None, silent=False,

132

feature_names=None, feature_types=None, nthread=None,

133

max_bin=256, group=None, qid=None, label_lower_bound=None,

134

label_upper_bound=None, feature_weights=None,

135

enable_categorical=False):

136

"""

137

Memory-efficient DMatrix using quantized data for hist tree method.

138

139

Parameters: Similar to DMatrix with additional:

140

- ref: Reference QuantileDMatrix for validation data (QuantileDMatrix)

141

- max_bin: Maximum number of bins for quantization (int)

142

"""

143

144

@property

145

def ref(self):

146

"""Reference to training QuantileDMatrix. Returns: QuantileDMatrix or None"""

147

```

148

149

### ExtMemQuantileDMatrix - External Memory Data Structure

150

151

External memory version of QuantileDMatrix for datasets that don't fit in memory. Enables training on very large datasets by streaming data from disk.

152

153

```python { .api }

154

class ExtMemQuantileDMatrix:

155

def __init__(self, data, *, missing=None, nthread=None, max_bin=None,

156

ref=None, enable_categorical=False, max_num_device_pages=None,

157

max_quantile_batches=None):

158

"""

159

External memory QuantileDMatrix for large datasets that don't fit in memory.

160

161

Parameters:

162

- data: Iterator that yields data chunks (DataIter)

163

- missing: Value representing missing data (float, optional)

164

- nthread: Number of threads for processing (int, optional)

165

- max_bin: Number of histogram bins for quantization (int, optional)

166

- ref: Reference DMatrix for validation data (DMatrix, optional)

167

- enable_categorical: Enable categorical feature support (bool)

168

- max_num_device_pages: GPU device memory page limit (int, optional)

169

- max_quantile_batches: Maximum quantile batches for processing (int, optional)

170

"""

171

172

@property

173

def ref(self):

174

"""Reference to training DMatrix. Returns: DMatrix or None"""

175

```

176

177

### Booster - Trained Model

178

179

The core XGBoost model class that contains the trained ensemble of decision trees. Provides methods for prediction, evaluation, model persistence, and introspection.

180

181

```python { .api }

182

class Booster:

183

def __init__(self, params=None, cache=(), model_file=None):

184

"""

185

XGBoost model containing training, prediction, and evaluation routines.

186

187

Parameters:

188

- params: Training parameters (dict)

189

- cache: List of DMatrix objects to cache (list)

190

- model_file: Path to load existing model (str)

191

"""

192

193

def update(self, dtrain, iteration, fobj=None):

194

"""

195

Update the model for one iteration.

196

197

Parameters:

198

- dtrain: Training DMatrix (DMatrix)

199

- iteration: Current iteration number (int)

200

- fobj: Custom objective function (callable, optional)

201

"""

202

203

def boost(self, dtrain, iteration, grad, hess):

204

"""

205

Boost the model for one iteration with custom gradients.

206

207

Parameters:

208

- dtrain: Training DMatrix (DMatrix)

209

- iteration: Current iteration number (int)

210

- grad: Gradient values (array-like)

211

- hess: Hessian values (array-like)

212

"""

213

214

def predict(self, data, *, output_margin=False, pred_leaf=False,

215

pred_contribs=False, approx_contribs=False,

216

pred_interactions=False, validate_features=True,

217

training=False, iteration_range=(0, 0), strict_shape=False):

218

"""

219

Make predictions using the trained model.

220

221

Parameters:

222

- data: Input data (DMatrix, array-like, or DataFrame)

223

- output_margin: Whether to output margin values (bool)

224

- pred_leaf: Whether to output leaf indices (bool)

225

- pred_contribs: Whether to output feature contributions (bool)

226

- approx_contribs: Whether to use approximate feature contributions (bool)

227

- pred_interactions: Whether to output interaction contributions (bool)

228

- validate_features: Whether to validate feature names (bool)

229

- training: Whether to use training mode (bool)

230

- iteration_range: Range of trees to use for prediction (tuple)

231

- strict_shape: Whether to enforce strict shape checking (bool)

232

233

Returns: numpy.ndarray - Predictions

234

"""

235

236

def inplace_predict(self, data, *, iteration_range=(0, 0),

237

predict_type='value', missing=float('nan'),

238

validate_features=True, base_margin=None,

239

strict_shape=False):

240

"""

241

Inplace prediction without creating DMatrix.

242

243

Parameters:

244

- data: Input data (array-like or DataFrame)

245

- iteration_range: Range of trees to use (tuple)

246

- predict_type: Type of prediction ('value', 'margin', 'contrib', 'leaf')

247

- missing: Value to treat as missing (float)

248

- validate_features: Whether to validate features (bool)

249

- base_margin: Base prediction margins (array-like)

250

- strict_shape: Whether to enforce strict shape checking (bool)

251

252

Returns: numpy.ndarray - Predictions

253

"""

254

255

def eval(self, data, name='eval', iteration=0):

256

"""

257

Evaluate model on given data.

258

259

Parameters:

260

- data: Evaluation data (DMatrix)

261

- name: Name for evaluation (str)

262

- iteration: Iteration to evaluate (int)

263

264

Returns: str - Evaluation result

265

"""

266

267

def eval_set(self, evals, iteration=0, feval=None, output_margin=True):

268

"""

269

Evaluate model on multiple datasets.

270

271

Parameters:

272

- evals: List of (DMatrix, name) tuples (list)

273

- iteration: Iteration to evaluate (int)

274

- feval: Custom evaluation function (callable)

275

- output_margin: Whether to output margins (bool)

276

277

Returns: str - Evaluation results

278

"""

279

280

def save_model(self, fname):

281

"""Save model to file. Parameters: fname (str)"""

282

283

def load_model(self, fname):

284

"""Load model from file. Parameters: fname (str)"""

285

286

def save_raw(self, raw_format='ubj'):

287

"""

288

Save model to raw format bytes.

289

290

Parameters:

291

- raw_format: Format ('json', 'ubj', 'deprecated') (str)

292

293

Returns: bytes - Serialized model

294

"""

295

296

def load_config(self, config):

297

"""Load configuration. Parameters: config (str)"""

298

299

def save_config(self):

300

"""Save current configuration. Returns: str - JSON configuration"""

301

302

def get_dump(self, fmap='', with_stats=False, dump_format='text'):

303

"""

304

Get model dump as list of strings.

305

306

Parameters:

307

- fmap: Feature map file (str)

308

- with_stats: Whether to include statistics (bool)

309

- dump_format: Output format ('text', 'json') (str)

310

311

Returns: list of str - Model trees

312

"""

313

314

def get_fscore(self, fmap=''):

315

"""

316

Get feature importance scores.

317

318

Parameters:

319

- fmap: Feature map file (str)

320

321

Returns: dict - Feature importance scores

322

"""

323

324

def get_score(self, fmap='', importance_type='weight'):

325

"""

326

Get feature importance scores by type.

327

328

Parameters:

329

- fmap: Feature map file (str)

330

- importance_type: Type ('weight', 'gain', 'cover', 'total_gain', 'total_cover')

331

332

Returns: dict - Feature importance scores

333

"""

334

335

def trees_to_dataframe(self, fmap=''):

336

"""

337

Convert trees to pandas DataFrame.

338

339

Parameters:

340

- fmap: Feature map file (str)

341

342

Returns: pandas.DataFrame - Tree structure

343

"""

344

345

def num_boosted_rounds(self):

346

"""Get number of boosted rounds. Returns: int"""

347

348

def num_features(self):

349

"""Get number of features. Returns: int"""

350

351

def copy(self):

352

"""Create a copy of the booster. Returns: Booster"""

353

354

def attr(self, key):

355

"""Get attribute by key. Parameters: key (str). Returns: str or None"""

356

357

def attributes(self):

358

"""Get all attributes. Returns: dict"""

359

360

def set_attr(self, **kwargs):

361

"""Set attributes. Parameters: **kwargs - Key-value pairs"""

362

363

def set_param(self, params, value=None):

364

"""

365

Set parameter(s).

366

367

Parameters:

368

- params: Parameter name (str) or parameter dict (dict)

369

- value: Parameter value (any, optional)

370

"""

371

372

@property

373

def feature_names(self):

374

"""Feature names. Returns: list of str or None"""

375

376

@property

377

def feature_types(self):

378

"""Feature types. Returns: list of str or None"""

379

380

@property

381

def best_iteration(self):

382

"""Best iteration from early stopping. Returns: int"""

383

384

@property

385

def best_score(self):

386

"""Best score from early stopping. Returns: float"""

387

```

388

389

### DataIter - Custom Data Loading

390

391

Abstract base class for implementing custom data iterators, enabling external memory training and custom data loading strategies for very large datasets.

392

393

```python { .api }

394

class DataIter:

395

def __init__(self, cache_prefix=None, release_data=True, *, on_host=True,

396

min_cache_page_bytes=None):

397

"""

398

Abstract base class for user-defined data iteration for external memory.

399

400

Parameters:

401

- cache_prefix: Prefix for cache files (str, optional)

402

- release_data: Whether to release data during iteration (bool)

403

- on_host: Cache on host memory vs file system for GPU (bool)

404

- min_cache_page_bytes: Minimum bytes per cache page (int, optional)

405

"""

406

407

def reset(self):

408

"""Reset iterator to the beginning. Must be implemented by subclasses."""

409

410

def next(self, input_data):

411

"""

412

Set the next batch of data. Must be implemented by subclasses.

413

414

Parameters:

415

- input_data: Callback function with data fields like DMatrix (callable)

416

Should be called as: input_data(data=X, label=y, weight=w, ...)

417

418

Returns: bool - False if no more batches, True if more data available

419

"""

420

421

def get_callbacks(self, enable_categorical):

422

"""

423

Get callback functions for iterating in C.

424

425

Parameters:

426

- enable_categorical: Enable categorical feature support (bool)

427

428

Returns: tuple - (reset_callback, next_callback)

429

"""

430

431

def reraise(self):

432

"""Reraise any exception thrown during iteration."""

433

434

@property

435

def proxy(self):

436

"""Handle of DMatrix proxy for internal use. Returns: _ProxyDMatrix"""

437

```

438

439

## Constants and Enums

440

441

### DataSplitMode

442

443

```python { .api }

444

class DataSplitMode:

445

"""Data splitting mode for distributed training."""

446

ROW = 0 # Split by rows

447

COL = 1 # Split by columns

448

```

449

450

## Usage Examples

451

452

### Basic DMatrix Creation

453

454

```python

455

import xgboost as xgb

456

import numpy as np

457

import pandas as pd

458

459

# From NumPy arrays

460

X = np.random.randn(1000, 10)

461

y = np.random.randint(0, 2, 1000)

462

dtrain = xgb.DMatrix(X, label=y, feature_names=[f'f{i}' for i in range(10)])

463

464

# From pandas DataFrame

465

df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)])

466

dtrain = xgb.DMatrix(df, label=y)

467

468

# With additional information

469

weights = np.random.uniform(0.5, 2.0, 1000)

470

dtrain = xgb.DMatrix(X, label=y, weight=weights,

471

feature_names=[f'f{i}' for i in range(10)],

472

feature_types=['float'] * 10)

473

```

474

475

### Memory-Efficient Data Loading

476

477

```python

478

# Use QuantileDMatrix for large datasets

479

dtrain = xgb.QuantileDMatrix(X_train, label=y_train, max_bin=512)

480

dtest = xgb.QuantileDMatrix(X_test, label=y_test, ref=dtrain)

481

482

# For external memory training

483

class CustomDataIter(xgb.DataIter):

484

def __init__(self, data_files):

485

self.data_files = data_files

486

self.file_idx = 0

487

super().__init__()

488

489

def reset(self):

490

self.file_idx = 0

491

492

def next(self, input_data):

493

if self.file_idx >= len(self.data_files):

494

return 1

495

496

# Load data from current file

497

X, y = load_data_from_file(self.data_files[self.file_idx])

498

input_data(data=X, label=y)

499

self.file_idx += 1

500

return 0

501

502

data_iter = CustomDataIter(['data1.csv', 'data2.csv', 'data3.csv'])

503

dtrain = xgb.ExtMemQuantileDMatrix(data_iter)

504

```

505

506

### Model Operations

507

508

```python

509

# Train model

510

params = {'objective': 'binary:logistic', 'max_depth': 6}

511

model = xgb.train(params, dtrain, num_boost_round=100)

512

513

# Make predictions

514

predictions = model.predict(dtest)

515

516

# Get feature importance

517

importance = model.get_score(importance_type='gain')

518

print(importance)

519

520

# Save and load model

521

model.save_model('model.json')

522

loaded_model = xgb.Booster()

523

loaded_model.load_model('model.json')

524

525

# Model introspection

526

print(f"Number of trees: {model.num_boosted_rounds()}")

527

print(f"Number of features: {model.num_features()}")

528

```