0
# Core Data Structures and Models
1
2
Fundamental XGBoost data structures and model objects that provide the foundation for training and prediction. These components handle data ingestion, optimization, and model storage with support for various data formats and memory optimization strategies.
3
4
## Capabilities
5
6
### DMatrix - Primary Data Structure
7
8
The core data structure for XGBoost that optimizes data storage and access patterns for gradient boosting. DMatrix handles various input formats including NumPy arrays, pandas DataFrames, scipy sparse matrices, and supports missing values, categorical features, and external memory datasets.
9
10
```python { .api }
11
class DMatrix:
12
def __init__(self, data, label=None, *, weight=None, base_margin=None,
13
missing=None, silent=False, feature_names=None,
14
feature_types=None, nthread=None, group=None, qid=None,
15
label_lower_bound=None, label_upper_bound=None,
16
feature_weights=None, enable_categorical=False,
17
data_split_mode=DataSplitMode.ROW):
18
"""
19
Optimized data matrix for XGBoost training and prediction.
20
21
Parameters:
22
- data: Input data (array-like, DataFrame, sparse matrix, or file path)
23
- label: Target values (array-like)
24
- weight: Instance weights (array-like)
25
- base_margin: Base prediction margins (array-like)
26
- missing: Value to be treated as missing (float, default: NaN)
27
- silent: Whether to suppress loading messages (bool)
28
- feature_names: Names for features (list of str)
29
- feature_types: Types for features ('int', 'float', 'c' for categorical)
30
- nthread: Number of threads for loading data (int)
31
- group: Group sizes for ranking (array-like)
32
- qid: Query IDs for ranking (array-like)
33
- label_lower_bound: Lower bound for labels in ranking (array-like)
34
- label_upper_bound: Upper bound for labels in ranking (array-like)
35
- feature_weights: Weights for features (array-like)
36
- enable_categorical: Enable categorical feature support (bool)
37
- data_split_mode: How to split data for distributed training
38
"""
39
40
def set_info(self, *, label=None, weight=None, base_margin=None,
41
group=None, qid=None, label_lower_bound=None,
42
label_upper_bound=None, feature_names=None,
43
feature_types=None, feature_weights=None):
44
"""
45
Set meta-information for the DMatrix.
46
47
Parameters: Same as constructor parameters for updating specific fields
48
"""
49
50
def get_label(self):
51
"""Get the labels of the DMatrix. Returns: numpy.ndarray"""
52
53
def get_weight(self):
54
"""Get the weights of the DMatrix. Returns: numpy.ndarray"""
55
56
def get_base_margin(self):
57
"""Get the base margins of the DMatrix. Returns: numpy.ndarray"""
58
59
def get_group(self):
60
"""Get the group sizes of the DMatrix. Returns: numpy.ndarray"""
61
62
def set_label(self, label):
63
"""Set labels for the DMatrix. Parameters: label (array-like)"""
64
65
def set_weight(self, weight):
66
"""Set instance weights for the DMatrix. Parameters: weight (array-like)"""
67
68
def set_base_margin(self, margin):
69
"""Set base prediction margins. Parameters: margin (array-like)"""
70
71
def set_group(self, group):
72
"""Set group sizes for ranking. Parameters: group (array-like)"""
73
74
def get_float_info(self, field):
75
"""Get float information by field name. Returns: numpy.ndarray"""
76
77
def get_uint_info(self, field):
78
"""Get unsigned integer information by field name. Returns: numpy.ndarray"""
79
80
def set_float_info(self, field, data):
81
"""Set float information. Parameters: field (str), data (array-like)"""
82
83
def set_uint_info(self, field, data):
84
"""Set unsigned integer information. Parameters: field (str), data (array-like)"""
85
86
def save_binary(self, fname, silent=True):
87
"""Save DMatrix to binary format. Parameters: fname (str), silent (bool)"""
88
89
def load_model(self, fname):
90
"""Load DMatrix from file. Parameters: fname (str)"""
91
92
def get_data(self):
93
"""Get the data matrix. Returns: CSR matrix representation"""
94
95
def num_row(self):
96
"""Get number of rows. Returns: int"""
97
98
def num_col(self):
99
"""Get number of columns. Returns: int"""
100
101
def num_nonmissing(self):
102
"""Get number of non-missing values. Returns: int"""
103
104
def slice(self, rindex, allow_groups=False):
105
"""
106
Slice DMatrix by row indices.
107
108
Parameters:
109
- rindex: Row indices to select (array-like)
110
- allow_groups: Whether to allow slicing with groups (bool)
111
112
Returns: DMatrix
113
"""
114
115
@property
116
def feature_names(self):
117
"""Feature names. Returns: list of str or None"""
118
119
@property
120
def feature_types(self):
121
"""Feature types. Returns: list of str or None"""
122
```
123
124
### QuantileDMatrix - Memory-Efficient Data Structure
125
126
Memory-efficient variant of DMatrix that uses quantized data representation, designed specifically for the hist tree method. Reduces memory usage while maintaining accuracy for large datasets.
127
128
```python { .api }
129
class QuantileDMatrix:
130
def __init__(self, data, label=None, *, ref=None, weight=None,
131
base_margin=None, missing=None, silent=False,
132
feature_names=None, feature_types=None, nthread=None,
133
max_bin=256, group=None, qid=None, label_lower_bound=None,
134
label_upper_bound=None, feature_weights=None,
135
enable_categorical=False):
136
"""
137
Memory-efficient DMatrix using quantized data for hist tree method.
138
139
Parameters: Similar to DMatrix with additional:
140
- ref: Reference QuantileDMatrix for validation data (QuantileDMatrix)
141
- max_bin: Maximum number of bins for quantization (int)
142
"""
143
144
@property
145
def ref(self):
146
"""Reference to training QuantileDMatrix. Returns: QuantileDMatrix or None"""
147
```
148
149
### ExtMemQuantileDMatrix - External Memory Data Structure
150
151
External memory version of QuantileDMatrix for datasets that don't fit in memory. Enables training on very large datasets by streaming data from disk.
152
153
```python { .api }
154
class ExtMemQuantileDMatrix:
155
def __init__(self, data, *, missing=None, nthread=None, max_bin=None,
156
ref=None, enable_categorical=False, max_num_device_pages=None,
157
max_quantile_batches=None):
158
"""
159
External memory QuantileDMatrix for large datasets that don't fit in memory.
160
161
Parameters:
162
- data: Iterator that yields data chunks (DataIter)
163
- missing: Value representing missing data (float, optional)
164
- nthread: Number of threads for processing (int, optional)
165
- max_bin: Number of histogram bins for quantization (int, optional)
166
- ref: Reference DMatrix for validation data (DMatrix, optional)
167
- enable_categorical: Enable categorical feature support (bool)
168
- max_num_device_pages: GPU device memory page limit (int, optional)
169
- max_quantile_batches: Maximum quantile batches for processing (int, optional)
170
"""
171
172
@property
173
def ref(self):
174
"""Reference to training DMatrix. Returns: DMatrix or None"""
175
```
176
177
### Booster - Trained Model
178
179
The core XGBoost model class that contains the trained ensemble of decision trees. Provides methods for prediction, evaluation, model persistence, and introspection.
180
181
```python { .api }
182
class Booster:
183
def __init__(self, params=None, cache=(), model_file=None):
184
"""
185
XGBoost model containing training, prediction, and evaluation routines.
186
187
Parameters:
188
- params: Training parameters (dict)
189
- cache: List of DMatrix objects to cache (list)
190
- model_file: Path to load existing model (str)
191
"""
192
193
def update(self, dtrain, iteration, fobj=None):
194
"""
195
Update the model for one iteration.
196
197
Parameters:
198
- dtrain: Training DMatrix (DMatrix)
199
- iteration: Current iteration number (int)
200
- fobj: Custom objective function (callable, optional)
201
"""
202
203
def boost(self, dtrain, iteration, grad, hess):
204
"""
205
Boost the model for one iteration with custom gradients.
206
207
Parameters:
208
- dtrain: Training DMatrix (DMatrix)
209
- iteration: Current iteration number (int)
210
- grad: Gradient values (array-like)
211
- hess: Hessian values (array-like)
212
"""
213
214
def predict(self, data, *, output_margin=False, pred_leaf=False,
215
pred_contribs=False, approx_contribs=False,
216
pred_interactions=False, validate_features=True,
217
training=False, iteration_range=(0, 0), strict_shape=False):
218
"""
219
Make predictions using the trained model.
220
221
Parameters:
222
- data: Input data (DMatrix, array-like, or DataFrame)
223
- output_margin: Whether to output margin values (bool)
224
- pred_leaf: Whether to output leaf indices (bool)
225
- pred_contribs: Whether to output feature contributions (bool)
226
- approx_contribs: Whether to use approximate feature contributions (bool)
227
- pred_interactions: Whether to output interaction contributions (bool)
228
- validate_features: Whether to validate feature names (bool)
229
- training: Whether to use training mode (bool)
230
- iteration_range: Range of trees to use for prediction (tuple)
231
- strict_shape: Whether to enforce strict shape checking (bool)
232
233
Returns: numpy.ndarray - Predictions
234
"""
235
236
def inplace_predict(self, data, *, iteration_range=(0, 0),
237
predict_type='value', missing=float('nan'),
238
validate_features=True, base_margin=None,
239
strict_shape=False):
240
"""
241
Inplace prediction without creating DMatrix.
242
243
Parameters:
244
- data: Input data (array-like or DataFrame)
245
- iteration_range: Range of trees to use (tuple)
246
- predict_type: Type of prediction ('value', 'margin', 'contrib', 'leaf')
247
- missing: Value to treat as missing (float)
248
- validate_features: Whether to validate features (bool)
249
- base_margin: Base prediction margins (array-like)
250
- strict_shape: Whether to enforce strict shape checking (bool)
251
252
Returns: numpy.ndarray - Predictions
253
"""
254
255
def eval(self, data, name='eval', iteration=0):
256
"""
257
Evaluate model on given data.
258
259
Parameters:
260
- data: Evaluation data (DMatrix)
261
- name: Name for evaluation (str)
262
- iteration: Iteration to evaluate (int)
263
264
Returns: str - Evaluation result
265
"""
266
267
def eval_set(self, evals, iteration=0, feval=None, output_margin=True):
268
"""
269
Evaluate model on multiple datasets.
270
271
Parameters:
272
- evals: List of (DMatrix, name) tuples (list)
273
- iteration: Iteration to evaluate (int)
274
- feval: Custom evaluation function (callable)
275
- output_margin: Whether to output margins (bool)
276
277
Returns: str - Evaluation results
278
"""
279
280
def save_model(self, fname):
281
"""Save model to file. Parameters: fname (str)"""
282
283
def load_model(self, fname):
284
"""Load model from file. Parameters: fname (str)"""
285
286
def save_raw(self, raw_format='ubj'):
287
"""
288
Save model to raw format bytes.
289
290
Parameters:
291
- raw_format: Format ('json', 'ubj', 'deprecated') (str)
292
293
Returns: bytes - Serialized model
294
"""
295
296
def load_config(self, config):
297
"""Load configuration. Parameters: config (str)"""
298
299
def save_config(self):
300
"""Save current configuration. Returns: str - JSON configuration"""
301
302
def get_dump(self, fmap='', with_stats=False, dump_format='text'):
303
"""
304
Get model dump as list of strings.
305
306
Parameters:
307
- fmap: Feature map file (str)
308
- with_stats: Whether to include statistics (bool)
309
- dump_format: Output format ('text', 'json') (str)
310
311
Returns: list of str - Model trees
312
"""
313
314
def get_fscore(self, fmap=''):
315
"""
316
Get feature importance scores.
317
318
Parameters:
319
- fmap: Feature map file (str)
320
321
Returns: dict - Feature importance scores
322
"""
323
324
def get_score(self, fmap='', importance_type='weight'):
325
"""
326
Get feature importance scores by type.
327
328
Parameters:
329
- fmap: Feature map file (str)
330
- importance_type: Type ('weight', 'gain', 'cover', 'total_gain', 'total_cover')
331
332
Returns: dict - Feature importance scores
333
"""
334
335
def trees_to_dataframe(self, fmap=''):
336
"""
337
Convert trees to pandas DataFrame.
338
339
Parameters:
340
- fmap: Feature map file (str)
341
342
Returns: pandas.DataFrame - Tree structure
343
"""
344
345
def num_boosted_rounds(self):
346
"""Get number of boosted rounds. Returns: int"""
347
348
def num_features(self):
349
"""Get number of features. Returns: int"""
350
351
def copy(self):
352
"""Create a copy of the booster. Returns: Booster"""
353
354
def attr(self, key):
355
"""Get attribute by key. Parameters: key (str). Returns: str or None"""
356
357
def attributes(self):
358
"""Get all attributes. Returns: dict"""
359
360
def set_attr(self, **kwargs):
361
"""Set attributes. Parameters: **kwargs - Key-value pairs"""
362
363
def set_param(self, params, value=None):
364
"""
365
Set parameter(s).
366
367
Parameters:
368
- params: Parameter name (str) or parameter dict (dict)
369
- value: Parameter value (any, optional)
370
"""
371
372
@property
373
def feature_names(self):
374
"""Feature names. Returns: list of str or None"""
375
376
@property
377
def feature_types(self):
378
"""Feature types. Returns: list of str or None"""
379
380
@property
381
def best_iteration(self):
382
"""Best iteration from early stopping. Returns: int"""
383
384
@property
385
def best_score(self):
386
"""Best score from early stopping. Returns: float"""
387
```
388
389
### DataIter - Custom Data Loading
390
391
Abstract base class for implementing custom data iterators, enabling external memory training and custom data loading strategies for very large datasets.
392
393
```python { .api }
394
class DataIter:
395
def __init__(self, cache_prefix=None, release_data=True, *, on_host=True,
396
min_cache_page_bytes=None):
397
"""
398
Abstract base class for user-defined data iteration for external memory.
399
400
Parameters:
401
- cache_prefix: Prefix for cache files (str, optional)
402
- release_data: Whether to release data during iteration (bool)
403
- on_host: Cache on host memory vs file system for GPU (bool)
404
- min_cache_page_bytes: Minimum bytes per cache page (int, optional)
405
"""
406
407
def reset(self):
408
"""Reset iterator to the beginning. Must be implemented by subclasses."""
409
410
def next(self, input_data):
411
"""
412
Set the next batch of data. Must be implemented by subclasses.
413
414
Parameters:
415
- input_data: Callback function with data fields like DMatrix (callable)
416
Should be called as: input_data(data=X, label=y, weight=w, ...)
417
418
Returns: bool - False if no more batches, True if more data available
419
"""
420
421
def get_callbacks(self, enable_categorical):
422
"""
423
Get callback functions for iterating in C.
424
425
Parameters:
426
- enable_categorical: Enable categorical feature support (bool)
427
428
Returns: tuple - (reset_callback, next_callback)
429
"""
430
431
def reraise(self):
432
"""Reraise any exception thrown during iteration."""
433
434
@property
435
def proxy(self):
436
"""Handle of DMatrix proxy for internal use. Returns: _ProxyDMatrix"""
437
```
438
439
## Constants and Enums
440
441
### DataSplitMode
442
443
```python { .api }
444
class DataSplitMode:
445
"""Data splitting mode for distributed training."""
446
ROW = 0 # Split by rows
447
COL = 1 # Split by columns
448
```
449
450
## Usage Examples
451
452
### Basic DMatrix Creation
453
454
```python
455
import xgboost as xgb
456
import numpy as np
457
import pandas as pd
458
459
# From NumPy arrays
460
X = np.random.randn(1000, 10)
461
y = np.random.randint(0, 2, 1000)
462
dtrain = xgb.DMatrix(X, label=y, feature_names=[f'f{i}' for i in range(10)])
463
464
# From pandas DataFrame
465
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)])
466
dtrain = xgb.DMatrix(df, label=y)
467
468
# With additional information
469
weights = np.random.uniform(0.5, 2.0, 1000)
470
dtrain = xgb.DMatrix(X, label=y, weight=weights,
471
feature_names=[f'f{i}' for i in range(10)],
472
feature_types=['float'] * 10)
473
```
474
475
### Memory-Efficient Data Loading
476
477
```python
478
# Use QuantileDMatrix for large datasets
479
dtrain = xgb.QuantileDMatrix(X_train, label=y_train, max_bin=512)
480
dtest = xgb.QuantileDMatrix(X_test, label=y_test, ref=dtrain)
481
482
# For external memory training
483
class CustomDataIter(xgb.DataIter):
484
def __init__(self, data_files):
485
self.data_files = data_files
486
self.file_idx = 0
487
super().__init__()
488
489
def reset(self):
490
self.file_idx = 0
491
492
def next(self, input_data):
493
if self.file_idx >= len(self.data_files):
494
return 1
495
496
# Load data from current file
497
X, y = load_data_from_file(self.data_files[self.file_idx])
498
input_data(data=X, label=y)
499
self.file_idx += 1
500
return 0
501
502
data_iter = CustomDataIter(['data1.csv', 'data2.csv', 'data3.csv'])
503
dtrain = xgb.ExtMemQuantileDMatrix(data_iter)
504
```
505
506
### Model Operations
507
508
```python
509
# Train model
510
params = {'objective': 'binary:logistic', 'max_depth': 6}
511
model = xgb.train(params, dtrain, num_boost_round=100)
512
513
# Make predictions
514
predictions = model.predict(dtest)
515
516
# Get feature importance
517
importance = model.get_score(importance_type='gain')
518
print(importance)
519
520
# Save and load model
521
model.save_model('model.json')
522
loaded_model = xgb.Booster()
523
loaded_model.load_model('model.json')
524
525
# Model introspection
526
print(f"Number of trees: {model.num_boosted_rounds()}")
527
print(f"Number of features: {model.num_features()}")
528
```