0
# Scikit-learn Interface
1
2
Drop-in replacements for scikit-learn estimators that provide the familiar fit/predict API while leveraging XGBoost's high-performance gradient boosting implementation. These estimators integrate seamlessly with scikit-learn pipelines, cross-validation, and model selection tools.
3
4
## Capabilities
5
6
### XGBClassifier - Classification Estimator
7
8
XGBoost classifier that follows the scikit-learn API for binary and multi-class classification tasks. Supports probability prediction and integrates with scikit-learn's model evaluation tools.
9
10
```python { .api }
11
class XGBClassifier:
12
def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,
13
grow_policy='depthwise', learning_rate=0.3, n_estimators=100,
14
verbosity=1, objective=None, booster='gbtree',
15
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
16
max_delta_step=0, subsample=1, sampling_method='uniform',
17
colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
18
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,
19
random_state=None, missing=float('nan'), num_parallel_tree=1,
20
monotone_constraints=None, interaction_constraints=None,
21
importance_type='gain', device=None, validate_parameters=None,
22
enable_categorical=False, feature_types=None,
23
feature_weights=None, max_cat_to_onehot=4,
24
max_cat_threshold=64, multi_strategy='one_output_per_tree',
25
eval_metric=None, early_stopping_rounds=None, callbacks=None):
26
"""
27
XGBoost classifier following scikit-learn API.
28
29
Parameters:
30
- max_depth: Maximum tree depth (int)
31
- max_leaves: Maximum number of leaves (int, 0 means no limit)
32
- max_bin: Maximum number of discrete bins for features (int)
33
- grow_policy: Tree growing policy ('depthwise', 'lossguide')
34
- learning_rate: Boosting learning rate (float)
35
- n_estimators: Number of boosting rounds (int)
36
- verbosity: Verbosity level (0=silent, 1=warning, 2=info, 3=debug)
37
- objective: Learning objective (str or None for auto-detection)
38
- booster: Booster type ('gbtree', 'gblinear', 'dart')
39
- tree_method: Tree construction algorithm ('auto', 'exact', 'approx', 'hist')
40
- n_jobs: Number of parallel threads (int or None)
41
- gamma: Minimum loss reduction required for split (float)
42
- min_child_weight: Minimum sum of instance weight in child (float)
43
- max_delta_step: Maximum delta step allowed for each leaf output (float)
44
- subsample: Fraction of samples used for training each tree (float)
45
- sampling_method: Sampling method ('uniform', 'gradient_based')
46
- colsample_bytree: Fraction of features used per tree (float)
47
- colsample_bylevel: Fraction of features used per level (float)
48
- colsample_bynode: Fraction of features used per split (float)
49
- reg_alpha: L1 regularization term (float)
50
- reg_lambda: L2 regularization term (float)
51
- scale_pos_weight: Balancing weight for positive class (float)
52
- base_score: Global bias for all predictions (float)
53
- random_state: Random seed (int)
54
- missing: Value to be treated as missing (float)
55
- num_parallel_tree: Number of parallel trees per round (int)
56
- monotone_constraints: Monotonic constraints (dict or None)
57
- interaction_constraints: Interaction constraints (list or None)
58
- importance_type: Feature importance type ('gain', 'weight', 'cover', 'total_gain', 'total_cover')
59
- device: Device to use for training ('cpu', 'cuda', 'gpu')
60
- validate_parameters: Whether to validate parameters (bool)
61
- enable_categorical: Enable categorical feature support (bool)
62
- feature_types: Types for features (list or None)
63
- feature_weights: Weights for features (array-like or None)
64
- max_cat_to_onehot: Maximum categories to use one-hot encoding (int)
65
- max_cat_threshold: Maximum categories before switching to partitioning (int)
66
- multi_strategy: Strategy for multi-class ('one_output_per_tree', 'multi_output_tree')
67
- eval_metric: Evaluation metric (str, list, or callable)
68
- early_stopping_rounds: Early stopping rounds (int)
69
- callbacks: Callbacks for training (list)
70
"""
71
72
def fit(self, X, y, *, sample_weight=None, base_margin=None,
73
eval_set=None, verbose=True, xgb_model=None,
74
sample_weight_eval_set=None, base_margin_eval_set=None,
75
feature_weights=None):
76
"""
77
Fit the classifier to training data.
78
79
Parameters:
80
- X: Training data (array-like or DataFrame)
81
- y: Target values (array-like)
82
- sample_weight: Sample weights (array-like, optional)
83
- base_margin: Base prediction margins (array-like, optional)
84
- eval_set: Evaluation datasets as list of (X, y) tuples (list, optional)
85
- verbose: Whether to print evaluation results (bool)
86
- xgb_model: Existing model to continue training (Booster, optional)
87
- sample_weight_eval_set: Sample weights for evaluation sets (list, optional)
88
- base_margin_eval_set: Base margins for evaluation sets (list, optional)
89
- feature_weights: Feature weights (array-like, optional)
90
91
Returns: self
92
"""
93
94
def predict(self, X, *, output_margin=False, validate_features=True,
95
base_margin=None, iteration_range=None):
96
"""
97
Predict class labels.
98
99
Parameters:
100
- X: Input data (array-like or DataFrame)
101
- output_margin: Whether to output margin values (bool)
102
- validate_features: Whether to validate feature names (bool)
103
- base_margin: Base prediction margins (array-like, optional)
104
- iteration_range: Range of trees to use (tuple, optional)
105
106
Returns: numpy.ndarray - Predicted class labels
107
"""
108
109
def predict_proba(self, X, *, validate_features=True, base_margin=None,
110
iteration_range=None):
111
"""
112
Predict class probabilities.
113
114
Parameters:
115
- X: Input data (array-like or DataFrame)
116
- validate_features: Whether to validate feature names (bool)
117
- base_margin: Base prediction margins (array-like, optional)
118
- iteration_range: Range of trees to use (tuple, optional)
119
120
Returns: numpy.ndarray - Class probabilities
121
"""
122
123
@property
124
def classes_(self):
125
"""Unique class labels. Returns: numpy.ndarray"""
126
127
@property
128
def feature_importances_(self):
129
"""Feature importances. Returns: numpy.ndarray"""
130
131
@property
132
def best_score(self):
133
"""Best validation score. Returns: float"""
134
135
@property
136
def best_iteration(self):
137
"""Best iteration from early stopping. Returns: int"""
138
```
139
140
### XGBRegressor - Regression Estimator
141
142
XGBoost regressor for continuous target variables, providing high-performance gradient boosting for regression tasks with extensive hyperparameter control.
143
144
```python { .api }
145
class XGBRegressor:
146
def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,
147
grow_policy='depthwise', learning_rate=0.3, n_estimators=100,
148
verbosity=1, objective=None, booster='gbtree',
149
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
150
max_delta_step=0, subsample=1, sampling_method='uniform',
151
colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
152
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,
153
random_state=None, missing=float('nan'), num_parallel_tree=1,
154
monotone_constraints=None, interaction_constraints=None,
155
importance_type='gain', device=None, validate_parameters=None,
156
enable_categorical=False, feature_types=None,
157
feature_weights=None, max_cat_to_onehot=4,
158
max_cat_threshold=64, multi_strategy='one_output_per_tree',
159
eval_metric=None, early_stopping_rounds=None, callbacks=None):
160
"""
161
XGBoost regressor following scikit-learn API.
162
163
Parameters: Same as XGBClassifier
164
"""
165
166
def fit(self, X, y, *, sample_weight=None, base_margin=None,
167
eval_set=None, verbose=True, xgb_model=None,
168
sample_weight_eval_set=None, base_margin_eval_set=None,
169
feature_weights=None):
170
"""Fit the regressor to training data. Same interface as XGBClassifier.fit()."""
171
172
def predict(self, X, *, output_margin=False, validate_features=True,
173
base_margin=None, iteration_range=None):
174
"""
175
Predict target values.
176
177
Returns: numpy.ndarray - Predicted values
178
"""
179
```
180
181
### XGBRanker - Learning-to-Rank Estimator
182
183
XGBoost ranker for learning-to-rank tasks such as search result ranking, recommendation systems, and other applications where relative ordering matters more than absolute values.
184
185
```python { .api }
186
class XGBRanker:
187
def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,
188
grow_policy='depthwise', learning_rate=0.3, n_estimators=100,
189
verbosity=1, objective='rank:ndcg', booster='gbtree',
190
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
191
max_delta_step=0, subsample=1, sampling_method='uniform',
192
colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
193
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,
194
random_state=None, missing=float('nan'), num_parallel_tree=1,
195
monotone_constraints=None, interaction_constraints=None,
196
importance_type='gain', device=None, validate_parameters=None,
197
enable_categorical=False, feature_types=None,
198
feature_weights=None, max_cat_to_onehot=4,
199
max_cat_threshold=64, multi_strategy='one_output_per_tree',
200
eval_metric=None, early_stopping_rounds=None, callbacks=None):
201
"""
202
XGBoost ranker for learning-to-rank tasks.
203
204
Parameters: Same as XGBClassifier with default objective='rank:ndcg'
205
"""
206
207
def fit(self, X, y, *, group=None, qid=None, sample_weight=None,
208
base_margin=None, eval_set=None, verbose=True, xgb_model=None,
209
sample_weight_eval_set=None, base_margin_eval_set=None,
210
feature_weights=None, eval_group=None, eval_qid=None):
211
"""
212
Fit the ranker to training data.
213
214
Parameters: Same as XGBClassifier.fit() with additional:
215
- group: Group sizes for ranking (array-like)
216
- qid: Query IDs for ranking (array-like)
217
- eval_group: Group sizes for evaluation sets (list of array-like)
218
- eval_qid: Query IDs for evaluation sets (list of array-like)
219
"""
220
221
def predict(self, X, *, output_margin=False, validate_features=True,
222
base_margin=None, iteration_range=None):
223
"""
224
Predict ranking scores.
225
226
Returns: numpy.ndarray - Ranking scores
227
"""
228
229
def score(self, X, y):
230
"""
231
Return the mean accuracy on the given test data and labels.
232
233
Parameters:
234
- X: Test data (array-like)
235
- y: True labels (array-like)
236
237
Returns: float - Mean accuracy score
238
"""
239
```
240
241
### XGBRFClassifier - Random Forest Classifier
242
243
XGBoost-based random forest classifier that combines the speed of XGBoost with random forest's ensemble approach, using random feature subsets and bootstrap sampling.
244
245
```python { .api }
246
class XGBRFClassifier:
247
def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100,
248
verbosity=1, objective=None, booster='gbtree',
249
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
250
max_delta_step=0, subsample=0.8, sampling_method='uniform',
251
colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8,
252
reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1,
253
base_score=None, random_state=None, missing=float('nan'),
254
num_parallel_tree=1, monotone_constraints=None,
255
interaction_constraints=None, importance_type='gain',
256
device=None, validate_parameters=None, enable_categorical=False,
257
feature_types=None, feature_weights=None, max_cat_to_onehot=4,
258
max_cat_threshold=64, multi_strategy='one_output_per_tree',
259
eval_metric=None, early_stopping_rounds=None, callbacks=None):
260
"""
261
XGBoost random forest classifier.
262
263
Parameters: Similar to XGBClassifier with RF-specific defaults:
264
- learning_rate: 1.0 (no shrinkage for RF)
265
- subsample: 0.8 (bootstrap sampling)
266
- colsample_bytree: 0.8 (random feature subset per tree)
267
- colsample_bynode: 0.8 (random feature subset per split)
268
- reg_lambda: 1e-05 (minimal regularization)
269
"""
270
```
271
272
### XGBRFRegressor - Random Forest Regressor
273
274
XGBoost-based random forest regressor for regression tasks, combining XGBoost's efficiency with random forest methodology.
275
276
```python { .api }
277
class XGBRFRegressor:
278
def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100,
279
verbosity=1, objective=None, booster='gbtree',
280
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
281
max_delta_step=0, subsample=0.8, sampling_method='uniform',
282
colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8,
283
reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1,
284
base_score=None, random_state=None, missing=float('nan'),
285
num_parallel_tree=1, monotone_constraints=None,
286
interaction_constraints=None, importance_type='gain',
287
device=None, validate_parameters=None, enable_categorical=False,
288
feature_types=None, feature_weights=None, max_cat_to_onehot=4,
289
max_cat_threshold=64, multi_strategy='one_output_per_tree',
290
eval_metric=None, early_stopping_rounds=None, callbacks=None):
291
"""
292
XGBoost random forest regressor.
293
294
Parameters: Same as XGBRFClassifier
295
"""
296
```
297
298
### XGBModel - Base Estimator
299
300
Base class for all XGBoost scikit-learn estimators, providing common functionality and interface methods.
301
302
```python { .api }
303
class XGBModel:
304
def get_booster(self):
305
"""
306
Get the underlying XGBoost Booster.
307
308
Returns: Booster - The trained XGBoost model
309
"""
310
311
def get_params(self, deep=True):
312
"""
313
Get parameters for the estimator.
314
315
Parameters:
316
- deep: Whether to return parameters of sub-estimators (bool)
317
318
Returns: dict - Parameter names and values
319
"""
320
321
def set_params(self, **params):
322
"""
323
Set parameters for the estimator.
324
325
Parameters:
326
- **params: Estimator parameters as keyword arguments
327
328
Returns: self
329
"""
330
331
def get_xgb_params(self):
332
"""
333
Get XGBoost-specific parameters.
334
335
Returns: dict - XGBoost parameters
336
"""
337
338
def save_model(self, fname):
339
"""
340
Save the model to file.
341
342
Parameters:
343
- fname: Output file name (str)
344
"""
345
346
def load_model(self, fname):
347
"""
348
Load model from file.
349
350
Parameters:
351
- fname: Input file name (str)
352
"""
353
354
def apply(self, X, iteration_range=None):
355
"""
356
Return the predicted leaf index for each sample.
357
358
Parameters:
359
- X: Input data (array-like or DataFrame)
360
- iteration_range: Range of trees to use (tuple, optional)
361
362
Returns: numpy.ndarray - Leaf indices
363
"""
364
365
def evals_result(self):
366
"""
367
Get evaluation results from training.
368
369
Returns: dict - Evaluation history
370
"""
371
372
@property
373
def n_features_in_(self):
374
"""Number of features seen during fit. Returns: int"""
375
376
@property
377
def feature_names_in_(self):
378
"""Feature names seen during fit. Returns: numpy.ndarray"""
379
380
@property
381
def feature_importances_(self):
382
"""Feature importances. Returns: numpy.ndarray"""
383
384
@property
385
def best_score(self):
386
"""Best validation score. Returns: float"""
387
388
@property
389
def best_iteration(self):
390
"""Best iteration from early stopping. Returns: int"""
391
392
@property
393
def coef_(self):
394
"""Model coefficients (for linear booster). Returns: numpy.ndarray"""
395
396
@property
397
def intercept_(self):
398
"""Model intercept (for linear booster). Returns: float"""
399
```
400
401
## Usage Examples
402
403
### Basic Classification
404
405
```python
406
from xgboost import XGBClassifier
407
from sklearn.datasets import make_classification
408
from sklearn.model_selection import train_test_split
409
from sklearn.metrics import accuracy_score, classification_report
410
411
# Create sample data
412
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2,
413
n_informative=10, random_state=42)
414
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
415
random_state=42)
416
417
# Train classifier
418
clf = XGBClassifier(
419
objective='binary:logistic',
420
max_depth=6,
421
learning_rate=0.1,
422
n_estimators=100,
423
early_stopping_rounds=10,
424
eval_metric='logloss',
425
random_state=42
426
)
427
428
clf.fit(X_train, y_train,
429
eval_set=[(X_test, y_test)],
430
verbose=False)
431
432
# Make predictions
433
y_pred = clf.predict(X_test)
434
y_pred_proba = clf.predict_proba(X_test)
435
436
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
437
print(f"Best iteration: {clf.best_iteration}")
438
print(f"Best score: {clf.best_score:.4f}")
439
440
# Feature importance
441
import matplotlib.pyplot as plt
442
feature_importance = clf.feature_importances_
443
plt.figure(figsize=(10, 6))
444
plt.barh(range(len(feature_importance)), feature_importance)
445
plt.xlabel('Feature Importance')
446
plt.title('XGBoost Feature Importance')
447
plt.show()
448
```
449
450
### Regression Example
451
452
```python
453
from xgboost import XGBRegressor
454
from sklearn.datasets import make_regression
455
from sklearn.metrics import mean_squared_error, r2_score
456
457
# Create regression data
458
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1,
459
random_state=42)
460
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
461
random_state=42)
462
463
# Train regressor
464
reg = XGBRegressor(
465
objective='reg:squarederror',
466
max_depth=6,
467
learning_rate=0.1,
468
n_estimators=100,
469
early_stopping_rounds=10,
470
eval_metric='rmse'
471
)
472
473
reg.fit(X_train, y_train,
474
eval_set=[(X_test, y_test)],
475
verbose=False)
476
477
# Make predictions
478
y_pred = reg.predict(X_test)
479
480
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")
481
print(f"R²: {r2_score(y_test, y_pred):.4f}")
482
```
483
484
### Learning-to-Rank Example
485
486
```python
487
from xgboost import XGBRanker
488
import numpy as np
489
490
# Create ranking data (mock example)
491
n_samples_per_group = 50
492
n_groups = 20
493
n_features = 10
494
495
X = np.random.randn(n_samples_per_group * n_groups, n_features)
496
y = np.random.randint(0, 5, n_samples_per_group * n_groups) # Relevance scores 0-4
497
group = np.array([n_samples_per_group] * n_groups) # Group sizes
498
499
# Train ranker
500
ranker = XGBRanker(
501
objective='rank:ndcg',
502
max_depth=6,
503
learning_rate=0.1,
504
n_estimators=100,
505
eval_metric='ndcg@10'
506
)
507
508
ranker.fit(X, y, group=group)
509
510
# Make ranking predictions
511
ranking_scores = ranker.predict(X)
512
print(f"Ranking scores shape: {ranking_scores.shape}")
513
```
514
515
### Pipeline Integration
516
517
```python
518
from sklearn.pipeline import Pipeline
519
from sklearn.preprocessing import StandardScaler
520
from sklearn.model_selection import GridSearchCV
521
522
# Create pipeline with preprocessing
523
pipeline = Pipeline([
524
('scaler', StandardScaler()),
525
('xgb', XGBClassifier(random_state=42))
526
])
527
528
# Parameter grid for hyperparameter tuning
529
param_grid = {
530
'xgb__max_depth': [3, 6, 9],
531
'xgb__learning_rate': [0.01, 0.1, 0.2],
532
'xgb__n_estimators': [50, 100, 200]
533
}
534
535
# Grid search with cross-validation
536
grid_search = GridSearchCV(
537
pipeline,
538
param_grid,
539
cv=5,
540
scoring='accuracy',
541
n_jobs=-1
542
)
543
544
grid_search.fit(X_train, y_train)
545
546
print(f"Best parameters: {grid_search.best_params_}")
547
print(f"Best CV score: {grid_search.best_score_:.4f}")
548
549
# Use best model
550
best_model = grid_search.best_estimator_
551
y_pred = best_model.predict(X_test)
552
```
553
554
### Random Forest Usage
555
556
```python
557
from xgboost import XGBRFClassifier
558
559
# XGBoost Random Forest
560
rf_clf = XGBRFClassifier(
561
n_estimators=100,
562
max_depth=6,
563
learning_rate=1.0, # No shrinkage for RF
564
subsample=0.8, # Bootstrap sampling
565
colsample_bynode=0.8, # Random feature subset per split
566
random_state=42
567
)
568
569
rf_clf.fit(X_train, y_train)
570
rf_pred = rf_clf.predict(X_test)
571
rf_pred_proba = rf_clf.predict_proba(X_test)
572
573
print(f"RF Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
574
```