0
# Scikit-Learn Interface
1
2
XGBoost provides scikit-learn compatible estimators that follow sklearn conventions for seamless integration with existing ML pipelines. These estimators provide familiar fit/predict interfaces while leveraging XGBoost's powerful gradient boosting algorithms.
3
4
## Capabilities
5
6
### Base Model Class
7
8
Base class for all XGBoost sklearn-compatible estimators.
9
10
```python { .api }
11
class XGBModel:
12
def __init__(
13
self,
14
n_estimators=100,
15
max_depth=None,
16
max_leaves=None,
17
max_bin=None,
18
grow_policy=None,
19
learning_rate=None,
20
verbosity=None,
21
objective=None,
22
booster=None,
23
tree_method=None,
24
n_jobs=None,
25
gamma=None,
26
min_child_weight=None,
27
max_delta_step=None,
28
subsample=None,
29
sampling_method=None,
30
colsample_bytree=None,
31
colsample_bylevel=None,
32
colsample_bynode=None,
33
reg_alpha=None,
34
reg_lambda=None,
35
scale_pos_weight=None,
36
base_score=None,
37
random_state=None,
38
missing=None,
39
num_parallel_tree=None,
40
monotone_constraints=None,
41
interaction_constraints=None,
42
importance_type='gain',
43
device=None,
44
validate_parameters=None,
45
enable_categorical=False,
46
feature_types=None,
47
max_cat_to_onehot=None,
48
max_cat_threshold=None,
49
multi_strategy=None,
50
eval_metric=None,
51
early_stopping_rounds=None,
52
callbacks=None,
53
**kwargs
54
):
55
"""
56
Base XGBoost sklearn-compatible estimator.
57
58
Parameters:
59
- n_estimators: Number of boosting rounds
60
- max_depth: Maximum tree depth
61
- learning_rate: Boosting learning rate
62
- objective: Learning objective
63
- booster: Booster type ('gbtree', 'gblinear', 'dart')
64
- tree_method: Tree construction algorithm
65
- n_jobs: Number of parallel threads
66
- gamma: Minimum loss reduction for split
67
- min_child_weight: Minimum sum of instance weight in child
68
- subsample: Subsample ratio of training instances
69
- colsample_bytree: Subsample ratio of columns per tree
70
- reg_alpha: L1 regularization term
71
- reg_lambda: L2 regularization term
72
- random_state: Random seed
73
- enable_categorical: Enable categorical feature support
74
"""
75
76
def fit(
77
self,
78
X,
79
y,
80
sample_weight=None,
81
base_margin=None,
82
eval_set=None,
83
eval_metric=None,
84
early_stopping_rounds=None,
85
verbose=True,
86
xgb_model=None,
87
sample_weight_eval_set=None,
88
base_margin_eval_set=None,
89
feature_weights=None,
90
callbacks=None
91
):
92
"""
93
Fit the model to training data.
94
95
Parameters:
96
- X: Training features
97
- y: Training labels
98
- sample_weight: Sample weights
99
- base_margin: Base margin for each sample
100
- eval_set: Evaluation sets as list of (X, y) tuples
101
- eval_metric: Evaluation metric(s)
102
- early_stopping_rounds: Early stopping rounds
103
- verbose: Verbosity
104
- xgb_model: Existing model to continue training
105
- sample_weight_eval_set: Sample weights for eval sets
106
- base_margin_eval_set: Base margins for eval sets
107
- feature_weights: Feature weights
108
- callbacks: Callback functions
109
110
Returns:
111
Self
112
"""
113
114
def predict(
115
self,
116
X,
117
output_margin=False,
118
validate_features=True,
119
base_margin=None,
120
iteration_range=None
121
):
122
"""
123
Make predictions on input data.
124
125
Parameters:
126
- X: Input features
127
- output_margin: Output raw margins
128
- validate_features: Validate feature names/types
129
- base_margin: Base margin for each sample
130
- iteration_range: Range of boosting rounds
131
132
Returns:
133
Predictions as numpy array
134
"""
135
136
def get_booster(self):
137
"""Get underlying Booster object."""
138
139
def save_model(self, fname):
140
"""Save model to file."""
141
142
def load_model(self, fname):
143
"""Load model from file."""
144
145
@property
146
def feature_importances_(self):
147
"""Feature importances as numpy array."""
148
149
def get_params(self, deep=True):
150
"""Get estimator parameters."""
151
152
def set_params(self, **params):
153
"""Set estimator parameters."""
154
```
155
156
### Regression
157
158
XGBoost regressor for continuous target variables.
159
160
```python { .api }
161
class XGBRegressor(XGBModel):
162
def __init__(self, **kwargs):
163
"""
164
XGBoost regressor.
165
166
Inherits all parameters from XGBModel.
167
Default objective: 'reg:squarederror'
168
"""
169
170
def fit(self, X, y, **kwargs):
171
"""Fit regressor to training data."""
172
173
def predict(self, X, **kwargs):
174
"""Predict continuous values."""
175
```
176
177
### Classification
178
179
XGBoost classifier for categorical target variables.
180
181
```python { .api }
182
class XGBClassifier(XGBModel):
183
def __init__(self, **kwargs):
184
"""
185
XGBoost classifier.
186
187
Inherits all parameters from XGBModel.
188
Default objective: 'binary:logistic' or 'multi:softprob'
189
"""
190
191
def fit(self, X, y, **kwargs):
192
"""Fit classifier to training data."""
193
194
def predict(self, X, **kwargs):
195
"""Predict class labels."""
196
197
def predict_proba(
198
self,
199
X,
200
validate_features=True,
201
base_margin=None,
202
iteration_range=None
203
):
204
"""
205
Predict class probabilities.
206
207
Parameters:
208
- X: Input features
209
- validate_features: Validate feature names/types
210
- base_margin: Base margin for each sample
211
- iteration_range: Range of boosting rounds
212
213
Returns:
214
Class probabilities as numpy array
215
"""
216
217
def predict_log_proba(self, X, **kwargs):
218
"""Predict log class probabilities."""
219
220
@property
221
def classes_(self):
222
"""Unique class labels."""
223
```
224
225
### Ranking
226
227
XGBoost ranker for learning-to-rank problems.
228
229
```python { .api }
230
class XGBRanker(XGBModel):
231
def __init__(self, **kwargs):
232
"""
233
XGBoost ranker for learning-to-rank.
234
235
Inherits all parameters from XGBModel.
236
Default objective: 'rank:pairwise'
237
"""
238
239
def fit(
240
self,
241
X,
242
y,
243
group=None,
244
qid=None,
245
sample_weight=None,
246
base_margin=None,
247
eval_set=None,
248
eval_group=None,
249
eval_qid=None,
250
eval_metric=None,
251
early_stopping_rounds=None,
252
verbose=True,
253
xgb_model=None,
254
sample_weight_eval_set=None,
255
base_margin_eval_set=None,
256
feature_weights=None,
257
callbacks=None
258
):
259
"""
260
Fit ranker to training data.
261
262
Parameters:
263
- X: Training features
264
- y: Training relevance scores
265
- group: Group sizes for queries
266
- qid: Query IDs for each sample
267
- (other parameters same as XGBModel.fit)
268
269
Returns:
270
Self
271
"""
272
273
def predict(self, X, **kwargs):
274
"""Predict ranking scores."""
275
```
276
277
### Random Forest Variants
278
279
XGBoost implementations of random forest algorithms.
280
281
```python { .api }
282
class XGBRFRegressor(XGBModel):
283
def __init__(self, **kwargs):
284
"""
285
XGBoost random forest regressor.
286
287
Configured with random forest defaults:
288
- colsample_bynode=0.8
289
- learning_rate=1.0
290
- max_depth=None
291
- n_estimators=100
292
- num_parallel_tree=100
293
- reg_lambda=1e-5
294
- subsample=0.8
295
"""
296
297
def fit(self, X, y, **kwargs):
298
"""Fit random forest regressor."""
299
300
def predict(self, X, **kwargs):
301
"""Predict using random forest."""
302
303
class XGBRFClassifier(XGBModel):
304
def __init__(self, **kwargs):
305
"""
306
XGBoost random forest classifier.
307
308
Same defaults as XGBRFRegressor with classification objective.
309
"""
310
311
def fit(self, X, y, **kwargs):
312
"""Fit random forest classifier."""
313
314
def predict(self, X, **kwargs):
315
"""Predict class labels using random forest."""
316
317
def predict_proba(self, X, **kwargs):
318
"""Predict class probabilities using random forest."""
319
```
320
321
## Usage Examples
322
323
### Basic Classification
324
325
```python
326
from xgboost import XGBClassifier
327
from sklearn.datasets import load_iris
328
from sklearn.model_selection import train_test_split
329
330
# Load data
331
X, y = load_iris(return_X_y=True)
332
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
333
334
# Train classifier
335
clf = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
336
clf.fit(X_train, y_train)
337
338
# Make predictions
339
y_pred = clf.predict(X_test)
340
y_proba = clf.predict_proba(X_test)
341
342
# Feature importance
343
importance = clf.feature_importances_
344
```
345
346
### Regression with Early Stopping
347
348
```python
349
from xgboost import XGBRegressor
350
from sklearn.datasets import load_boston
351
from sklearn.model_selection import train_test_split
352
353
# Load data
354
X, y = load_boston(return_X_y=True)
355
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
356
357
# Train with early stopping
358
reg = XGBRegressor(
359
n_estimators=1000,
360
max_depth=3,
361
learning_rate=0.1,
362
early_stopping_rounds=10
363
)
364
365
reg.fit(
366
X_train, y_train,
367
eval_set=[(X_test, y_test)],
368
verbose=False
369
)
370
371
# Predict
372
y_pred = reg.predict(X_test)
373
```