Tessl Tile for pypi/xgboost-cpu@3.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-data-models.md distributed-computing.md index.md sklearn-interface.md training-evaluation.md utilities.md

sklearn-interface.mddocs/

0
# Scikit-learn Interface
1

2
Drop-in replacements for scikit-learn estimators that provide the familiar fit/predict API while leveraging XGBoost's high-performance gradient boosting implementation. These estimators integrate seamlessly with scikit-learn pipelines, cross-validation, and model selection tools.
3

4
## Capabilities
5

6
### XGBClassifier - Classification Estimator
7

8
XGBoost classifier that follows the scikit-learn API for binary and multi-class classification tasks. Supports probability prediction and integrates with scikit-learn's model evaluation tools.
9

10
```python { .api }
11
class XGBClassifier:
12
    def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256, 
13
                 grow_policy='depthwise', learning_rate=0.3, n_estimators=100, 
14
                 verbosity=1, objective=None, booster='gbtree', 
15
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
16
                 max_delta_step=0, subsample=1, sampling_method='uniform', 
17
                 colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
18
                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None, 
19
                 random_state=None, missing=float('nan'), num_parallel_tree=1, 
20
                 monotone_constraints=None, interaction_constraints=None, 
21
                 importance_type='gain', device=None, validate_parameters=None, 
22
                 enable_categorical=False, feature_types=None, 
23
                 feature_weights=None, max_cat_to_onehot=4, 
24
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
25
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
26
        """
27
        XGBoost classifier following scikit-learn API.
28
        
29
        Parameters:
30
        - max_depth: Maximum tree depth (int)
31
        - max_leaves: Maximum number of leaves (int, 0 means no limit)
32
        - max_bin: Maximum number of discrete bins for features (int)
33
        - grow_policy: Tree growing policy ('depthwise', 'lossguide')
34
        - learning_rate: Boosting learning rate (float)
35
        - n_estimators: Number of boosting rounds (int)
36
        - verbosity: Verbosity level (0=silent, 1=warning, 2=info, 3=debug)
37
        - objective: Learning objective (str or None for auto-detection)
38
        - booster: Booster type ('gbtree', 'gblinear', 'dart')
39
        - tree_method: Tree construction algorithm ('auto', 'exact', 'approx', 'hist')
40
        - n_jobs: Number of parallel threads (int or None)
41
        - gamma: Minimum loss reduction required for split (float)
42
        - min_child_weight: Minimum sum of instance weight in child (float)
43
        - max_delta_step: Maximum delta step allowed for each leaf output (float)
44
        - subsample: Fraction of samples used for training each tree (float)
45
        - sampling_method: Sampling method ('uniform', 'gradient_based')
46
        - colsample_bytree: Fraction of features used per tree (float)
47
        - colsample_bylevel: Fraction of features used per level (float)
48
        - colsample_bynode: Fraction of features used per split (float)
49
        - reg_alpha: L1 regularization term (float)
50
        - reg_lambda: L2 regularization term (float)
51
        - scale_pos_weight: Balancing weight for positive class (float)
52
        - base_score: Global bias for all predictions (float)
53
        - random_state: Random seed (int)
54
        - missing: Value to be treated as missing (float)
55
        - num_parallel_tree: Number of parallel trees per round (int)
56
        - monotone_constraints: Monotonic constraints (dict or None)
57
        - interaction_constraints: Interaction constraints (list or None)
58
        - importance_type: Feature importance type ('gain', 'weight', 'cover', 'total_gain', 'total_cover')
59
        - device: Device to use for training ('cpu', 'cuda', 'gpu')
60
        - validate_parameters: Whether to validate parameters (bool)
61
        - enable_categorical: Enable categorical feature support (bool)
62
        - feature_types: Types for features (list or None)
63
        - feature_weights: Weights for features (array-like or None)
64
        - max_cat_to_onehot: Maximum categories to use one-hot encoding (int)
65
        - max_cat_threshold: Maximum categories before switching to partitioning (int)
66
        - multi_strategy: Strategy for multi-class ('one_output_per_tree', 'multi_output_tree')
67
        - eval_metric: Evaluation metric (str, list, or callable)
68
        - early_stopping_rounds: Early stopping rounds (int)
69
        - callbacks: Callbacks for training (list)
70
        """
71
    
72
    def fit(self, X, y, *, sample_weight=None, base_margin=None, 
73
            eval_set=None, verbose=True, xgb_model=None, 
74
            sample_weight_eval_set=None, base_margin_eval_set=None, 
75
            feature_weights=None):
76
        """
77
        Fit the classifier to training data.
78
        
79
        Parameters:
80
        - X: Training data (array-like or DataFrame)
81
        - y: Target values (array-like)
82
        - sample_weight: Sample weights (array-like, optional)
83
        - base_margin: Base prediction margins (array-like, optional)
84
        - eval_set: Evaluation datasets as list of (X, y) tuples (list, optional)
85
        - verbose: Whether to print evaluation results (bool)
86
        - xgb_model: Existing model to continue training (Booster, optional)
87
        - sample_weight_eval_set: Sample weights for evaluation sets (list, optional)
88
        - base_margin_eval_set: Base margins for evaluation sets (list, optional)
89
        - feature_weights: Feature weights (array-like, optional)
90
        
91
        Returns: self
92
        """
93
    
94
    def predict(self, X, *, output_margin=False, validate_features=True, 
95
                base_margin=None, iteration_range=None):
96
        """
97
        Predict class labels.
98
        
99
        Parameters:
100
        - X: Input data (array-like or DataFrame)
101
        - output_margin: Whether to output margin values (bool)
102
        - validate_features: Whether to validate feature names (bool)
103
        - base_margin: Base prediction margins (array-like, optional)
104
        - iteration_range: Range of trees to use (tuple, optional)
105
        
106
        Returns: numpy.ndarray - Predicted class labels
107
        """
108
    
109
    def predict_proba(self, X, *, validate_features=True, base_margin=None, 
110
                      iteration_range=None):
111
        """
112
        Predict class probabilities.
113
        
114
        Parameters:
115
        - X: Input data (array-like or DataFrame)
116
        - validate_features: Whether to validate feature names (bool)
117
        - base_margin: Base prediction margins (array-like, optional)
118
        - iteration_range: Range of trees to use (tuple, optional)
119
        
120
        Returns: numpy.ndarray - Class probabilities
121
        """
122
    
123
    @property
124
    def classes_(self):
125
        """Unique class labels. Returns: numpy.ndarray"""
126
    
127
    @property
128
    def feature_importances_(self):
129
        """Feature importances. Returns: numpy.ndarray"""
130
    
131
    @property
132
    def best_score(self):
133
        """Best validation score. Returns: float"""
134
    
135
    @property
136
    def best_iteration(self):
137
        """Best iteration from early stopping. Returns: int"""
138
```
139

140
### XGBRegressor - Regression Estimator
141

142
XGBoost regressor for continuous target variables, providing high-performance gradient boosting for regression tasks with extensive hyperparameter control.
143

144
```python { .api }
145
class XGBRegressor:
146
    def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256, 
147
                 grow_policy='depthwise', learning_rate=0.3, n_estimators=100, 
148
                 verbosity=1, objective=None, booster='gbtree', 
149
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
150
                 max_delta_step=0, subsample=1, sampling_method='uniform', 
151
                 colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
152
                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None, 
153
                 random_state=None, missing=float('nan'), num_parallel_tree=1, 
154
                 monotone_constraints=None, interaction_constraints=None, 
155
                 importance_type='gain', device=None, validate_parameters=None, 
156
                 enable_categorical=False, feature_types=None, 
157
                 feature_weights=None, max_cat_to_onehot=4, 
158
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
159
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
160
        """
161
        XGBoost regressor following scikit-learn API.
162
        
163
        Parameters: Same as XGBClassifier
164
        """
165
    
166
    def fit(self, X, y, *, sample_weight=None, base_margin=None, 
167
            eval_set=None, verbose=True, xgb_model=None, 
168
            sample_weight_eval_set=None, base_margin_eval_set=None, 
169
            feature_weights=None):
170
        """Fit the regressor to training data. Same interface as XGBClassifier.fit()."""
171
    
172
    def predict(self, X, *, output_margin=False, validate_features=True, 
173
                base_margin=None, iteration_range=None):
174
        """
175
        Predict target values.
176
        
177
        Returns: numpy.ndarray - Predicted values
178
        """
179
```
180

181
### XGBRanker - Learning-to-Rank Estimator
182

183
XGBoost ranker for learning-to-rank tasks such as search result ranking, recommendation systems, and other applications where relative ordering matters more than absolute values.
184

185
```python { .api }
186
class XGBRanker:
187
    def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256, 
188
                 grow_policy='depthwise', learning_rate=0.3, n_estimators=100, 
189
                 verbosity=1, objective='rank:ndcg', booster='gbtree', 
190
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
191
                 max_delta_step=0, subsample=1, sampling_method='uniform', 
192
                 colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
193
                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None, 
194
                 random_state=None, missing=float('nan'), num_parallel_tree=1, 
195
                 monotone_constraints=None, interaction_constraints=None, 
196
                 importance_type='gain', device=None, validate_parameters=None, 
197
                 enable_categorical=False, feature_types=None, 
198
                 feature_weights=None, max_cat_to_onehot=4, 
199
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
200
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
201
        """
202
        XGBoost ranker for learning-to-rank tasks.
203
        
204
        Parameters: Same as XGBClassifier with default objective='rank:ndcg'
205
        """
206
    
207
    def fit(self, X, y, *, group=None, qid=None, sample_weight=None, 
208
            base_margin=None, eval_set=None, verbose=True, xgb_model=None, 
209
            sample_weight_eval_set=None, base_margin_eval_set=None, 
210
            feature_weights=None, eval_group=None, eval_qid=None):
211
        """
212
        Fit the ranker to training data.
213
        
214
        Parameters: Same as XGBClassifier.fit() with additional:
215
        - group: Group sizes for ranking (array-like)
216
        - qid: Query IDs for ranking (array-like)
217
        - eval_group: Group sizes for evaluation sets (list of array-like)
218
        - eval_qid: Query IDs for evaluation sets (list of array-like)
219
        """
220
    
221
    def predict(self, X, *, output_margin=False, validate_features=True, 
222
                base_margin=None, iteration_range=None):
223
        """
224
        Predict ranking scores.
225
        
226
        Returns: numpy.ndarray - Ranking scores
227
        """
228
    
229
    def score(self, X, y):
230
        """
231
        Return the mean accuracy on the given test data and labels.
232
        
233
        Parameters:
234
        - X: Test data (array-like)
235
        - y: True labels (array-like)
236
        
237
        Returns: float - Mean accuracy score
238
        """
239
```
240

241
### XGBRFClassifier - Random Forest Classifier
242

243
XGBoost-based random forest classifier that combines the speed of XGBoost with random forest's ensemble approach, using random feature subsets and bootstrap sampling.
244

245
```python { .api }
246
class XGBRFClassifier:
247
    def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100, 
248
                 verbosity=1, objective=None, booster='gbtree', 
249
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
250
                 max_delta_step=0, subsample=0.8, sampling_method='uniform', 
251
                 colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8, 
252
                 reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1, 
253
                 base_score=None, random_state=None, missing=float('nan'), 
254
                 num_parallel_tree=1, monotone_constraints=None, 
255
                 interaction_constraints=None, importance_type='gain', 
256
                 device=None, validate_parameters=None, enable_categorical=False, 
257
                 feature_types=None, feature_weights=None, max_cat_to_onehot=4, 
258
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
259
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
260
        """
261
        XGBoost random forest classifier.
262
        
263
        Parameters: Similar to XGBClassifier with RF-specific defaults:
264
        - learning_rate: 1.0 (no shrinkage for RF)
265
        - subsample: 0.8 (bootstrap sampling)
266
        - colsample_bytree: 0.8 (random feature subset per tree)
267
        - colsample_bynode: 0.8 (random feature subset per split)
268
        - reg_lambda: 1e-05 (minimal regularization)
269
        """
270
```
271

272
### XGBRFRegressor - Random Forest Regressor
273

274
XGBoost-based random forest regressor for regression tasks, combining XGBoost's efficiency with random forest methodology.
275

276
```python { .api }
277
class XGBRFRegressor:
278
    def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100, 
279
                 verbosity=1, objective=None, booster='gbtree', 
280
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
281
                 max_delta_step=0, subsample=0.8, sampling_method='uniform', 
282
                 colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8, 
283
                 reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1, 
284
                 base_score=None, random_state=None, missing=float('nan'), 
285
                 num_parallel_tree=1, monotone_constraints=None, 
286
                 interaction_constraints=None, importance_type='gain', 
287
                 device=None, validate_parameters=None, enable_categorical=False, 
288
                 feature_types=None, feature_weights=None, max_cat_to_onehot=4, 
289
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
290
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
291
        """
292
        XGBoost random forest regressor.
293
        
294
        Parameters: Same as XGBRFClassifier
295
        """
296
```
297

298
### XGBModel - Base Estimator
299

300
Base class for all XGBoost scikit-learn estimators, providing common functionality and interface methods.
301

302
```python { .api }
303
class XGBModel:
304
    def get_booster(self):
305
        """
306
        Get the underlying XGBoost Booster.
307
        
308
        Returns: Booster - The trained XGBoost model
309
        """
310
    
311
    def get_params(self, deep=True):
312
        """
313
        Get parameters for the estimator.
314
        
315
        Parameters:
316
        - deep: Whether to return parameters of sub-estimators (bool)
317
        
318
        Returns: dict - Parameter names and values
319
        """
320
    
321
    def set_params(self, **params):
322
        """
323
        Set parameters for the estimator.
324
        
325
        Parameters:
326
        - **params: Estimator parameters as keyword arguments
327
        
328
        Returns: self
329
        """
330
    
331
    def get_xgb_params(self):
332
        """
333
        Get XGBoost-specific parameters.
334
        
335
        Returns: dict - XGBoost parameters
336
        """
337
    
338
    def save_model(self, fname):
339
        """
340
        Save the model to file.
341
        
342
        Parameters:
343
        - fname: Output file name (str)
344
        """
345
    
346
    def load_model(self, fname):
347
        """
348
        Load model from file.
349
        
350
        Parameters:
351
        - fname: Input file name (str)
352
        """
353
    
354
    def apply(self, X, iteration_range=None):
355
        """
356
        Return the predicted leaf index for each sample.
357
        
358
        Parameters:
359
        - X: Input data (array-like or DataFrame)
360
        - iteration_range: Range of trees to use (tuple, optional)
361
        
362
        Returns: numpy.ndarray - Leaf indices
363
        """
364
    
365
    def evals_result(self):
366
        """
367
        Get evaluation results from training.
368
        
369
        Returns: dict - Evaluation history
370
        """
371
    
372
    @property
373
    def n_features_in_(self):
374
        """Number of features seen during fit. Returns: int"""
375
    
376
    @property
377
    def feature_names_in_(self):
378
        """Feature names seen during fit. Returns: numpy.ndarray"""
379
    
380
    @property
381
    def feature_importances_(self):
382
        """Feature importances. Returns: numpy.ndarray"""
383
    
384
    @property
385
    def best_score(self):
386
        """Best validation score. Returns: float"""
387
    
388
    @property
389
    def best_iteration(self):
390
        """Best iteration from early stopping. Returns: int"""
391
    
392
    @property
393
    def coef_(self):
394
        """Model coefficients (for linear booster). Returns: numpy.ndarray"""
395
    
396
    @property
397
    def intercept_(self):
398
        """Model intercept (for linear booster). Returns: float"""
399
```
400

401
## Usage Examples
402

403
### Basic Classification
404

405
```python
406
from xgboost import XGBClassifier
407
from sklearn.datasets import make_classification
408
from sklearn.model_selection import train_test_split
409
from sklearn.metrics import accuracy_score, classification_report
410

411
# Create sample data
412
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, 
413
                          n_informative=10, random_state=42)
414
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
415
                                                   random_state=42)
416

417
# Train classifier
418
clf = XGBClassifier(
419
    objective='binary:logistic',
420
    max_depth=6,
421
    learning_rate=0.1,
422
    n_estimators=100,
423
    early_stopping_rounds=10,
424
    eval_metric='logloss',
425
    random_state=42
426
)
427

428
clf.fit(X_train, y_train, 
429
        eval_set=[(X_test, y_test)], 
430
        verbose=False)
431

432
# Make predictions
433
y_pred = clf.predict(X_test)
434
y_pred_proba = clf.predict_proba(X_test)
435

436
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
437
print(f"Best iteration: {clf.best_iteration}")
438
print(f"Best score: {clf.best_score:.4f}")
439

440
# Feature importance
441
import matplotlib.pyplot as plt
442
feature_importance = clf.feature_importances_
443
plt.figure(figsize=(10, 6))
444
plt.barh(range(len(feature_importance)), feature_importance)
445
plt.xlabel('Feature Importance')
446
plt.title('XGBoost Feature Importance')
447
plt.show()
448
```
449

450
### Regression Example
451

452
```python
453
from xgboost import XGBRegressor
454
from sklearn.datasets import make_regression
455
from sklearn.metrics import mean_squared_error, r2_score
456

457
# Create regression data
458
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, 
459
                      random_state=42)
460
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
461
                                                   random_state=42)
462

463
# Train regressor
464
reg = XGBRegressor(
465
    objective='reg:squarederror',
466
    max_depth=6,
467
    learning_rate=0.1,
468
    n_estimators=100,
469
    early_stopping_rounds=10,
470
    eval_metric='rmse'
471
)
472

473
reg.fit(X_train, y_train, 
474
        eval_set=[(X_test, y_test)], 
475
        verbose=False)
476

477
# Make predictions
478
y_pred = reg.predict(X_test)
479

480
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")
481
print(f"R²: {r2_score(y_test, y_pred):.4f}")
482
```
483

484
### Learning-to-Rank Example
485

486
```python
487
from xgboost import XGBRanker
488
import numpy as np
489

490
# Create ranking data (mock example)
491
n_samples_per_group = 50
492
n_groups = 20
493
n_features = 10
494

495
X = np.random.randn(n_samples_per_group * n_groups, n_features)
496
y = np.random.randint(0, 5, n_samples_per_group * n_groups)  # Relevance scores 0-4
497
group = np.array([n_samples_per_group] * n_groups)  # Group sizes
498

499
# Train ranker
500
ranker = XGBRanker(
501
    objective='rank:ndcg',
502
    max_depth=6,
503
    learning_rate=0.1,
504
    n_estimators=100,
505
    eval_metric='ndcg@10'
506
)
507

508
ranker.fit(X, y, group=group)
509

510
# Make ranking predictions
511
ranking_scores = ranker.predict(X)
512
print(f"Ranking scores shape: {ranking_scores.shape}")
513
```
514

515
### Pipeline Integration
516

517
```python
518
from sklearn.pipeline import Pipeline
519
from sklearn.preprocessing import StandardScaler
520
from sklearn.model_selection import GridSearchCV
521

522
# Create pipeline with preprocessing
523
pipeline = Pipeline([
524
    ('scaler', StandardScaler()),
525
    ('xgb', XGBClassifier(random_state=42))
526
])
527

528
# Parameter grid for hyperparameter tuning
529
param_grid = {
530
    'xgb__max_depth': [3, 6, 9],
531
    'xgb__learning_rate': [0.01, 0.1, 0.2],
532
    'xgb__n_estimators': [50, 100, 200]
533
}
534

535
# Grid search with cross-validation
536
grid_search = GridSearchCV(
537
    pipeline, 
538
    param_grid, 
539
    cv=5, 
540
    scoring='accuracy',
541
    n_jobs=-1
542
)
543

544
grid_search.fit(X_train, y_train)
545

546
print(f"Best parameters: {grid_search.best_params_}")
547
print(f"Best CV score: {grid_search.best_score_:.4f}")
548

549
# Use best model
550
best_model = grid_search.best_estimator_
551
y_pred = best_model.predict(X_test)
552
```
553

554
### Random Forest Usage
555

556
```python
557
from xgboost import XGBRFClassifier
558

559
# XGBoost Random Forest
560
rf_clf = XGBRFClassifier(
561
    n_estimators=100,
562
    max_depth=6,
563
    learning_rate=1.0,  # No shrinkage for RF
564
    subsample=0.8,      # Bootstrap sampling
565
    colsample_bynode=0.8,  # Random feature subset per split
566
    random_state=42
567
)
568

569
rf_clf.fit(X_train, y_train)
570
rf_pred = rf_clf.predict(X_test)
571
rf_pred_proba = rf_clf.predict_proba(X_test)
572

573
print(f"RF Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
574
```

Version

Tile

Files

sklearn-interface.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

sklearn-interface.mddocs/