or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-data-models.mddistributed-computing.mdindex.mdsklearn-interface.mdtraining-evaluation.mdutilities.md

sklearn-interface.mddocs/

0

# Scikit-learn Interface

1

2

Drop-in replacements for scikit-learn estimators that provide the familiar fit/predict API while leveraging XGBoost's high-performance gradient boosting implementation. These estimators integrate seamlessly with scikit-learn pipelines, cross-validation, and model selection tools.

3

4

## Capabilities

5

6

### XGBClassifier - Classification Estimator

7

8

XGBoost classifier that follows the scikit-learn API for binary and multi-class classification tasks. Supports probability prediction and integrates with scikit-learn's model evaluation tools.

9

10

```python { .api }

11

class XGBClassifier:

12

def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,

13

grow_policy='depthwise', learning_rate=0.3, n_estimators=100,

14

verbosity=1, objective=None, booster='gbtree',

15

tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,

16

max_delta_step=0, subsample=1, sampling_method='uniform',

17

colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,

18

reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,

19

random_state=None, missing=float('nan'), num_parallel_tree=1,

20

monotone_constraints=None, interaction_constraints=None,

21

importance_type='gain', device=None, validate_parameters=None,

22

enable_categorical=False, feature_types=None,

23

feature_weights=None, max_cat_to_onehot=4,

24

max_cat_threshold=64, multi_strategy='one_output_per_tree',

25

eval_metric=None, early_stopping_rounds=None, callbacks=None):

26

"""

27

XGBoost classifier following scikit-learn API.

28

29

Parameters:

30

- max_depth: Maximum tree depth (int)

31

- max_leaves: Maximum number of leaves (int, 0 means no limit)

32

- max_bin: Maximum number of discrete bins for features (int)

33

- grow_policy: Tree growing policy ('depthwise', 'lossguide')

34

- learning_rate: Boosting learning rate (float)

35

- n_estimators: Number of boosting rounds (int)

36

- verbosity: Verbosity level (0=silent, 1=warning, 2=info, 3=debug)

37

- objective: Learning objective (str or None for auto-detection)

38

- booster: Booster type ('gbtree', 'gblinear', 'dart')

39

- tree_method: Tree construction algorithm ('auto', 'exact', 'approx', 'hist')

40

- n_jobs: Number of parallel threads (int or None)

41

- gamma: Minimum loss reduction required for split (float)

42

- min_child_weight: Minimum sum of instance weight in child (float)

43

- max_delta_step: Maximum delta step allowed for each leaf output (float)

44

- subsample: Fraction of samples used for training each tree (float)

45

- sampling_method: Sampling method ('uniform', 'gradient_based')

46

- colsample_bytree: Fraction of features used per tree (float)

47

- colsample_bylevel: Fraction of features used per level (float)

48

- colsample_bynode: Fraction of features used per split (float)

49

- reg_alpha: L1 regularization term (float)

50

- reg_lambda: L2 regularization term (float)

51

- scale_pos_weight: Balancing weight for positive class (float)

52

- base_score: Global bias for all predictions (float)

53

- random_state: Random seed (int)

54

- missing: Value to be treated as missing (float)

55

- num_parallel_tree: Number of parallel trees per round (int)

56

- monotone_constraints: Monotonic constraints (dict or None)

57

- interaction_constraints: Interaction constraints (list or None)

58

- importance_type: Feature importance type ('gain', 'weight', 'cover', 'total_gain', 'total_cover')

59

- device: Device to use for training ('cpu', 'cuda', 'gpu')

60

- validate_parameters: Whether to validate parameters (bool)

61

- enable_categorical: Enable categorical feature support (bool)

62

- feature_types: Types for features (list or None)

63

- feature_weights: Weights for features (array-like or None)

64

- max_cat_to_onehot: Maximum categories to use one-hot encoding (int)

65

- max_cat_threshold: Maximum categories before switching to partitioning (int)

66

- multi_strategy: Strategy for multi-class ('one_output_per_tree', 'multi_output_tree')

67

- eval_metric: Evaluation metric (str, list, or callable)

68

- early_stopping_rounds: Early stopping rounds (int)

69

- callbacks: Callbacks for training (list)

70

"""

71

72

def fit(self, X, y, *, sample_weight=None, base_margin=None,

73

eval_set=None, verbose=True, xgb_model=None,

74

sample_weight_eval_set=None, base_margin_eval_set=None,

75

feature_weights=None):

76

"""

77

Fit the classifier to training data.

78

79

Parameters:

80

- X: Training data (array-like or DataFrame)

81

- y: Target values (array-like)

82

- sample_weight: Sample weights (array-like, optional)

83

- base_margin: Base prediction margins (array-like, optional)

84

- eval_set: Evaluation datasets as list of (X, y) tuples (list, optional)

85

- verbose: Whether to print evaluation results (bool)

86

- xgb_model: Existing model to continue training (Booster, optional)

87

- sample_weight_eval_set: Sample weights for evaluation sets (list, optional)

88

- base_margin_eval_set: Base margins for evaluation sets (list, optional)

89

- feature_weights: Feature weights (array-like, optional)

90

91

Returns: self

92

"""

93

94

def predict(self, X, *, output_margin=False, validate_features=True,

95

base_margin=None, iteration_range=None):

96

"""

97

Predict class labels.

98

99

Parameters:

100

- X: Input data (array-like or DataFrame)

101

- output_margin: Whether to output margin values (bool)

102

- validate_features: Whether to validate feature names (bool)

103

- base_margin: Base prediction margins (array-like, optional)

104

- iteration_range: Range of trees to use (tuple, optional)

105

106

Returns: numpy.ndarray - Predicted class labels

107

"""

108

109

def predict_proba(self, X, *, validate_features=True, base_margin=None,

110

iteration_range=None):

111

"""

112

Predict class probabilities.

113

114

Parameters:

115

- X: Input data (array-like or DataFrame)

116

- validate_features: Whether to validate feature names (bool)

117

- base_margin: Base prediction margins (array-like, optional)

118

- iteration_range: Range of trees to use (tuple, optional)

119

120

Returns: numpy.ndarray - Class probabilities

121

"""

122

123

@property

124

def classes_(self):

125

"""Unique class labels. Returns: numpy.ndarray"""

126

127

@property

128

def feature_importances_(self):

129

"""Feature importances. Returns: numpy.ndarray"""

130

131

@property

132

def best_score(self):

133

"""Best validation score. Returns: float"""

134

135

@property

136

def best_iteration(self):

137

"""Best iteration from early stopping. Returns: int"""

138

```

139

140

### XGBRegressor - Regression Estimator

141

142

XGBoost regressor for continuous target variables, providing high-performance gradient boosting for regression tasks with extensive hyperparameter control.

143

144

```python { .api }

145

class XGBRegressor:

146

def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,

147

grow_policy='depthwise', learning_rate=0.3, n_estimators=100,

148

verbosity=1, objective=None, booster='gbtree',

149

tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,

150

max_delta_step=0, subsample=1, sampling_method='uniform',

151

colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,

152

reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,

153

random_state=None, missing=float('nan'), num_parallel_tree=1,

154

monotone_constraints=None, interaction_constraints=None,

155

importance_type='gain', device=None, validate_parameters=None,

156

enable_categorical=False, feature_types=None,

157

feature_weights=None, max_cat_to_onehot=4,

158

max_cat_threshold=64, multi_strategy='one_output_per_tree',

159

eval_metric=None, early_stopping_rounds=None, callbacks=None):

160

"""

161

XGBoost regressor following scikit-learn API.

162

163

Parameters: Same as XGBClassifier

164

"""

165

166

def fit(self, X, y, *, sample_weight=None, base_margin=None,

167

eval_set=None, verbose=True, xgb_model=None,

168

sample_weight_eval_set=None, base_margin_eval_set=None,

169

feature_weights=None):

170

"""Fit the regressor to training data. Same interface as XGBClassifier.fit()."""

171

172

def predict(self, X, *, output_margin=False, validate_features=True,

173

base_margin=None, iteration_range=None):

174

"""

175

Predict target values.

176

177

Returns: numpy.ndarray - Predicted values

178

"""

179

```

180

181

### XGBRanker - Learning-to-Rank Estimator

182

183

XGBoost ranker for learning-to-rank tasks such as search result ranking, recommendation systems, and other applications where relative ordering matters more than absolute values.

184

185

```python { .api }

186

class XGBRanker:

187

def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,

188

grow_policy='depthwise', learning_rate=0.3, n_estimators=100,

189

verbosity=1, objective='rank:ndcg', booster='gbtree',

190

tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,

191

max_delta_step=0, subsample=1, sampling_method='uniform',

192

colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,

193

reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,

194

random_state=None, missing=float('nan'), num_parallel_tree=1,

195

monotone_constraints=None, interaction_constraints=None,

196

importance_type='gain', device=None, validate_parameters=None,

197

enable_categorical=False, feature_types=None,

198

feature_weights=None, max_cat_to_onehot=4,

199

max_cat_threshold=64, multi_strategy='one_output_per_tree',

200

eval_metric=None, early_stopping_rounds=None, callbacks=None):

201

"""

202

XGBoost ranker for learning-to-rank tasks.

203

204

Parameters: Same as XGBClassifier with default objective='rank:ndcg'

205

"""

206

207

def fit(self, X, y, *, group=None, qid=None, sample_weight=None,

208

base_margin=None, eval_set=None, verbose=True, xgb_model=None,

209

sample_weight_eval_set=None, base_margin_eval_set=None,

210

feature_weights=None, eval_group=None, eval_qid=None):

211

"""

212

Fit the ranker to training data.

213

214

Parameters: Same as XGBClassifier.fit() with additional:

215

- group: Group sizes for ranking (array-like)

216

- qid: Query IDs for ranking (array-like)

217

- eval_group: Group sizes for evaluation sets (list of array-like)

218

- eval_qid: Query IDs for evaluation sets (list of array-like)

219

"""

220

221

def predict(self, X, *, output_margin=False, validate_features=True,

222

base_margin=None, iteration_range=None):

223

"""

224

Predict ranking scores.

225

226

Returns: numpy.ndarray - Ranking scores

227

"""

228

229

def score(self, X, y):

230

"""

231

Return the mean accuracy on the given test data and labels.

232

233

Parameters:

234

- X: Test data (array-like)

235

- y: True labels (array-like)

236

237

Returns: float - Mean accuracy score

238

"""

239

```

240

241

### XGBRFClassifier - Random Forest Classifier

242

243

XGBoost-based random forest classifier that combines the speed of XGBoost with random forest's ensemble approach, using random feature subsets and bootstrap sampling.

244

245

```python { .api }

246

class XGBRFClassifier:

247

def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100,

248

verbosity=1, objective=None, booster='gbtree',

249

tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,

250

max_delta_step=0, subsample=0.8, sampling_method='uniform',

251

colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8,

252

reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1,

253

base_score=None, random_state=None, missing=float('nan'),

254

num_parallel_tree=1, monotone_constraints=None,

255

interaction_constraints=None, importance_type='gain',

256

device=None, validate_parameters=None, enable_categorical=False,

257

feature_types=None, feature_weights=None, max_cat_to_onehot=4,

258

max_cat_threshold=64, multi_strategy='one_output_per_tree',

259

eval_metric=None, early_stopping_rounds=None, callbacks=None):

260

"""

261

XGBoost random forest classifier.

262

263

Parameters: Similar to XGBClassifier with RF-specific defaults:

264

- learning_rate: 1.0 (no shrinkage for RF)

265

- subsample: 0.8 (bootstrap sampling)

266

- colsample_bytree: 0.8 (random feature subset per tree)

267

- colsample_bynode: 0.8 (random feature subset per split)

268

- reg_lambda: 1e-05 (minimal regularization)

269

"""

270

```

271

272

### XGBRFRegressor - Random Forest Regressor

273

274

XGBoost-based random forest regressor for regression tasks, combining XGBoost's efficiency with random forest methodology.

275

276

```python { .api }

277

class XGBRFRegressor:

278

def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100,

279

verbosity=1, objective=None, booster='gbtree',

280

tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,

281

max_delta_step=0, subsample=0.8, sampling_method='uniform',

282

colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8,

283

reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1,

284

base_score=None, random_state=None, missing=float('nan'),

285

num_parallel_tree=1, monotone_constraints=None,

286

interaction_constraints=None, importance_type='gain',

287

device=None, validate_parameters=None, enable_categorical=False,

288

feature_types=None, feature_weights=None, max_cat_to_onehot=4,

289

max_cat_threshold=64, multi_strategy='one_output_per_tree',

290

eval_metric=None, early_stopping_rounds=None, callbacks=None):

291

"""

292

XGBoost random forest regressor.

293

294

Parameters: Same as XGBRFClassifier

295

"""

296

```

297

298

### XGBModel - Base Estimator

299

300

Base class for all XGBoost scikit-learn estimators, providing common functionality and interface methods.

301

302

```python { .api }

303

class XGBModel:

304

def get_booster(self):

305

"""

306

Get the underlying XGBoost Booster.

307

308

Returns: Booster - The trained XGBoost model

309

"""

310

311

def get_params(self, deep=True):

312

"""

313

Get parameters for the estimator.

314

315

Parameters:

316

- deep: Whether to return parameters of sub-estimators (bool)

317

318

Returns: dict - Parameter names and values

319

"""

320

321

def set_params(self, **params):

322

"""

323

Set parameters for the estimator.

324

325

Parameters:

326

- **params: Estimator parameters as keyword arguments

327

328

Returns: self

329

"""

330

331

def get_xgb_params(self):

332

"""

333

Get XGBoost-specific parameters.

334

335

Returns: dict - XGBoost parameters

336

"""

337

338

def save_model(self, fname):

339

"""

340

Save the model to file.

341

342

Parameters:

343

- fname: Output file name (str)

344

"""

345

346

def load_model(self, fname):

347

"""

348

Load model from file.

349

350

Parameters:

351

- fname: Input file name (str)

352

"""

353

354

def apply(self, X, iteration_range=None):

355

"""

356

Return the predicted leaf index for each sample.

357

358

Parameters:

359

- X: Input data (array-like or DataFrame)

360

- iteration_range: Range of trees to use (tuple, optional)

361

362

Returns: numpy.ndarray - Leaf indices

363

"""

364

365

def evals_result(self):

366

"""

367

Get evaluation results from training.

368

369

Returns: dict - Evaluation history

370

"""

371

372

@property

373

def n_features_in_(self):

374

"""Number of features seen during fit. Returns: int"""

375

376

@property

377

def feature_names_in_(self):

378

"""Feature names seen during fit. Returns: numpy.ndarray"""

379

380

@property

381

def feature_importances_(self):

382

"""Feature importances. Returns: numpy.ndarray"""

383

384

@property

385

def best_score(self):

386

"""Best validation score. Returns: float"""

387

388

@property

389

def best_iteration(self):

390

"""Best iteration from early stopping. Returns: int"""

391

392

@property

393

def coef_(self):

394

"""Model coefficients (for linear booster). Returns: numpy.ndarray"""

395

396

@property

397

def intercept_(self):

398

"""Model intercept (for linear booster). Returns: float"""

399

```

400

401

## Usage Examples

402

403

### Basic Classification

404

405

```python

406

from xgboost import XGBClassifier

407

from sklearn.datasets import make_classification

408

from sklearn.model_selection import train_test_split

409

from sklearn.metrics import accuracy_score, classification_report

410

411

# Create sample data

412

X, y = make_classification(n_samples=1000, n_features=20, n_classes=2,

413

n_informative=10, random_state=42)

414

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,

415

random_state=42)

416

417

# Train classifier

418

clf = XGBClassifier(

419

objective='binary:logistic',

420

max_depth=6,

421

learning_rate=0.1,

422

n_estimators=100,

423

early_stopping_rounds=10,

424

eval_metric='logloss',

425

random_state=42

426

)

427

428

clf.fit(X_train, y_train,

429

eval_set=[(X_test, y_test)],

430

verbose=False)

431

432

# Make predictions

433

y_pred = clf.predict(X_test)

434

y_pred_proba = clf.predict_proba(X_test)

435

436

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

437

print(f"Best iteration: {clf.best_iteration}")

438

print(f"Best score: {clf.best_score:.4f}")

439

440

# Feature importance

441

import matplotlib.pyplot as plt

442

feature_importance = clf.feature_importances_

443

plt.figure(figsize=(10, 6))

444

plt.barh(range(len(feature_importance)), feature_importance)

445

plt.xlabel('Feature Importance')

446

plt.title('XGBoost Feature Importance')

447

plt.show()

448

```

449

450

### Regression Example

451

452

```python

453

from xgboost import XGBRegressor

454

from sklearn.datasets import make_regression

455

from sklearn.metrics import mean_squared_error, r2_score

456

457

# Create regression data

458

X, y = make_regression(n_samples=1000, n_features=20, noise=0.1,

459

random_state=42)

460

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,

461

random_state=42)

462

463

# Train regressor

464

reg = XGBRegressor(

465

objective='reg:squarederror',

466

max_depth=6,

467

learning_rate=0.1,

468

n_estimators=100,

469

early_stopping_rounds=10,

470

eval_metric='rmse'

471

)

472

473

reg.fit(X_train, y_train,

474

eval_set=[(X_test, y_test)],

475

verbose=False)

476

477

# Make predictions

478

y_pred = reg.predict(X_test)

479

480

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")

481

print(f"R²: {r2_score(y_test, y_pred):.4f}")

482

```

483

484

### Learning-to-Rank Example

485

486

```python

487

from xgboost import XGBRanker

488

import numpy as np

489

490

# Create ranking data (mock example)

491

n_samples_per_group = 50

492

n_groups = 20

493

n_features = 10

494

495

X = np.random.randn(n_samples_per_group * n_groups, n_features)

496

y = np.random.randint(0, 5, n_samples_per_group * n_groups) # Relevance scores 0-4

497

group = np.array([n_samples_per_group] * n_groups) # Group sizes

498

499

# Train ranker

500

ranker = XGBRanker(

501

objective='rank:ndcg',

502

max_depth=6,

503

learning_rate=0.1,

504

n_estimators=100,

505

eval_metric='ndcg@10'

506

)

507

508

ranker.fit(X, y, group=group)

509

510

# Make ranking predictions

511

ranking_scores = ranker.predict(X)

512

print(f"Ranking scores shape: {ranking_scores.shape}")

513

```

514

515

### Pipeline Integration

516

517

```python

518

from sklearn.pipeline import Pipeline

519

from sklearn.preprocessing import StandardScaler

520

from sklearn.model_selection import GridSearchCV

521

522

# Create pipeline with preprocessing

523

pipeline = Pipeline([

524

('scaler', StandardScaler()),

525

('xgb', XGBClassifier(random_state=42))

526

])

527

528

# Parameter grid for hyperparameter tuning

529

param_grid = {

530

'xgb__max_depth': [3, 6, 9],

531

'xgb__learning_rate': [0.01, 0.1, 0.2],

532

'xgb__n_estimators': [50, 100, 200]

533

}

534

535

# Grid search with cross-validation

536

grid_search = GridSearchCV(

537

pipeline,

538

param_grid,

539

cv=5,

540

scoring='accuracy',

541

n_jobs=-1

542

)

543

544

grid_search.fit(X_train, y_train)

545

546

print(f"Best parameters: {grid_search.best_params_}")

547

print(f"Best CV score: {grid_search.best_score_:.4f}")

548

549

# Use best model

550

best_model = grid_search.best_estimator_

551

y_pred = best_model.predict(X_test)

552

```

553

554

### Random Forest Usage

555

556

```python

557

from xgboost import XGBRFClassifier

558

559

# XGBoost Random Forest

560

rf_clf = XGBRFClassifier(

561

n_estimators=100,

562

max_depth=6,

563

learning_rate=1.0, # No shrinkage for RF

564

subsample=0.8, # Bootstrap sampling

565

colsample_bynode=0.8, # Random feature subset per split

566

random_state=42

567

)

568

569

rf_clf.fit(X_train, y_train)

570

rf_pred = rf_clf.predict(X_test)

571

rf_pred_proba = rf_clf.predict_proba(X_test)

572

573

print(f"RF Accuracy: {accuracy_score(y_test, rf_pred):.4f}")

574

```