XGBoost Python Package (CPU only) - A minimal installation with no support for GPU algorithms or federated learning, providing optimized distributed gradient boosting for machine learning
Drop-in replacements for scikit-learn estimators that provide the familiar fit/predict API while leveraging XGBoost's high-performance gradient boosting implementation. These estimators integrate seamlessly with scikit-learn pipelines, cross-validation, and model selection tools.
XGBoost classifier that follows the scikit-learn API for binary and multi-class classification tasks. Supports probability prediction and integrates with scikit-learn's model evaluation tools.
class XGBClassifier:
def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,
grow_policy='depthwise', learning_rate=0.3, n_estimators=100,
verbosity=1, objective=None, booster='gbtree',
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
max_delta_step=0, subsample=1, sampling_method='uniform',
colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,
random_state=None, missing=float('nan'), num_parallel_tree=1,
monotone_constraints=None, interaction_constraints=None,
importance_type='gain', device=None, validate_parameters=None,
enable_categorical=False, feature_types=None,
feature_weights=None, max_cat_to_onehot=4,
max_cat_threshold=64, multi_strategy='one_output_per_tree',
eval_metric=None, early_stopping_rounds=None, callbacks=None):
"""
XGBoost classifier following scikit-learn API.
Parameters:
- max_depth: Maximum tree depth (int)
- max_leaves: Maximum number of leaves (int, 0 means no limit)
- max_bin: Maximum number of discrete bins for features (int)
- grow_policy: Tree growing policy ('depthwise', 'lossguide')
- learning_rate: Boosting learning rate (float)
- n_estimators: Number of boosting rounds (int)
- verbosity: Verbosity level (0=silent, 1=warning, 2=info, 3=debug)
- objective: Learning objective (str or None for auto-detection)
- booster: Booster type ('gbtree', 'gblinear', 'dart')
- tree_method: Tree construction algorithm ('auto', 'exact', 'approx', 'hist')
- n_jobs: Number of parallel threads (int or None)
- gamma: Minimum loss reduction required for split (float)
- min_child_weight: Minimum sum of instance weight in child (float)
- max_delta_step: Maximum delta step allowed for each leaf output (float)
- subsample: Fraction of samples used for training each tree (float)
- sampling_method: Sampling method ('uniform', 'gradient_based')
- colsample_bytree: Fraction of features used per tree (float)
- colsample_bylevel: Fraction of features used per level (float)
- colsample_bynode: Fraction of features used per split (float)
- reg_alpha: L1 regularization term (float)
- reg_lambda: L2 regularization term (float)
- scale_pos_weight: Balancing weight for positive class (float)
- base_score: Global bias for all predictions (float)
- random_state: Random seed (int)
- missing: Value to be treated as missing (float)
- num_parallel_tree: Number of parallel trees per round (int)
- monotone_constraints: Monotonic constraints (dict or None)
- interaction_constraints: Interaction constraints (list or None)
- importance_type: Feature importance type ('gain', 'weight', 'cover', 'total_gain', 'total_cover')
- device: Device to use for training ('cpu', 'cuda', 'gpu')
- validate_parameters: Whether to validate parameters (bool)
- enable_categorical: Enable categorical feature support (bool)
- feature_types: Types for features (list or None)
- feature_weights: Weights for features (array-like or None)
- max_cat_to_onehot: Maximum categories to use one-hot encoding (int)
- max_cat_threshold: Maximum categories before switching to partitioning (int)
- multi_strategy: Strategy for multi-class ('one_output_per_tree', 'multi_output_tree')
- eval_metric: Evaluation metric (str, list, or callable)
- early_stopping_rounds: Early stopping rounds (int)
- callbacks: Callbacks for training (list)
"""
def fit(self, X, y, *, sample_weight=None, base_margin=None,
eval_set=None, verbose=True, xgb_model=None,
sample_weight_eval_set=None, base_margin_eval_set=None,
feature_weights=None):
"""
Fit the classifier to training data.
Parameters:
- X: Training data (array-like or DataFrame)
- y: Target values (array-like)
- sample_weight: Sample weights (array-like, optional)
- base_margin: Base prediction margins (array-like, optional)
- eval_set: Evaluation datasets as list of (X, y) tuples (list, optional)
- verbose: Whether to print evaluation results (bool)
- xgb_model: Existing model to continue training (Booster, optional)
- sample_weight_eval_set: Sample weights for evaluation sets (list, optional)
- base_margin_eval_set: Base margins for evaluation sets (list, optional)
- feature_weights: Feature weights (array-like, optional)
Returns: self
"""
def predict(self, X, *, output_margin=False, validate_features=True,
base_margin=None, iteration_range=None):
"""
Predict class labels.
Parameters:
- X: Input data (array-like or DataFrame)
- output_margin: Whether to output margin values (bool)
- validate_features: Whether to validate feature names (bool)
- base_margin: Base prediction margins (array-like, optional)
- iteration_range: Range of trees to use (tuple, optional)
Returns: numpy.ndarray - Predicted class labels
"""
def predict_proba(self, X, *, validate_features=True, base_margin=None,
iteration_range=None):
"""
Predict class probabilities.
Parameters:
- X: Input data (array-like or DataFrame)
- validate_features: Whether to validate feature names (bool)
- base_margin: Base prediction margins (array-like, optional)
- iteration_range: Range of trees to use (tuple, optional)
Returns: numpy.ndarray - Class probabilities
"""
@property
def classes_(self):
"""Unique class labels. Returns: numpy.ndarray"""
@property
def feature_importances_(self):
"""Feature importances. Returns: numpy.ndarray"""
@property
def best_score(self):
"""Best validation score. Returns: float"""
@property
def best_iteration(self):
"""Best iteration from early stopping. Returns: int"""XGBoost regressor for continuous target variables, providing high-performance gradient boosting for regression tasks with extensive hyperparameter control.
class XGBRegressor:
def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,
grow_policy='depthwise', learning_rate=0.3, n_estimators=100,
verbosity=1, objective=None, booster='gbtree',
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
max_delta_step=0, subsample=1, sampling_method='uniform',
colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,
random_state=None, missing=float('nan'), num_parallel_tree=1,
monotone_constraints=None, interaction_constraints=None,
importance_type='gain', device=None, validate_parameters=None,
enable_categorical=False, feature_types=None,
feature_weights=None, max_cat_to_onehot=4,
max_cat_threshold=64, multi_strategy='one_output_per_tree',
eval_metric=None, early_stopping_rounds=None, callbacks=None):
"""
XGBoost regressor following scikit-learn API.
Parameters: Same as XGBClassifier
"""
def fit(self, X, y, *, sample_weight=None, base_margin=None,
eval_set=None, verbose=True, xgb_model=None,
sample_weight_eval_set=None, base_margin_eval_set=None,
feature_weights=None):
"""Fit the regressor to training data. Same interface as XGBClassifier.fit()."""
def predict(self, X, *, output_margin=False, validate_features=True,
base_margin=None, iteration_range=None):
"""
Predict target values.
Returns: numpy.ndarray - Predicted values
"""XGBoost ranker for learning-to-rank tasks such as search result ranking, recommendation systems, and other applications where relative ordering matters more than absolute values.
class XGBRanker:
def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256,
grow_policy='depthwise', learning_rate=0.3, n_estimators=100,
verbosity=1, objective='rank:ndcg', booster='gbtree',
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
max_delta_step=0, subsample=1, sampling_method='uniform',
colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None,
random_state=None, missing=float('nan'), num_parallel_tree=1,
monotone_constraints=None, interaction_constraints=None,
importance_type='gain', device=None, validate_parameters=None,
enable_categorical=False, feature_types=None,
feature_weights=None, max_cat_to_onehot=4,
max_cat_threshold=64, multi_strategy='one_output_per_tree',
eval_metric=None, early_stopping_rounds=None, callbacks=None):
"""
XGBoost ranker for learning-to-rank tasks.
Parameters: Same as XGBClassifier with default objective='rank:ndcg'
"""
def fit(self, X, y, *, group=None, qid=None, sample_weight=None,
base_margin=None, eval_set=None, verbose=True, xgb_model=None,
sample_weight_eval_set=None, base_margin_eval_set=None,
feature_weights=None, eval_group=None, eval_qid=None):
"""
Fit the ranker to training data.
Parameters: Same as XGBClassifier.fit() with additional:
- group: Group sizes for ranking (array-like)
- qid: Query IDs for ranking (array-like)
- eval_group: Group sizes for evaluation sets (list of array-like)
- eval_qid: Query IDs for evaluation sets (list of array-like)
"""
def predict(self, X, *, output_margin=False, validate_features=True,
base_margin=None, iteration_range=None):
"""
Predict ranking scores.
Returns: numpy.ndarray - Ranking scores
"""
def score(self, X, y):
"""
Return the mean accuracy on the given test data and labels.
Parameters:
- X: Test data (array-like)
- y: True labels (array-like)
Returns: float - Mean accuracy score
"""XGBoost-based random forest classifier that combines the speed of XGBoost with random forest's ensemble approach, using random feature subsets and bootstrap sampling.
class XGBRFClassifier:
def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100,
verbosity=1, objective=None, booster='gbtree',
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
max_delta_step=0, subsample=0.8, sampling_method='uniform',
colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8,
reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1,
base_score=None, random_state=None, missing=float('nan'),
num_parallel_tree=1, monotone_constraints=None,
interaction_constraints=None, importance_type='gain',
device=None, validate_parameters=None, enable_categorical=False,
feature_types=None, feature_weights=None, max_cat_to_onehot=4,
max_cat_threshold=64, multi_strategy='one_output_per_tree',
eval_metric=None, early_stopping_rounds=None, callbacks=None):
"""
XGBoost random forest classifier.
Parameters: Similar to XGBClassifier with RF-specific defaults:
- learning_rate: 1.0 (no shrinkage for RF)
- subsample: 0.8 (bootstrap sampling)
- colsample_bytree: 0.8 (random feature subset per tree)
- colsample_bynode: 0.8 (random feature subset per split)
- reg_lambda: 1e-05 (minimal regularization)
"""XGBoost-based random forest regressor for regression tasks, combining XGBoost's efficiency with random forest methodology.
class XGBRFRegressor:
def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100,
verbosity=1, objective=None, booster='gbtree',
tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1,
max_delta_step=0, subsample=0.8, sampling_method='uniform',
colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8,
reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1,
base_score=None, random_state=None, missing=float('nan'),
num_parallel_tree=1, monotone_constraints=None,
interaction_constraints=None, importance_type='gain',
device=None, validate_parameters=None, enable_categorical=False,
feature_types=None, feature_weights=None, max_cat_to_onehot=4,
max_cat_threshold=64, multi_strategy='one_output_per_tree',
eval_metric=None, early_stopping_rounds=None, callbacks=None):
"""
XGBoost random forest regressor.
Parameters: Same as XGBRFClassifier
"""Base class for all XGBoost scikit-learn estimators, providing common functionality and interface methods.
class XGBModel:
def get_booster(self):
"""
Get the underlying XGBoost Booster.
Returns: Booster - The trained XGBoost model
"""
def get_params(self, deep=True):
"""
Get parameters for the estimator.
Parameters:
- deep: Whether to return parameters of sub-estimators (bool)
Returns: dict - Parameter names and values
"""
def set_params(self, **params):
"""
Set parameters for the estimator.
Parameters:
- **params: Estimator parameters as keyword arguments
Returns: self
"""
def get_xgb_params(self):
"""
Get XGBoost-specific parameters.
Returns: dict - XGBoost parameters
"""
def save_model(self, fname):
"""
Save the model to file.
Parameters:
- fname: Output file name (str)
"""
def load_model(self, fname):
"""
Load model from file.
Parameters:
- fname: Input file name (str)
"""
def apply(self, X, iteration_range=None):
"""
Return the predicted leaf index for each sample.
Parameters:
- X: Input data (array-like or DataFrame)
- iteration_range: Range of trees to use (tuple, optional)
Returns: numpy.ndarray - Leaf indices
"""
def evals_result(self):
"""
Get evaluation results from training.
Returns: dict - Evaluation history
"""
@property
def n_features_in_(self):
"""Number of features seen during fit. Returns: int"""
@property
def feature_names_in_(self):
"""Feature names seen during fit. Returns: numpy.ndarray"""
@property
def feature_importances_(self):
"""Feature importances. Returns: numpy.ndarray"""
@property
def best_score(self):
"""Best validation score. Returns: float"""
@property
def best_iteration(self):
"""Best iteration from early stopping. Returns: int"""
@property
def coef_(self):
"""Model coefficients (for linear booster). Returns: numpy.ndarray"""
@property
def intercept_(self):
"""Model intercept (for linear booster). Returns: float"""from xgboost import XGBClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Create sample data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2,
n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train classifier
clf = XGBClassifier(
objective='binary:logistic',
max_depth=6,
learning_rate=0.1,
n_estimators=100,
early_stopping_rounds=10,
eval_metric='logloss',
random_state=42
)
clf.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False)
# Make predictions
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Best iteration: {clf.best_iteration}")
print(f"Best score: {clf.best_score:.4f}")
# Feature importance
import matplotlib.pyplot as plt
feature_importance = clf.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importance)), feature_importance)
plt.xlabel('Feature Importance')
plt.title('XGBoost Feature Importance')
plt.show()from xgboost import XGBRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
# Create regression data
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train regressor
reg = XGBRegressor(
objective='reg:squarederror',
max_depth=6,
learning_rate=0.1,
n_estimators=100,
early_stopping_rounds=10,
eval_metric='rmse'
)
reg.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False)
# Make predictions
y_pred = reg.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")from xgboost import XGBRanker
import numpy as np
# Create ranking data (mock example)
n_samples_per_group = 50
n_groups = 20
n_features = 10
X = np.random.randn(n_samples_per_group * n_groups, n_features)
y = np.random.randint(0, 5, n_samples_per_group * n_groups) # Relevance scores 0-4
group = np.array([n_samples_per_group] * n_groups) # Group sizes
# Train ranker
ranker = XGBRanker(
objective='rank:ndcg',
max_depth=6,
learning_rate=0.1,
n_estimators=100,
eval_metric='ndcg@10'
)
ranker.fit(X, y, group=group)
# Make ranking predictions
ranking_scores = ranker.predict(X)
print(f"Ranking scores shape: {ranking_scores.shape}")from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
# Create pipeline with preprocessing
pipeline = Pipeline([
('scaler', StandardScaler()),
('xgb', XGBClassifier(random_state=42))
])
# Parameter grid for hyperparameter tuning
param_grid = {
'xgb__max_depth': [3, 6, 9],
'xgb__learning_rate': [0.01, 0.1, 0.2],
'xgb__n_estimators': [50, 100, 200]
}
# Grid search with cross-validation
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")
# Use best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)from xgboost import XGBRFClassifier
# XGBoost Random Forest
rf_clf = XGBRFClassifier(
n_estimators=100,
max_depth=6,
learning_rate=1.0, # No shrinkage for RF
subsample=0.8, # Bootstrap sampling
colsample_bynode=0.8, # Random feature subset per split
random_state=42
)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_pred_proba = rf_clf.predict_proba(X_test)
print(f"RF Accuracy: {accuracy_score(y_test, rf_pred):.4f}")Install with Tessl CLI
npx tessl i tessl/pypi-xgboost-cpu