XGBoost Python Package (CPU only) - A minimal installation with no support for GPU algorithms or federated learning, providing optimized distributed gradient boosting for machine learning
npx @tessl/cli install tessl/pypi-xgboost-cpu@3.0.0XGBoost Python Package (CPU only) - A minimal installation with no support for GPU algorithms or federated learning, providing optimized distributed gradient boosting for machine learning. XGBoost is an optimized distributed gradient boosting library designed for high efficiency, flexibility, and portability, implementing machine learning algorithms under the Gradient Boosting framework.
pip install xgboost-cpuimport xgboost as xgbCommon imports for different use cases:
# Core functionality
from xgboost import DMatrix, Booster, train, cv
# Scikit-learn interface
from xgboost import XGBClassifier, XGBRegressor, XGBRanker
# Distributed computing
from xgboost import dask as dxgb # Dask integration
from xgboost import spark as spark_xgb # Spark integration
# Utilities
from xgboost import plot_importance, plot_tree
from xgboost import get_config, set_configimport xgboost as xgb
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create sample data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Method 1: Using XGBoost's native API
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'max_depth': 6,
'learning_rate': 0.1
}
model = xgb.train(params, dtrain, num_boost_round=100,
evals=[(dtrain, 'train'), (dtest, 'test')])
# Make predictions
y_pred = model.predict(dtest)
# Method 2: Using scikit-learn interface
from xgboost import XGBClassifier
clf = XGBClassifier(objective='binary:logistic', max_depth=6,
learning_rate=0.1, n_estimators=100)
clf.fit(X_train, y_train)
y_pred_sklearn = clf.predict_proba(X_test)[:, 1]
# Visualize feature importance
xgb.plot_importance(model, max_num_features=10)XGBoost provides multiple interfaces and deployment options:
This design enables XGBoost to serve as both a high-performance gradient boosting engine and an accessible machine learning library that integrates seamlessly with the Python data science ecosystem.
Fundamental XGBoost data structures and model objects that provide the foundation for training and prediction. These include DMatrix for efficient data handling, Booster for trained models, and specialized variants for memory optimization.
class DMatrix:
def __init__(self, data, label=None, *, weight=None, base_margin=None,
missing=None, silent=False, feature_names=None,
feature_types=None, nthread=None, group=None, qid=None,
label_lower_bound=None, label_upper_bound=None,
feature_weights=None, enable_categorical=False):
"""Optimized data matrix for XGBoost training and prediction."""
class Booster:
def __init__(self, params=None, cache=(), model_file=None):
"""XGBoost model containing training, prediction, and evaluation routines."""
def predict(self, data, *, output_margin=False, pred_leaf=False,
pred_contribs=False, approx_contribs=False,
pred_interactions=False, validate_features=True,
training=False, iteration_range=(0, 0), strict_shape=False):
"""Make predictions using the trained model."""
class QuantileDMatrix:
def __init__(self, data, label=None, *, ref=None, **kwargs):
"""Memory-efficient DMatrix variant using quantized data."""Core Data Structures and Models
Core training functions and cross-validation for model development. These functions provide the primary interface for training XGBoost models with extensive configuration options and evaluation capabilities.
def train(params, dtrain, num_boost_round=10, evals=(), obj=None,
maximize=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, xgb_model=None, callbacks=None, custom_metric=None):
"""Train a booster with given parameters."""
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False,
folds=None, metrics=(), obj=None, maximize=None,
early_stopping_rounds=None, fpreproc=None, as_pandas=True,
verbose_eval=None, show_stdv=True, seed=0, callbacks=None,
shuffle=True, custom_metric=None):
"""Cross-validation with given parameters."""Drop-in replacements for scikit-learn estimators providing familiar fit/predict API with XGBoost's performance. Includes classifiers, regressors, rankers, and random forest variants.
class XGBClassifier:
def __init__(self, *, max_depth=6, learning_rate=0.3, n_estimators=100,
objective=None, booster='gbtree', tree_method='auto',
n_jobs=None, gamma=0, min_child_weight=1, max_delta_step=0,
subsample=1, colsample_bytree=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=None, random_state=None,
missing=np.nan, **kwargs):
"""XGBoost classifier following scikit-learn API."""
def fit(self, X, y, *, sample_weight=None, base_margin=None,
eval_set=None, verbose=True, xgb_model=None,
sample_weight_eval_set=None, base_margin_eval_set=None,
feature_weights=None):
"""Fit the model to training data."""
def predict_proba(self, X, *, validate_features=True, base_margin=None,
iteration_range=None):
"""Predict class probabilities."""
class XGBRegressor:
"""XGBoost regressor following scikit-learn API."""
class XGBRanker:
"""XGBoost ranker for learning-to-rank tasks."""Native support for distributed training across Dask and Spark ecosystems, enabling scalable machine learning on large datasets and compute clusters.
# Dask integration
from xgboost import dask as dxgb
def dxgb.train(client, params, dtrain, num_boost_round=10, evals=(),
obj=None, maximize=None, early_stopping_rounds=None,
evals_result=None, verbose_eval=True, xgb_model=None,
callbacks=None):
"""Train XGBoost model using Dask."""
class dxgb.DaskXGBClassifier:
"""Dask-distributed XGBoost classifier."""
# Spark integration
from xgboost import spark as spark_xgb
class spark_xgb.SparkXGBClassifier:
"""PySpark XGBoost classifier."""Utility functions for model interpretation, configuration management, and visualization. These tools help understand model behavior and manage XGBoost settings.
def plot_importance(booster, ax=None, height=0.2, xlim=None, ylim=None,
title='Feature importance', xlabel='F score',
ylabel='Features', fmap='', importance_type='weight',
max_num_features=None, grid=True, show_values=True,
values_format='{v}'):
"""Plot feature importance based on fitted trees."""
def plot_tree(booster, fmap='', num_trees=0, rankdir=None, ax=None, **kwargs):
"""Plot specified tree using matplotlib."""
def set_config(**new_config):
"""Set global XGBoost configuration."""
def get_config():
"""Get current global configuration values."""from typing import Union, Optional, List, Dict, Any, Tuple, Callable
import numpy as np
import pandas as pd
# Common type aliases used throughout XGBoost
ArrayLike = Union[List, np.ndarray, pd.DataFrame, pd.Series]
PathLike = Union[str, os.PathLike]
Metric = Union[str, List[str], Callable]
Objective = Union[str, Callable]
EvalSet = List[Tuple[DMatrix, str]]
FeatureNames = List[str]
FeatureTypes = List[str]
FloatCompatible = Union[float, np.float32, np.float64]
# Callback types
from xgboost.callback import TrainingCallback
EvalsLog = Dict[str, Dict[str, List[float]]]
CallbackList = Optional[List[TrainingCallback]]
# Data splitting modes
from enum import IntEnum
class DataSplitMode(IntEnum):
"""Supported data split mode for DMatrix."""
ROW = 0 # Split by rows (default)
COL = 1 # Split by columns
# Collective communication operations
class Op(IntEnum):
"""Supported operations for allreduce."""
MAX = 0
MIN = 1
SUM = 2
BITWISE_AND = 3
BITWISE_OR = 4
BITWISE_XOR = 5