tessl/pypi-xgboost-cpu

XGBoost Python Package (CPU only) - A minimal installation with no support for GPU algorithms or federated learning, providing optimized distributed gradient boosting for machine learning

Overview

Eval results

Files

Scikit-learn Interface

Name: tessl/pypi-xgboost-cpu
Author: tessl

Drop-in replacements for scikit-learn estimators that provide the familiar fit/predict API while leveraging XGBoost's high-performance gradient boosting implementation. These estimators integrate seamlessly with scikit-learn pipelines, cross-validation, and model selection tools.

Capabilities

XGBClassifier - Classification Estimator

XGBoost classifier that follows the scikit-learn API for binary and multi-class classification tasks. Supports probability prediction and integrates with scikit-learn's model evaluation tools.

class XGBClassifier:
    def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256, 
                 grow_policy='depthwise', learning_rate=0.3, n_estimators=100, 
                 verbosity=1, objective=None, booster='gbtree', 
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
                 max_delta_step=0, subsample=1, sampling_method='uniform', 
                 colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None, 
                 random_state=None, missing=float('nan'), num_parallel_tree=1, 
                 monotone_constraints=None, interaction_constraints=None, 
                 importance_type='gain', device=None, validate_parameters=None, 
                 enable_categorical=False, feature_types=None, 
                 feature_weights=None, max_cat_to_onehot=4, 
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
        """
        XGBoost classifier following scikit-learn API.
        
        Parameters:
        - max_depth: Maximum tree depth (int)
        - max_leaves: Maximum number of leaves (int, 0 means no limit)
        - max_bin: Maximum number of discrete bins for features (int)
        - grow_policy: Tree growing policy ('depthwise', 'lossguide')
        - learning_rate: Boosting learning rate (float)
        - n_estimators: Number of boosting rounds (int)
        - verbosity: Verbosity level (0=silent, 1=warning, 2=info, 3=debug)
        - objective: Learning objective (str or None for auto-detection)
        - booster: Booster type ('gbtree', 'gblinear', 'dart')
        - tree_method: Tree construction algorithm ('auto', 'exact', 'approx', 'hist')
        - n_jobs: Number of parallel threads (int or None)
        - gamma: Minimum loss reduction required for split (float)
        - min_child_weight: Minimum sum of instance weight in child (float)
        - max_delta_step: Maximum delta step allowed for each leaf output (float)
        - subsample: Fraction of samples used for training each tree (float)
        - sampling_method: Sampling method ('uniform', 'gradient_based')
        - colsample_bytree: Fraction of features used per tree (float)
        - colsample_bylevel: Fraction of features used per level (float)
        - colsample_bynode: Fraction of features used per split (float)
        - reg_alpha: L1 regularization term (float)
        - reg_lambda: L2 regularization term (float)
        - scale_pos_weight: Balancing weight for positive class (float)
        - base_score: Global bias for all predictions (float)
        - random_state: Random seed (int)
        - missing: Value to be treated as missing (float)
        - num_parallel_tree: Number of parallel trees per round (int)
        - monotone_constraints: Monotonic constraints (dict or None)
        - interaction_constraints: Interaction constraints (list or None)
        - importance_type: Feature importance type ('gain', 'weight', 'cover', 'total_gain', 'total_cover')
        - device: Device to use for training ('cpu', 'cuda', 'gpu')
        - validate_parameters: Whether to validate parameters (bool)
        - enable_categorical: Enable categorical feature support (bool)
        - feature_types: Types for features (list or None)
        - feature_weights: Weights for features (array-like or None)
        - max_cat_to_onehot: Maximum categories to use one-hot encoding (int)
        - max_cat_threshold: Maximum categories before switching to partitioning (int)
        - multi_strategy: Strategy for multi-class ('one_output_per_tree', 'multi_output_tree')
        - eval_metric: Evaluation metric (str, list, or callable)
        - early_stopping_rounds: Early stopping rounds (int)
        - callbacks: Callbacks for training (list)
        """
    
    def fit(self, X, y, *, sample_weight=None, base_margin=None, 
            eval_set=None, verbose=True, xgb_model=None, 
            sample_weight_eval_set=None, base_margin_eval_set=None, 
            feature_weights=None):
        """
        Fit the classifier to training data.
        
        Parameters:
        - X: Training data (array-like or DataFrame)
        - y: Target values (array-like)
        - sample_weight: Sample weights (array-like, optional)
        - base_margin: Base prediction margins (array-like, optional)
        - eval_set: Evaluation datasets as list of (X, y) tuples (list, optional)
        - verbose: Whether to print evaluation results (bool)
        - xgb_model: Existing model to continue training (Booster, optional)
        - sample_weight_eval_set: Sample weights for evaluation sets (list, optional)
        - base_margin_eval_set: Base margins for evaluation sets (list, optional)
        - feature_weights: Feature weights (array-like, optional)
        
        Returns: self
        """
    
    def predict(self, X, *, output_margin=False, validate_features=True, 
                base_margin=None, iteration_range=None):
        """
        Predict class labels.
        
        Parameters:
        - X: Input data (array-like or DataFrame)
        - output_margin: Whether to output margin values (bool)
        - validate_features: Whether to validate feature names (bool)
        - base_margin: Base prediction margins (array-like, optional)
        - iteration_range: Range of trees to use (tuple, optional)
        
        Returns: numpy.ndarray - Predicted class labels
        """
    
    def predict_proba(self, X, *, validate_features=True, base_margin=None, 
                      iteration_range=None):
        """
        Predict class probabilities.
        
        Parameters:
        - X: Input data (array-like or DataFrame)
        - validate_features: Whether to validate feature names (bool)
        - base_margin: Base prediction margins (array-like, optional)
        - iteration_range: Range of trees to use (tuple, optional)
        
        Returns: numpy.ndarray - Class probabilities
        """
    
    @property
    def classes_(self):
        """Unique class labels. Returns: numpy.ndarray"""
    
    @property
    def feature_importances_(self):
        """Feature importances. Returns: numpy.ndarray"""
    
    @property
    def best_score(self):
        """Best validation score. Returns: float"""
    
    @property
    def best_iteration(self):
        """Best iteration from early stopping. Returns: int"""

XGBRegressor - Regression Estimator

XGBoost regressor for continuous target variables, providing high-performance gradient boosting for regression tasks with extensive hyperparameter control.

class XGBRegressor:
    def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256, 
                 grow_policy='depthwise', learning_rate=0.3, n_estimators=100, 
                 verbosity=1, objective=None, booster='gbtree', 
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
                 max_delta_step=0, subsample=1, sampling_method='uniform', 
                 colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None, 
                 random_state=None, missing=float('nan'), num_parallel_tree=1, 
                 monotone_constraints=None, interaction_constraints=None, 
                 importance_type='gain', device=None, validate_parameters=None, 
                 enable_categorical=False, feature_types=None, 
                 feature_weights=None, max_cat_to_onehot=4, 
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
        """
        XGBoost regressor following scikit-learn API.
        
        Parameters: Same as XGBClassifier
        """
    
    def fit(self, X, y, *, sample_weight=None, base_margin=None, 
            eval_set=None, verbose=True, xgb_model=None, 
            sample_weight_eval_set=None, base_margin_eval_set=None, 
            feature_weights=None):
        """Fit the regressor to training data. Same interface as XGBClassifier.fit()."""
    
    def predict(self, X, *, output_margin=False, validate_features=True, 
                base_margin=None, iteration_range=None):
        """
        Predict target values.
        
        Returns: numpy.ndarray - Predicted values
        """

XGBRanker - Learning-to-Rank Estimator

XGBoost ranker for learning-to-rank tasks such as search result ranking, recommendation systems, and other applications where relative ordering matters more than absolute values.

class XGBRanker:
    def __init__(self, *, max_depth=6, max_leaves=0, max_bin=256, 
                 grow_policy='depthwise', learning_rate=0.3, n_estimators=100, 
                 verbosity=1, objective='rank:ndcg', booster='gbtree', 
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
                 max_delta_step=0, subsample=1, sampling_method='uniform', 
                 colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=None, 
                 random_state=None, missing=float('nan'), num_parallel_tree=1, 
                 monotone_constraints=None, interaction_constraints=None, 
                 importance_type='gain', device=None, validate_parameters=None, 
                 enable_categorical=False, feature_types=None, 
                 feature_weights=None, max_cat_to_onehot=4, 
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
        """
        XGBoost ranker for learning-to-rank tasks.
        
        Parameters: Same as XGBClassifier with default objective='rank:ndcg'
        """
    
    def fit(self, X, y, *, group=None, qid=None, sample_weight=None, 
            base_margin=None, eval_set=None, verbose=True, xgb_model=None, 
            sample_weight_eval_set=None, base_margin_eval_set=None, 
            feature_weights=None, eval_group=None, eval_qid=None):
        """
        Fit the ranker to training data.
        
        Parameters: Same as XGBClassifier.fit() with additional:
        - group: Group sizes for ranking (array-like)
        - qid: Query IDs for ranking (array-like)
        - eval_group: Group sizes for evaluation sets (list of array-like)
        - eval_qid: Query IDs for evaluation sets (list of array-like)
        """
    
    def predict(self, X, *, output_margin=False, validate_features=True, 
                base_margin=None, iteration_range=None):
        """
        Predict ranking scores.
        
        Returns: numpy.ndarray - Ranking scores
        """
    
    def score(self, X, y):
        """
        Return the mean accuracy on the given test data and labels.
        
        Parameters:
        - X: Test data (array-like)
        - y: True labels (array-like)
        
        Returns: float - Mean accuracy score
        """

XGBRFClassifier - Random Forest Classifier

XGBoost-based random forest classifier that combines the speed of XGBoost with random forest's ensemble approach, using random feature subsets and bootstrap sampling.

class XGBRFClassifier:
    def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100, 
                 verbosity=1, objective=None, booster='gbtree', 
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
                 max_delta_step=0, subsample=0.8, sampling_method='uniform', 
                 colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8, 
                 reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1, 
                 base_score=None, random_state=None, missing=float('nan'), 
                 num_parallel_tree=1, monotone_constraints=None, 
                 interaction_constraints=None, importance_type='gain', 
                 device=None, validate_parameters=None, enable_categorical=False, 
                 feature_types=None, feature_weights=None, max_cat_to_onehot=4, 
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
        """
        XGBoost random forest classifier.
        
        Parameters: Similar to XGBClassifier with RF-specific defaults:
        - learning_rate: 1.0 (no shrinkage for RF)
        - subsample: 0.8 (bootstrap sampling)
        - colsample_bytree: 0.8 (random feature subset per tree)
        - colsample_bynode: 0.8 (random feature subset per split)
        - reg_lambda: 1e-05 (minimal regularization)
        """

XGBRFRegressor - Random Forest Regressor

XGBoost-based random forest regressor for regression tasks, combining XGBoost's efficiency with random forest methodology.

class XGBRFRegressor:
    def __init__(self, *, max_depth=6, learning_rate=1.0, n_estimators=100, 
                 verbosity=1, objective=None, booster='gbtree', 
                 tree_method='auto', n_jobs=None, gamma=0, min_child_weight=1, 
                 max_delta_step=0, subsample=0.8, sampling_method='uniform', 
                 colsample_bytree=0.8, colsample_bylevel=1, colsample_bynode=0.8, 
                 reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1, 
                 base_score=None, random_state=None, missing=float('nan'), 
                 num_parallel_tree=1, monotone_constraints=None, 
                 interaction_constraints=None, importance_type='gain', 
                 device=None, validate_parameters=None, enable_categorical=False, 
                 feature_types=None, feature_weights=None, max_cat_to_onehot=4, 
                 max_cat_threshold=64, multi_strategy='one_output_per_tree', 
                 eval_metric=None, early_stopping_rounds=None, callbacks=None):
        """
        XGBoost random forest regressor.
        
        Parameters: Same as XGBRFClassifier
        """

XGBModel - Base Estimator

Base class for all XGBoost scikit-learn estimators, providing common functionality and interface methods.

class XGBModel:
    def get_booster(self):
        """
        Get the underlying XGBoost Booster.
        
        Returns: Booster - The trained XGBoost model
        """
    
    def get_params(self, deep=True):
        """
        Get parameters for the estimator.
        
        Parameters:
        - deep: Whether to return parameters of sub-estimators (bool)
        
        Returns: dict - Parameter names and values
        """
    
    def set_params(self, **params):
        """
        Set parameters for the estimator.
        
        Parameters:
        - **params: Estimator parameters as keyword arguments
        
        Returns: self
        """
    
    def get_xgb_params(self):
        """
        Get XGBoost-specific parameters.
        
        Returns: dict - XGBoost parameters
        """
    
    def save_model(self, fname):
        """
        Save the model to file.
        
        Parameters:
        - fname: Output file name (str)
        """
    
    def load_model(self, fname):
        """
        Load model from file.
        
        Parameters:
        - fname: Input file name (str)
        """
    
    def apply(self, X, iteration_range=None):
        """
        Return the predicted leaf index for each sample.
        
        Parameters:
        - X: Input data (array-like or DataFrame)
        - iteration_range: Range of trees to use (tuple, optional)
        
        Returns: numpy.ndarray - Leaf indices
        """
    
    def evals_result(self):
        """
        Get evaluation results from training.
        
        Returns: dict - Evaluation history
        """
    
    @property
    def n_features_in_(self):
        """Number of features seen during fit. Returns: int"""
    
    @property
    def feature_names_in_(self):
        """Feature names seen during fit. Returns: numpy.ndarray"""
    
    @property
    def feature_importances_(self):
        """Feature importances. Returns: numpy.ndarray"""
    
    @property
    def best_score(self):
        """Best validation score. Returns: float"""
    
    @property
    def best_iteration(self):
        """Best iteration from early stopping. Returns: int"""
    
    @property
    def coef_(self):
        """Model coefficients (for linear booster). Returns: numpy.ndarray"""
    
    @property
    def intercept_(self):
        """Model intercept (for linear booster). Returns: float"""

Usage Examples

Basic Classification

from xgboost import XGBClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Create sample data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, 
                          n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                   random_state=42)

# Train classifier
clf = XGBClassifier(
    objective='binary:logistic',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    early_stopping_rounds=10,
    eval_metric='logloss',
    random_state=42
)

clf.fit(X_train, y_train, 
        eval_set=[(X_test, y_test)], 
        verbose=False)

# Make predictions
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Best iteration: {clf.best_iteration}")
print(f"Best score: {clf.best_score:.4f}")

# Feature importance
import matplotlib.pyplot as plt
feature_importance = clf.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importance)), feature_importance)
plt.xlabel('Feature Importance')
plt.title('XGBoost Feature Importance')
plt.show()

Regression Example

from xgboost import XGBRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score

# Create regression data
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, 
                      random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                   random_state=42)

# Train regressor
reg = XGBRegressor(
    objective='reg:squarederror',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    early_stopping_rounds=10,
    eval_metric='rmse'
)

reg.fit(X_train, y_train, 
        eval_set=[(X_test, y_test)], 
        verbose=False)

# Make predictions
y_pred = reg.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")

Learning-to-Rank Example

from xgboost import XGBRanker
import numpy as np

# Create ranking data (mock example)
n_samples_per_group = 50
n_groups = 20
n_features = 10

X = np.random.randn(n_samples_per_group * n_groups, n_features)
y = np.random.randint(0, 5, n_samples_per_group * n_groups)  # Relevance scores 0-4
group = np.array([n_samples_per_group] * n_groups)  # Group sizes

# Train ranker
ranker = XGBRanker(
    objective='rank:ndcg',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    eval_metric='ndcg@10'
)

ranker.fit(X, y, group=group)

# Make ranking predictions
ranking_scores = ranker.predict(X)
print(f"Ranking scores shape: {ranking_scores.shape}")

Pipeline Integration

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Create pipeline with preprocessing
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(random_state=42))
])

# Parameter grid for hyperparameter tuning
param_grid = {
    'xgb__max_depth': [3, 6, 9],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__n_estimators': [50, 100, 200]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Use best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

Random Forest Usage

from xgboost import XGBRFClassifier

# XGBoost Random Forest
rf_clf = XGBRFClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=1.0,  # No shrinkage for RF
    subsample=0.8,      # Bootstrap sampling
    colsample_bynode=0.8,  # Random feature subset per split
    random_state=42
)

rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_pred_proba = rf_clf.predict_proba(X_test)

print(f"RF Accuracy: {accuracy_score(y_test, rf_pred):.4f}")

Install with Tessl CLI