tessl/pypi-xgboost-cpu

XGBoost Python Package (CPU only) - A minimal installation with no support for GPU algorithms or federated learning, providing optimized distributed gradient boosting for machine learning

Overview

Eval results

Files

Core Data Structures and Models

Name: tessl/pypi-xgboost-cpu
Author: tessl

Fundamental XGBoost data structures and model objects that provide the foundation for training and prediction. These components handle data ingestion, optimization, and model storage with support for various data formats and memory optimization strategies.

Capabilities

DMatrix - Primary Data Structure

The core data structure for XGBoost that optimizes data storage and access patterns for gradient boosting. DMatrix handles various input formats including NumPy arrays, pandas DataFrames, scipy sparse matrices, and supports missing values, categorical features, and external memory datasets.

class DMatrix:
    def __init__(self, data, label=None, *, weight=None, base_margin=None, 
                 missing=None, silent=False, feature_names=None, 
                 feature_types=None, nthread=None, group=None, qid=None, 
                 label_lower_bound=None, label_upper_bound=None, 
                 feature_weights=None, enable_categorical=False, 
                 data_split_mode=DataSplitMode.ROW):
        """
        Optimized data matrix for XGBoost training and prediction.
        
        Parameters:
        - data: Input data (array-like, DataFrame, sparse matrix, or file path)
        - label: Target values (array-like)
        - weight: Instance weights (array-like)
        - base_margin: Base prediction margins (array-like)
        - missing: Value to be treated as missing (float, default: NaN)
        - silent: Whether to suppress loading messages (bool)
        - feature_names: Names for features (list of str)
        - feature_types: Types for features ('int', 'float', 'c' for categorical)
        - nthread: Number of threads for loading data (int)
        - group: Group sizes for ranking (array-like)
        - qid: Query IDs for ranking (array-like)
        - label_lower_bound: Lower bound for labels in ranking (array-like)
        - label_upper_bound: Upper bound for labels in ranking (array-like)
        - feature_weights: Weights for features (array-like)
        - enable_categorical: Enable categorical feature support (bool)
        - data_split_mode: How to split data for distributed training
        """
    
    def set_info(self, *, label=None, weight=None, base_margin=None, 
                 group=None, qid=None, label_lower_bound=None, 
                 label_upper_bound=None, feature_names=None, 
                 feature_types=None, feature_weights=None):
        """
        Set meta-information for the DMatrix.
        
        Parameters: Same as constructor parameters for updating specific fields
        """
    
    def get_label(self):
        """Get the labels of the DMatrix. Returns: numpy.ndarray"""
    
    def get_weight(self):
        """Get the weights of the DMatrix. Returns: numpy.ndarray"""
    
    def get_base_margin(self):
        """Get the base margins of the DMatrix. Returns: numpy.ndarray"""
    
    def get_group(self):
        """Get the group sizes of the DMatrix. Returns: numpy.ndarray"""
    
    def set_label(self, label):
        """Set labels for the DMatrix. Parameters: label (array-like)"""
    
    def set_weight(self, weight):
        """Set instance weights for the DMatrix. Parameters: weight (array-like)"""
    
    def set_base_margin(self, margin):
        """Set base prediction margins. Parameters: margin (array-like)"""
    
    def set_group(self, group):
        """Set group sizes for ranking. Parameters: group (array-like)"""
    
    def get_float_info(self, field):
        """Get float information by field name. Returns: numpy.ndarray"""
    
    def get_uint_info(self, field):
        """Get unsigned integer information by field name. Returns: numpy.ndarray"""
    
    def set_float_info(self, field, data):
        """Set float information. Parameters: field (str), data (array-like)"""
    
    def set_uint_info(self, field, data):
        """Set unsigned integer information. Parameters: field (str), data (array-like)"""
    
    def save_binary(self, fname, silent=True):
        """Save DMatrix to binary format. Parameters: fname (str), silent (bool)"""
    
    def load_model(self, fname):
        """Load DMatrix from file. Parameters: fname (str)"""
    
    def get_data(self):
        """Get the data matrix. Returns: CSR matrix representation"""
    
    def num_row(self):
        """Get number of rows. Returns: int"""
    
    def num_col(self):
        """Get number of columns. Returns: int"""
    
    def num_nonmissing(self):
        """Get number of non-missing values. Returns: int"""
    
    def slice(self, rindex, allow_groups=False):
        """
        Slice DMatrix by row indices.
        
        Parameters:
        - rindex: Row indices to select (array-like)
        - allow_groups: Whether to allow slicing with groups (bool)
        
        Returns: DMatrix
        """
    
    @property
    def feature_names(self):
        """Feature names. Returns: list of str or None"""
    
    @property 
    def feature_types(self):
        """Feature types. Returns: list of str or None"""

QuantileDMatrix - Memory-Efficient Data Structure

Memory-efficient variant of DMatrix that uses quantized data representation, designed specifically for the hist tree method. Reduces memory usage while maintaining accuracy for large datasets.

class QuantileDMatrix:
    def __init__(self, data, label=None, *, ref=None, weight=None, 
                 base_margin=None, missing=None, silent=False, 
                 feature_names=None, feature_types=None, nthread=None, 
                 max_bin=256, group=None, qid=None, label_lower_bound=None, 
                 label_upper_bound=None, feature_weights=None, 
                 enable_categorical=False):
        """
        Memory-efficient DMatrix using quantized data for hist tree method.
        
        Parameters: Similar to DMatrix with additional:
        - ref: Reference QuantileDMatrix for validation data (QuantileDMatrix)
        - max_bin: Maximum number of bins for quantization (int)
        """
    
    @property
    def ref(self):
        """Reference to training QuantileDMatrix. Returns: QuantileDMatrix or None"""

ExtMemQuantileDMatrix - External Memory Data Structure

External memory version of QuantileDMatrix for datasets that don't fit in memory. Enables training on very large datasets by streaming data from disk.

class ExtMemQuantileDMatrix:
    def __init__(self, data, *, missing=None, nthread=None, max_bin=None, 
                 ref=None, enable_categorical=False, max_num_device_pages=None, 
                 max_quantile_batches=None):
        """
        External memory QuantileDMatrix for large datasets that don't fit in memory.
        
        Parameters:
        - data: Iterator that yields data chunks (DataIter)
        - missing: Value representing missing data (float, optional)
        - nthread: Number of threads for processing (int, optional)
        - max_bin: Number of histogram bins for quantization (int, optional)
        - ref: Reference DMatrix for validation data (DMatrix, optional)
        - enable_categorical: Enable categorical feature support (bool)
        - max_num_device_pages: GPU device memory page limit (int, optional)
        - max_quantile_batches: Maximum quantile batches for processing (int, optional)
        """
    
    @property
    def ref(self):
        """Reference to training DMatrix. Returns: DMatrix or None"""

Booster - Trained Model

The core XGBoost model class that contains the trained ensemble of decision trees. Provides methods for prediction, evaluation, model persistence, and introspection.

class Booster:
    def __init__(self, params=None, cache=(), model_file=None):
        """
        XGBoost model containing training, prediction, and evaluation routines.
        
        Parameters:
        - params: Training parameters (dict)
        - cache: List of DMatrix objects to cache (list)
        - model_file: Path to load existing model (str)
        """
    
    def update(self, dtrain, iteration, fobj=None):
        """
        Update the model for one iteration.
        
        Parameters:
        - dtrain: Training DMatrix (DMatrix)
        - iteration: Current iteration number (int)
        - fobj: Custom objective function (callable, optional)
        """
    
    def boost(self, dtrain, iteration, grad, hess):
        """
        Boost the model for one iteration with custom gradients.
        
        Parameters:
        - dtrain: Training DMatrix (DMatrix)
        - iteration: Current iteration number (int) 
        - grad: Gradient values (array-like)
        - hess: Hessian values (array-like)
        """
    
    def predict(self, data, *, output_margin=False, pred_leaf=False, 
                pred_contribs=False, approx_contribs=False, 
                pred_interactions=False, validate_features=True, 
                training=False, iteration_range=(0, 0), strict_shape=False):
        """
        Make predictions using the trained model.
        
        Parameters:
        - data: Input data (DMatrix, array-like, or DataFrame)
        - output_margin: Whether to output margin values (bool)
        - pred_leaf: Whether to output leaf indices (bool)
        - pred_contribs: Whether to output feature contributions (bool)
        - approx_contribs: Whether to use approximate feature contributions (bool)
        - pred_interactions: Whether to output interaction contributions (bool)
        - validate_features: Whether to validate feature names (bool)
        - training: Whether to use training mode (bool)
        - iteration_range: Range of trees to use for prediction (tuple)
        - strict_shape: Whether to enforce strict shape checking (bool)
        
        Returns: numpy.ndarray - Predictions
        """
    
    def inplace_predict(self, data, *, iteration_range=(0, 0), 
                       predict_type='value', missing=float('nan'), 
                       validate_features=True, base_margin=None, 
                       strict_shape=False):
        """
        Inplace prediction without creating DMatrix.
        
        Parameters:
        - data: Input data (array-like or DataFrame)
        - iteration_range: Range of trees to use (tuple)
        - predict_type: Type of prediction ('value', 'margin', 'contrib', 'leaf')
        - missing: Value to treat as missing (float)
        - validate_features: Whether to validate features (bool)
        - base_margin: Base prediction margins (array-like)
        - strict_shape: Whether to enforce strict shape checking (bool)
        
        Returns: numpy.ndarray - Predictions
        """
    
    def eval(self, data, name='eval', iteration=0):
        """
        Evaluate model on given data.
        
        Parameters:
        - data: Evaluation data (DMatrix)
        - name: Name for evaluation (str)
        - iteration: Iteration to evaluate (int)
        
        Returns: str - Evaluation result
        """
    
    def eval_set(self, evals, iteration=0, feval=None, output_margin=True):
        """
        Evaluate model on multiple datasets.
        
        Parameters:
        - evals: List of (DMatrix, name) tuples (list)
        - iteration: Iteration to evaluate (int)
        - feval: Custom evaluation function (callable)
        - output_margin: Whether to output margins (bool)
        
        Returns: str - Evaluation results
        """
    
    def save_model(self, fname):
        """Save model to file. Parameters: fname (str)"""
    
    def load_model(self, fname):
        """Load model from file. Parameters: fname (str)"""
    
    def save_raw(self, raw_format='ubj'):
        """
        Save model to raw format bytes.
        
        Parameters:
        - raw_format: Format ('json', 'ubj', 'deprecated') (str)
        
        Returns: bytes - Serialized model
        """
    
    def load_config(self, config):
        """Load configuration. Parameters: config (str)"""
    
    def save_config(self):
        """Save current configuration. Returns: str - JSON configuration"""
    
    def get_dump(self, fmap='', with_stats=False, dump_format='text'):
        """
        Get model dump as list of strings.
        
        Parameters:
        - fmap: Feature map file (str)
        - with_stats: Whether to include statistics (bool)
        - dump_format: Output format ('text', 'json') (str)
        
        Returns: list of str - Model trees
        """
    
    def get_fscore(self, fmap=''):
        """
        Get feature importance scores.
        
        Parameters:
        - fmap: Feature map file (str)
        
        Returns: dict - Feature importance scores
        """
    
    def get_score(self, fmap='', importance_type='weight'):
        """
        Get feature importance scores by type.
        
        Parameters:
        - fmap: Feature map file (str)
        - importance_type: Type ('weight', 'gain', 'cover', 'total_gain', 'total_cover')
        
        Returns: dict - Feature importance scores
        """
    
    def trees_to_dataframe(self, fmap=''):
        """
        Convert trees to pandas DataFrame.
        
        Parameters:
        - fmap: Feature map file (str)
        
        Returns: pandas.DataFrame - Tree structure
        """
    
    def num_boosted_rounds(self):
        """Get number of boosted rounds. Returns: int"""
    
    def num_features(self):
        """Get number of features. Returns: int"""
    
    def copy(self):
        """Create a copy of the booster. Returns: Booster"""
    
    def attr(self, key):
        """Get attribute by key. Parameters: key (str). Returns: str or None"""
    
    def attributes(self):
        """Get all attributes. Returns: dict"""
    
    def set_attr(self, **kwargs):
        """Set attributes. Parameters: **kwargs - Key-value pairs"""
    
    def set_param(self, params, value=None):
        """
        Set parameter(s).
        
        Parameters:
        - params: Parameter name (str) or parameter dict (dict)
        - value: Parameter value (any, optional)
        """
    
    @property
    def feature_names(self):
        """Feature names. Returns: list of str or None"""
    
    @property
    def feature_types(self):
        """Feature types. Returns: list of str or None"""
    
    @property
    def best_iteration(self):
        """Best iteration from early stopping. Returns: int"""
    
    @property
    def best_score(self):
        """Best score from early stopping. Returns: float"""

DataIter - Custom Data Loading

Abstract base class for implementing custom data iterators, enabling external memory training and custom data loading strategies for very large datasets.

class DataIter:
    def __init__(self, cache_prefix=None, release_data=True, *, on_host=True, 
                 min_cache_page_bytes=None):
        """
        Abstract base class for user-defined data iteration for external memory.
        
        Parameters:
        - cache_prefix: Prefix for cache files (str, optional)
        - release_data: Whether to release data during iteration (bool)
        - on_host: Cache on host memory vs file system for GPU (bool)
        - min_cache_page_bytes: Minimum bytes per cache page (int, optional)
        """
    
    def reset(self):
        """Reset iterator to the beginning. Must be implemented by subclasses."""
    
    def next(self, input_data):
        """
        Set the next batch of data. Must be implemented by subclasses.
        
        Parameters:
        - input_data: Callback function with data fields like DMatrix (callable)
            Should be called as: input_data(data=X, label=y, weight=w, ...)
        
        Returns: bool - False if no more batches, True if more data available
        """
    
    def get_callbacks(self, enable_categorical):
        """
        Get callback functions for iterating in C.
        
        Parameters:
        - enable_categorical: Enable categorical feature support (bool)
        
        Returns: tuple - (reset_callback, next_callback)
        """
    
    def reraise(self):
        """Reraise any exception thrown during iteration."""
    
    @property
    def proxy(self):
        """Handle of DMatrix proxy for internal use. Returns: _ProxyDMatrix"""

Constants and Enums

DataSplitMode

class DataSplitMode:
    """Data splitting mode for distributed training."""
    ROW = 0  # Split by rows
    COL = 1  # Split by columns

Usage Examples

Basic DMatrix Creation

import xgboost as xgb
import numpy as np
import pandas as pd

# From NumPy arrays
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, 1000)
dtrain = xgb.DMatrix(X, label=y, feature_names=[f'f{i}' for i in range(10)])

# From pandas DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)])
dtrain = xgb.DMatrix(df, label=y)

# With additional information
weights = np.random.uniform(0.5, 2.0, 1000)
dtrain = xgb.DMatrix(X, label=y, weight=weights, 
                     feature_names=[f'f{i}' for i in range(10)],
                     feature_types=['float'] * 10)

Memory-Efficient Data Loading

# Use QuantileDMatrix for large datasets
dtrain = xgb.QuantileDMatrix(X_train, label=y_train, max_bin=512)
dtest = xgb.QuantileDMatrix(X_test, label=y_test, ref=dtrain)

# For external memory training
class CustomDataIter(xgb.DataIter):
    def __init__(self, data_files):
        self.data_files = data_files
        self.file_idx = 0
        super().__init__()
    
    def reset(self):
        self.file_idx = 0
    
    def next(self, input_data):
        if self.file_idx >= len(self.data_files):
            return 1
        
        # Load data from current file
        X, y = load_data_from_file(self.data_files[self.file_idx])
        input_data(data=X, label=y)
        self.file_idx += 1
        return 0

data_iter = CustomDataIter(['data1.csv', 'data2.csv', 'data3.csv'])
dtrain = xgb.ExtMemQuantileDMatrix(data_iter)

Model Operations

# Train model
params = {'objective': 'binary:logistic', 'max_depth': 6}
model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions
predictions = model.predict(dtest)

# Get feature importance
importance = model.get_score(importance_type='gain')
print(importance)

# Save and load model
model.save_model('model.json')
loaded_model = xgb.Booster()
loaded_model.load_model('model.json')

# Model introspection
print(f"Number of trees: {model.num_boosted_rounds()}")
print(f"Number of features: {model.num_features()}")

Install with Tessl CLI