tessl/pypi-autogluon

AutoGluon automates machine learning tasks enabling you to easily achieve strong predictive performance in your applications.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Tabular Machine Learning

Name: tessl/pypi-autogluon
Author: tessl

Automated machine learning for structured/tabular data supporting binary classification, multiclass classification, and regression tasks. TabularPredictor automatically handles feature engineering, model selection, hyperparameter tuning, and intelligent ensembling to achieve strong predictive performance with minimal configuration.

Capabilities

TabularPredictor Class

Main predictor class for tabular/structured data that automates the entire ML pipeline from data preprocessing to model deployment.

class TabularPredictor:
    def __init__(
        self,
        label: str,
        problem_type: str = None,
        eval_metric: str = None,
        path: str = None,
        verbosity: int = 2,
        sample_weight: str = None,
        weight_evaluation: bool = False,
        groups: str = None,
        **kwargs
    ):
        """
        Initialize TabularPredictor for automated machine learning on tabular data.
        
        Parameters:
        - label: Name of the target column to predict
        - problem_type: Type of problem ('binary', 'multiclass', 'regression', 'quantile')
        - eval_metric: Evaluation metric ('accuracy', 'roc_auc', 'rmse', etc.)
        - path: Directory to save models and artifacts
        - verbosity: Logging verbosity level (0-4)
        - sample_weight: Column name for sample weights
        - weight_evaluation: Whether to weight evaluation metrics
        - groups: Column name for group information (for grouped CV)
        """

Model Training

Train and automatically tune machine learning models on tabular data with intelligent preprocessing and model selection.

def fit(
    self,
    train_data,
    tuning_data=None,
    time_limit: float = None,
    presets: str = None,
    hyperparameters=None,
    feature_metadata=None,
    infer_limit: float = None,
    infer_limit_batch_size: int = None,
    fit_weighted_ensemble: bool = True,
    dynamic_stacking: bool = False,
    calibrate_decision_threshold: str = "auto",
    num_cpus: str = "auto",
    num_gpus: str = "auto",
    fit_strategy: str = "sequential",
    memory_limit: str = "auto",
    excluded_model_types: list = None,
    included_model_types: list = None,
    holdout_frac: float = None,
    callbacks: list = None,
    **kwargs
):
    """
    Fit TabularPredictor on training data.
    
    Parameters:
    - train_data: Training data (DataFrame, file path, or TabularDataset)
    - tuning_data: Validation data for hyperparameter tuning
    - time_limit: Maximum training time in seconds
    - presets: Quality/speed presets ('best_quality', 'high_quality', 'medium_quality', 'optimize_for_deployment')
    - hyperparameters: Custom hyperparameter configurations
    - feature_metadata: Manual feature type specifications or 'infer'
    - infer_limit: Time limit for feature inference
    - infer_limit_batch_size: Batch size for feature inference
    - fit_weighted_ensemble: Whether to fit weighted ensemble models
    - dynamic_stacking: Enable dynamic stacking for ensemble models
    - calibrate_decision_threshold: Auto-calibrate decision threshold ('auto', True, False)
    - num_cpus: Number of CPU cores ('auto' or int)
    - num_gpus: Number of GPUs ('auto' or int) 
    - fit_strategy: Model fitting strategy ('sequential', 'parallel')
    - memory_limit: Memory limit for training ('auto' or float)
    - excluded_model_types: List of model types to exclude
    - included_model_types: List of model types to include only
    - holdout_frac: Fraction of data to hold out for validation
    - callbacks: List of callback functions for training
    
    Returns:
    TabularPredictor: Fitted predictor instance
    """

Prediction

Generate predictions and prediction probabilities for new data using the trained model ensemble.

def predict(
    self,
    data,
    model: str = None,
    as_pandas: bool = True,
    transform_features: bool = True
):
    """
    Generate predictions for new data.
    
    Parameters:
    - data: Input data (DataFrame, file path, or TabularDataset)
    - model: Specific model name to use for prediction
    - as_pandas: Return results as pandas Series
    - transform_features: Apply feature transformations
    
    Returns:
    Predictions as pandas Series or numpy array
    """

def predict_proba(
    self,
    data,
    model: str = None,
    as_pandas: bool = True,
    as_multiclass: bool = True,
    transform_features: bool = True
):
    """
    Generate prediction probabilities for classification tasks.
    
    Parameters:
    - data: Input data (DataFrame, file path, or TabularDataset)
    - model: Specific model name to use for prediction
    - as_pandas: Return results as pandas DataFrame
    - as_multiclass: Return all class probabilities vs just positive class
    - transform_features: Apply feature transformations
    
    Returns:
    Prediction probabilities as pandas DataFrame or numpy array
    """

Model Evaluation

Evaluate model performance and analyze results with comprehensive metrics and model comparison capabilities.

def evaluate(
    self,
    data,
    model: str = None,
    auxiliary_metrics: bool = True,
    detailed_report: bool = False,
    silent: bool = False
):
    """
    Evaluate predictor performance on test data.
    
    Parameters:
    - data: Test data (DataFrame, file path, or TabularDataset)
    - model: Specific model to evaluate
    - auxiliary_metrics: Include additional evaluation metrics
    - detailed_report: Generate detailed evaluation report
    - silent: Suppress output
    
    Returns:
    dict: Dictionary of evaluation metrics
    """

def leaderboard(
    self,
    data=None,
    extra_info: bool = False,
    only_pareto_frontier: bool = False,
    skip_score: bool = False,
    silent: bool = False
):
    """
    Display model leaderboard with performance rankings.
    
    Parameters:
    - data: Test data for evaluation (optional)
    - extra_info: Include additional model information
    - only_pareto_frontier: Show only Pareto optimal models
    - skip_score: Skip performance scoring
    - silent: Suppress output
    
    Returns:
    DataFrame: Model leaderboard with performance metrics
    """

Feature Analysis

Analyze feature importance and understand model behavior through interpretability tools.

def feature_importance(
    self,
    data=None,
    model: str = None,
    features: list = None,
    feature_stage: str = 'original',
    subsample_size: int = 5000,
    silent: bool = False
):
    """
    Calculate feature importance scores.
    
    Parameters:
    - data: Data for importance calculation
    - model: Specific model to analyze
    - features: Specific features to analyze
    - feature_stage: Feature processing stage ('original' or 'transformed')
    - subsample_size: Sample size for efficient computation
    - silent: Suppress output
    
    Returns:
    DataFrame: Feature importance scores
    """

def fit_summary(self, verbosity: int = 1, show_plot: bool = False):
    """
    Display summary of training process and results.
    
    Parameters:
    - verbosity: Detail level (0-4)
    - show_plot: Show training plots
    
    Returns:
    dict: Training summary information
    """

Model Persistence

Save and load trained predictors for deployment and reuse.

def save(self, path: str = None):
    """
    Save trained predictor to disk.
    
    Parameters:
    - path: Directory to save predictor
    """

@classmethod
def load(cls, path: str, verbosity: int = 2):
    """
    Load saved predictor from disk.
    
    Parameters:
    - path: Directory containing saved predictor
    - verbosity: Logging verbosity level
    
    Returns:
    TabularPredictor: Loaded predictor instance
    """

Advanced Features

Advanced model configuration and specialized functionality for power users.

def refit_full(self, model: str = 'best'):
    """
    Refit model on full dataset (train + validation).
    
    Parameters:
    - model: Model to refit ('best', 'all', or specific model name)
    
    Returns:
    dict: Refit results
    """

def distill(
    self,
    train_data=None,
    tuning_data=None,
    time_limit: int = None,
    hyperparameters=None,
    **kwargs
):
    """
    Create distilled (compressed) version of ensemble model.
    
    Parameters:
    - train_data: Training data for distillation
    - tuning_data: Validation data for distillation
    - time_limit: Maximum distillation time
    - hyperparameters: Distillation hyperparameters
    
    Returns:
    dict: Distillation results
    """

def persist_models(self, models: list = None, with_ancestors: bool = True):
    """
    Persist models in memory to disk for memory optimization.
    
    Parameters:
    - models: List of model names to persist
    - with_ancestors: Include ancestor models in persistence
    """

def unpersist_models(self, models: list = None):
    """
    Load persisted models back into memory.
    
    Parameters:
    - models: List of model names to unpersist
    """

def calibrate_decision_threshold(
    self,
    data=None,
    metric: str = None,
    return_optimization_curve: bool = False,
    verbose: bool = True
):
    """
    Calibrate decision threshold for binary classification to optimize specified metric.
    
    Parameters:
    - data: Data to use for threshold calibration
    - metric: Metric to optimize ('f1', 'balanced_accuracy', 'mcc', etc.)
    - return_optimization_curve: Return threshold vs metric curve
    - verbose: Print optimization results
    
    Returns:
    dict or tuple: Calibration results, optionally with optimization curve
    """

def clone(self, path: str, *, return_clone: bool = False, dirs_exist_ok: bool = False):
    """
    Create a copy of the predictor at a new location.
    
    Parameters:
    - path: Directory path for the cloned predictor
    - return_clone: Return the cloned predictor instance
    - dirs_exist_ok: Allow overwriting existing directory
    
    Returns:
    str or TabularPredictor: Path to clone or cloned predictor instance
    """

def clone_for_deployment(
    self, 
    path: str, 
    *, 
    model: str = "best", 
    return_clone: bool = False, 
    dirs_exist_ok: bool = False
):
    """
    Create optimized copy of predictor for deployment with minimal storage footprint.
    
    Parameters:
    - path: Directory path for deployment clone
    - model: Model to include in deployment clone
    - return_clone: Return the cloned predictor instance
    - dirs_exist_ok: Allow overwriting existing directory
    
    Returns:
    str or TabularPredictor: Path to clone or cloned predictor instance
    """

InterpretableTabularPredictor Class

[EXPERIMENTAL] Specialized TabularPredictor subclass focused on interpretable models with simple, human-readable rules. Trades accuracy for interpretability by limiting to simple models and disabling complex ensemble techniques.

class InterpretableTabularPredictor(TabularPredictor):
    def __init__(self, *args, **kwargs):
        """
        Initialize InterpretableTabularPredictor with same parameters as TabularPredictor.
        Automatically restricts to interpretable models and preprocessing.
        """
    
    def fit(
        self,
        train_data,
        tuning_data=None,
        time_limit: float = None,
        *,
        presets: str = "interpretable",
        **kwargs
    ):
        """
        Fit interpretable models with automatic preset selection for interpretability.
        
        Parameters:
        - train_data: Training data (same as TabularPredictor)
        - tuning_data: Validation data (optional)
        - time_limit: Maximum training time
        - presets: Defaults to "interpretable" preset
        
        Note: Bagging, stacking, and complex ensembles are disabled for interpretability
        """
    
    def leaderboard_interpretable(self, verbose: bool = False, **kwargs):
        """
        Leaderboard with model complexity scores for interpretable model selection.
        
        Parameters:
        - verbose: Print detailed leaderboard
        
        Returns:
        DataFrame: Leaderboard with additional 'complexity' column showing rule count
        """
    
    def print_interpretable_rules(
        self, 
        complexity_threshold: int = 10, 
        model_name: str = None
    ):
        """
        Print human-readable rules from the best interpretable model.
        
        Parameters:
        - complexity_threshold: Maximum rule complexity to display
        - model_name: Specific model to show rules for
        """

Usage Examples

Basic Classification

from autogluon.tabular import TabularPredictor

# Binary classification
predictor = TabularPredictor(label='target')
predictor.fit('train.csv', presets='best_quality', time_limit=3600)

# Make predictions
predictions = predictor.predict('test.csv')
probabilities = predictor.predict_proba('test.csv')

# Evaluate performance
scores = predictor.evaluate('test.csv')
print(f"Accuracy: {scores['accuracy']:.3f}")

# View model leaderboard
leaderboard = predictor.leaderboard('test.csv')
print(leaderboard)

Custom Configuration

# Custom hyperparameters and model selection
hyperparameters = {
    'GBM': {'num_boost_round': 1000, 'learning_rate': 0.01},
    'RF': {'n_estimators': 500, 'max_depth': 20},
    'XGB': {'n_estimators': 1000, 'learning_rate': 0.01}
}

predictor = TabularPredictor(
    label='price',
    problem_type='regression',
    eval_metric='rmse',
    path='./models'
)

predictor.fit(
    train_data,
    hyperparameters=hyperparameters,
    excluded_model_types=['KNN', 'LR'],  # Exclude certain model types
    time_limit=7200,
    presets='high_quality'
)

# Feature importance analysis
importance = predictor.feature_importance(train_data)
print(importance.head(10))

Install with Tessl CLI