CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-autogluon--tabular

AutoGluon TabularPredictor for automated machine learning on tabular datasets

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

predictor.mddocs/

Core Prediction Interface

The TabularPredictor class provides the main interface for automated machine learning on tabular datasets. It handles the complete ML pipeline from data preprocessing to model training, evaluation, and deployment with minimal user configuration required.

Capabilities

Predictor Initialization

Creates a new TabularPredictor instance configured for a specific prediction task with automatic problem type detection and evaluation metric selection.

class TabularPredictor:
    def __init__(
        self,
        label: str,
        problem_type: str = None,
        eval_metric: str | Scorer = None,
        path: str = None,
        verbosity: int = 2,
        log_to_file: bool = False,
        log_file_path: str = "auto",
        sample_weight: str = None,
        weight_evaluation: bool = False,
        groups: str = None,
        positive_class: int | str | None = None,
        **kwargs
    ):
        """
        Initialize TabularPredictor for automated machine learning.
        
        Parameters:
        - label: Name of the target column to predict
        - problem_type: Type of problem ('binary', 'multiclass', 'regression', 'quantile')
        - eval_metric: Metric for model evaluation and selection
        - path: Directory to save models and outputs
        - verbosity: Logging level (0-4)
        - log_to_file: Whether to save logs to file
        - log_file_path: Path for log file (auto for default)
        - sample_weight: Column name for sample weights or 'auto_weight'/'balance_weight'
        - weight_evaluation: Whether to use sample weights in evaluation
        - groups: Column for custom data splitting in bagging
        - positive_class: Positive class for binary classification metrics
        """

Model Training

Trains multiple machine learning models with automatic hyperparameter optimization, ensemble creation, and model selection using advanced techniques like bagging and stacking.

def fit(
    self,
    train_data: pd.DataFrame | str,
    tuning_data: pd.DataFrame | str = None,
    time_limit: float = None,
    presets: list[str] | str = None,
    hyperparameters: dict | str = None,
    feature_metadata: str | FeatureMetadata = "infer",
    infer_limit: float = None,
    infer_limit_batch_size: int = None,
    fit_weighted_ensemble: bool = True,
    fit_full_last_level_weighted_ensemble: bool = True,
    full_weighted_ensemble_additionally: bool = False,
    dynamic_stacking: bool | str = False,
    calibrate_decision_threshold: bool | str = "auto",
    num_cpus: int | str = "auto",
    num_gpus: int | str = "auto",
    fit_strategy: Literal["sequential", "parallel"] = "sequential",
    memory_limit: float | str = "auto",
    callbacks: list[AbstractCallback] = None,
    **kwargs
) -> 'TabularPredictor':
    """
    Train machine learning models on the provided dataset.
    
    Parameters:
    - train_data: Training dataset as DataFrame or file path string
    - tuning_data: Optional validation dataset as DataFrame or file path string
    - time_limit: Maximum training time in seconds (float)
    - presets: Pre-configured settings list or single preset ('best_quality', 'high_quality', etc.)
    - hyperparameters: Custom hyperparameter configurations as dict or preset string
    - feature_metadata: Feature metadata configuration or "infer" for automatic detection
    - infer_limit: Time limit for feature inference in seconds
    - infer_limit_batch_size: Batch size for feature inference
    - fit_weighted_ensemble: Whether to fit weighted ensemble models
    - fit_full_last_level_weighted_ensemble: Whether to fit full last level weighted ensemble
    - full_weighted_ensemble_additionally: Whether to fit additional full weighted ensemble
    - dynamic_stacking: Whether to use dynamic stacking (bool or strategy string)
    - calibrate_decision_threshold: Whether to calibrate decision threshold ("auto", True, False)
    - num_cpus: Number of CPUs to use ("auto" or integer)
    - num_gpus: Number of GPUs to use ("auto" or integer)
    - fit_strategy: Strategy for fitting models ("sequential" or "parallel")
    - memory_limit: Memory limit ("auto" or float in GB)
    - callbacks: List of callback functions for training monitoring
    
    Returns:
    Self (TabularPredictor instance)
    """

Predictions

Generates predictions using trained models with options for single model or ensemble predictions, automatic feature transformation, and flexible output formats.

def predict(
    self,
    data: pd.DataFrame | str,
    model: str = None,
    as_pandas: bool = True,
    transform_features: bool = True,
    *,
    decision_threshold: float = None,
    **kwargs
) -> pd.Series | np.ndarray:
    """
    Generate predictions for new data.
    
    Parameters:
    - data: Input data or path to data file
    - model: Specific model to use (default: best model)
    - as_pandas: Return pandas Series (True) or numpy array (False)
    - transform_features: Apply feature preprocessing
    - decision_threshold: Decision threshold for binary classification
    
    Returns:
    Predictions as pandas Series or numpy array
    """

def predict_proba(
    self,
    data: pd.DataFrame | str,
    model: str = None,
    as_pandas: bool = True,
    as_multiclass: bool = True,
    transform_features: bool = True,
    **kwargs
) -> pd.DataFrame | pd.Series | np.ndarray:
    """
    Generate prediction probabilities for classification tasks.
    
    Parameters:
    - data: Input data or path to data file
    - model: Specific model to use (default: best model)
    - as_pandas: Return pandas DataFrame (True) or numpy array (False)
    - as_multiclass: Return multiclass format for binary classification
    - transform_features: Apply feature preprocessing
    
    Returns:
    Prediction probabilities as pandas DataFrame or numpy array
    """

def predict_from_proba(
    self,
    y_pred_proba: pd.DataFrame | np.ndarray,
    decision_threshold: float = None
) -> pd.Series | np.ndarray:
    """
    Convert prediction probabilities to class predictions.
    
    Parameters:
    - y_pred_proba: Prediction probabilities
    - decision_threshold: Custom threshold for binary classification
    
    Returns:
    Class predictions
    """

Multi-Model Predictions

Generates predictions from multiple models simultaneously for model comparison, uncertainty estimation, and ensemble analysis.

def predict_multi(
    self,
    data: pd.DataFrame = None,
    models: list[str] = None,
    as_pandas: bool = True,
    transform_features: bool = True,
    **kwargs
) -> pd.DataFrame | dict:
    """
    Generate predictions from multiple models.
    
    Parameters:
    - data: Input data
    - models: List of model names (default: all models)
    - as_pandas: Return format
    - transform_features: Apply feature preprocessing
    
    Returns:
    Multi-model predictions
    """

def predict_proba_multi(
    self,
    data: pd.DataFrame = None,  
    models: list[str] = None,
    as_pandas: bool = True,
    as_multiclass: bool = True,
    **kwargs
) -> dict:
    """
    Generate prediction probabilities from multiple models.
    
    Parameters:
    - data: Input data
    - models: List of model names (default: all models)
    - as_pandas: Return format
    - as_multiclass: Multiclass format for binary classification
    
    Returns:
    Multi-model prediction probabilities
    """

Model Evaluation

Comprehensive model evaluation with multiple metrics, detailed performance analysis, and comparison across different models and datasets.

def evaluate(
    self,
    data: pd.DataFrame | str,
    model: str = None,
    silent: bool = False,
    auxiliary_metrics: bool = True,
    detailed_report: bool = False,
    **kwargs
) -> dict:
    """
    Evaluate model performance on provided dataset.
    
    Parameters:
    - data: Evaluation data or path to data file
    - model: Specific model to evaluate (default: best model)
    - silent: Suppress printed output
    - auxiliary_metrics: Include additional metrics beyond eval_metric
    - detailed_report: Generate detailed evaluation report
    
    Returns:
    Dictionary of evaluation metrics and scores
    """

def evaluate_predictions(
    self,
    y_true: pd.Series | np.ndarray,
    y_pred: pd.Series | np.ndarray,
    sample_weight: pd.Series | np.ndarray = None,
    decision_threshold: float = None,
    display: bool = False,
    auxiliary_metrics: bool = True,
    detailed_report: bool = False,
    **kwargs
) -> dict:
    """
    Evaluate predictions directly without requiring predictor or data.
    
    Parameters:
    - y_true: Ground truth labels
    - y_pred: Model predictions
    - sample_weight: Sample weights for evaluation
    - decision_threshold: Threshold for binary classification
    - display: Print evaluation results
    - auxiliary_metrics: Include additional metrics
    - detailed_report: Generate detailed report
    
    Returns:
    Dictionary of evaluation metrics
    """

def leaderboard(
    self,
    data: pd.DataFrame | str = None,
    extra_info: bool = False,
    only_pareto_frontier: bool = False,
    skip_score: bool = False,
    **kwargs
) -> pd.DataFrame:
    """
    Generate model leaderboard with performance rankings.
    
    Parameters:
    - data: Evaluation data (default: validation data)
    - extra_info: Include additional model information
    - only_pareto_frontier: Show only Pareto optimal models
    - skip_score: Skip scoring models (faster)
    
    Returns:
    DataFrame with model rankings and performance metrics
    """

Out-of-Fold Predictions

Advanced functionality for accessing out-of-fold predictions from cross-validation, useful for stacking, analysis, and debugging model performance.

def predict_oof(
    self,
    model: str = None,
    transformed: bool = False,
    train_data: pd.DataFrame = None,
    internal_oof: bool = False,
    decision_threshold: float = None,
    **kwargs
) -> pd.Series:
    """
    Get out-of-fold predictions for training data.
    
    Parameters:
    - model: Model name (default: best model)
    - transformed: Use transformed feature representation
    - train_data: Training data (default: original training data)
    - internal_oof: Use internal OOF format
    - decision_threshold: Threshold for binary classification
    
    Returns:
    Out-of-fold predictions for training data
    """

def predict_proba_oof(
    self,
    model: str = None,
    transformed: bool = False,
    as_multiclass: bool = True,
    train_data: pd.DataFrame = None,
    internal_oof: bool = False,
    **kwargs
) -> pd.DataFrame | pd.Series:
    """
    Get out-of-fold prediction probabilities for training data.
    
    Parameters:
    - model: Model name (default: best model)
    - transformed: Use transformed feature representation
    - as_multiclass: Multiclass format for binary classification
    - train_data: Training data (default: original training data)
    - internal_oof: Use internal OOF format
    
    Returns:
    Out-of-fold prediction probabilities
    """

Model Management

Comprehensive model lifecycle management including saving, loading, cloning, and optimization for deployment scenarios.

def save(self, silent: bool = False) -> str:
    """
    Save predictor to disk.
    
    Parameters:
    - silent: Suppress output messages
    
    Returns:
    Path where predictor was saved
    """

@classmethod
def load(
    cls,
    path: str,
    verbosity: int = None,
    require_version_match: bool = True,
    require_py_version_match: bool = True
) -> 'TabularPredictor':
    """
    Load a saved predictor from disk.
    
    Parameters:
    - path: Path to saved predictor
    - verbosity: Logging level override
    - require_version_match: Require AutoGluon version match
    - require_py_version_match: Require Python version match
    
    Returns:
    Loaded TabularPredictor instance
    """

def clone(
    self,
    path: str,
    return_clone: bool = False,
    dirs_exist_ok: bool = False
) -> str | 'TabularPredictor':
    """
    Create a copy of the predictor at a new location.
    
    Parameters:
    - path: Destination path for cloned predictor
    - return_clone: Return cloned predictor object
    - dirs_exist_ok: Allow overwriting existing directory
    
    Returns:
    Path to cloned predictor or cloned predictor object
    """

def clone_for_deployment(
    self,
    path: str,
    model: str = "best",
    return_clone: bool = False,
    dirs_exist_ok: bool = False
) -> str | 'TabularPredictor':
    """
    Create a deployment-optimized copy with minimal footprint.
    
    Parameters:
    - path: Destination path for deployment clone
    - model: Specific model to include in deployment
    - return_clone: Return cloned predictor object
    - dirs_exist_ok: Allow overwriting existing directory
    
    Returns:
    Path to deployment clone or cloned predictor object
    """

def save_space(
    self,
    remove_data: bool = True,
    remove_fit_stack: bool = True,
    requires_save: bool = True,
    reduce_children: bool = False
) -> str:
    """
    Reduce predictor disk usage by removing non-essential files.
    
    Parameters:
    - remove_data: Remove cached training data
    - remove_fit_stack: Remove intermediate stacking models
    - requires_save: Save predictor after space reduction
    - reduce_children: Apply space reduction to child models
    
    Returns:
    Path to optimized predictor
    """

Properties and Inspection

Access to predictor metadata, model information, and internal state for analysis and debugging.

@property
def classes_(self) -> list:
    """Available classes for classification problems."""

@property  
def class_labels(self) -> list:
    """Class labels in original format."""

@property
def problem_type(self) -> str:
    """Type of ML problem (binary, multiclass, regression, etc.)."""

@property
def eval_metric(self) -> str:
    """Evaluation metric used for model selection."""

@property
def label(self) -> str:
    """Name of the target column."""

@property
def path(self) -> str:
    """Path where predictor is saved."""

@property
def features(self) -> list[str]:
    """List of feature names used by models."""

@property
def original_features(self) -> list[str]:
    """List of original feature names from training data."""

def features(self, feature_stage: str = "original") -> list[str]:
    """
    Get feature names at different processing stages.
    
    Parameters:
    - feature_stage: Stage of feature processing ('original', 'transformed')
    
    Returns:
    List of feature names
    """

@property
def feature_metadata(self) -> FeatureMetadata:
    """Metadata about features including types and preprocessing."""

def set_decision_threshold(self, decision_threshold: float) -> None:
    """
    Set custom decision threshold for binary classification.
    
    Parameters:
    - decision_threshold: New threshold value (0.0 to 1.0)
    """

@property  
def decision_threshold(self) -> float | None:
    """Current decision threshold for binary classification."""

Usage Examples

Basic Classification

from autogluon.tabular import TabularPredictor
import pandas as pd

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Create predictor for binary classification
predictor = TabularPredictor(
    label='target',
    problem_type='binary',
    eval_metric='roc_auc'
)

# Train with time limit
predictor.fit(
    train_data,
    time_limit=600,  # 10 minutes
    presets='good_quality'
)

# Make predictions
predictions = predictor.predict(test_data)
probabilities = predictor.predict_proba(test_data)

# Evaluate performance
results = predictor.evaluate(test_data)
print(f"ROC-AUC: {results['roc_auc']:.4f}")

# View model leaderboard
leaderboard = predictor.leaderboard(test_data, extra_info=True)
print(leaderboard)

Advanced Configuration

# Custom hyperparameters
hyperparameters = {
    'LGB': {'num_leaves': [26, 66, 176]},
    'XGB': {'n_estimators': [50, 100, 200]},
    'CAT': {'iterations': [100, 200, 500]}
}

# Advanced training with custom settings
predictor = TabularPredictor(
    label='target',
    sample_weight='weights',
    path='./models/'
)

predictor.fit(
    train_data,
    hyperparameters=hyperparameters,
    num_bag_folds=5,
    num_stack_levels=2,
    ag_args_fit={'num_cpus': 8},
    excluded_model_types=['KNN', 'XT']
)

# Multi-model predictions for ensemble analysis  
multi_preds = predictor.predict_multi(test_data)
model_comparison = pd.DataFrame(multi_preds)

Install with Tessl CLI

npx tessl i tessl/pypi-autogluon--tabular

docs

configurations.md

experimental.md

index.md

models.md

predictor.md

tile.json