AutoGluon TabularPredictor for automated machine learning on tabular datasets
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
The TabularPredictor class provides the main interface for automated machine learning on tabular datasets. It handles the complete ML pipeline from data preprocessing to model training, evaluation, and deployment with minimal user configuration required.
Creates a new TabularPredictor instance configured for a specific prediction task with automatic problem type detection and evaluation metric selection.
class TabularPredictor:
def __init__(
self,
label: str,
problem_type: str = None,
eval_metric: str | Scorer = None,
path: str = None,
verbosity: int = 2,
log_to_file: bool = False,
log_file_path: str = "auto",
sample_weight: str = None,
weight_evaluation: bool = False,
groups: str = None,
positive_class: int | str | None = None,
**kwargs
):
"""
Initialize TabularPredictor for automated machine learning.
Parameters:
- label: Name of the target column to predict
- problem_type: Type of problem ('binary', 'multiclass', 'regression', 'quantile')
- eval_metric: Metric for model evaluation and selection
- path: Directory to save models and outputs
- verbosity: Logging level (0-4)
- log_to_file: Whether to save logs to file
- log_file_path: Path for log file (auto for default)
- sample_weight: Column name for sample weights or 'auto_weight'/'balance_weight'
- weight_evaluation: Whether to use sample weights in evaluation
- groups: Column for custom data splitting in bagging
- positive_class: Positive class for binary classification metrics
"""Trains multiple machine learning models with automatic hyperparameter optimization, ensemble creation, and model selection using advanced techniques like bagging and stacking.
def fit(
self,
train_data: pd.DataFrame | str,
tuning_data: pd.DataFrame | str = None,
time_limit: float = None,
presets: list[str] | str = None,
hyperparameters: dict | str = None,
feature_metadata: str | FeatureMetadata = "infer",
infer_limit: float = None,
infer_limit_batch_size: int = None,
fit_weighted_ensemble: bool = True,
fit_full_last_level_weighted_ensemble: bool = True,
full_weighted_ensemble_additionally: bool = False,
dynamic_stacking: bool | str = False,
calibrate_decision_threshold: bool | str = "auto",
num_cpus: int | str = "auto",
num_gpus: int | str = "auto",
fit_strategy: Literal["sequential", "parallel"] = "sequential",
memory_limit: float | str = "auto",
callbacks: list[AbstractCallback] = None,
**kwargs
) -> 'TabularPredictor':
"""
Train machine learning models on the provided dataset.
Parameters:
- train_data: Training dataset as DataFrame or file path string
- tuning_data: Optional validation dataset as DataFrame or file path string
- time_limit: Maximum training time in seconds (float)
- presets: Pre-configured settings list or single preset ('best_quality', 'high_quality', etc.)
- hyperparameters: Custom hyperparameter configurations as dict or preset string
- feature_metadata: Feature metadata configuration or "infer" for automatic detection
- infer_limit: Time limit for feature inference in seconds
- infer_limit_batch_size: Batch size for feature inference
- fit_weighted_ensemble: Whether to fit weighted ensemble models
- fit_full_last_level_weighted_ensemble: Whether to fit full last level weighted ensemble
- full_weighted_ensemble_additionally: Whether to fit additional full weighted ensemble
- dynamic_stacking: Whether to use dynamic stacking (bool or strategy string)
- calibrate_decision_threshold: Whether to calibrate decision threshold ("auto", True, False)
- num_cpus: Number of CPUs to use ("auto" or integer)
- num_gpus: Number of GPUs to use ("auto" or integer)
- fit_strategy: Strategy for fitting models ("sequential" or "parallel")
- memory_limit: Memory limit ("auto" or float in GB)
- callbacks: List of callback functions for training monitoring
Returns:
Self (TabularPredictor instance)
"""Generates predictions using trained models with options for single model or ensemble predictions, automatic feature transformation, and flexible output formats.
def predict(
self,
data: pd.DataFrame | str,
model: str = None,
as_pandas: bool = True,
transform_features: bool = True,
*,
decision_threshold: float = None,
**kwargs
) -> pd.Series | np.ndarray:
"""
Generate predictions for new data.
Parameters:
- data: Input data or path to data file
- model: Specific model to use (default: best model)
- as_pandas: Return pandas Series (True) or numpy array (False)
- transform_features: Apply feature preprocessing
- decision_threshold: Decision threshold for binary classification
Returns:
Predictions as pandas Series or numpy array
"""
def predict_proba(
self,
data: pd.DataFrame | str,
model: str = None,
as_pandas: bool = True,
as_multiclass: bool = True,
transform_features: bool = True,
**kwargs
) -> pd.DataFrame | pd.Series | np.ndarray:
"""
Generate prediction probabilities for classification tasks.
Parameters:
- data: Input data or path to data file
- model: Specific model to use (default: best model)
- as_pandas: Return pandas DataFrame (True) or numpy array (False)
- as_multiclass: Return multiclass format for binary classification
- transform_features: Apply feature preprocessing
Returns:
Prediction probabilities as pandas DataFrame or numpy array
"""
def predict_from_proba(
self,
y_pred_proba: pd.DataFrame | np.ndarray,
decision_threshold: float = None
) -> pd.Series | np.ndarray:
"""
Convert prediction probabilities to class predictions.
Parameters:
- y_pred_proba: Prediction probabilities
- decision_threshold: Custom threshold for binary classification
Returns:
Class predictions
"""Generates predictions from multiple models simultaneously for model comparison, uncertainty estimation, and ensemble analysis.
def predict_multi(
self,
data: pd.DataFrame = None,
models: list[str] = None,
as_pandas: bool = True,
transform_features: bool = True,
**kwargs
) -> pd.DataFrame | dict:
"""
Generate predictions from multiple models.
Parameters:
- data: Input data
- models: List of model names (default: all models)
- as_pandas: Return format
- transform_features: Apply feature preprocessing
Returns:
Multi-model predictions
"""
def predict_proba_multi(
self,
data: pd.DataFrame = None,
models: list[str] = None,
as_pandas: bool = True,
as_multiclass: bool = True,
**kwargs
) -> dict:
"""
Generate prediction probabilities from multiple models.
Parameters:
- data: Input data
- models: List of model names (default: all models)
- as_pandas: Return format
- as_multiclass: Multiclass format for binary classification
Returns:
Multi-model prediction probabilities
"""Comprehensive model evaluation with multiple metrics, detailed performance analysis, and comparison across different models and datasets.
def evaluate(
self,
data: pd.DataFrame | str,
model: str = None,
silent: bool = False,
auxiliary_metrics: bool = True,
detailed_report: bool = False,
**kwargs
) -> dict:
"""
Evaluate model performance on provided dataset.
Parameters:
- data: Evaluation data or path to data file
- model: Specific model to evaluate (default: best model)
- silent: Suppress printed output
- auxiliary_metrics: Include additional metrics beyond eval_metric
- detailed_report: Generate detailed evaluation report
Returns:
Dictionary of evaluation metrics and scores
"""
def evaluate_predictions(
self,
y_true: pd.Series | np.ndarray,
y_pred: pd.Series | np.ndarray,
sample_weight: pd.Series | np.ndarray = None,
decision_threshold: float = None,
display: bool = False,
auxiliary_metrics: bool = True,
detailed_report: bool = False,
**kwargs
) -> dict:
"""
Evaluate predictions directly without requiring predictor or data.
Parameters:
- y_true: Ground truth labels
- y_pred: Model predictions
- sample_weight: Sample weights for evaluation
- decision_threshold: Threshold for binary classification
- display: Print evaluation results
- auxiliary_metrics: Include additional metrics
- detailed_report: Generate detailed report
Returns:
Dictionary of evaluation metrics
"""
def leaderboard(
self,
data: pd.DataFrame | str = None,
extra_info: bool = False,
only_pareto_frontier: bool = False,
skip_score: bool = False,
**kwargs
) -> pd.DataFrame:
"""
Generate model leaderboard with performance rankings.
Parameters:
- data: Evaluation data (default: validation data)
- extra_info: Include additional model information
- only_pareto_frontier: Show only Pareto optimal models
- skip_score: Skip scoring models (faster)
Returns:
DataFrame with model rankings and performance metrics
"""Advanced functionality for accessing out-of-fold predictions from cross-validation, useful for stacking, analysis, and debugging model performance.
def predict_oof(
self,
model: str = None,
transformed: bool = False,
train_data: pd.DataFrame = None,
internal_oof: bool = False,
decision_threshold: float = None,
**kwargs
) -> pd.Series:
"""
Get out-of-fold predictions for training data.
Parameters:
- model: Model name (default: best model)
- transformed: Use transformed feature representation
- train_data: Training data (default: original training data)
- internal_oof: Use internal OOF format
- decision_threshold: Threshold for binary classification
Returns:
Out-of-fold predictions for training data
"""
def predict_proba_oof(
self,
model: str = None,
transformed: bool = False,
as_multiclass: bool = True,
train_data: pd.DataFrame = None,
internal_oof: bool = False,
**kwargs
) -> pd.DataFrame | pd.Series:
"""
Get out-of-fold prediction probabilities for training data.
Parameters:
- model: Model name (default: best model)
- transformed: Use transformed feature representation
- as_multiclass: Multiclass format for binary classification
- train_data: Training data (default: original training data)
- internal_oof: Use internal OOF format
Returns:
Out-of-fold prediction probabilities
"""Comprehensive model lifecycle management including saving, loading, cloning, and optimization for deployment scenarios.
def save(self, silent: bool = False) -> str:
"""
Save predictor to disk.
Parameters:
- silent: Suppress output messages
Returns:
Path where predictor was saved
"""
@classmethod
def load(
cls,
path: str,
verbosity: int = None,
require_version_match: bool = True,
require_py_version_match: bool = True
) -> 'TabularPredictor':
"""
Load a saved predictor from disk.
Parameters:
- path: Path to saved predictor
- verbosity: Logging level override
- require_version_match: Require AutoGluon version match
- require_py_version_match: Require Python version match
Returns:
Loaded TabularPredictor instance
"""
def clone(
self,
path: str,
return_clone: bool = False,
dirs_exist_ok: bool = False
) -> str | 'TabularPredictor':
"""
Create a copy of the predictor at a new location.
Parameters:
- path: Destination path for cloned predictor
- return_clone: Return cloned predictor object
- dirs_exist_ok: Allow overwriting existing directory
Returns:
Path to cloned predictor or cloned predictor object
"""
def clone_for_deployment(
self,
path: str,
model: str = "best",
return_clone: bool = False,
dirs_exist_ok: bool = False
) -> str | 'TabularPredictor':
"""
Create a deployment-optimized copy with minimal footprint.
Parameters:
- path: Destination path for deployment clone
- model: Specific model to include in deployment
- return_clone: Return cloned predictor object
- dirs_exist_ok: Allow overwriting existing directory
Returns:
Path to deployment clone or cloned predictor object
"""
def save_space(
self,
remove_data: bool = True,
remove_fit_stack: bool = True,
requires_save: bool = True,
reduce_children: bool = False
) -> str:
"""
Reduce predictor disk usage by removing non-essential files.
Parameters:
- remove_data: Remove cached training data
- remove_fit_stack: Remove intermediate stacking models
- requires_save: Save predictor after space reduction
- reduce_children: Apply space reduction to child models
Returns:
Path to optimized predictor
"""Access to predictor metadata, model information, and internal state for analysis and debugging.
@property
def classes_(self) -> list:
"""Available classes for classification problems."""
@property
def class_labels(self) -> list:
"""Class labels in original format."""
@property
def problem_type(self) -> str:
"""Type of ML problem (binary, multiclass, regression, etc.)."""
@property
def eval_metric(self) -> str:
"""Evaluation metric used for model selection."""
@property
def label(self) -> str:
"""Name of the target column."""
@property
def path(self) -> str:
"""Path where predictor is saved."""
@property
def features(self) -> list[str]:
"""List of feature names used by models."""
@property
def original_features(self) -> list[str]:
"""List of original feature names from training data."""
def features(self, feature_stage: str = "original") -> list[str]:
"""
Get feature names at different processing stages.
Parameters:
- feature_stage: Stage of feature processing ('original', 'transformed')
Returns:
List of feature names
"""
@property
def feature_metadata(self) -> FeatureMetadata:
"""Metadata about features including types and preprocessing."""
def set_decision_threshold(self, decision_threshold: float) -> None:
"""
Set custom decision threshold for binary classification.
Parameters:
- decision_threshold: New threshold value (0.0 to 1.0)
"""
@property
def decision_threshold(self) -> float | None:
"""Current decision threshold for binary classification."""from autogluon.tabular import TabularPredictor
import pandas as pd
# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
# Create predictor for binary classification
predictor = TabularPredictor(
label='target',
problem_type='binary',
eval_metric='roc_auc'
)
# Train with time limit
predictor.fit(
train_data,
time_limit=600, # 10 minutes
presets='good_quality'
)
# Make predictions
predictions = predictor.predict(test_data)
probabilities = predictor.predict_proba(test_data)
# Evaluate performance
results = predictor.evaluate(test_data)
print(f"ROC-AUC: {results['roc_auc']:.4f}")
# View model leaderboard
leaderboard = predictor.leaderboard(test_data, extra_info=True)
print(leaderboard)# Custom hyperparameters
hyperparameters = {
'LGB': {'num_leaves': [26, 66, 176]},
'XGB': {'n_estimators': [50, 100, 200]},
'CAT': {'iterations': [100, 200, 500]}
}
# Advanced training with custom settings
predictor = TabularPredictor(
label='target',
sample_weight='weights',
path='./models/'
)
predictor.fit(
train_data,
hyperparameters=hyperparameters,
num_bag_folds=5,
num_stack_levels=2,
ag_args_fit={'num_cpus': 8},
excluded_model_types=['KNN', 'XT']
)
# Multi-model predictions for ensemble analysis
multi_preds = predictor.predict_multi(test_data)
model_comparison = pd.DataFrame(multi_preds)Install with Tessl CLI
npx tessl i tessl/pypi-autogluon--tabular