tessl/pypi-autogluon

AutoGluon automates machine learning tasks enabling you to easily achieve strong predictive performance in your applications.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Core Utilities

Name: tessl/pypi-autogluon
Author: tessl

Shared utilities for metrics, constants, and data structures used across all AutoGluon predictors. These components provide the foundational infrastructure for consistent evaluation, problem type handling, and data management throughout the AutoGluon ecosystem.

Capabilities

Problem Type Constants

Standard constants for different machine learning problem types and configurations.

# Core problem type constants
BINARY = "binary"
MULTICLASS = "multiclass"
REGRESSION = "regression"
SOFTCLASS = "softclass"  # Classification with soft targets (probabilities)
QUANTILE = "quantile"    # Quantile regression

# Problem type groupings
PROBLEM_TYPES_CLASSIFICATION = ["binary", "multiclass"]
PROBLEM_TYPES_REGRESSION = ["regression"]
PROBLEM_TYPES = ["binary", "multiclass", "regression", "softclass", "quantile"]

# Model configuration constants
AG_ARGS = "ag_args"          # AutoGluon model arguments
AG_ARGS_FIT = "ag_args_fit"  # Training-specific arguments
AG_ARGS_ENSEMBLE = "ag_args_ensemble"  # Ensemble-specific arguments

# Weight handling constants
AUTO_WEIGHT = "auto_weight"
BALANCE_WEIGHT = "balance_weight"

# Problem type inference thresholds
MULTICLASS_UPPER_LIMIT = 1000  # Max unique values for classification
LARGE_DATA_THRESHOLD = 1000
REGRESS_THRESHOLD_LARGE_DATA = 0.05
REGRESS_THRESHOLD_SMALL_DATA = 0.1

Data Structures

Core data structures for handling tabular data and feature metadata.

class TabularDataset:
    def __init__(self, df: pd.DataFrame):
        """
        Enhanced DataFrame wrapper with AutoGluon-specific utilities.
        
        Parameters:
        - df: Input pandas DataFrame
        """
    
    @classmethod
    def load(cls, file_path: str, **kwargs):
        """
        Load tabular data from file.
        
        Parameters:
        - file_path: Path to data file (CSV, TSV, Parquet, etc.)
        - **kwargs: Additional pandas read parameters
        
        Returns:
        TabularDataset: Loaded dataset
        """
    
    def save(self, file_path: str, **kwargs):
        """
        Save tabular data to file.
        
        Parameters:
        - file_path: Output file path
        - **kwargs: Additional pandas save parameters
        """

class FeatureMetadata:
    def __init__(
        self,
        type_map_raw: dict = None,
        type_group_map_special: dict = None,
        **kwargs
    ):
        """
        Metadata container for feature type information and processing hints.
        
        Parameters:
        - type_map_raw: Mapping of feature names to raw data types
        - type_group_map_special: Mapping of features to special type groups
        """
    
    def get_features(self, valid_raw_types: list = None, invalid_raw_types: list = None):
        """
        Get features filtered by data types.
        
        Parameters:
        - valid_raw_types: Include only these raw types
        - invalid_raw_types: Exclude these raw types
        
        Returns:
        list: Filtered feature names
        """

Evaluation Metrics

Comprehensive metric system for model evaluation across different problem types.

class Scorer:
    def __init__(
        self,
        name: str,
        score_func: callable,
        optimum: float = None,
        sign: int = None,
        greater_is_better: bool = None,
        **kwargs
    ):
        """
        Scorer wrapper for evaluation metrics.
        
        Parameters:
        - name: Metric name
        - score_func: Function to compute metric
        - optimum: Optimal score value
        - sign: Sign for score interpretation (-1 or 1)
        - greater_is_better: Whether higher scores are better
        """
    
    def __call__(self, y_true, y_pred, **kwargs):
        """
        Compute metric score.
        
        Parameters:
        - y_true: True target values
        - y_pred: Predicted values
        
        Returns:
        float: Computed metric score
        """

# Classification metrics
def accuracy_score(y_true, y_pred, **kwargs) -> float:
    """Compute classification accuracy."""

def balanced_accuracy_score(y_true, y_pred, **kwargs) -> float:
    """Compute balanced accuracy for imbalanced datasets."""

def f1_score(y_true, y_pred, **kwargs) -> float:
    """Compute F1 score (harmonic mean of precision and recall)."""

def precision_score(y_true, y_pred, **kwargs) -> float:
    """Compute precision score."""

def recall_score(y_true, y_pred, **kwargs) -> float:
    """Compute recall score."""

def roc_auc_score(y_true, y_pred_proba, **kwargs) -> float:
    """Compute Area Under the ROC Curve."""

def log_loss(y_true, y_pred_proba, **kwargs) -> float:
    """Compute logistic loss."""

# Regression metrics
def mean_squared_error(y_true, y_pred, **kwargs) -> float:
    """Compute mean squared error."""

def root_mean_squared_error(y_true, y_pred, **kwargs) -> float:
    """Compute root mean squared error."""

def mean_absolute_error(y_true, y_pred, **kwargs) -> float:
    """Compute mean absolute error."""

def mean_absolute_percentage_error(y_true, y_pred, **kwargs) -> float:
    """Compute mean absolute percentage error."""

def r2_score(y_true, y_pred, **kwargs) -> float:
    """Compute R² coefficient of determination."""

# Metric computation utilities
def compute_metric(metric_name: str, y_true, y_pred, **kwargs) -> float:
    """
    Compute metric by name with automatic type handling.
    
    Parameters:
    - metric_name: Name of metric to compute
    - y_true: True target values
    - y_pred: Predictions or prediction probabilities
    
    Returns:
    float: Computed metric value
    """

Exploratory Data Analysis

State management and utilities for exploratory data analysis workflows.

class AnalysisState:
    def __init__(self, *args, **kwargs):
        """
        Dictionary-like state container with dot notation access.
        
        Enables dynamic attribute access for analysis results:
        state.model_performance instead of state['model_performance']
        """
    
    def __getattr__(self, item):
        """Enable dot notation access to stored values."""
    
    def __setattr__(self, name: str, value):
        """Enable dot notation assignment with nested dict conversion."""

class StateCheckMixin:
    def at_least_one_key_must_be_present(self, state: AnalysisState, *keys) -> bool:
        """
        Check if at least one required key is present in analysis state.
        
        Parameters:
        - state: Analysis state to check
        - *keys: Required keys to check for
        
        Returns:
        bool: True if at least one key is present
        """

def is_key_present_in_state(state: dict, key: str) -> bool:
    """
    Check if a key exists in nested state dictionary.
    
    Parameters:
    - state: State dictionary to search
    - key: Key to search for
    
    Returns:
    bool: True if key is present
    """

Utility Functions

Helper functions for common operations across AutoGluon components.

def infer_problem_type(y: pd.Series, silent: bool = False) -> str:
    """
    Automatically infer problem type from target variable.
    
    Parameters:
    - y: Target variable values
    - silent: Suppress logging output
    
    Returns:
    str: Inferred problem type ('binary', 'multiclass', 'regression')
    """

def get_pred_from_proba(y_pred_proba, problem_type: str) -> np.ndarray:
    """
    Convert prediction probabilities to class predictions.
    
    Parameters:
    - y_pred_proba: Prediction probabilities
    - problem_type: Type of ML problem
    
    Returns:
    numpy.ndarray: Class predictions
    """

def normalize_pred_probas(y_pred_proba, problem_type: str) -> np.ndarray:
    """
    Normalize prediction probabilities to valid probability distributions.
    
    Parameters:
    - y_pred_proba: Raw prediction probabilities
    - problem_type: Type of ML problem
    
    Returns:
    numpy.ndarray: Normalized probabilities
    """

def setup_outputdir(path: str, warn_if_exist: bool = True) -> str:
    """
    Set up output directory for saving models and artifacts.
    
    Parameters:
    - path: Desired output directory path
    - warn_if_exist: Whether to warn if directory exists
    
    Returns:
    str: Validated output directory path
    """

Usage Examples

Problem Type Inference and Metrics

from autogluon.core import constants, metrics
import pandas as pd
import numpy as np

# Infer problem type from target variable
target_binary = pd.Series([0, 1, 1, 0, 1])  
target_multiclass = pd.Series(['A', 'B', 'C', 'A', 'B'])
target_regression = pd.Series([1.5, 2.3, 3.7, 4.1, 5.2])

print(f"Binary: {constants.infer_problem_type(target_binary)}")
print(f"Multiclass: {constants.infer_problem_type(target_multiclass)}")
print(f"Regression: {constants.infer_problem_type(target_regression)}")

# Use problem type constants
if problem_type == constants.BINARY:
    eval_metric = 'roc_auc'
elif problem_type == constants.MULTICLASS:
    eval_metric = 'accuracy'
elif problem_type == constants.REGRESSION:
    eval_metric = 'rmse'

# Compute metrics
y_true = [0, 1, 1, 0, 1]
y_pred = [0, 1, 0, 0, 1]
y_pred_proba = [[0.8, 0.2], [0.3, 0.7], [0.6, 0.4], [0.9, 0.1], [0.2, 0.8]]

accuracy = metrics.compute_metric('accuracy', y_true, y_pred)
auc = metrics.compute_metric('roc_auc', y_true, y_pred_proba)
print(f"Accuracy: {accuracy:.3f}, AUC: {auc:.3f}")

Working with TabularDataset

from autogluon.common import TabularDataset
import pandas as pd

# Create TabularDataset from DataFrame
df = pd.DataFrame({
    'feature1': [1, 2, 3, 4, 5],
    'feature2': ['A', 'B', 'A', 'C', 'B'],
    'target': [0, 1, 0, 1, 1]
})

dataset = TabularDataset(df)

# Load from file
dataset = TabularDataset.load('data.csv')

# Access underlying DataFrame
print(f"Shape: {dataset.shape}")
print(f"Columns: {list(dataset.columns)}")

# Save to file
dataset.save('processed_data.csv')

Feature Metadata Management

from autogluon.common import FeatureMetadata

# Create feature metadata
feature_metadata = FeatureMetadata(
    type_map_raw={
        'numerical_col': 'float',
        'categorical_col': 'object',
        'text_col': 'object',
        'datetime_col': 'datetime'
    },
    type_group_map_special={
        'text_col': 'text',
        'datetime_col': 'datetime'
    }
)

# Get features by type
numerical_features = feature_metadata.get_features(valid_raw_types=['float', 'int'])
categorical_features = feature_metadata.get_features(valid_raw_types=['object'])
text_features = feature_metadata.get_features(invalid_raw_types=['float', 'int'])

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")
print(f"Text features: {text_features}")

Custom Metric Creation

from autogluon.core.metrics import Scorer
import numpy as np

# Define custom metric function
def custom_weighted_accuracy(y_true, y_pred, sample_weight=None):
    """Custom weighted accuracy metric."""
    correct = (y_true == y_pred).astype(float)
    if sample_weight is not None:
        return np.average(correct, weights=sample_weight)
    return np.mean(correct)

# Create custom scorer
custom_scorer = Scorer(
    name='weighted_accuracy',
    score_func=custom_weighted_accuracy,
    greater_is_better=True,
    optimum=1.0
)

# Use custom scorer
y_true = [0, 1, 1, 0, 1]
y_pred = [0, 1, 0, 0, 1] 
weights = [1, 2, 1, 1, 2]

score = custom_scorer(y_true, y_pred, sample_weight=weights)
print(f"Custom weighted accuracy: {score:.3f}")

Analysis State Management

from autogluon.eda import AnalysisState

# Create analysis state
state = AnalysisState()

# Use dot notation for assignment
state.model_performance = {'accuracy': 0.85, 'f1': 0.82}
state.feature_importance = ['feature1', 'feature2', 'feature3']
state.training_time = 120.5

# Access with dot notation
print(f"Best accuracy: {state.model_performance['accuracy']}")
print(f"Training time: {state.training_time}s")
print(f"Top feature: {state.feature_importance[0]}")

# Nested state management
state.validation = AnalysisState()
state.validation.scores = {'val_acc': 0.83, 'val_f1': 0.80}
state.validation.fold_results = [0.82, 0.84, 0.83, 0.85, 0.81]

print(f"Validation accuracy: {state.validation.scores['val_acc']}")

Install with Tessl CLI