tessl/pypi-fastai

fastai simplifies training fast and accurate neural nets using modern best practices

—

Pending

Overview

Eval results

Files

Tabular Data

Name: tessl/pypi-fastai
Author: tessl

Comprehensive tabular data processing and modeling including preprocessing transforms, neural network architectures optimized for structured data, and utilities for working with pandas DataFrames.

Capabilities

Tabular Learner

Main entry point for creating tabular data learners with neural networks optimized for structured data.

def tabular_learner(dls, layers=None, emb_szs=None, n_out=None, y_range=None, 
                    use_bn=True, emb_drop=0.0, bn_final=False, bn_cont=True, 
                    act_cls=nn.ReLU(inplace=True), lin_first=False, ps=None, 
                    concat_pool=True, first_bn=True, bn_drop_out=False, 
                    lin_drop_out=0.0, embed_p=0.0, **kwargs):
    """
    Create a tabular data learner.
    
    Parameters:
    - dls: TabularDataLoaders with preprocessed tabular data
    - layers: List of layer sizes for hidden layers
    - emb_szs: Dictionary or list of embedding sizes for categorical variables
    - n_out: Number of outputs (auto-detected from data if None)
    - y_range: Range of target values for regression
    - use_bn: Use batch normalization
    - emb_drop: Embedding dropout probability
    - bn_final: Apply batch norm to final layer
    - bn_cont: Apply batch norm to continuous variables
    - act_cls: Activation function class
    - lin_first: Apply linear layer before embedding
    - ps: Dropout probabilities for hidden layers
    - concat_pool: Use concatenated pooling
    - first_bn: Apply batch norm to first layer
    - bn_drop_out: Apply dropout after batch norm
    - lin_drop_out: Linear layer dropout
    - embed_p: Embedding layer dropout
    
    Returns:
    - Learner instance for tabular data
    """

class TabularLearner(Learner):
    """Learner specialized for tabular data."""
    
    def predict(self, row, with_input=False):
        """
        Make prediction on a single row.
        
        Parameters:
        - row: Dictionary or pandas Series with input features
        - with_input: Return processed input along with prediction
        
        Returns:
        - Prediction class, prediction index, raw outputs
        """
    
    def show_results(self, ds_idx=1, dl=None, max_n=10, **kwargs):
        """Show model predictions vs actual values."""

Tabular Data Processing

Specialized data loaders and processing for structured/tabular datasets.

class TabularDataLoaders(DataLoaders):
    """DataLoaders for tabular datasets."""
    
    @classmethod
    def from_csv(cls, path, csv_name='train.csv', header='infer', delimiter=None, 
                 y_names=None, y_block=None, cat_names=None, cont_names=None, 
                 procs=None, valid_col=None, valid_pct=0.2, seed=None, **kwargs):
        """
        Create TabularDataLoaders from CSV file.
        
        Parameters:
        - path: Path to data directory
        - csv_name: Name of CSV file
        - header: CSV header handling
        - delimiter: CSV delimiter
        - y_names: Target column name(s)
        - y_block: Transform block for targets
        - cat_names: Categorical column names
        - cont_names: Continuous column names
        - procs: List of preprocessing transforms
        - valid_col: Column indicating validation split
        - valid_pct: Validation percentage
        - seed: Random seed for splitting
        
        Returns:
        - TabularDataLoaders instance
        """
    
    @classmethod
    def from_df(cls, df, path='.', y_names=None, cat_names=None, cont_names=None, 
                procs=None, valid_col=None, valid_pct=0.2, seed=None, **kwargs):
        """Create from pandas DataFrame."""

class TabularPandas:
    """Pandas DataFrame integration for tabular data."""
    
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, 
                 y_names=None, y_block=None, splits=None, do_setup=True, 
                 device=None, inplace=False): ...
    
    def process(self):
        """Apply preprocessing transforms."""
    
    def setup(self, train_setup=True):
        """Setup transforms for training or inference."""
    
    @property
    def train(self):
        """Training subset."""
    
    @property
    def valid(self):
        """Validation subset."""
    
    def new(self, df):
        """Create new TabularPandas with same setup."""

class Tabular:
    """Core tabular data representation."""
    
    def __init__(self, cats, conts, classes, names): ...
    
    def show(self, ctx=None): ...

Tabular Preprocessing

Preprocessing transforms for handling categorical and continuous variables.

class Categorify(TabularProc):
    """Convert categorical variables to integer codes."""
    
    def __init__(self, cat_names, add_na=False): ...
    
    def setup(self, to=None, train_setup=True, **kwargs): ...
    
    def process(self, to): ...

class FillMissing(TabularProc):
    """Fill missing values with median (continuous) or mode (categorical)."""
    
    def __init__(self, fill_strategy=FillStrategy.MEDIAN, add_col=True, 
                 fill_vals=None): ...
    
    def setup(self, to=None, train_setup=True, **kwargs): ...
    
    def process(self, to): ...

class Normalize(TabularProc):
    """Normalize continuous variables to zero mean and unit variance."""
    
    def __init__(self, cont_names): ...
    
    def setup(self, to=None, train_setup=True, **kwargs): ...
    
    def process(self, to): ...

def add_datepart(df, field_name, prefix=None, drop=True, time=False):
    """
    Add date-based features from datetime column.
    
    Parameters:
    - df: DataFrame to modify
    - field_name: Name of datetime column
    - prefix: Prefix for new columns
    - drop: Drop original column
    - time: Include time-based features
    
    Returns:
    - DataFrame with added date features
    """

def make_date(df, date_field):
    """
    Convert columns to datetime.
    
    Parameters:
    - df: DataFrame to modify
    - date_field: Column name or list of column names
    
    Returns:
    - DataFrame with datetime columns
    """

Tabular Model Architecture

Neural network architecture optimized for tabular data with embeddings and mixed data types.

class TabularModel(nn.Module):
    """Neural network model for tabular data."""
    
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=None, 
                 emb_drop=0.0, y_range=None, use_bn=True, bn_final=False, 
                 bn_cont=True, act_cls=nn.ReLU(inplace=True), lin_first=False):
        """
        Initialize tabular model.
        
        Parameters:
        - emb_szs: List of (vocab_size, embedding_size) for categorical vars
        - n_cont: Number of continuous variables
        - out_sz: Number of outputs
        - layers: List of hidden layer sizes
        - ps: Dropout probabilities for layers
        - emb_drop: Embedding dropout probability
        - y_range: Output range for regression
        - use_bn: Use batch normalization
        - bn_final: Batch norm on final layer
        - bn_cont: Batch norm on continuous inputs
        - act_cls: Activation function
        - lin_first: Linear layer before embeddings
        """
    
    def forward(self, x_cat, x_cont=None): ...

def emb_sz_rule(n_cat):
    """
    Rule of thumb for embedding sizes.
    
    Parameters:
    - n_cat: Number of categories
    
    Returns:
    - Recommended embedding size
    """

def get_emb_sz(to, sz_dict=None):
    """
    Get embedding sizes for categorical variables.
    
    Parameters:
    - to: TabularPandas object
    - sz_dict: Custom size dictionary
    
    Returns:
    - List of (vocab_size, embedding_size) tuples
    """

Tabular Transform Blocks

Transform blocks for different types of tabular data.

class TabularBlock(TransformBlock):
    """Transform block for tabular data."""
    
    def __init__(self, cat_names=None, cont_names=None, procs=None, y_block=None): ...

class CategoryBlock(TransformBlock):
    """Transform block for categorical targets."""
    
    def __init__(self, vocab=None, sort=True, add_na=False): ...

class MultiCategoryBlock(TransformBlock):  
    """Transform block for multi-label targets."""
    
    def __init__(self, encoded=False, vocab=None, add_na=False): ...

class RegressionBlock(TransformBlock):
    """Transform block for regression targets."""
    
    def __init__(self): ...

Tabular Utilities

Utility functions for working with tabular data and pandas DataFrames.

def cont_cat_split(df, max_card=20, dep_var=None):
    """
    Split DataFrame columns into continuous and categorical.
    
    Parameters:
    - df: DataFrame to analyze
    - max_card: Maximum cardinality for categorical
    - dep_var: Dependent variable to exclude
    
    Returns:
    - (continuous_names, categorical_names)
    """

def tabular_config(**kwargs):
    """Get default configuration for tabular models."""

class TabularLine:
    """Single row representation for tabular data."""
    
    def __init__(self, cats, conts, classes, names): ...
    def show(self): ...

def show_batch(self, max_n=10, ctxs=None, show=True, **kwargs):
    """Display batch of tabular data."""

def predict(self, row):
    """Make prediction on single row."""

Feature Engineering

Advanced feature engineering functions for tabular data.

class Discretize(TabularProc):
    """Discretize continuous variables into bins."""
    
    def __init__(self, cont_names, n_bins=5): ...

def cyclic_dt_features(df, field_name, time=True, drop=True):
    """
    Create cyclic features from datetime (sin/cos encoding).
    
    Parameters:
    - df: DataFrame to modify
    - field_name: Datetime column name
    - time: Include time features
    - drop: Drop original column
    
    Returns:
    - DataFrame with cyclic datetime features
    """

def get_correlation_clusters(df, cluster_threshold=0.95):
    """
    Find clusters of highly correlated features.
    
    Parameters:  
    - df: DataFrame with features
    - cluster_threshold: Correlation threshold for clustering
    
    Returns:
    - Dictionary mapping cluster ID to feature list
    """

Install with Tessl CLI