Tessl Tile for pypi/fastai@2.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

callbacks.md collaborative-filtering.md core-training.md data-loading.md index.md interpretation.md medical.md metrics-losses.md tabular.md text.md vision.md

tabular.mddocs/

0
# Tabular Data
1

2
Comprehensive tabular data processing and modeling including preprocessing transforms, neural network architectures optimized for structured data, and utilities for working with pandas DataFrames.
3

4
## Capabilities
5

6
### Tabular Learner
7

8
Main entry point for creating tabular data learners with neural networks optimized for structured data.
9

10
```python { .api }
11
def tabular_learner(dls, layers=None, emb_szs=None, n_out=None, y_range=None, 
12
                    use_bn=True, emb_drop=0.0, bn_final=False, bn_cont=True, 
13
                    act_cls=nn.ReLU(inplace=True), lin_first=False, ps=None, 
14
                    concat_pool=True, first_bn=True, bn_drop_out=False, 
15
                    lin_drop_out=0.0, embed_p=0.0, **kwargs):
16
    """
17
    Create a tabular data learner.
18
    
19
    Parameters:
20
    - dls: TabularDataLoaders with preprocessed tabular data
21
    - layers: List of layer sizes for hidden layers
22
    - emb_szs: Dictionary or list of embedding sizes for categorical variables
23
    - n_out: Number of outputs (auto-detected from data if None)
24
    - y_range: Range of target values for regression
25
    - use_bn: Use batch normalization
26
    - emb_drop: Embedding dropout probability
27
    - bn_final: Apply batch norm to final layer
28
    - bn_cont: Apply batch norm to continuous variables
29
    - act_cls: Activation function class
30
    - lin_first: Apply linear layer before embedding
31
    - ps: Dropout probabilities for hidden layers
32
    - concat_pool: Use concatenated pooling
33
    - first_bn: Apply batch norm to first layer
34
    - bn_drop_out: Apply dropout after batch norm
35
    - lin_drop_out: Linear layer dropout
36
    - embed_p: Embedding layer dropout
37
    
38
    Returns:
39
    - Learner instance for tabular data
40
    """
41

42
class TabularLearner(Learner):
43
    """Learner specialized for tabular data."""
44
    
45
    def predict(self, row, with_input=False):
46
        """
47
        Make prediction on a single row.
48
        
49
        Parameters:
50
        - row: Dictionary or pandas Series with input features
51
        - with_input: Return processed input along with prediction
52
        
53
        Returns:
54
        - Prediction class, prediction index, raw outputs
55
        """
56
    
57
    def show_results(self, ds_idx=1, dl=None, max_n=10, **kwargs):
58
        """Show model predictions vs actual values."""
59
```
60

61
### Tabular Data Processing
62

63
Specialized data loaders and processing for structured/tabular datasets.
64

65
```python { .api }
66
class TabularDataLoaders(DataLoaders):
67
    """DataLoaders for tabular datasets."""
68
    
69
    @classmethod
70
    def from_csv(cls, path, csv_name='train.csv', header='infer', delimiter=None, 
71
                 y_names=None, y_block=None, cat_names=None, cont_names=None, 
72
                 procs=None, valid_col=None, valid_pct=0.2, seed=None, **kwargs):
73
        """
74
        Create TabularDataLoaders from CSV file.
75
        
76
        Parameters:
77
        - path: Path to data directory
78
        - csv_name: Name of CSV file
79
        - header: CSV header handling
80
        - delimiter: CSV delimiter
81
        - y_names: Target column name(s)
82
        - y_block: Transform block for targets
83
        - cat_names: Categorical column names
84
        - cont_names: Continuous column names
85
        - procs: List of preprocessing transforms
86
        - valid_col: Column indicating validation split
87
        - valid_pct: Validation percentage
88
        - seed: Random seed for splitting
89
        
90
        Returns:
91
        - TabularDataLoaders instance
92
        """
93
    
94
    @classmethod
95
    def from_df(cls, df, path='.', y_names=None, cat_names=None, cont_names=None, 
96
                procs=None, valid_col=None, valid_pct=0.2, seed=None, **kwargs):
97
        """Create from pandas DataFrame."""
98

99
class TabularPandas:
100
    """Pandas DataFrame integration for tabular data."""
101
    
102
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, 
103
                 y_names=None, y_block=None, splits=None, do_setup=True, 
104
                 device=None, inplace=False): ...
105
    
106
    def process(self):
107
        """Apply preprocessing transforms."""
108
    
109
    def setup(self, train_setup=True):
110
        """Setup transforms for training or inference."""
111
    
112
    @property
113
    def train(self):
114
        """Training subset."""
115
    
116
    @property
117
    def valid(self):
118
        """Validation subset."""
119
    
120
    def new(self, df):
121
        """Create new TabularPandas with same setup."""
122

123
class Tabular:
124
    """Core tabular data representation."""
125
    
126
    def __init__(self, cats, conts, classes, names): ...
127
    
128
    def show(self, ctx=None): ...
129
```
130

131
### Tabular Preprocessing
132

133
Preprocessing transforms for handling categorical and continuous variables.
134

135
```python { .api }
136
class Categorify(TabularProc):
137
    """Convert categorical variables to integer codes."""
138
    
139
    def __init__(self, cat_names, add_na=False): ...
140
    
141
    def setup(self, to=None, train_setup=True, **kwargs): ...
142
    
143
    def process(self, to): ...
144

145
class FillMissing(TabularProc):
146
    """Fill missing values with median (continuous) or mode (categorical)."""
147
    
148
    def __init__(self, fill_strategy=FillStrategy.MEDIAN, add_col=True, 
149
                 fill_vals=None): ...
150
    
151
    def setup(self, to=None, train_setup=True, **kwargs): ...
152
    
153
    def process(self, to): ...
154

155
class Normalize(TabularProc):
156
    """Normalize continuous variables to zero mean and unit variance."""
157
    
158
    def __init__(self, cont_names): ...
159
    
160
    def setup(self, to=None, train_setup=True, **kwargs): ...
161
    
162
    def process(self, to): ...
163

164
def add_datepart(df, field_name, prefix=None, drop=True, time=False):
165
    """
166
    Add date-based features from datetime column.
167
    
168
    Parameters:
169
    - df: DataFrame to modify
170
    - field_name: Name of datetime column
171
    - prefix: Prefix for new columns
172
    - drop: Drop original column
173
    - time: Include time-based features
174
    
175
    Returns:
176
    - DataFrame with added date features
177
    """
178

179
def make_date(df, date_field):
180
    """
181
    Convert columns to datetime.
182
    
183
    Parameters:
184
    - df: DataFrame to modify
185
    - date_field: Column name or list of column names
186
    
187
    Returns:
188
    - DataFrame with datetime columns
189
    """
190
```
191

192
### Tabular Model Architecture
193

194
Neural network architecture optimized for tabular data with embeddings and mixed data types.
195

196
```python { .api }
197
class TabularModel(nn.Module):
198
    """Neural network model for tabular data."""
199
    
200
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=None, 
201
                 emb_drop=0.0, y_range=None, use_bn=True, bn_final=False, 
202
                 bn_cont=True, act_cls=nn.ReLU(inplace=True), lin_first=False):
203
        """
204
        Initialize tabular model.
205
        
206
        Parameters:
207
        - emb_szs: List of (vocab_size, embedding_size) for categorical vars
208
        - n_cont: Number of continuous variables
209
        - out_sz: Number of outputs
210
        - layers: List of hidden layer sizes
211
        - ps: Dropout probabilities for layers
212
        - emb_drop: Embedding dropout probability
213
        - y_range: Output range for regression
214
        - use_bn: Use batch normalization
215
        - bn_final: Batch norm on final layer
216
        - bn_cont: Batch norm on continuous inputs
217
        - act_cls: Activation function
218
        - lin_first: Linear layer before embeddings
219
        """
220
    
221
    def forward(self, x_cat, x_cont=None): ...
222

223
def emb_sz_rule(n_cat):
224
    """
225
    Rule of thumb for embedding sizes.
226
    
227
    Parameters:
228
    - n_cat: Number of categories
229
    
230
    Returns:
231
    - Recommended embedding size
232
    """
233

234
def get_emb_sz(to, sz_dict=None):
235
    """
236
    Get embedding sizes for categorical variables.
237
    
238
    Parameters:
239
    - to: TabularPandas object
240
    - sz_dict: Custom size dictionary
241
    
242
    Returns:
243
    - List of (vocab_size, embedding_size) tuples
244
    """
245
```
246

247
### Tabular Transform Blocks
248

249
Transform blocks for different types of tabular data.
250

251
```python { .api }
252
class TabularBlock(TransformBlock):
253
    """Transform block for tabular data."""
254
    
255
    def __init__(self, cat_names=None, cont_names=None, procs=None, y_block=None): ...
256

257
class CategoryBlock(TransformBlock):
258
    """Transform block for categorical targets."""
259
    
260
    def __init__(self, vocab=None, sort=True, add_na=False): ...
261

262
class MultiCategoryBlock(TransformBlock):  
263
    """Transform block for multi-label targets."""
264
    
265
    def __init__(self, encoded=False, vocab=None, add_na=False): ...
266

267
class RegressionBlock(TransformBlock):
268
    """Transform block for regression targets."""
269
    
270
    def __init__(self): ...
271
```
272

273
### Tabular Utilities
274

275
Utility functions for working with tabular data and pandas DataFrames.
276

277
```python { .api }
278
def cont_cat_split(df, max_card=20, dep_var=None):
279
    """
280
    Split DataFrame columns into continuous and categorical.
281
    
282
    Parameters:
283
    - df: DataFrame to analyze
284
    - max_card: Maximum cardinality for categorical
285
    - dep_var: Dependent variable to exclude
286
    
287
    Returns:
288
    - (continuous_names, categorical_names)
289
    """
290

291
def tabular_config(**kwargs):
292
    """Get default configuration for tabular models."""
293

294
class TabularLine:
295
    """Single row representation for tabular data."""
296
    
297
    def __init__(self, cats, conts, classes, names): ...
298
    def show(self): ...
299

300
def show_batch(self, max_n=10, ctxs=None, show=True, **kwargs):
301
    """Display batch of tabular data."""
302

303
def predict(self, row):
304
    """Make prediction on single row."""
305
```
306

307
### Feature Engineering
308

309
Advanced feature engineering functions for tabular data.
310

311
```python { .api }
312
class Discretize(TabularProc):
313
    """Discretize continuous variables into bins."""
314
    
315
    def __init__(self, cont_names, n_bins=5): ...
316

317
def cyclic_dt_features(df, field_name, time=True, drop=True):
318
    """
319
    Create cyclic features from datetime (sin/cos encoding).
320
    
321
    Parameters:
322
    - df: DataFrame to modify
323
    - field_name: Datetime column name
324
    - time: Include time features
325
    - drop: Drop original column
326
    
327
    Returns:
328
    - DataFrame with cyclic datetime features
329
    """
330

331
def get_correlation_clusters(df, cluster_threshold=0.95):
332
    """
333
    Find clusters of highly correlated features.
334
    
335
    Parameters:  
336
    - df: DataFrame with features
337
    - cluster_threshold: Correlation threshold for clustering
338
    
339
    Returns:
340
    - Dictionary mapping cluster ID to feature list
341
    """
342
```

Version

Tile

Files

tabular.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

tabular.mddocs/