or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

callbacks.mdcollaborative-filtering.mdcore-training.mddata-loading.mdindex.mdinterpretation.mdmedical.mdmetrics-losses.mdtabular.mdtext.mdvision.md

tabular.mddocs/

0

# Tabular Data

1

2

Comprehensive tabular data processing and modeling including preprocessing transforms, neural network architectures optimized for structured data, and utilities for working with pandas DataFrames.

3

4

## Capabilities

5

6

### Tabular Learner

7

8

Main entry point for creating tabular data learners with neural networks optimized for structured data.

9

10

```python { .api }

11

def tabular_learner(dls, layers=None, emb_szs=None, n_out=None, y_range=None,

12

use_bn=True, emb_drop=0.0, bn_final=False, bn_cont=True,

13

act_cls=nn.ReLU(inplace=True), lin_first=False, ps=None,

14

concat_pool=True, first_bn=True, bn_drop_out=False,

15

lin_drop_out=0.0, embed_p=0.0, **kwargs):

16

"""

17

Create a tabular data learner.

18

19

Parameters:

20

- dls: TabularDataLoaders with preprocessed tabular data

21

- layers: List of layer sizes for hidden layers

22

- emb_szs: Dictionary or list of embedding sizes for categorical variables

23

- n_out: Number of outputs (auto-detected from data if None)

24

- y_range: Range of target values for regression

25

- use_bn: Use batch normalization

26

- emb_drop: Embedding dropout probability

27

- bn_final: Apply batch norm to final layer

28

- bn_cont: Apply batch norm to continuous variables

29

- act_cls: Activation function class

30

- lin_first: Apply linear layer before embedding

31

- ps: Dropout probabilities for hidden layers

32

- concat_pool: Use concatenated pooling

33

- first_bn: Apply batch norm to first layer

34

- bn_drop_out: Apply dropout after batch norm

35

- lin_drop_out: Linear layer dropout

36

- embed_p: Embedding layer dropout

37

38

Returns:

39

- Learner instance for tabular data

40

"""

41

42

class TabularLearner(Learner):

43

"""Learner specialized for tabular data."""

44

45

def predict(self, row, with_input=False):

46

"""

47

Make prediction on a single row.

48

49

Parameters:

50

- row: Dictionary or pandas Series with input features

51

- with_input: Return processed input along with prediction

52

53

Returns:

54

- Prediction class, prediction index, raw outputs

55

"""

56

57

def show_results(self, ds_idx=1, dl=None, max_n=10, **kwargs):

58

"""Show model predictions vs actual values."""

59

```

60

61

### Tabular Data Processing

62

63

Specialized data loaders and processing for structured/tabular datasets.

64

65

```python { .api }

66

class TabularDataLoaders(DataLoaders):

67

"""DataLoaders for tabular datasets."""

68

69

@classmethod

70

def from_csv(cls, path, csv_name='train.csv', header='infer', delimiter=None,

71

y_names=None, y_block=None, cat_names=None, cont_names=None,

72

procs=None, valid_col=None, valid_pct=0.2, seed=None, **kwargs):

73

"""

74

Create TabularDataLoaders from CSV file.

75

76

Parameters:

77

- path: Path to data directory

78

- csv_name: Name of CSV file

79

- header: CSV header handling

80

- delimiter: CSV delimiter

81

- y_names: Target column name(s)

82

- y_block: Transform block for targets

83

- cat_names: Categorical column names

84

- cont_names: Continuous column names

85

- procs: List of preprocessing transforms

86

- valid_col: Column indicating validation split

87

- valid_pct: Validation percentage

88

- seed: Random seed for splitting

89

90

Returns:

91

- TabularDataLoaders instance

92

"""

93

94

@classmethod

95

def from_df(cls, df, path='.', y_names=None, cat_names=None, cont_names=None,

96

procs=None, valid_col=None, valid_pct=0.2, seed=None, **kwargs):

97

"""Create from pandas DataFrame."""

98

99

class TabularPandas:

100

"""Pandas DataFrame integration for tabular data."""

101

102

def __init__(self, df, procs=None, cat_names=None, cont_names=None,

103

y_names=None, y_block=None, splits=None, do_setup=True,

104

device=None, inplace=False): ...

105

106

def process(self):

107

"""Apply preprocessing transforms."""

108

109

def setup(self, train_setup=True):

110

"""Setup transforms for training or inference."""

111

112

@property

113

def train(self):

114

"""Training subset."""

115

116

@property

117

def valid(self):

118

"""Validation subset."""

119

120

def new(self, df):

121

"""Create new TabularPandas with same setup."""

122

123

class Tabular:

124

"""Core tabular data representation."""

125

126

def __init__(self, cats, conts, classes, names): ...

127

128

def show(self, ctx=None): ...

129

```

130

131

### Tabular Preprocessing

132

133

Preprocessing transforms for handling categorical and continuous variables.

134

135

```python { .api }

136

class Categorify(TabularProc):

137

"""Convert categorical variables to integer codes."""

138

139

def __init__(self, cat_names, add_na=False): ...

140

141

def setup(self, to=None, train_setup=True, **kwargs): ...

142

143

def process(self, to): ...

144

145

class FillMissing(TabularProc):

146

"""Fill missing values with median (continuous) or mode (categorical)."""

147

148

def __init__(self, fill_strategy=FillStrategy.MEDIAN, add_col=True,

149

fill_vals=None): ...

150

151

def setup(self, to=None, train_setup=True, **kwargs): ...

152

153

def process(self, to): ...

154

155

class Normalize(TabularProc):

156

"""Normalize continuous variables to zero mean and unit variance."""

157

158

def __init__(self, cont_names): ...

159

160

def setup(self, to=None, train_setup=True, **kwargs): ...

161

162

def process(self, to): ...

163

164

def add_datepart(df, field_name, prefix=None, drop=True, time=False):

165

"""

166

Add date-based features from datetime column.

167

168

Parameters:

169

- df: DataFrame to modify

170

- field_name: Name of datetime column

171

- prefix: Prefix for new columns

172

- drop: Drop original column

173

- time: Include time-based features

174

175

Returns:

176

- DataFrame with added date features

177

"""

178

179

def make_date(df, date_field):

180

"""

181

Convert columns to datetime.

182

183

Parameters:

184

- df: DataFrame to modify

185

- date_field: Column name or list of column names

186

187

Returns:

188

- DataFrame with datetime columns

189

"""

190

```

191

192

### Tabular Model Architecture

193

194

Neural network architecture optimized for tabular data with embeddings and mixed data types.

195

196

```python { .api }

197

class TabularModel(nn.Module):

198

"""Neural network model for tabular data."""

199

200

def __init__(self, emb_szs, n_cont, out_sz, layers, ps=None,

201

emb_drop=0.0, y_range=None, use_bn=True, bn_final=False,

202

bn_cont=True, act_cls=nn.ReLU(inplace=True), lin_first=False):

203

"""

204

Initialize tabular model.

205

206

Parameters:

207

- emb_szs: List of (vocab_size, embedding_size) for categorical vars

208

- n_cont: Number of continuous variables

209

- out_sz: Number of outputs

210

- layers: List of hidden layer sizes

211

- ps: Dropout probabilities for layers

212

- emb_drop: Embedding dropout probability

213

- y_range: Output range for regression

214

- use_bn: Use batch normalization

215

- bn_final: Batch norm on final layer

216

- bn_cont: Batch norm on continuous inputs

217

- act_cls: Activation function

218

- lin_first: Linear layer before embeddings

219

"""

220

221

def forward(self, x_cat, x_cont=None): ...

222

223

def emb_sz_rule(n_cat):

224

"""

225

Rule of thumb for embedding sizes.

226

227

Parameters:

228

- n_cat: Number of categories

229

230

Returns:

231

- Recommended embedding size

232

"""

233

234

def get_emb_sz(to, sz_dict=None):

235

"""

236

Get embedding sizes for categorical variables.

237

238

Parameters:

239

- to: TabularPandas object

240

- sz_dict: Custom size dictionary

241

242

Returns:

243

- List of (vocab_size, embedding_size) tuples

244

"""

245

```

246

247

### Tabular Transform Blocks

248

249

Transform blocks for different types of tabular data.

250

251

```python { .api }

252

class TabularBlock(TransformBlock):

253

"""Transform block for tabular data."""

254

255

def __init__(self, cat_names=None, cont_names=None, procs=None, y_block=None): ...

256

257

class CategoryBlock(TransformBlock):

258

"""Transform block for categorical targets."""

259

260

def __init__(self, vocab=None, sort=True, add_na=False): ...

261

262

class MultiCategoryBlock(TransformBlock):

263

"""Transform block for multi-label targets."""

264

265

def __init__(self, encoded=False, vocab=None, add_na=False): ...

266

267

class RegressionBlock(TransformBlock):

268

"""Transform block for regression targets."""

269

270

def __init__(self): ...

271

```

272

273

### Tabular Utilities

274

275

Utility functions for working with tabular data and pandas DataFrames.

276

277

```python { .api }

278

def cont_cat_split(df, max_card=20, dep_var=None):

279

"""

280

Split DataFrame columns into continuous and categorical.

281

282

Parameters:

283

- df: DataFrame to analyze

284

- max_card: Maximum cardinality for categorical

285

- dep_var: Dependent variable to exclude

286

287

Returns:

288

- (continuous_names, categorical_names)

289

"""

290

291

def tabular_config(**kwargs):

292

"""Get default configuration for tabular models."""

293

294

class TabularLine:

295

"""Single row representation for tabular data."""

296

297

def __init__(self, cats, conts, classes, names): ...

298

def show(self): ...

299

300

def show_batch(self, max_n=10, ctxs=None, show=True, **kwargs):

301

"""Display batch of tabular data."""

302

303

def predict(self, row):

304

"""Make prediction on single row."""

305

```

306

307

### Feature Engineering

308

309

Advanced feature engineering functions for tabular data.

310

311

```python { .api }

312

class Discretize(TabularProc):

313

"""Discretize continuous variables into bins."""

314

315

def __init__(self, cont_names, n_bins=5): ...

316

317

def cyclic_dt_features(df, field_name, time=True, drop=True):

318

"""

319

Create cyclic features from datetime (sin/cos encoding).

320

321

Parameters:

322

- df: DataFrame to modify

323

- field_name: Datetime column name

324

- time: Include time features

325

- drop: Drop original column

326

327

Returns:

328

- DataFrame with cyclic datetime features

329

"""

330

331

def get_correlation_clusters(df, cluster_threshold=0.95):

332

"""

333

Find clusters of highly correlated features.

334

335

Parameters:

336

- df: DataFrame with features

337

- cluster_threshold: Correlation threshold for clustering

338

339

Returns:

340

- Dictionary mapping cluster ID to feature list

341

"""

342

```