or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core.mdfeatures.mdindex.mdmultimodal.mdtabular.mdtimeseries.md

features.mddocs/

0

# Feature Engineering

1

2

Comprehensive feature generation and transformation capabilities for automated feature engineering across different data types. AutoGluon's feature engineering system provides modular, composable feature generators that can handle text, categorical, numerical, and datetime data with intelligent preprocessing pipelines.

3

4

## Capabilities

5

6

### AutoML Pipeline Feature Generators

7

8

High-level feature generation pipelines that automatically select and configure appropriate feature transformations.

9

10

```python { .api }

11

class AutoMLPipelineFeatureGenerator:

12

def __init__(

13

self,

14

enable_numeric_features: bool = True,

15

enable_categorical_features: bool = True,

16

enable_datetime_features: bool = True,

17

enable_text_special_features: bool = True,

18

enable_text_ngram_features: bool = True,

19

enable_raw_text_features: bool = False,

20

enable_vision_features: bool = True,

21

**kwargs

22

):

23

"""

24

Initialize automated feature generation pipeline.

25

26

Parameters:

27

- enable_numeric_features: Generate numerical feature transformations

28

- enable_categorical_features: Generate categorical encodings

29

- enable_datetime_features: Generate datetime-based features

30

- enable_text_special_features: Generate text special character features

31

- enable_text_ngram_features: Generate text n-gram features

32

- enable_raw_text_features: Keep raw text features

33

- enable_vision_features: Generate image-based features

34

"""

35

36

def fit_transform(self, X, y=None, **kwargs):

37

"""

38

Fit feature generators and transform input data.

39

40

Parameters:

41

- X: Input DataFrame with raw features

42

- y: Target values (optional)

43

44

Returns:

45

Transformed DataFrame with engineered features

46

"""

47

48

def transform(self, X, **kwargs):

49

"""

50

Transform input data using fitted feature generators.

51

52

Parameters:

53

- X: Input DataFrame to transform

54

55

Returns:

56

Transformed DataFrame with engineered features

57

"""

58

59

class AutoMLInterpretablePipelineFeatureGenerator:

60

def __init__(self, **kwargs):

61

"""

62

Initialize interpretable feature generation pipeline.

63

64

Similar to AutoMLPipelineFeatureGenerator but focuses on

65

interpretable transformations suitable for model explanation.

66

"""

67

```

68

69

### Core Feature Generators

70

71

Base classes and fundamental feature transformation components.

72

73

```python { .api }

74

class AbstractFeatureGenerator:

75

def __init__(self, **kwargs):

76

"""Base class for all feature generators."""

77

78

def fit_transform(self, X, y=None, **kwargs):

79

"""Fit generator and transform data in one step."""

80

81

def fit(self, X, y=None, **kwargs):

82

"""Fit feature generator to training data."""

83

84

def transform(self, X, **kwargs):

85

"""Transform data using fitted generator."""

86

87

class PipelineFeatureGenerator(AbstractFeatureGenerator):

88

def __init__(self, generators: list, **kwargs):

89

"""

90

Chain multiple feature generators in sequence.

91

92

Parameters:

93

- generators: List of feature generator instances

94

"""

95

96

class BulkFeatureGenerator(AbstractFeatureGenerator):

97

def __init__(self, generators: list, **kwargs):

98

"""

99

Apply multiple feature generators in parallel.

100

101

Parameters:

102

- generators: List of feature generator instances

103

"""

104

```

105

106

### Categorical Feature Processing

107

108

Feature generators for categorical data encoding and transformation.

109

110

```python { .api }

111

class CategoryFeatureGenerator(AbstractFeatureGenerator):

112

def __init__(

113

self,

114

cat_order: str = 'count',

115

maximum_num_cat: int = 10000,

116

verbosity: int = 0,

117

**kwargs

118

):

119

"""

120

Generate categorical features with label encoding.

121

122

Parameters:

123

- cat_order: Category ordering method ('count', 'alphabetic')

124

- maximum_num_cat: Maximum number of categories to process

125

- verbosity: Logging verbosity level

126

"""

127

128

class OneHotEncoderFeatureGenerator(AbstractFeatureGenerator):

129

def __init__(

130

self,

131

maximum_num_cat: int = 10,

132

minimum_cat_count: int = 30,

133

**kwargs

134

):

135

"""

136

Generate one-hot encoded features for categorical data.

137

138

Parameters:

139

- maximum_num_cat: Maximum categories for one-hot encoding

140

- minimum_cat_count: Minimum category frequency for inclusion

141

"""

142

143

class LabelEncoderFeatureGenerator(AbstractFeatureGenerator):

144

def __init__(self, verbosity: int = 0, **kwargs):

145

"""

146

Generate label encoded features for categorical data.

147

148

Parameters:

149

- verbosity: Logging verbosity level

150

"""

151

```

152

153

### Numerical Feature Processing

154

155

Feature generators for numerical data transformation and binning.

156

157

```python { .api }

158

class BinnedFeatureGenerator(AbstractFeatureGenerator):

159

def __init__(

160

self,

161

num_bins: int = 10,

162

quantile_bin: bool = True,

163

**kwargs

164

):

165

"""

166

Generate binned features from numerical data.

167

168

Parameters:

169

- num_bins: Number of bins to create

170

- quantile_bin: Use quantile-based binning

171

"""

172

173

class NumericMemoryMinimizeFeatureGenerator(AbstractFeatureGenerator):

174

def __init__(self, **kwargs):

175

"""

176

Minimize memory usage of numerical features through dtype optimization.

177

"""

178

179

class CategoryMemoryMinimizeFeatureGenerator(AbstractFeatureGenerator):

180

def __init__(self, **kwargs):

181

"""

182

Minimize memory usage of categorical features through dtype optimization.

183

"""

184

```

185

186

### Text Feature Processing

187

188

Feature generators specialized for text data processing and transformation.

189

190

```python { .api }

191

class TextNgramFeatureGenerator(AbstractFeatureGenerator):

192

def __init__(

193

self,

194

vectorizer_strategy: str = 'tf-idf',

195

max_features: int = 10000,

196

ngram_range: tuple = (1, 3),

197

**kwargs

198

):

199

"""

200

Generate n-gram features from text data.

201

202

Parameters:

203

- vectorizer_strategy: Vectorization method ('tf-idf', 'count')

204

- max_features: Maximum number of features to generate

205

- ngram_range: Range of n-gram sizes (min_n, max_n)

206

"""

207

208

class TextSpecialFeatureGenerator(AbstractFeatureGenerator):

209

def __init__(self, **kwargs):

210

"""

211

Generate special character and text statistics features.

212

213

Creates features like text length, number of words,

214

special character counts, etc.

215

"""

216

```

217

218

### Datetime Feature Processing

219

220

Feature generators for datetime and temporal data transformation.

221

222

```python { .api }

223

class DatetimeFeatureGenerator(AbstractFeatureGenerator):

224

def __init__(

225

self,

226

features_to_extract: list = None,

227

**kwargs

228

):

229

"""

230

Generate datetime-based features from timestamp columns.

231

232

Parameters:

233

- features_to_extract: List of datetime features to extract

234

Options: ['year', 'month', 'day', 'dayofweek', 'hour', 'minute', 'second']

235

236

Generates features like:

237

- Year, month, day components

238

- Day of week, hour of day

239

- Is weekend, is business hour

240

- Cyclical encodings for periodic features

241

"""

242

```

243

244

### Data Cleaning and Preprocessing

245

246

Feature generators for data cleaning and basic preprocessing operations.

247

248

```python { .api }

249

class FillNaFeatureGenerator(AbstractFeatureGenerator):

250

def __init__(

251

self,

252

inplace: bool = True,

253

fillna_map: dict = None,

254

**kwargs

255

):

256

"""

257

Handle missing values through various imputation strategies.

258

259

Parameters:

260

- inplace: Modify features in place

261

- fillna_map: Custom fill values for specific columns

262

"""

263

264

class DropUniqueFeatureGenerator(AbstractFeatureGenerator):

265

def __init__(self, **kwargs):

266

"""

267

Remove features with only one unique value (constant features).

268

"""

269

270

class DropDuplicatesFeatureGenerator(AbstractFeatureGenerator):

271

def __init__(self, **kwargs):

272

"""

273

Remove duplicate features (identical columns).

274

"""

275

276

class IsNanFeatureGenerator(AbstractFeatureGenerator):

277

def __init__(self, **kwargs):

278

"""

279

Generate binary indicator features for missing values.

280

"""

281

```

282

283

### Utility Feature Generators

284

285

Helper feature generators for type conversion and feature management.

286

287

```python { .api }

288

class AsTypeFeatureGenerator(AbstractFeatureGenerator):

289

def __init__(

290

self,

291

convert_map: dict,

292

**kwargs

293

):

294

"""

295

Convert feature data types.

296

297

Parameters:

298

- convert_map: Dictionary mapping column names to target dtypes

299

"""

300

301

class IdentityFeatureGenerator(AbstractFeatureGenerator):

302

def __init__(self, **kwargs):

303

"""

304

Pass-through generator that returns features unchanged.

305

"""

306

307

class RenameFeatureGenerator(AbstractFeatureGenerator):

308

def __init__(

309

self,

310

rename_map: dict,

311

**kwargs

312

):

313

"""

314

Rename features according to mapping.

315

316

Parameters:

317

- rename_map: Dictionary mapping old names to new names

318

"""

319

320

class DummyFeatureGenerator(AbstractFeatureGenerator):

321

def __init__(self, **kwargs):

322

"""

323

Placeholder generator for testing and debugging.

324

"""

325

```

326

327

## Usage Examples

328

329

### Basic Feature Engineering Pipeline

330

331

```python

332

from autogluon.features import AutoMLPipelineFeatureGenerator

333

import pandas as pd

334

335

# Sample dataset with mixed data types

336

df = pd.DataFrame({

337

'numerical_col': [1.5, 2.3, 3.1, 4.7],

338

'categorical_col': ['A', 'B', 'A', 'C'],

339

'text_col': ['hello world', 'goodbye moon', 'hello again', 'farewell sun'],

340

'datetime_col': pd.date_range('2023-01-01', periods=4, freq='D'),

341

'target': [0, 1, 0, 1]

342

})

343

344

# Initialize automated feature generator

345

feature_generator = AutoMLPipelineFeatureGenerator(

346

enable_text_ngram_features=True,

347

enable_datetime_features=True,

348

enable_categorical_features=True

349

)

350

351

# Fit and transform features

352

X = df.drop('target', axis=1)

353

y = df['target']

354

355

X_transformed = feature_generator.fit_transform(X, y)

356

print(f"Original features: {X.shape[1]}")

357

print(f"Engineered features: {X_transformed.shape[1]}")

358

print(f"New columns: {list(X_transformed.columns)}")

359

360

# Transform new data

361

X_new_transformed = feature_generator.transform(new_data)

362

```

363

364

### Custom Feature Engineering Pipeline

365

366

```python

367

from autogluon.features import (

368

PipelineFeatureGenerator,

369

DatetimeFeatureGenerator,

370

CategoryFeatureGenerator,

371

TextNgramFeatureGenerator,

372

FillNaFeatureGenerator

373

)

374

375

# Build custom pipeline

376

custom_pipeline = PipelineFeatureGenerator([

377

FillNaFeatureGenerator(), # Handle missing values first

378

DatetimeFeatureGenerator(

379

features_to_extract=['year', 'month', 'dayofweek', 'hour']

380

),

381

CategoryFeatureGenerator(maximum_num_cat=1000),

382

TextNgramFeatureGenerator(

383

max_features=5000,

384

ngram_range=(1, 2),

385

vectorizer_strategy='tf-idf'

386

)

387

])

388

389

# Apply custom pipeline

390

X_custom = custom_pipeline.fit_transform(raw_data, target_data)

391

```

392

393

### Specialized Text Processing

394

395

```python

396

from autogluon.features import TextSpecialFeatureGenerator, TextNgramFeatureGenerator

397

from autogluon.features import BulkFeatureGenerator

398

399

# Combine multiple text feature generators

400

text_features = BulkFeatureGenerator([

401

TextSpecialFeatureGenerator(), # Text statistics

402

TextNgramFeatureGenerator(

403

ngram_range=(1, 3),

404

max_features=10000,

405

vectorizer_strategy='tf-idf'

406

)

407

])

408

409

# Process text data

410

text_df = pd.DataFrame({

411

'review_text': ['Great product!', 'Not bad', 'Excellent quality', 'Poor service'],

412

'description': ['Short desc', 'Longer description here', 'Brief', 'Detailed info']

413

})

414

415

text_features_generated = text_features.fit_transform(text_df)

416

print(f"Generated {text_features_generated.shape[1]} text features")

417

```

418

419

### Memory-Optimized Feature Processing

420

421

```python

422

from autogluon.features import (

423

AutoMLPipelineFeatureGenerator,

424

NumericMemoryMinimizeFeatureGenerator,

425

CategoryMemoryMinimizeFeatureGenerator,

426

PipelineFeatureGenerator

427

)

428

429

# Memory-optimized pipeline for large datasets

430

memory_optimized = PipelineFeatureGenerator([

431

AutoMLPipelineFeatureGenerator(),

432

NumericMemoryMinimizeFeatureGenerator(),

433

CategoryMemoryMinimizeFeatureGenerator()

434

])

435

436

# Process large dataset with memory optimization

437

large_data_processed = memory_optimized.fit_transform(large_dataset)

438

print(f"Memory usage reduced by dtype optimization")

439

```