0
# Feature Creation
1
2
Transformers for generating new features through mathematical combinations, cyclical transformations, and reference feature combinations to enrich the dataset and improve model performance.
3
4
## Capabilities
5
6
### Mathematical Combination
7
8
Applies basic mathematical operations to multiple features, returning additional features.
9
10
```python { .api }
11
class MathematicalCombination:
12
def __init__(self, variables_to_combine, math_operations=None, new_variables_names=None,
13
missing_values='raise', drop_original=False):
14
"""
15
Initialize MathematicalCombination.
16
17
Parameters:
18
- variables_to_combine (list): List of numerical variables to combine mathematically
19
- math_operations (list): Operations to perform - 'sum', 'prod', 'mean', 'std', 'max', 'min'
20
- new_variables_names (list): Names for new variables. If None, auto-generated
21
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
22
- drop_original (bool): Whether to drop original variables after combination
23
"""
24
25
def fit(self, X, y=None):
26
"""
27
Validate input and create operation dictionary.
28
29
Parameters:
30
- X (pandas.DataFrame): Training dataset
31
- y (pandas.Series, optional): Target variable (not used)
32
33
Returns:
34
- self
35
"""
36
37
def transform(self, X):
38
"""
39
Combine variables with mathematical operations and add new features.
40
41
Parameters:
42
- X (pandas.DataFrame): Dataset to transform
43
44
Returns:
45
- pandas.DataFrame: Dataset with additional combined features
46
"""
47
48
def fit_transform(self, X, y=None):
49
"""Fit to data, then transform it."""
50
```
51
52
**Usage Example**:
53
```python
54
from feature_engine.creation import MathematicalCombination
55
import pandas as pd
56
57
# Sample numerical data
58
data = {
59
'height': [170, 175, 180, 165, 190],
60
'weight': [70, 80, 85, 60, 95],
61
'age': [25, 30, 35, 22, 45]
62
}
63
df = pd.DataFrame(data)
64
65
# Create combinations of height and weight
66
combiner = MathematicalCombination(
67
variables_to_combine=['height', 'weight'],
68
math_operations=['sum', 'mean', 'prod'],
69
new_variables_names=['height_weight_sum', 'height_weight_mean', 'height_weight_prod']
70
)
71
df_combined = combiner.fit_transform(df)
72
73
# Auto-generate variable names
74
combiner = MathematicalCombination(
75
variables_to_combine=['height', 'weight', 'age'],
76
math_operations=['mean', 'std', 'max', 'min']
77
)
78
df_combined = combiner.fit_transform(df)
79
# Creates: height_weight_age_mean, height_weight_age_std, etc.
80
81
# Access operation mappings
82
print(combiner.combination_dict_) # Shows operation to variable name mapping
83
```
84
85
### Combine with Reference Feature
86
87
Combines multiple features with a reference feature using mathematical operations.
88
89
```python { .api }
90
class CombineWithReferenceFeature:
91
def __init__(self, variables_to_combine, reference_variables, operations_list,
92
new_variables_names=None, missing_values='raise', drop_original=False):
93
"""
94
Initialize CombineWithReferenceFeature.
95
96
Parameters:
97
- variables_to_combine (list): List of variables to combine with reference
98
- reference_variables (list): List of reference variables for combination
99
- operations_list (list): Mathematical operations - 'sub', 'div', 'add', 'mul'
100
- new_variables_names (list): Names for new variables. If None, auto-generated
101
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
102
- drop_original (bool): Whether to drop original variables
103
"""
104
105
def fit(self, X, y=None):
106
"""
107
Validate input variables and operations.
108
109
Parameters:
110
- X (pandas.DataFrame): Training dataset
111
- y (pandas.Series, optional): Target variable (not used)
112
113
Returns:
114
- self
115
"""
116
117
def transform(self, X):
118
"""
119
Combine variables with reference features using specified operations.
120
121
Parameters:
122
- X (pandas.DataFrame): Dataset to transform
123
124
Returns:
125
- pandas.DataFrame: Dataset with additional combined features
126
"""
127
128
def fit_transform(self, X, y=None):
129
"""Fit to data, then transform it."""
130
```
131
132
**Usage Example**:
133
```python
134
from feature_engine.creation import CombineWithReferenceFeature
135
136
# Combine features with reference features
137
combiner = CombineWithReferenceFeature(
138
variables_to_combine=['height', 'weight'],
139
reference_variables=['age'],
140
operations_list=['div', 'mul'],
141
new_variables_names=['height_per_age', 'weight_per_age', 'height_times_age', 'weight_times_age']
142
)
143
df_combined = combiner.fit_transform(df)
144
145
# Multiple reference variables
146
combiner = CombineWithReferenceFeature(
147
variables_to_combine=['height'],
148
reference_variables=['weight', 'age'],
149
operations_list=['div', 'sub']
150
)
151
df_combined = combiner.fit_transform(df)
152
# Creates: height_div_weight, height_div_age, height_sub_weight, height_sub_age
153
```
154
155
### Cyclical Transformer
156
157
Creates cyclical features from numerical variables to capture periodic patterns.
158
159
```python { .api }
160
class CyclicalTransformer:
161
def __init__(self, variables=None, max_values=None, drop_original=False):
162
"""
163
Initialize CyclicalTransformer.
164
165
Parameters:
166
- variables (list): List of numerical variables to transform. If None, selects all numerical variables
167
- max_values (dict/int/float): Maximum values for each variable to define cycle. Auto-detected if None
168
- drop_original (bool): Whether to drop original variables after transformation
169
"""
170
171
def fit(self, X, y=None):
172
"""
173
Learn maximum values for cyclical transformation if not provided.
174
175
Parameters:
176
- X (pandas.DataFrame): Training dataset
177
- y (pandas.Series, optional): Target variable (not used)
178
179
Returns:
180
- self
181
"""
182
183
def transform(self, X):
184
"""
185
Create sine and cosine features from numerical variables.
186
187
Parameters:
188
- X (pandas.DataFrame): Dataset to transform
189
190
Returns:
191
- pandas.DataFrame: Dataset with sine and cosine cyclical features
192
"""
193
194
def fit_transform(self, X, y=None):
195
"""Fit to data, then transform it."""
196
```
197
198
**Usage Example**:
199
```python
200
from feature_engine.creation import CyclicalTransformer
201
import numpy as np
202
203
# Sample cyclical data (e.g., time-based)
204
data = {
205
'hour': np.random.randint(0, 24, 100),
206
'day_of_week': np.random.randint(0, 7, 100),
207
'month': np.random.randint(1, 13, 100)
208
}
209
df = pd.DataFrame(data)
210
211
# Auto-detect maximum values
212
transformer = CyclicalTransformer()
213
df_cyclical = transformer.fit_transform(df)
214
# Creates: hour_sin, hour_cos, day_of_week_sin, day_of_week_cos, etc.
215
216
# Specify maximum values for proper cycles
217
transformer = CyclicalTransformer(
218
max_values={'hour': 24, 'day_of_week': 7, 'month': 12}
219
)
220
df_cyclical = transformer.fit_transform(df)
221
222
# Transform specific variables only
223
transformer = CyclicalTransformer(
224
variables=['hour', 'month'],
225
max_values={'hour': 24, 'month': 12},
226
drop_original=True
227
)
228
df_cyclical = transformer.fit_transform(df)
229
230
# Access learned max values
231
print(transformer.max_values_) # Maximum values per variable
232
```
233
234
## Usage Patterns
235
236
### Feature Engineering Pipeline
237
238
```python
239
from sklearn.pipeline import Pipeline
240
from feature_engine.creation import MathematicalCombination, CyclicalTransformer
241
from feature_engine.imputation import MeanMedianImputer
242
243
# Multi-step feature creation pipeline
244
creation_pipeline = Pipeline([
245
('imputer', MeanMedianImputer()),
246
('math_combinations', MathematicalCombination(
247
variables_to_combine=['var1', 'var2'],
248
math_operations=['sum', 'prod', 'mean']
249
)),
250
('cyclical_features', CyclicalTransformer(
251
variables=['hour', 'day'],
252
max_values={'hour': 24, 'day': 365}
253
))
254
])
255
256
df_enhanced = creation_pipeline.fit_transform(df)
257
```
258
259
### Domain-Specific Feature Creation
260
261
```python
262
# Financial ratios
263
financial_combiner = MathematicalCombination(
264
variables_to_combine=['assets', 'liabilities'],
265
math_operations=['sub'], # Assets - Liabilities = Equity
266
new_variables_names=['equity']
267
)
268
269
# BMI calculation
270
bmi_combiner = CombineWithReferenceFeature(
271
variables_to_combine=['weight'],
272
reference_variables=['height'],
273
operations_list=['div'],
274
new_variables_names=['weight_per_height'] # Weight / Height (need to square height separately)
275
)
276
277
# Time-based cyclical features for seasonality
278
time_transformer = CyclicalTransformer(
279
variables=['month', 'hour', 'day_of_week'],
280
max_values={'month': 12, 'hour': 24, 'day_of_week': 7}
281
)
282
```
283
284
### Advanced Mathematical Combinations
285
286
```python
287
import numpy as np
288
289
# Custom data with multiple numerical variables
290
data = {
291
'x1': np.random.normal(10, 2, 1000),
292
'x2': np.random.normal(20, 5, 1000),
293
'x3': np.random.normal(5, 1, 1000),
294
'x4': np.random.normal(100, 15, 1000)
295
}
296
df = pd.DataFrame(data)
297
298
# Create comprehensive feature combinations
299
combiner = MathematicalCombination(
300
variables_to_combine=['x1', 'x2', 'x3', 'x4'],
301
math_operations=['sum', 'prod', 'mean', 'std', 'max', 'min'],
302
new_variables_names=[
303
'total_sum', 'total_product', 'average_value',
304
'value_std', 'max_value', 'min_value'
305
]
306
)
307
308
df_enhanced = combiner.fit_transform(df)
309
print(f"Original features: {len(df.columns)}")
310
print(f"Enhanced features: {len(df_enhanced.columns)}")
311
print(f"New features: {list(df_enhanced.columns[-6:])}") # Last 6 are new features
312
```
313
314
### Handling Missing Values in Feature Creation
315
316
```python
317
# Data with missing values
318
data_with_na = {
319
'feature1': [1, 2, None, 4, 5],
320
'feature2': [10, None, 30, 40, 50],
321
'feature3': [100, 200, 300, None, 500]
322
}
323
df_na = pd.DataFrame(data_with_na)
324
325
# Ignore missing values in calculations
326
combiner_ignore = MathematicalCombination(
327
variables_to_combine=['feature1', 'feature2', 'feature3'],
328
math_operations=['mean', 'sum'],
329
missing_values='ignore' # Skip NaN values in calculations
330
)
331
332
df_combined_ignore = combiner_ignore.fit_transform(df_na)
333
334
# Raise error on missing values (default)
335
try:
336
combiner_raise = MathematicalCombination(
337
variables_to_combine=['feature1', 'feature2'],
338
math_operations=['sum'],
339
missing_values='raise'
340
)
341
df_combined_raise = combiner_raise.fit_transform(df_na)
342
except ValueError as e:
343
print(f"Error with missing values: {e}")
344
```
345
346
## Common Attributes
347
348
All creation transformers share these fitted attributes:
349
350
- `variables_` (list): Variables that will be used for feature creation
351
- `n_features_in_` (int): Number of features in training set
352
353
Transformer-specific attributes:
354
- `combination_dict_` (dict): Mapping of operations to new variable names (MathematicalCombination)
355
- `max_values_` (dict): Maximum values used for cyclical transformation (CyclicalTransformer)
356
- `math_operations_` (list): Mathematical operations applied (MathematicalCombination)
357
- `operations_list_` (list): Operations applied between variables and references (CombineWithReferenceFeature)