0
# Feature Engineering
1
2
Tools for selecting optimal feature subsets and extracting new features through dimensionality reduction techniques. All transformers follow scikit-learn's fit/transform API and can be used in scikit-learn pipelines.
3
4
## Capabilities
5
6
### Sequential Feature Selector
7
8
Forward or backward sequential feature selection to find optimal feature subsets based on cross-validation performance.
9
10
```python { .api }
11
class SequentialFeatureSelector:
12
def __init__(self, estimator, k_features=1, forward=True, floating=False,
13
verbose=0, scoring=None, cv=5, n_jobs=1,
14
pre_dispatch='2*n_jobs', clone_estimator=True):
15
"""
16
Sequential Feature Selector for optimal feature subset selection.
17
18
Parameters:
19
- estimator: sklearn-compatible estimator
20
- k_features: int or tuple, number of features to select
21
- forward: bool, forward (True) or backward (False) selection
22
- floating: bool, use floating selection
23
- verbose: int, verbosity level
24
- scoring: str or callable, scoring metric
25
- cv: int, cross-validation folds
26
- n_jobs: int, number of parallel jobs
27
- pre_dispatch: str, pre-dispatch parameter for joblib
28
- clone_estimator: bool, clone the estimator
29
"""
30
31
def fit(self, X, y, custom_feature_names=None):
32
"""Fit feature selector"""
33
34
def transform(self, X):
35
"""Transform features by selecting optimal subset"""
36
37
def fit_transform(self, X, y):
38
"""Fit and transform features"""
39
40
def get_metric_dict(self):
41
"""Get performance metrics for each subset size"""
42
43
def k_feature_names_:
44
"""Names of selected features"""
45
46
def k_feature_idx_:
47
"""Indices of selected features"""
48
49
def k_score_:
50
"""Cross-validation score of selected feature subset"""
51
52
def subsets_:
53
"""Dictionary with subset information for each step"""
54
```
55
56
### Exhaustive Feature Selector
57
58
Evaluates all possible feature combinations to find the optimal subset.
59
60
```python { .api }
61
class ExhaustiveFeatureSelector:
62
def __init__(self, estimator, min_features=1, max_features=1,
63
print_progress=True, scoring='accuracy', cv=5, n_jobs=1,
64
pre_dispatch='2*n_jobs', clone_estimator=True):
65
"""
66
Exhaustive Feature Selector evaluating all combinations.
67
68
Parameters:
69
- estimator: sklearn-compatible estimator
70
- min_features: int, minimum number of features
71
- max_features: int, maximum number of features
72
- print_progress: bool, print progress
73
- scoring: str or callable, scoring metric
74
- cv: int, cross-validation folds
75
- n_jobs: int, number of parallel jobs
76
- pre_dispatch: str, pre-dispatch parameter
77
- clone_estimator: bool, clone the estimator
78
"""
79
80
def fit(self, X, y, custom_feature_names=None):
81
"""Fit exhaustive feature selector"""
82
83
def transform(self, X):
84
"""Transform features using best subset"""
85
86
def fit_transform(self, X, y):
87
"""Fit and transform features"""
88
89
def best_idx_:
90
"""Indices of best feature subset"""
91
92
def best_feature_names_:
93
"""Names of best features"""
94
95
def best_score_:
96
"""Cross-validation score of best subset"""
97
98
def subsets_:
99
"""Dictionary with all evaluated subsets"""
100
```
101
102
### Column Selector
103
104
Simple transformer for selecting specific columns by index or name.
105
106
```python { .api }
107
class ColumnSelector:
108
def __init__(self, cols=None, drop_axis=False):
109
"""
110
Column selector for feature matrices.
111
112
Parameters:
113
- cols: list, column indices or names to select
114
- drop_axis: bool, drop axis if single column selected
115
"""
116
117
def fit(self, X, y=None):
118
"""Fit column selector (no-op)"""
119
120
def transform(self, X):
121
"""Select specified columns"""
122
123
def fit_transform(self, X, y=None):
124
"""Fit and transform columns"""
125
```
126
127
### Principal Component Analysis
128
129
Principal Component Analysis for dimensionality reduction and feature extraction.
130
131
```python { .api }
132
class PrincipalComponentAnalysis:
133
def __init__(self, n_components=None, solver='svd', eta=0.01, epochs=100,
134
minibatches=None, random_seed=None, print_progress=0):
135
"""
136
Principal Component Analysis implementation.
137
138
Parameters:
139
- n_components: int, number of components to keep
140
- solver: str, solver algorithm ('svd' or 'eigen')
141
- eta: float, learning rate (for gradient-based solver)
142
- epochs: int, number of epochs (for gradient-based solver)
143
- minibatches: int, number of minibatches
144
- random_seed: int, random seed
145
- print_progress: int, print progress frequency
146
"""
147
148
def fit(self, X, y=None):
149
"""Fit PCA model"""
150
151
def transform(self, X):
152
"""Apply dimensionality reduction"""
153
154
def fit_transform(self, X, y=None):
155
"""Fit and transform data"""
156
157
def components_:
158
"""Principal axes in feature space"""
159
160
def explained_variance_ratio_:
161
"""Percentage of variance explained by each component"""
162
163
def mean_:
164
"""Per-feature empirical mean"""
165
166
def eigenvalues_:
167
"""Eigenvalues of the covariance matrix"""
168
169
def loadings_:
170
"""The loadings matrix"""
171
```
172
173
### Linear Discriminant Analysis
174
175
Linear Discriminant Analysis for supervised dimensionality reduction and classification.
176
177
```python { .api }
178
class LinearDiscriminantAnalysis:
179
def __init__(self, n_discriminants=None):
180
"""
181
Linear Discriminant Analysis implementation.
182
183
Parameters:
184
- n_discriminants: int, number of discriminants to keep
185
"""
186
187
def fit(self, X, y):
188
"""Fit LDA model"""
189
190
def transform(self, X):
191
"""Apply LDA transformation"""
192
193
def fit_transform(self, X, y):
194
"""Fit and transform data"""
195
196
def scalings_:
197
"""Scaling factors for each discriminant"""
198
199
def explained_variance_ratio_:
200
"""Percentage of variance explained by each discriminant"""
201
202
def mean_:
203
"""Overall mean of the data"""
204
205
def means_:
206
"""Class means"""
207
208
def eigenvalues_:
209
"""Eigenvalues in descending order"""
210
```
211
212
### RBF Kernel PCA
213
214
Kernel PCA using Radial Basis Function (RBF) kernel for non-linear dimensionality reduction.
215
216
```python { .api }
217
class RBFKernelPCA:
218
def __init__(self, gamma=15.0, n_components=None, copy_X=True):
219
"""
220
RBF Kernel PCA for non-linear dimensionality reduction.
221
222
Parameters:
223
- gamma: float, RBF kernel parameter
224
- n_components: int, number of components to keep
225
- copy_X: bool, copy input data
226
"""
227
228
def fit(self, X, y=None):
229
"""Fit RBF Kernel PCA model"""
230
231
def transform(self, X):
232
"""Apply kernel PCA transformation"""
233
234
def fit_transform(self, X, y=None):
235
"""Fit and transform data"""
236
237
def alphas_:
238
"""Eigenvectors of the kernel matrix"""
239
240
def eigenvals_:
241
"""Eigenvalues of the kernel matrix"""
242
243
def X_fit_:
244
"""Training data used for kernel computation"""
245
```
246
247
## Usage Examples
248
249
### Sequential Feature Selection Example
250
251
```python
252
from mlxtend.feature_selection import SequentialFeatureSelector
253
from sklearn.ensemble import RandomForestClassifier
254
from sklearn.datasets import make_classification
255
from sklearn.model_selection import train_test_split
256
257
# Create dataset
258
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
259
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
260
261
# Create feature selector
262
clf = RandomForestClassifier(random_state=42)
263
sfs = SequentialFeatureSelector(clf, k_features=10, forward=True, scoring='accuracy', cv=5)
264
265
# Fit and transform features
266
sfs.fit(X_train, y_train)
267
X_train_selected = sfs.transform(X_train)
268
X_test_selected = sfs.transform(X_test)
269
270
# Get selected feature information
271
print("Selected features:", sfs.k_feature_names_)
272
print("Best score:", sfs.k_score_)
273
```
274
275
### PCA Example
276
277
```python
278
from mlxtend.feature_extraction import PrincipalComponentAnalysis
279
from sklearn.datasets import make_classification
280
import matplotlib.pyplot as plt
281
282
# Create dataset
283
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
284
285
# Apply PCA
286
pca = PrincipalComponentAnalysis(n_components=2)
287
X_pca = pca.fit_transform(X)
288
289
# Plot results
290
plt.figure(figsize=(8, 6))
291
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
292
plt.xlabel('First Principal Component')
293
plt.ylabel('Second Principal Component')
294
plt.title('PCA Visualization')
295
plt.colorbar()
296
plt.show()
297
298
# Print variance explained
299
print("Explained variance ratio:", pca.explained_variance_ratio_)
300
```
301
302
### LDA Example
303
304
```python
305
from mlxtend.feature_extraction import LinearDiscriminantAnalysis
306
from sklearn.datasets import make_classification
307
import matplotlib.pyplot as plt
308
309
# Create multi-class dataset
310
X, y = make_classification(n_samples=1000, n_features=10, n_classes=3,
311
n_informative=3, random_state=42)
312
313
# Apply LDA
314
lda = LinearDiscriminantAnalysis(n_discriminants=2)
315
X_lda = lda.fit_transform(X, y)
316
317
# Plot results
318
plt.figure(figsize=(8, 6))
319
plt.scatter(X_lda[:, 0], X_lda[:, 1], c=y, cmap='viridis')
320
plt.xlabel('First Linear Discriminant')
321
plt.ylabel('Second Linear Discriminant')
322
plt.title('LDA Visualization')
323
plt.colorbar()
324
plt.show()
325
326
# Print variance explained
327
print("Explained variance ratio:", lda.explained_variance_ratio_)
328
```