0
# Feature Engineering
1
2
Comprehensive preprocessing pipeline including dimensionality reduction, feature selection, transformation, scaling, and imputation utilities. Smile Core provides a complete toolkit for preparing data for machine learning algorithms.
3
4
## Capabilities
5
6
### Core Transformation Interface
7
8
All feature transformations implement the `Transform` interface for consistent data preprocessing.
9
10
```java { .api }
11
/**
12
* Base interface for feature transformations
13
*/
14
interface Transform extends Function<double[], double[]> {
15
/** Apply transformation to feature vector */
16
double[] apply(double[] x);
17
18
/** Transform multiple samples */
19
default double[][] apply(double[][] x) {
20
return Arrays.stream(x).map(this::apply).toArray(double[][]::new);
21
}
22
}
23
```
24
25
### Dimensionality Reduction
26
27
Algorithms for reducing the number of features while preserving important information.
28
29
```java { .api }
30
/**
31
* Principal Component Analysis for dimensionality reduction
32
*/
33
class PCA extends Projection {
34
/** Fit PCA with default number of components */
35
public static PCA fit(double[][] data);
36
37
/** Fit PCA with correlation matrix instead of covariance */
38
public static PCA cor(double[][] data);
39
40
/** Fit PCA with correlation matrix from DataFrame */
41
public static PCA cor(DataFrame data);
42
43
/** Transform data to principal component space */
44
public double[] apply(double[] x);
45
46
/** Get principal components (eigenvectors) */
47
public double[][] loadings();
48
49
/** Get eigenvalues (explained variance) */
50
public double[] variance();
51
52
/** Get explained variance proportion */
53
public double[] varianceProportion();
54
55
/** Get cumulative explained variance proportion */
56
public double[] cumulativeVarianceProportion();
57
58
/** Get projection to k dimensions */
59
public Projection getProjection(int k);
60
61
/** Get projection by variance threshold */
62
public Projection getProjection(double varianceThreshold);
63
}
64
65
/**
66
* Kernel PCA for non-linear dimensionality reduction
67
*/
68
class KernelPCA extends Projection {
69
/** Fit Kernel PCA with RBF kernel */
70
public static KernelPCA fit(double[][] data, int k, double sigma);
71
72
/** Fit with custom kernel */
73
public static KernelPCA fit(double[][] data, int k, Kernel kernel);
74
75
/** Transform data to kernel principal component space */
76
public double[] apply(double[] x);
77
78
/** Get eigenvalues */
79
public double[] eigenvalues();
80
81
/** Get kernel matrix */
82
public double[][] kernelMatrix();
83
}
84
85
/**
86
* Probabilistic PCA with missing value handling
87
*/
88
class ProbabilisticPCA extends Projection {
89
/** Fit Probabilistic PCA */
90
public static ProbabilisticPCA fit(double[][] data, int k);
91
92
/** Transform data */
93
public double[] apply(double[] x);
94
95
/** Get noise variance */
96
public double noiseVariance();
97
98
/** Get log-likelihood */
99
public double logLikelihood();
100
}
101
102
/**
103
* Random Projection for fast dimensionality reduction
104
*/
105
class RandomProjection extends Projection {
106
/** Create random projection matrix */
107
public static RandomProjection of(int d, int k);
108
109
/** Create with specified sparsity */
110
public static RandomProjection of(int d, int k, double density);
111
112
/** Transform data */
113
public double[] apply(double[] x);
114
115
/** Get projection matrix */
116
public double[][] matrix();
117
}
118
119
/**
120
* Generalized Hebbian Algorithm for online PCA
121
*/
122
class GHA extends Projection {
123
/** Fit GHA with specified learning rate */
124
public static GHA fit(double[][] data, int k, double learningRate);
125
126
/** Transform data */
127
public double[] apply(double[] x);
128
129
/** Online update with new sample */
130
public void update(double[] x);
131
132
/** Get learned weights */
133
public double[][] weights();
134
}
135
```
136
137
**Usage Example:**
138
139
```java
140
import smile.feature.extraction.PCA;
141
import smile.feature.extraction.KernelPCA;
142
143
// Basic PCA
144
PCA pca = PCA.fit(data, 10); // Reduce to 10 dimensions
145
double[] transformed = pca.apply(newSample);
146
double[] variance = pca.varianceRatio();
147
148
// Kernel PCA for non-linear reduction
149
KernelPCA kpca = KernelPCA.fit(data, 5, 1.0); // RBF kernel with sigma=1.0
150
double[] nonLinearTransform = kpca.apply(newSample);
151
```
152
153
### Feature Selection
154
155
Methods for selecting the most relevant features for machine learning models.
156
157
```java { .api }
158
/**
159
* Genetic Algorithm for Feature Extraction
160
*/
161
class GAFE {
162
/** Perform feature selection using genetic algorithm */
163
public static GAFE fit(double[][] x, int[] y, int populationSize, int maxGeneration);
164
165
/** Get selected feature indices */
166
public int[] features();
167
168
/** Get fitness score */
169
public double fitness();
170
171
/** Transform data using selected features */
172
public double[][] apply(double[][] x);
173
}
174
175
/**
176
* Signal-to-Noise Ratio for feature ranking
177
*/
178
class SignalNoiseRatio implements Comparable<SignalNoiseRatio> {
179
/** Calculate SNR for all features */
180
public static SignalNoiseRatio[] fit(double[][] x, int[] y);
181
182
/** Feature index */
183
public final int feature;
184
185
/** SNR score */
186
public final double score;
187
188
/** Compare by score for ranking */
189
public int compareTo(SignalNoiseRatio other);
190
}
191
192
/**
193
* Sum of Squares Ratio for feature ranking
194
*/
195
class SumSquaresRatio implements Comparable<SumSquaresRatio> {
196
/** Calculate SSR for all features */
197
public static SumSquaresRatio[] fit(double[][] x, int[] y);
198
199
/** Feature index */
200
public final int feature;
201
202
/** SSR score */
203
public final double score;
204
}
205
206
/**
207
* Information Value for feature selection
208
*/
209
class InformationValue implements Comparable<InformationValue> {
210
/** Calculate IV for all features */
211
public static InformationValue[] fit(double[][] x, int[] y);
212
213
/** Feature index */
214
public final int feature;
215
216
/** Information value score */
217
public final double score;
218
}
219
```
220
221
### Feature Scaling and Normalization
222
223
Transformations for scaling features to appropriate ranges and distributions.
224
225
```java { .api }
226
/**
227
* Z-score standardization (mean=0, std=1)
228
*/
229
class Standardizer implements Transform {
230
/** Fit standardizer from training data */
231
public static Standardizer fit(double[][] data);
232
233
/** Fit with robust statistics (median, MAD) */
234
public static Standardizer fit(double[][] data, boolean robust);
235
236
/** Transform feature vector */
237
public double[] apply(double[] x);
238
239
/** Get feature means */
240
public double[] mean();
241
242
/** Get feature standard deviations */
243
public double[] std();
244
}
245
246
/**
247
* Robust standardization using median and MAD
248
*/
249
class RobustStandardizer implements Transform {
250
/** Fit robust standardizer */
251
public static RobustStandardizer fit(double[][] data);
252
253
/** Transform feature vector */
254
public double[] apply(double[] x);
255
256
/** Get feature medians */
257
public double[] median();
258
259
/** Get median absolute deviations */
260
public double[] mad();
261
}
262
263
/**
264
* Min-Max scaling to specified range
265
*/
266
class Scaler implements Transform {
267
/** Fit scaler to [0, 1] range */
268
public static Scaler fit(double[][] data);
269
270
/** Fit scaler to custom range */
271
public static Scaler fit(double[][] data, double lo, double hi);
272
273
/** Transform feature vector */
274
public double[] apply(double[] x);
275
276
/** Get minimum values */
277
public double[] lo();
278
279
/** Get maximum values */
280
public double[] hi();
281
}
282
283
/**
284
* Maximum absolute scaling
285
*/
286
class MaxAbsScaler implements Transform {
287
/** Fit max absolute scaler */
288
public static MaxAbsScaler fit(double[][] data);
289
290
/** Transform feature vector */
291
public double[] apply(double[] x);
292
293
/** Get maximum absolute values */
294
public double[] scale();
295
}
296
297
/**
298
* Winsor scaling with outlier clipping
299
*/
300
class WinsorScaler implements Transform {
301
/** Fit Winsor scaler with default percentiles (5%, 95%) */
302
public static WinsorScaler fit(double[][] data);
303
304
/** Fit with custom percentiles */
305
public static WinsorScaler fit(double[][] data, double lower, double upper);
306
307
/** Transform feature vector */
308
public double[] apply(double[] x);
309
310
/** Get lower bounds */
311
public double[] lower();
312
313
/** Get upper bounds */
314
public double[] upper();
315
}
316
317
/**
318
* Unit vector normalization
319
*/
320
class Normalizer implements Transform {
321
/** L2 normalization */
322
public static final Normalizer L2 = new Normalizer(Norm.L2);
323
324
/** L1 normalization */
325
public static final Normalizer L1 = new Normalizer(Norm.L1);
326
327
/** L-infinity normalization */
328
public static final Normalizer Linf = new Normalizer(Norm.Linf);
329
330
/** Transform to unit vector */
331
public double[] apply(double[] x);
332
333
/** Normalization types */
334
enum Norm { L1, L2, Linf }
335
}
336
```
337
338
**Usage Example:**
339
340
```java
341
import smile.feature.transform.*;
342
343
// Standardization pipeline
344
Standardizer standardizer = Standardizer.fit(trainData);
345
double[][] standardizedTrain = standardizer.apply(trainData);
346
double[] standardizedTest = standardizer.apply(testSample);
347
348
// Min-max scaling to [0, 1]
349
Scaler scaler = Scaler.fit(trainData, 0.0, 1.0);
350
double[][] scaledData = scaler.apply(trainData);
351
352
// Robust scaling for outlier handling
353
RobustStandardizer robust = RobustStandardizer.fit(trainData);
354
double[][] robustScaled = robust.apply(trainData);
355
```
356
357
### Missing Value Imputation
358
359
Methods for handling missing values in datasets.
360
361
```java { .api }
362
/**
363
* Simple imputation strategies
364
*/
365
class SimpleImputer implements Transform {
366
/** Mean imputation for missing values */
367
public static SimpleImputer mean(double[][] data);
368
369
/** Median imputation for missing values */
370
public static SimpleImputer median(double[][] data);
371
372
/** Mode imputation for missing values */
373
public static SimpleImputer mode(double[][] data);
374
375
/** Constant value imputation */
376
public static SimpleImputer constant(double[][] data, double value);
377
378
/** Transform data with imputation */
379
public double[] apply(double[] x);
380
381
/** Get imputation values */
382
public double[] values();
383
}
384
385
/**
386
* K-Nearest Neighbors imputation
387
*/
388
class KNNImputer implements Transform {
389
/** Fit KNN imputer with specified k */
390
public static KNNImputer fit(double[][] data, int k);
391
392
/** Fit with custom distance metric */
393
public static KNNImputer fit(double[][] data, int k, Distance<double[]> distance);
394
395
/** Transform with KNN imputation */
396
public double[] apply(double[] x);
397
398
/** Get k value */
399
public int k();
400
}
401
402
/**
403
* K-Medoids imputation
404
*/
405
class KMedoidsImputer implements Transform {
406
/** Fit K-medoids imputer */
407
public static KMedoidsImputer fit(double[][] data, int k);
408
409
/** Transform with medoid imputation */
410
public double[] apply(double[] x);
411
412
/** Get medoid centers */
413
public double[][] medoids();
414
}
415
416
/**
417
* SVD-based imputation interface
418
*/
419
interface SVDImputer {
420
/** Impute missing values using SVD */
421
double[][] impute(double[][] data, int rank);
422
}
423
```
424
425
### Text Feature Extraction
426
427
Feature extraction methods for text and categorical data.
428
429
```java { .api }
430
/**
431
* Bag of Words transformation for text
432
*/
433
class BagOfWords implements Transform {
434
/** Fit vocabulary from text documents */
435
public static BagOfWords fit(String[] documents);
436
437
/** Fit with custom parameters */
438
public static BagOfWords fit(String[] documents, int maxFeatures, int minDF, int maxDF);
439
440
/** Transform text to feature vector */
441
public double[] apply(String text);
442
443
/** Get vocabulary */
444
public Map<String, Integer> vocabulary();
445
446
/** Get document frequencies */
447
public double[] documentFrequency();
448
}
449
450
/**
451
* Binary encoding for categorical features
452
*/
453
class BinaryEncoder implements Function<Tuple, int[]> {
454
/** Fit binary encoder from data */
455
public static BinaryEncoder fit(DataFrame data);
456
457
/** Encode tuple to binary features */
458
public int[] apply(Tuple tuple);
459
460
/** Get encoding dimension */
461
public int dimension();
462
}
463
464
/**
465
* Sparse encoding for high-dimensional categorical data
466
*/
467
class SparseEncoder implements Function<Tuple, SparseArray> {
468
/** Fit sparse encoder */
469
public static SparseEncoder fit(DataFrame data);
470
471
/** Encode tuple to sparse array */
472
public SparseArray apply(Tuple tuple);
473
474
/** Get feature dimension */
475
public int dimension();
476
}
477
478
/**
479
* Feature hashing for categorical features
480
*/
481
class HashEncoder implements Function<String, SparseArray> {
482
/** Create hash encoder with specified dimension */
483
public static HashEncoder of(int dimension);
484
485
/** Encode string to sparse hash features */
486
public SparseArray apply(String text);
487
488
/** Get hash dimension */
489
public int dimension();
490
}
491
```
492
493
### Feature Importance
494
495
Methods for measuring and interpreting feature importance.
496
497
```java { .api }
498
/**
499
* SHAP (SHapley Additive exPlanations) values interface
500
* @param <T> the type of input objects
501
*/
502
interface SHAP<T> {
503
/** Calculate SHAP values for feature importance */
504
double[] shap(T x);
505
506
/** Calculate SHAP values for multiple samples */
507
default double[][] shap(T[] x) {
508
return Arrays.stream(x).map(this::shap).toArray(double[][]::new);
509
}
510
}
511
512
/**
513
* Tree-specific SHAP implementation
514
*/
515
interface TreeSHAP extends SHAP<Tuple> {
516
/** Calculate SHAP values for tree-based models */
517
double[] shap(Tuple x);
518
519
/** Calculate SHAP interaction values */
520
double[][] shapInteraction(Tuple x);
521
}
522
```
523
524
### Base Classes
525
526
Abstract base classes for feature transformation implementations.
527
528
```java { .api }
529
/**
530
* Base class for projection-based dimensionality reduction
531
*/
532
abstract class Projection implements Transform {
533
/** Project data to lower-dimensional space */
534
public abstract double[] project(double[] x);
535
536
/** Apply transformation (same as project) */
537
public double[] apply(double[] x) {
538
return project(x);
539
}
540
541
/** Get projection dimension */
542
public abstract int dimension();
543
}
544
```
545
546
**Comprehensive Usage Example:**
547
548
```java
549
import smile.feature.extraction.PCA;
550
import smile.feature.transform.Standardizer;
551
import smile.feature.imputation.SimpleImputer;
552
import smile.feature.selection.SignalNoiseRatio;
553
554
// Complete preprocessing pipeline
555
public class FeaturePipeline {
556
private SimpleImputer imputer;
557
private Standardizer standardizer;
558
private PCA pca;
559
private int[] selectedFeatures;
560
561
public void fit(double[][] rawData, int[] labels) {
562
// 1. Handle missing values
563
imputer = SimpleImputer.mean(rawData);
564
double[][] imputedData = imputer.apply(rawData);
565
566
// 2. Standardize features
567
standardizer = Standardizer.fit(imputedData);
568
double[][] standardizedData = standardizer.apply(imputedData);
569
570
// 3. Feature selection
571
SignalNoiseRatio[] snr = SignalNoiseRatio.fit(standardizedData, labels);
572
Arrays.sort(snr, Collections.reverseOrder());
573
selectedFeatures = Arrays.stream(snr)
574
.limit(100) // Select top 100 features
575
.mapToInt(s -> s.feature)
576
.toArray();
577
578
// Select features
579
double[][] selectedData = selectFeatures(standardizedData, selectedFeatures);
580
581
// 4. Dimensionality reduction
582
pca = PCA.fit(selectedData, 50); // Reduce to 50 dimensions
583
}
584
585
public double[] transform(double[] sample) {
586
double[] imputed = imputer.apply(sample);
587
double[] standardized = standardizer.apply(imputed);
588
double[] selected = selectFeatures(standardized, selectedFeatures);
589
return pca.apply(selected);
590
}
591
}
592
```
593
594
### Common Parameters
595
596
Feature engineering methods commonly support these parameters:
597
598
- **k**: Number of components/features to keep
599
- **threshold**: Selection threshold for feature ranking
600
- **minDF/maxDF**: Minimum/maximum document frequency (text)
601
- **maxFeatures**: Maximum number of features to extract
602
- **learningRate**: Learning rate for online algorithms
603
- **sparse**: Whether to return sparse representations
604
- **random_state**: Random seed for reproducible results