Tessl Tile for maven/com.github.haifengl/smile-core@3.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-analytics.md classification.md clustering.md deep-learning.md feature-engineering.md index.md regression.md validation-metrics.md

feature-engineering.mddocs/

0
# Feature Engineering
1

2
Comprehensive preprocessing pipeline including dimensionality reduction, feature selection, transformation, scaling, and imputation utilities. Smile Core provides a complete toolkit for preparing data for machine learning algorithms.
3

4
## Capabilities
5

6
### Core Transformation Interface
7

8
All feature transformations implement the `Transform` interface for consistent data preprocessing.
9

10
```java { .api }
11
/**
12
 * Base interface for feature transformations
13
 */
14
interface Transform extends Function<double[], double[]> {
15
    /** Apply transformation to feature vector */
16
    double[] apply(double[] x);
17
    
18
    /** Transform multiple samples */
19
    default double[][] apply(double[][] x) {
20
        return Arrays.stream(x).map(this::apply).toArray(double[][]::new);
21
    }
22
}
23
```
24

25
### Dimensionality Reduction
26

27
Algorithms for reducing the number of features while preserving important information.
28

29
```java { .api }
30
/**
31
 * Principal Component Analysis for dimensionality reduction
32
 */
33
class PCA extends Projection {
34
    /** Fit PCA with default number of components */
35
    public static PCA fit(double[][] data);
36
    
37
    /** Fit PCA with correlation matrix instead of covariance */
38
    public static PCA cor(double[][] data);
39
    
40
    /** Fit PCA with correlation matrix from DataFrame */
41
    public static PCA cor(DataFrame data);
42
    
43
    /** Transform data to principal component space */
44
    public double[] apply(double[] x);
45
    
46
    /** Get principal components (eigenvectors) */
47
    public double[][] loadings();
48
    
49
    /** Get eigenvalues (explained variance) */
50
    public double[] variance();
51
    
52
    /** Get explained variance proportion */
53
    public double[] varianceProportion();
54
    
55
    /** Get cumulative explained variance proportion */
56
    public double[] cumulativeVarianceProportion();
57
    
58
    /** Get projection to k dimensions */
59
    public Projection getProjection(int k);
60
    
61
    /** Get projection by variance threshold */
62
    public Projection getProjection(double varianceThreshold);
63
}
64

65
/**
66
 * Kernel PCA for non-linear dimensionality reduction
67
 */
68
class KernelPCA extends Projection {
69
    /** Fit Kernel PCA with RBF kernel */
70
    public static KernelPCA fit(double[][] data, int k, double sigma);
71
    
72
    /** Fit with custom kernel */
73
    public static KernelPCA fit(double[][] data, int k, Kernel kernel);
74
    
75
    /** Transform data to kernel principal component space */
76
    public double[] apply(double[] x);
77
    
78
    /** Get eigenvalues */
79
    public double[] eigenvalues();
80
    
81
    /** Get kernel matrix */
82
    public double[][] kernelMatrix();
83
}
84

85
/**
86
 * Probabilistic PCA with missing value handling
87
 */
88
class ProbabilisticPCA extends Projection {
89
    /** Fit Probabilistic PCA */
90
    public static ProbabilisticPCA fit(double[][] data, int k);
91
    
92
    /** Transform data */
93
    public double[] apply(double[] x);
94
    
95
    /** Get noise variance */
96
    public double noiseVariance();
97
    
98
    /** Get log-likelihood */
99
    public double logLikelihood();
100
}
101

102
/**
103
 * Random Projection for fast dimensionality reduction
104
 */
105
class RandomProjection extends Projection {
106
    /** Create random projection matrix */
107
    public static RandomProjection of(int d, int k);
108
    
109
    /** Create with specified sparsity */
110
    public static RandomProjection of(int d, int k, double density);
111
    
112
    /** Transform data */
113
    public double[] apply(double[] x);
114
    
115
    /** Get projection matrix */
116
    public double[][] matrix();
117
}
118

119
/**
120
 * Generalized Hebbian Algorithm for online PCA
121
 */
122
class GHA extends Projection {
123
    /** Fit GHA with specified learning rate */
124
    public static GHA fit(double[][] data, int k, double learningRate);
125
    
126
    /** Transform data */
127
    public double[] apply(double[] x);
128
    
129
    /** Online update with new sample */
130
    public void update(double[] x);
131
    
132
    /** Get learned weights */
133
    public double[][] weights();
134
}
135
```
136

137
**Usage Example:**
138

139
```java
140
import smile.feature.extraction.PCA;
141
import smile.feature.extraction.KernelPCA;
142

143
// Basic PCA
144
PCA pca = PCA.fit(data, 10); // Reduce to 10 dimensions
145
double[] transformed = pca.apply(newSample);
146
double[] variance = pca.varianceRatio();
147

148
// Kernel PCA for non-linear reduction
149
KernelPCA kpca = KernelPCA.fit(data, 5, 1.0); // RBF kernel with sigma=1.0
150
double[] nonLinearTransform = kpca.apply(newSample);
151
```
152

153
### Feature Selection
154

155
Methods for selecting the most relevant features for machine learning models.
156

157
```java { .api }
158
/**
159
 * Genetic Algorithm for Feature Extraction
160
 */
161
class GAFE {
162
    /** Perform feature selection using genetic algorithm */
163
    public static GAFE fit(double[][] x, int[] y, int populationSize, int maxGeneration);
164
    
165
    /** Get selected feature indices */
166
    public int[] features();
167
    
168
    /** Get fitness score */
169
    public double fitness();
170
    
171
    /** Transform data using selected features */
172
    public double[][] apply(double[][] x);
173
}
174

175
/**
176
 * Signal-to-Noise Ratio for feature ranking
177
 */
178
class SignalNoiseRatio implements Comparable<SignalNoiseRatio> {
179
    /** Calculate SNR for all features */
180
    public static SignalNoiseRatio[] fit(double[][] x, int[] y);
181
    
182
    /** Feature index */
183
    public final int feature;
184
    
185
    /** SNR score */
186
    public final double score;
187
    
188
    /** Compare by score for ranking */
189
    public int compareTo(SignalNoiseRatio other);
190
}
191

192
/**
193
 * Sum of Squares Ratio for feature ranking
194
 */
195
class SumSquaresRatio implements Comparable<SumSquaresRatio> {
196
    /** Calculate SSR for all features */
197
    public static SumSquaresRatio[] fit(double[][] x, int[] y);
198
    
199
    /** Feature index */
200
    public final int feature;
201
    
202
    /** SSR score */
203
    public final double score;
204
}
205

206
/**
207
 * Information Value for feature selection
208
 */
209
class InformationValue implements Comparable<InformationValue> {
210
    /** Calculate IV for all features */
211
    public static InformationValue[] fit(double[][] x, int[] y);
212
    
213
    /** Feature index */
214
    public final int feature;
215
    
216
    /** Information value score */
217
    public final double score;
218
}
219
```
220

221
### Feature Scaling and Normalization
222

223
Transformations for scaling features to appropriate ranges and distributions.
224

225
```java { .api }
226
/**
227
 * Z-score standardization (mean=0, std=1)
228
 */
229
class Standardizer implements Transform {
230
    /** Fit standardizer from training data */
231
    public static Standardizer fit(double[][] data);
232
    
233
    /** Fit with robust statistics (median, MAD) */
234
    public static Standardizer fit(double[][] data, boolean robust);
235
    
236
    /** Transform feature vector */
237
    public double[] apply(double[] x);
238
    
239
    /** Get feature means */
240
    public double[] mean();
241
    
242
    /** Get feature standard deviations */
243
    public double[] std();
244
}
245

246
/**
247
 * Robust standardization using median and MAD
248
 */
249
class RobustStandardizer implements Transform {
250
    /** Fit robust standardizer */
251
    public static RobustStandardizer fit(double[][] data);
252
    
253
    /** Transform feature vector */
254
    public double[] apply(double[] x);
255
    
256
    /** Get feature medians */
257
    public double[] median();
258
    
259
    /** Get median absolute deviations */
260
    public double[] mad();
261
}
262

263
/**
264
 * Min-Max scaling to specified range
265
 */
266
class Scaler implements Transform {
267
    /** Fit scaler to [0, 1] range */
268
    public static Scaler fit(double[][] data);
269
    
270
    /** Fit scaler to custom range */
271
    public static Scaler fit(double[][] data, double lo, double hi);
272
    
273
    /** Transform feature vector */
274
    public double[] apply(double[] x);
275
    
276
    /** Get minimum values */
277
    public double[] lo();
278
    
279
    /** Get maximum values */
280
    public double[] hi();
281
}
282

283
/**
284
 * Maximum absolute scaling
285
 */
286
class MaxAbsScaler implements Transform {
287
    /** Fit max absolute scaler */
288
    public static MaxAbsScaler fit(double[][] data);
289
    
290
    /** Transform feature vector */
291
    public double[] apply(double[] x);
292
    
293
    /** Get maximum absolute values */
294
    public double[] scale();
295
}
296

297
/**
298
 * Winsor scaling with outlier clipping
299
 */
300
class WinsorScaler implements Transform {
301
    /** Fit Winsor scaler with default percentiles (5%, 95%) */
302
    public static WinsorScaler fit(double[][] data);
303
    
304
    /** Fit with custom percentiles */
305
    public static WinsorScaler fit(double[][] data, double lower, double upper);
306
    
307
    /** Transform feature vector */
308
    public double[] apply(double[] x);
309
    
310
    /** Get lower bounds */
311
    public double[] lower();
312
    
313
    /** Get upper bounds */
314
    public double[] upper();
315
}
316

317
/**
318
 * Unit vector normalization
319
 */
320
class Normalizer implements Transform {
321
    /** L2 normalization */
322
    public static final Normalizer L2 = new Normalizer(Norm.L2);
323
    
324
    /** L1 normalization */
325
    public static final Normalizer L1 = new Normalizer(Norm.L1);
326
    
327
    /** L-infinity normalization */
328
    public static final Normalizer Linf = new Normalizer(Norm.Linf);
329
    
330
    /** Transform to unit vector */
331
    public double[] apply(double[] x);
332
    
333
    /** Normalization types */
334
    enum Norm { L1, L2, Linf }
335
}
336
```
337

338
**Usage Example:**
339

340
```java
341
import smile.feature.transform.*;
342

343
// Standardization pipeline
344
Standardizer standardizer = Standardizer.fit(trainData);
345
double[][] standardizedTrain = standardizer.apply(trainData);
346
double[] standardizedTest = standardizer.apply(testSample);
347

348
// Min-max scaling to [0, 1]
349
Scaler scaler = Scaler.fit(trainData, 0.0, 1.0);
350
double[][] scaledData = scaler.apply(trainData);
351

352
// Robust scaling for outlier handling
353
RobustStandardizer robust = RobustStandardizer.fit(trainData);
354
double[][] robustScaled = robust.apply(trainData);
355
```
356

357
### Missing Value Imputation
358

359
Methods for handling missing values in datasets.
360

361
```java { .api }
362
/**
363
 * Simple imputation strategies
364
 */
365
class SimpleImputer implements Transform {
366
    /** Mean imputation for missing values */
367
    public static SimpleImputer mean(double[][] data);
368
    
369
    /** Median imputation for missing values */
370
    public static SimpleImputer median(double[][] data);
371
    
372
    /** Mode imputation for missing values */
373
    public static SimpleImputer mode(double[][] data);
374
    
375
    /** Constant value imputation */
376
    public static SimpleImputer constant(double[][] data, double value);
377
    
378
    /** Transform data with imputation */
379
    public double[] apply(double[] x);
380
    
381
    /** Get imputation values */
382
    public double[] values();
383
}
384

385
/**
386
 * K-Nearest Neighbors imputation
387
 */
388
class KNNImputer implements Transform {
389
    /** Fit KNN imputer with specified k */
390
    public static KNNImputer fit(double[][] data, int k);
391
    
392
    /** Fit with custom distance metric */
393
    public static KNNImputer fit(double[][] data, int k, Distance<double[]> distance);
394
    
395
    /** Transform with KNN imputation */
396
    public double[] apply(double[] x);
397
    
398
    /** Get k value */
399
    public int k();
400
}
401

402
/**
403
 * K-Medoids imputation
404
 */
405
class KMedoidsImputer implements Transform {
406
    /** Fit K-medoids imputer */
407
    public static KMedoidsImputer fit(double[][] data, int k);
408
    
409
    /** Transform with medoid imputation */
410
    public double[] apply(double[] x);
411
    
412
    /** Get medoid centers */
413
    public double[][] medoids();
414
}
415

416
/**
417
 * SVD-based imputation interface
418
 */
419
interface SVDImputer {
420
    /** Impute missing values using SVD */
421
    double[][] impute(double[][] data, int rank);
422
}
423
```
424

425
### Text Feature Extraction
426

427
Feature extraction methods for text and categorical data.
428

429
```java { .api }
430
/**
431
 * Bag of Words transformation for text
432
 */
433
class BagOfWords implements Transform {
434
    /** Fit vocabulary from text documents */
435
    public static BagOfWords fit(String[] documents);
436
    
437
    /** Fit with custom parameters */
438
    public static BagOfWords fit(String[] documents, int maxFeatures, int minDF, int maxDF);
439
    
440
    /** Transform text to feature vector */
441
    public double[] apply(String text);
442
    
443
    /** Get vocabulary */
444
    public Map<String, Integer> vocabulary();
445
    
446
    /** Get document frequencies */
447
    public double[] documentFrequency();
448
}
449

450
/**
451
 * Binary encoding for categorical features
452
 */
453
class BinaryEncoder implements Function<Tuple, int[]> {
454
    /** Fit binary encoder from data */
455
    public static BinaryEncoder fit(DataFrame data);
456
    
457
    /** Encode tuple to binary features */
458
    public int[] apply(Tuple tuple);
459
    
460
    /** Get encoding dimension */
461
    public int dimension();
462
}
463

464
/**
465
 * Sparse encoding for high-dimensional categorical data
466
 */
467
class SparseEncoder implements Function<Tuple, SparseArray> {
468
    /** Fit sparse encoder */
469
    public static SparseEncoder fit(DataFrame data);
470
    
471
    /** Encode tuple to sparse array */
472
    public SparseArray apply(Tuple tuple);
473
    
474
    /** Get feature dimension */
475
    public int dimension();
476
}
477

478
/**
479
 * Feature hashing for categorical features
480
 */
481
class HashEncoder implements Function<String, SparseArray> {
482
    /** Create hash encoder with specified dimension */
483
    public static HashEncoder of(int dimension);
484
    
485
    /** Encode string to sparse hash features */
486
    public SparseArray apply(String text);
487
    
488
    /** Get hash dimension */
489
    public int dimension();
490
}
491
```
492

493
### Feature Importance
494

495
Methods for measuring and interpreting feature importance.
496

497
```java { .api }
498
/**
499
 * SHAP (SHapley Additive exPlanations) values interface
500
 * @param <T> the type of input objects
501
 */
502
interface SHAP<T> {
503
    /** Calculate SHAP values for feature importance */
504
    double[] shap(T x);
505
    
506
    /** Calculate SHAP values for multiple samples */
507
    default double[][] shap(T[] x) {
508
        return Arrays.stream(x).map(this::shap).toArray(double[][]::new);
509
    }
510
}
511

512
/**
513
 * Tree-specific SHAP implementation
514
 */
515
interface TreeSHAP extends SHAP<Tuple> {
516
    /** Calculate SHAP values for tree-based models */
517
    double[] shap(Tuple x);
518
    
519
    /** Calculate SHAP interaction values */
520
    double[][] shapInteraction(Tuple x);
521
}
522
```
523

524
### Base Classes
525

526
Abstract base classes for feature transformation implementations.
527

528
```java { .api }
529
/**
530
 * Base class for projection-based dimensionality reduction
531
 */
532
abstract class Projection implements Transform {
533
    /** Project data to lower-dimensional space */
534
    public abstract double[] project(double[] x);
535
    
536
    /** Apply transformation (same as project) */
537
    public double[] apply(double[] x) {
538
        return project(x);
539
    }
540
    
541
    /** Get projection dimension */
542
    public abstract int dimension();
543
}
544
```
545

546
**Comprehensive Usage Example:**
547

548
```java
549
import smile.feature.extraction.PCA;
550
import smile.feature.transform.Standardizer;
551
import smile.feature.imputation.SimpleImputer;
552
import smile.feature.selection.SignalNoiseRatio;
553

554
// Complete preprocessing pipeline
555
public class FeaturePipeline {
556
    private SimpleImputer imputer;
557
    private Standardizer standardizer;
558
    private PCA pca;
559
    private int[] selectedFeatures;
560
    
561
    public void fit(double[][] rawData, int[] labels) {
562
        // 1. Handle missing values
563
        imputer = SimpleImputer.mean(rawData);
564
        double[][] imputedData = imputer.apply(rawData);
565
        
566
        // 2. Standardize features
567
        standardizer = Standardizer.fit(imputedData);
568
        double[][] standardizedData = standardizer.apply(imputedData);
569
        
570
        // 3. Feature selection
571
        SignalNoiseRatio[] snr = SignalNoiseRatio.fit(standardizedData, labels);
572
        Arrays.sort(snr, Collections.reverseOrder());
573
        selectedFeatures = Arrays.stream(snr)
574
            .limit(100) // Select top 100 features
575
            .mapToInt(s -> s.feature)
576
            .toArray();
577
        
578
        // Select features
579
        double[][] selectedData = selectFeatures(standardizedData, selectedFeatures);
580
        
581
        // 4. Dimensionality reduction
582
        pca = PCA.fit(selectedData, 50); // Reduce to 50 dimensions
583
    }
584
    
585
    public double[] transform(double[] sample) {
586
        double[] imputed = imputer.apply(sample);
587
        double[] standardized = standardizer.apply(imputed);
588
        double[] selected = selectFeatures(standardized, selectedFeatures);
589
        return pca.apply(selected);
590
    }
591
}
592
```
593

594
### Common Parameters
595

596
Feature engineering methods commonly support these parameters:
597

598
- **k**: Number of components/features to keep
599
- **threshold**: Selection threshold for feature ranking
600
- **minDF/maxDF**: Minimum/maximum document frequency (text)
601
- **maxFeatures**: Maximum number of features to extract
602
- **learningRate**: Learning rate for online algorithms
603
- **sparse**: Whether to return sparse representations
604
- **random_state**: Random seed for reproducible results

Version

Tile

Files

feature-engineering.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

feature-engineering.mddocs/