Statistical Machine Intelligence and Learning Engine providing comprehensive machine learning algorithms for classification, regression, clustering, and feature engineering in Java
—
Comprehensive preprocessing pipeline including dimensionality reduction, feature selection, transformation, scaling, and imputation utilities. Smile Core provides a complete toolkit for preparing data for machine learning algorithms.
All feature transformations implement the Transform interface for consistent data preprocessing.
/**
* Base interface for feature transformations
*/
interface Transform extends Function<double[], double[]> {
/** Apply transformation to feature vector */
double[] apply(double[] x);
/** Transform multiple samples */
default double[][] apply(double[][] x) {
return Arrays.stream(x).map(this::apply).toArray(double[][]::new);
}
}Algorithms for reducing the number of features while preserving important information.
/**
* Principal Component Analysis for dimensionality reduction
*/
class PCA extends Projection {
/** Fit PCA with default number of components */
public static PCA fit(double[][] data);
/** Fit PCA with correlation matrix instead of covariance */
public static PCA cor(double[][] data);
/** Fit PCA with correlation matrix from DataFrame */
public static PCA cor(DataFrame data);
/** Transform data to principal component space */
public double[] apply(double[] x);
/** Get principal components (eigenvectors) */
public double[][] loadings();
/** Get eigenvalues (explained variance) */
public double[] variance();
/** Get explained variance proportion */
public double[] varianceProportion();
/** Get cumulative explained variance proportion */
public double[] cumulativeVarianceProportion();
/** Get projection to k dimensions */
public Projection getProjection(int k);
/** Get projection by variance threshold */
public Projection getProjection(double varianceThreshold);
}
/**
* Kernel PCA for non-linear dimensionality reduction
*/
class KernelPCA extends Projection {
/** Fit Kernel PCA with RBF kernel */
public static KernelPCA fit(double[][] data, int k, double sigma);
/** Fit with custom kernel */
public static KernelPCA fit(double[][] data, int k, Kernel kernel);
/** Transform data to kernel principal component space */
public double[] apply(double[] x);
/** Get eigenvalues */
public double[] eigenvalues();
/** Get kernel matrix */
public double[][] kernelMatrix();
}
/**
* Probabilistic PCA with missing value handling
*/
class ProbabilisticPCA extends Projection {
/** Fit Probabilistic PCA */
public static ProbabilisticPCA fit(double[][] data, int k);
/** Transform data */
public double[] apply(double[] x);
/** Get noise variance */
public double noiseVariance();
/** Get log-likelihood */
public double logLikelihood();
}
/**
* Random Projection for fast dimensionality reduction
*/
class RandomProjection extends Projection {
/** Create random projection matrix */
public static RandomProjection of(int d, int k);
/** Create with specified sparsity */
public static RandomProjection of(int d, int k, double density);
/** Transform data */
public double[] apply(double[] x);
/** Get projection matrix */
public double[][] matrix();
}
/**
* Generalized Hebbian Algorithm for online PCA
*/
class GHA extends Projection {
/** Fit GHA with specified learning rate */
public static GHA fit(double[][] data, int k, double learningRate);
/** Transform data */
public double[] apply(double[] x);
/** Online update with new sample */
public void update(double[] x);
/** Get learned weights */
public double[][] weights();
}Usage Example:
import smile.feature.extraction.PCA;
import smile.feature.extraction.KernelPCA;
// Basic PCA
PCA pca = PCA.fit(data, 10); // Reduce to 10 dimensions
double[] transformed = pca.apply(newSample);
double[] variance = pca.varianceRatio();
// Kernel PCA for non-linear reduction
KernelPCA kpca = KernelPCA.fit(data, 5, 1.0); // RBF kernel with sigma=1.0
double[] nonLinearTransform = kpca.apply(newSample);Methods for selecting the most relevant features for machine learning models.
/**
* Genetic Algorithm for Feature Extraction
*/
class GAFE {
/** Perform feature selection using genetic algorithm */
public static GAFE fit(double[][] x, int[] y, int populationSize, int maxGeneration);
/** Get selected feature indices */
public int[] features();
/** Get fitness score */
public double fitness();
/** Transform data using selected features */
public double[][] apply(double[][] x);
}
/**
* Signal-to-Noise Ratio for feature ranking
*/
class SignalNoiseRatio implements Comparable<SignalNoiseRatio> {
/** Calculate SNR for all features */
public static SignalNoiseRatio[] fit(double[][] x, int[] y);
/** Feature index */
public final int feature;
/** SNR score */
public final double score;
/** Compare by score for ranking */
public int compareTo(SignalNoiseRatio other);
}
/**
* Sum of Squares Ratio for feature ranking
*/
class SumSquaresRatio implements Comparable<SumSquaresRatio> {
/** Calculate SSR for all features */
public static SumSquaresRatio[] fit(double[][] x, int[] y);
/** Feature index */
public final int feature;
/** SSR score */
public final double score;
}
/**
* Information Value for feature selection
*/
class InformationValue implements Comparable<InformationValue> {
/** Calculate IV for all features */
public static InformationValue[] fit(double[][] x, int[] y);
/** Feature index */
public final int feature;
/** Information value score */
public final double score;
}Transformations for scaling features to appropriate ranges and distributions.
/**
* Z-score standardization (mean=0, std=1)
*/
class Standardizer implements Transform {
/** Fit standardizer from training data */
public static Standardizer fit(double[][] data);
/** Fit with robust statistics (median, MAD) */
public static Standardizer fit(double[][] data, boolean robust);
/** Transform feature vector */
public double[] apply(double[] x);
/** Get feature means */
public double[] mean();
/** Get feature standard deviations */
public double[] std();
}
/**
* Robust standardization using median and MAD
*/
class RobustStandardizer implements Transform {
/** Fit robust standardizer */
public static RobustStandardizer fit(double[][] data);
/** Transform feature vector */
public double[] apply(double[] x);
/** Get feature medians */
public double[] median();
/** Get median absolute deviations */
public double[] mad();
}
/**
* Min-Max scaling to specified range
*/
class Scaler implements Transform {
/** Fit scaler to [0, 1] range */
public static Scaler fit(double[][] data);
/** Fit scaler to custom range */
public static Scaler fit(double[][] data, double lo, double hi);
/** Transform feature vector */
public double[] apply(double[] x);
/** Get minimum values */
public double[] lo();
/** Get maximum values */
public double[] hi();
}
/**
* Maximum absolute scaling
*/
class MaxAbsScaler implements Transform {
/** Fit max absolute scaler */
public static MaxAbsScaler fit(double[][] data);
/** Transform feature vector */
public double[] apply(double[] x);
/** Get maximum absolute values */
public double[] scale();
}
/**
* Winsor scaling with outlier clipping
*/
class WinsorScaler implements Transform {
/** Fit Winsor scaler with default percentiles (5%, 95%) */
public static WinsorScaler fit(double[][] data);
/** Fit with custom percentiles */
public static WinsorScaler fit(double[][] data, double lower, double upper);
/** Transform feature vector */
public double[] apply(double[] x);
/** Get lower bounds */
public double[] lower();
/** Get upper bounds */
public double[] upper();
}
/**
* Unit vector normalization
*/
class Normalizer implements Transform {
/** L2 normalization */
public static final Normalizer L2 = new Normalizer(Norm.L2);
/** L1 normalization */
public static final Normalizer L1 = new Normalizer(Norm.L1);
/** L-infinity normalization */
public static final Normalizer Linf = new Normalizer(Norm.Linf);
/** Transform to unit vector */
public double[] apply(double[] x);
/** Normalization types */
enum Norm { L1, L2, Linf }
}Usage Example:
import smile.feature.transform.*;
// Standardization pipeline
Standardizer standardizer = Standardizer.fit(trainData);
double[][] standardizedTrain = standardizer.apply(trainData);
double[] standardizedTest = standardizer.apply(testSample);
// Min-max scaling to [0, 1]
Scaler scaler = Scaler.fit(trainData, 0.0, 1.0);
double[][] scaledData = scaler.apply(trainData);
// Robust scaling for outlier handling
RobustStandardizer robust = RobustStandardizer.fit(trainData);
double[][] robustScaled = robust.apply(trainData);Methods for handling missing values in datasets.
/**
* Simple imputation strategies
*/
class SimpleImputer implements Transform {
/** Mean imputation for missing values */
public static SimpleImputer mean(double[][] data);
/** Median imputation for missing values */
public static SimpleImputer median(double[][] data);
/** Mode imputation for missing values */
public static SimpleImputer mode(double[][] data);
/** Constant value imputation */
public static SimpleImputer constant(double[][] data, double value);
/** Transform data with imputation */
public double[] apply(double[] x);
/** Get imputation values */
public double[] values();
}
/**
* K-Nearest Neighbors imputation
*/
class KNNImputer implements Transform {
/** Fit KNN imputer with specified k */
public static KNNImputer fit(double[][] data, int k);
/** Fit with custom distance metric */
public static KNNImputer fit(double[][] data, int k, Distance<double[]> distance);
/** Transform with KNN imputation */
public double[] apply(double[] x);
/** Get k value */
public int k();
}
/**
* K-Medoids imputation
*/
class KMedoidsImputer implements Transform {
/** Fit K-medoids imputer */
public static KMedoidsImputer fit(double[][] data, int k);
/** Transform with medoid imputation */
public double[] apply(double[] x);
/** Get medoid centers */
public double[][] medoids();
}
/**
* SVD-based imputation interface
*/
interface SVDImputer {
/** Impute missing values using SVD */
double[][] impute(double[][] data, int rank);
}Feature extraction methods for text and categorical data.
/**
* Bag of Words transformation for text
*/
class BagOfWords implements Transform {
/** Fit vocabulary from text documents */
public static BagOfWords fit(String[] documents);
/** Fit with custom parameters */
public static BagOfWords fit(String[] documents, int maxFeatures, int minDF, int maxDF);
/** Transform text to feature vector */
public double[] apply(String text);
/** Get vocabulary */
public Map<String, Integer> vocabulary();
/** Get document frequencies */
public double[] documentFrequency();
}
/**
* Binary encoding for categorical features
*/
class BinaryEncoder implements Function<Tuple, int[]> {
/** Fit binary encoder from data */
public static BinaryEncoder fit(DataFrame data);
/** Encode tuple to binary features */
public int[] apply(Tuple tuple);
/** Get encoding dimension */
public int dimension();
}
/**
* Sparse encoding for high-dimensional categorical data
*/
class SparseEncoder implements Function<Tuple, SparseArray> {
/** Fit sparse encoder */
public static SparseEncoder fit(DataFrame data);
/** Encode tuple to sparse array */
public SparseArray apply(Tuple tuple);
/** Get feature dimension */
public int dimension();
}
/**
* Feature hashing for categorical features
*/
class HashEncoder implements Function<String, SparseArray> {
/** Create hash encoder with specified dimension */
public static HashEncoder of(int dimension);
/** Encode string to sparse hash features */
public SparseArray apply(String text);
/** Get hash dimension */
public int dimension();
}Methods for measuring and interpreting feature importance.
/**
* SHAP (SHapley Additive exPlanations) values interface
* @param <T> the type of input objects
*/
interface SHAP<T> {
/** Calculate SHAP values for feature importance */
double[] shap(T x);
/** Calculate SHAP values for multiple samples */
default double[][] shap(T[] x) {
return Arrays.stream(x).map(this::shap).toArray(double[][]::new);
}
}
/**
* Tree-specific SHAP implementation
*/
interface TreeSHAP extends SHAP<Tuple> {
/** Calculate SHAP values for tree-based models */
double[] shap(Tuple x);
/** Calculate SHAP interaction values */
double[][] shapInteraction(Tuple x);
}Abstract base classes for feature transformation implementations.
/**
* Base class for projection-based dimensionality reduction
*/
abstract class Projection implements Transform {
/** Project data to lower-dimensional space */
public abstract double[] project(double[] x);
/** Apply transformation (same as project) */
public double[] apply(double[] x) {
return project(x);
}
/** Get projection dimension */
public abstract int dimension();
}Comprehensive Usage Example:
import smile.feature.extraction.PCA;
import smile.feature.transform.Standardizer;
import smile.feature.imputation.SimpleImputer;
import smile.feature.selection.SignalNoiseRatio;
// Complete preprocessing pipeline
public class FeaturePipeline {
private SimpleImputer imputer;
private Standardizer standardizer;
private PCA pca;
private int[] selectedFeatures;
public void fit(double[][] rawData, int[] labels) {
// 1. Handle missing values
imputer = SimpleImputer.mean(rawData);
double[][] imputedData = imputer.apply(rawData);
// 2. Standardize features
standardizer = Standardizer.fit(imputedData);
double[][] standardizedData = standardizer.apply(imputedData);
// 3. Feature selection
SignalNoiseRatio[] snr = SignalNoiseRatio.fit(standardizedData, labels);
Arrays.sort(snr, Collections.reverseOrder());
selectedFeatures = Arrays.stream(snr)
.limit(100) // Select top 100 features
.mapToInt(s -> s.feature)
.toArray();
// Select features
double[][] selectedData = selectFeatures(standardizedData, selectedFeatures);
// 4. Dimensionality reduction
pca = PCA.fit(selectedData, 50); // Reduce to 50 dimensions
}
public double[] transform(double[] sample) {
double[] imputed = imputer.apply(sample);
double[] standardized = standardizer.apply(imputed);
double[] selected = selectFeatures(standardized, selectedFeatures);
return pca.apply(selected);
}
}Feature engineering methods commonly support these parameters:
Install with Tessl CLI
npx tessl i tessl/maven-com-github-haifengl--smile-core