CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-com-github-haifengl--smile-core

Statistical Machine Intelligence and Learning Engine providing comprehensive machine learning algorithms for classification, regression, clustering, and feature engineering in Java

Pending
Overview
Eval results
Files

feature-engineering.mddocs/

Feature Engineering

Comprehensive preprocessing pipeline including dimensionality reduction, feature selection, transformation, scaling, and imputation utilities. Smile Core provides a complete toolkit for preparing data for machine learning algorithms.

Capabilities

Core Transformation Interface

All feature transformations implement the Transform interface for consistent data preprocessing.

/**
 * Base interface for feature transformations
 */
interface Transform extends Function<double[], double[]> {
    /** Apply transformation to feature vector */
    double[] apply(double[] x);
    
    /** Transform multiple samples */
    default double[][] apply(double[][] x) {
        return Arrays.stream(x).map(this::apply).toArray(double[][]::new);
    }
}

Dimensionality Reduction

Algorithms for reducing the number of features while preserving important information.

/**
 * Principal Component Analysis for dimensionality reduction
 */
class PCA extends Projection {
    /** Fit PCA with default number of components */
    public static PCA fit(double[][] data);
    
    /** Fit PCA with correlation matrix instead of covariance */
    public static PCA cor(double[][] data);
    
    /** Fit PCA with correlation matrix from DataFrame */
    public static PCA cor(DataFrame data);
    
    /** Transform data to principal component space */
    public double[] apply(double[] x);
    
    /** Get principal components (eigenvectors) */
    public double[][] loadings();
    
    /** Get eigenvalues (explained variance) */
    public double[] variance();
    
    /** Get explained variance proportion */
    public double[] varianceProportion();
    
    /** Get cumulative explained variance proportion */
    public double[] cumulativeVarianceProportion();
    
    /** Get projection to k dimensions */
    public Projection getProjection(int k);
    
    /** Get projection by variance threshold */
    public Projection getProjection(double varianceThreshold);
}

/**
 * Kernel PCA for non-linear dimensionality reduction
 */
class KernelPCA extends Projection {
    /** Fit Kernel PCA with RBF kernel */
    public static KernelPCA fit(double[][] data, int k, double sigma);
    
    /** Fit with custom kernel */
    public static KernelPCA fit(double[][] data, int k, Kernel kernel);
    
    /** Transform data to kernel principal component space */
    public double[] apply(double[] x);
    
    /** Get eigenvalues */
    public double[] eigenvalues();
    
    /** Get kernel matrix */
    public double[][] kernelMatrix();
}

/**
 * Probabilistic PCA with missing value handling
 */
class ProbabilisticPCA extends Projection {
    /** Fit Probabilistic PCA */
    public static ProbabilisticPCA fit(double[][] data, int k);
    
    /** Transform data */
    public double[] apply(double[] x);
    
    /** Get noise variance */
    public double noiseVariance();
    
    /** Get log-likelihood */
    public double logLikelihood();
}

/**
 * Random Projection for fast dimensionality reduction
 */
class RandomProjection extends Projection {
    /** Create random projection matrix */
    public static RandomProjection of(int d, int k);
    
    /** Create with specified sparsity */
    public static RandomProjection of(int d, int k, double density);
    
    /** Transform data */
    public double[] apply(double[] x);
    
    /** Get projection matrix */
    public double[][] matrix();
}

/**
 * Generalized Hebbian Algorithm for online PCA
 */
class GHA extends Projection {
    /** Fit GHA with specified learning rate */
    public static GHA fit(double[][] data, int k, double learningRate);
    
    /** Transform data */
    public double[] apply(double[] x);
    
    /** Online update with new sample */
    public void update(double[] x);
    
    /** Get learned weights */
    public double[][] weights();
}

Usage Example:

import smile.feature.extraction.PCA;
import smile.feature.extraction.KernelPCA;

// Basic PCA
PCA pca = PCA.fit(data, 10); // Reduce to 10 dimensions
double[] transformed = pca.apply(newSample);
double[] variance = pca.varianceRatio();

// Kernel PCA for non-linear reduction
KernelPCA kpca = KernelPCA.fit(data, 5, 1.0); // RBF kernel with sigma=1.0
double[] nonLinearTransform = kpca.apply(newSample);

Feature Selection

Methods for selecting the most relevant features for machine learning models.

/**
 * Genetic Algorithm for Feature Extraction
 */
class GAFE {
    /** Perform feature selection using genetic algorithm */
    public static GAFE fit(double[][] x, int[] y, int populationSize, int maxGeneration);
    
    /** Get selected feature indices */
    public int[] features();
    
    /** Get fitness score */
    public double fitness();
    
    /** Transform data using selected features */
    public double[][] apply(double[][] x);
}

/**
 * Signal-to-Noise Ratio for feature ranking
 */
class SignalNoiseRatio implements Comparable<SignalNoiseRatio> {
    /** Calculate SNR for all features */
    public static SignalNoiseRatio[] fit(double[][] x, int[] y);
    
    /** Feature index */
    public final int feature;
    
    /** SNR score */
    public final double score;
    
    /** Compare by score for ranking */
    public int compareTo(SignalNoiseRatio other);
}

/**
 * Sum of Squares Ratio for feature ranking
 */
class SumSquaresRatio implements Comparable<SumSquaresRatio> {
    /** Calculate SSR for all features */
    public static SumSquaresRatio[] fit(double[][] x, int[] y);
    
    /** Feature index */
    public final int feature;
    
    /** SSR score */
    public final double score;
}

/**
 * Information Value for feature selection
 */
class InformationValue implements Comparable<InformationValue> {
    /** Calculate IV for all features */
    public static InformationValue[] fit(double[][] x, int[] y);
    
    /** Feature index */
    public final int feature;
    
    /** Information value score */
    public final double score;
}

Feature Scaling and Normalization

Transformations for scaling features to appropriate ranges and distributions.

/**
 * Z-score standardization (mean=0, std=1)
 */
class Standardizer implements Transform {
    /** Fit standardizer from training data */
    public static Standardizer fit(double[][] data);
    
    /** Fit with robust statistics (median, MAD) */
    public static Standardizer fit(double[][] data, boolean robust);
    
    /** Transform feature vector */
    public double[] apply(double[] x);
    
    /** Get feature means */
    public double[] mean();
    
    /** Get feature standard deviations */
    public double[] std();
}

/**
 * Robust standardization using median and MAD
 */
class RobustStandardizer implements Transform {
    /** Fit robust standardizer */
    public static RobustStandardizer fit(double[][] data);
    
    /** Transform feature vector */
    public double[] apply(double[] x);
    
    /** Get feature medians */
    public double[] median();
    
    /** Get median absolute deviations */
    public double[] mad();
}

/**
 * Min-Max scaling to specified range
 */
class Scaler implements Transform {
    /** Fit scaler to [0, 1] range */
    public static Scaler fit(double[][] data);
    
    /** Fit scaler to custom range */
    public static Scaler fit(double[][] data, double lo, double hi);
    
    /** Transform feature vector */
    public double[] apply(double[] x);
    
    /** Get minimum values */
    public double[] lo();
    
    /** Get maximum values */
    public double[] hi();
}

/**
 * Maximum absolute scaling
 */
class MaxAbsScaler implements Transform {
    /** Fit max absolute scaler */
    public static MaxAbsScaler fit(double[][] data);
    
    /** Transform feature vector */
    public double[] apply(double[] x);
    
    /** Get maximum absolute values */
    public double[] scale();
}

/**
 * Winsor scaling with outlier clipping
 */
class WinsorScaler implements Transform {
    /** Fit Winsor scaler with default percentiles (5%, 95%) */
    public static WinsorScaler fit(double[][] data);
    
    /** Fit with custom percentiles */
    public static WinsorScaler fit(double[][] data, double lower, double upper);
    
    /** Transform feature vector */
    public double[] apply(double[] x);
    
    /** Get lower bounds */
    public double[] lower();
    
    /** Get upper bounds */
    public double[] upper();
}

/**
 * Unit vector normalization
 */
class Normalizer implements Transform {
    /** L2 normalization */
    public static final Normalizer L2 = new Normalizer(Norm.L2);
    
    /** L1 normalization */
    public static final Normalizer L1 = new Normalizer(Norm.L1);
    
    /** L-infinity normalization */
    public static final Normalizer Linf = new Normalizer(Norm.Linf);
    
    /** Transform to unit vector */
    public double[] apply(double[] x);
    
    /** Normalization types */
    enum Norm { L1, L2, Linf }
}

Usage Example:

import smile.feature.transform.*;

// Standardization pipeline
Standardizer standardizer = Standardizer.fit(trainData);
double[][] standardizedTrain = standardizer.apply(trainData);
double[] standardizedTest = standardizer.apply(testSample);

// Min-max scaling to [0, 1]
Scaler scaler = Scaler.fit(trainData, 0.0, 1.0);
double[][] scaledData = scaler.apply(trainData);

// Robust scaling for outlier handling
RobustStandardizer robust = RobustStandardizer.fit(trainData);
double[][] robustScaled = robust.apply(trainData);

Missing Value Imputation

Methods for handling missing values in datasets.

/**
 * Simple imputation strategies
 */
class SimpleImputer implements Transform {
    /** Mean imputation for missing values */
    public static SimpleImputer mean(double[][] data);
    
    /** Median imputation for missing values */
    public static SimpleImputer median(double[][] data);
    
    /** Mode imputation for missing values */
    public static SimpleImputer mode(double[][] data);
    
    /** Constant value imputation */
    public static SimpleImputer constant(double[][] data, double value);
    
    /** Transform data with imputation */
    public double[] apply(double[] x);
    
    /** Get imputation values */
    public double[] values();
}

/**
 * K-Nearest Neighbors imputation
 */
class KNNImputer implements Transform {
    /** Fit KNN imputer with specified k */
    public static KNNImputer fit(double[][] data, int k);
    
    /** Fit with custom distance metric */
    public static KNNImputer fit(double[][] data, int k, Distance<double[]> distance);
    
    /** Transform with KNN imputation */
    public double[] apply(double[] x);
    
    /** Get k value */
    public int k();
}

/**
 * K-Medoids imputation
 */
class KMedoidsImputer implements Transform {
    /** Fit K-medoids imputer */
    public static KMedoidsImputer fit(double[][] data, int k);
    
    /** Transform with medoid imputation */
    public double[] apply(double[] x);
    
    /** Get medoid centers */
    public double[][] medoids();
}

/**
 * SVD-based imputation interface
 */
interface SVDImputer {
    /** Impute missing values using SVD */
    double[][] impute(double[][] data, int rank);
}

Text Feature Extraction

Feature extraction methods for text and categorical data.

/**
 * Bag of Words transformation for text
 */
class BagOfWords implements Transform {
    /** Fit vocabulary from text documents */
    public static BagOfWords fit(String[] documents);
    
    /** Fit with custom parameters */
    public static BagOfWords fit(String[] documents, int maxFeatures, int minDF, int maxDF);
    
    /** Transform text to feature vector */
    public double[] apply(String text);
    
    /** Get vocabulary */
    public Map<String, Integer> vocabulary();
    
    /** Get document frequencies */
    public double[] documentFrequency();
}

/**
 * Binary encoding for categorical features
 */
class BinaryEncoder implements Function<Tuple, int[]> {
    /** Fit binary encoder from data */
    public static BinaryEncoder fit(DataFrame data);
    
    /** Encode tuple to binary features */
    public int[] apply(Tuple tuple);
    
    /** Get encoding dimension */
    public int dimension();
}

/**
 * Sparse encoding for high-dimensional categorical data
 */
class SparseEncoder implements Function<Tuple, SparseArray> {
    /** Fit sparse encoder */
    public static SparseEncoder fit(DataFrame data);
    
    /** Encode tuple to sparse array */
    public SparseArray apply(Tuple tuple);
    
    /** Get feature dimension */
    public int dimension();
}

/**
 * Feature hashing for categorical features
 */
class HashEncoder implements Function<String, SparseArray> {
    /** Create hash encoder with specified dimension */
    public static HashEncoder of(int dimension);
    
    /** Encode string to sparse hash features */
    public SparseArray apply(String text);
    
    /** Get hash dimension */
    public int dimension();
}

Feature Importance

Methods for measuring and interpreting feature importance.

/**
 * SHAP (SHapley Additive exPlanations) values interface
 * @param <T> the type of input objects
 */
interface SHAP<T> {
    /** Calculate SHAP values for feature importance */
    double[] shap(T x);
    
    /** Calculate SHAP values for multiple samples */
    default double[][] shap(T[] x) {
        return Arrays.stream(x).map(this::shap).toArray(double[][]::new);
    }
}

/**
 * Tree-specific SHAP implementation
 */
interface TreeSHAP extends SHAP<Tuple> {
    /** Calculate SHAP values for tree-based models */
    double[] shap(Tuple x);
    
    /** Calculate SHAP interaction values */
    double[][] shapInteraction(Tuple x);
}

Base Classes

Abstract base classes for feature transformation implementations.

/**
 * Base class for projection-based dimensionality reduction
 */
abstract class Projection implements Transform {
    /** Project data to lower-dimensional space */
    public abstract double[] project(double[] x);
    
    /** Apply transformation (same as project) */
    public double[] apply(double[] x) {
        return project(x);
    }
    
    /** Get projection dimension */
    public abstract int dimension();
}

Comprehensive Usage Example:

import smile.feature.extraction.PCA;
import smile.feature.transform.Standardizer;
import smile.feature.imputation.SimpleImputer;
import smile.feature.selection.SignalNoiseRatio;

// Complete preprocessing pipeline
public class FeaturePipeline {
    private SimpleImputer imputer;
    private Standardizer standardizer;
    private PCA pca;
    private int[] selectedFeatures;
    
    public void fit(double[][] rawData, int[] labels) {
        // 1. Handle missing values
        imputer = SimpleImputer.mean(rawData);
        double[][] imputedData = imputer.apply(rawData);
        
        // 2. Standardize features
        standardizer = Standardizer.fit(imputedData);
        double[][] standardizedData = standardizer.apply(imputedData);
        
        // 3. Feature selection
        SignalNoiseRatio[] snr = SignalNoiseRatio.fit(standardizedData, labels);
        Arrays.sort(snr, Collections.reverseOrder());
        selectedFeatures = Arrays.stream(snr)
            .limit(100) // Select top 100 features
            .mapToInt(s -> s.feature)
            .toArray();
        
        // Select features
        double[][] selectedData = selectFeatures(standardizedData, selectedFeatures);
        
        // 4. Dimensionality reduction
        pca = PCA.fit(selectedData, 50); // Reduce to 50 dimensions
    }
    
    public double[] transform(double[] sample) {
        double[] imputed = imputer.apply(sample);
        double[] standardized = standardizer.apply(imputed);
        double[] selected = selectFeatures(standardized, selectedFeatures);
        return pca.apply(selected);
    }
}

Common Parameters

Feature engineering methods commonly support these parameters:

  • k: Number of components/features to keep
  • threshold: Selection threshold for feature ranking
  • minDF/maxDF: Minimum/maximum document frequency (text)
  • maxFeatures: Maximum number of features to extract
  • learningRate: Learning rate for online algorithms
  • sparse: Whether to return sparse representations
  • random_state: Random seed for reproducible results

Install with Tessl CLI

npx tessl i tessl/maven-com-github-haifengl--smile-core

docs

advanced-analytics.md

classification.md

clustering.md

deep-learning.md

feature-engineering.md

index.md

regression.md

validation-metrics.md

tile.json