or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-analytics.mdclassification.mdclustering.mddeep-learning.mdfeature-engineering.mdindex.mdregression.mdvalidation-metrics.md

feature-engineering.mddocs/

0

# Feature Engineering

1

2

Comprehensive preprocessing pipeline including dimensionality reduction, feature selection, transformation, scaling, and imputation utilities. Smile Core provides a complete toolkit for preparing data for machine learning algorithms.

3

4

## Capabilities

5

6

### Core Transformation Interface

7

8

All feature transformations implement the `Transform` interface for consistent data preprocessing.

9

10

```java { .api }

11

/**

12

* Base interface for feature transformations

13

*/

14

interface Transform extends Function<double[], double[]> {

15

/** Apply transformation to feature vector */

16

double[] apply(double[] x);

17

18

/** Transform multiple samples */

19

default double[][] apply(double[][] x) {

20

return Arrays.stream(x).map(this::apply).toArray(double[][]::new);

21

}

22

}

23

```

24

25

### Dimensionality Reduction

26

27

Algorithms for reducing the number of features while preserving important information.

28

29

```java { .api }

30

/**

31

* Principal Component Analysis for dimensionality reduction

32

*/

33

class PCA extends Projection {

34

/** Fit PCA with default number of components */

35

public static PCA fit(double[][] data);

36

37

/** Fit PCA with correlation matrix instead of covariance */

38

public static PCA cor(double[][] data);

39

40

/** Fit PCA with correlation matrix from DataFrame */

41

public static PCA cor(DataFrame data);

42

43

/** Transform data to principal component space */

44

public double[] apply(double[] x);

45

46

/** Get principal components (eigenvectors) */

47

public double[][] loadings();

48

49

/** Get eigenvalues (explained variance) */

50

public double[] variance();

51

52

/** Get explained variance proportion */

53

public double[] varianceProportion();

54

55

/** Get cumulative explained variance proportion */

56

public double[] cumulativeVarianceProportion();

57

58

/** Get projection to k dimensions */

59

public Projection getProjection(int k);

60

61

/** Get projection by variance threshold */

62

public Projection getProjection(double varianceThreshold);

63

}

64

65

/**

66

* Kernel PCA for non-linear dimensionality reduction

67

*/

68

class KernelPCA extends Projection {

69

/** Fit Kernel PCA with RBF kernel */

70

public static KernelPCA fit(double[][] data, int k, double sigma);

71

72

/** Fit with custom kernel */

73

public static KernelPCA fit(double[][] data, int k, Kernel kernel);

74

75

/** Transform data to kernel principal component space */

76

public double[] apply(double[] x);

77

78

/** Get eigenvalues */

79

public double[] eigenvalues();

80

81

/** Get kernel matrix */

82

public double[][] kernelMatrix();

83

}

84

85

/**

86

* Probabilistic PCA with missing value handling

87

*/

88

class ProbabilisticPCA extends Projection {

89

/** Fit Probabilistic PCA */

90

public static ProbabilisticPCA fit(double[][] data, int k);

91

92

/** Transform data */

93

public double[] apply(double[] x);

94

95

/** Get noise variance */

96

public double noiseVariance();

97

98

/** Get log-likelihood */

99

public double logLikelihood();

100

}

101

102

/**

103

* Random Projection for fast dimensionality reduction

104

*/

105

class RandomProjection extends Projection {

106

/** Create random projection matrix */

107

public static RandomProjection of(int d, int k);

108

109

/** Create with specified sparsity */

110

public static RandomProjection of(int d, int k, double density);

111

112

/** Transform data */

113

public double[] apply(double[] x);

114

115

/** Get projection matrix */

116

public double[][] matrix();

117

}

118

119

/**

120

* Generalized Hebbian Algorithm for online PCA

121

*/

122

class GHA extends Projection {

123

/** Fit GHA with specified learning rate */

124

public static GHA fit(double[][] data, int k, double learningRate);

125

126

/** Transform data */

127

public double[] apply(double[] x);

128

129

/** Online update with new sample */

130

public void update(double[] x);

131

132

/** Get learned weights */

133

public double[][] weights();

134

}

135

```

136

137

**Usage Example:**

138

139

```java

140

import smile.feature.extraction.PCA;

141

import smile.feature.extraction.KernelPCA;

142

143

// Basic PCA

144

PCA pca = PCA.fit(data, 10); // Reduce to 10 dimensions

145

double[] transformed = pca.apply(newSample);

146

double[] variance = pca.varianceRatio();

147

148

// Kernel PCA for non-linear reduction

149

KernelPCA kpca = KernelPCA.fit(data, 5, 1.0); // RBF kernel with sigma=1.0

150

double[] nonLinearTransform = kpca.apply(newSample);

151

```

152

153

### Feature Selection

154

155

Methods for selecting the most relevant features for machine learning models.

156

157

```java { .api }

158

/**

159

* Genetic Algorithm for Feature Extraction

160

*/

161

class GAFE {

162

/** Perform feature selection using genetic algorithm */

163

public static GAFE fit(double[][] x, int[] y, int populationSize, int maxGeneration);

164

165

/** Get selected feature indices */

166

public int[] features();

167

168

/** Get fitness score */

169

public double fitness();

170

171

/** Transform data using selected features */

172

public double[][] apply(double[][] x);

173

}

174

175

/**

176

* Signal-to-Noise Ratio for feature ranking

177

*/

178

class SignalNoiseRatio implements Comparable<SignalNoiseRatio> {

179

/** Calculate SNR for all features */

180

public static SignalNoiseRatio[] fit(double[][] x, int[] y);

181

182

/** Feature index */

183

public final int feature;

184

185

/** SNR score */

186

public final double score;

187

188

/** Compare by score for ranking */

189

public int compareTo(SignalNoiseRatio other);

190

}

191

192

/**

193

* Sum of Squares Ratio for feature ranking

194

*/

195

class SumSquaresRatio implements Comparable<SumSquaresRatio> {

196

/** Calculate SSR for all features */

197

public static SumSquaresRatio[] fit(double[][] x, int[] y);

198

199

/** Feature index */

200

public final int feature;

201

202

/** SSR score */

203

public final double score;

204

}

205

206

/**

207

* Information Value for feature selection

208

*/

209

class InformationValue implements Comparable<InformationValue> {

210

/** Calculate IV for all features */

211

public static InformationValue[] fit(double[][] x, int[] y);

212

213

/** Feature index */

214

public final int feature;

215

216

/** Information value score */

217

public final double score;

218

}

219

```

220

221

### Feature Scaling and Normalization

222

223

Transformations for scaling features to appropriate ranges and distributions.

224

225

```java { .api }

226

/**

227

* Z-score standardization (mean=0, std=1)

228

*/

229

class Standardizer implements Transform {

230

/** Fit standardizer from training data */

231

public static Standardizer fit(double[][] data);

232

233

/** Fit with robust statistics (median, MAD) */

234

public static Standardizer fit(double[][] data, boolean robust);

235

236

/** Transform feature vector */

237

public double[] apply(double[] x);

238

239

/** Get feature means */

240

public double[] mean();

241

242

/** Get feature standard deviations */

243

public double[] std();

244

}

245

246

/**

247

* Robust standardization using median and MAD

248

*/

249

class RobustStandardizer implements Transform {

250

/** Fit robust standardizer */

251

public static RobustStandardizer fit(double[][] data);

252

253

/** Transform feature vector */

254

public double[] apply(double[] x);

255

256

/** Get feature medians */

257

public double[] median();

258

259

/** Get median absolute deviations */

260

public double[] mad();

261

}

262

263

/**

264

* Min-Max scaling to specified range

265

*/

266

class Scaler implements Transform {

267

/** Fit scaler to [0, 1] range */

268

public static Scaler fit(double[][] data);

269

270

/** Fit scaler to custom range */

271

public static Scaler fit(double[][] data, double lo, double hi);

272

273

/** Transform feature vector */

274

public double[] apply(double[] x);

275

276

/** Get minimum values */

277

public double[] lo();

278

279

/** Get maximum values */

280

public double[] hi();

281

}

282

283

/**

284

* Maximum absolute scaling

285

*/

286

class MaxAbsScaler implements Transform {

287

/** Fit max absolute scaler */

288

public static MaxAbsScaler fit(double[][] data);

289

290

/** Transform feature vector */

291

public double[] apply(double[] x);

292

293

/** Get maximum absolute values */

294

public double[] scale();

295

}

296

297

/**

298

* Winsor scaling with outlier clipping

299

*/

300

class WinsorScaler implements Transform {

301

/** Fit Winsor scaler with default percentiles (5%, 95%) */

302

public static WinsorScaler fit(double[][] data);

303

304

/** Fit with custom percentiles */

305

public static WinsorScaler fit(double[][] data, double lower, double upper);

306

307

/** Transform feature vector */

308

public double[] apply(double[] x);

309

310

/** Get lower bounds */

311

public double[] lower();

312

313

/** Get upper bounds */

314

public double[] upper();

315

}

316

317

/**

318

* Unit vector normalization

319

*/

320

class Normalizer implements Transform {

321

/** L2 normalization */

322

public static final Normalizer L2 = new Normalizer(Norm.L2);

323

324

/** L1 normalization */

325

public static final Normalizer L1 = new Normalizer(Norm.L1);

326

327

/** L-infinity normalization */

328

public static final Normalizer Linf = new Normalizer(Norm.Linf);

329

330

/** Transform to unit vector */

331

public double[] apply(double[] x);

332

333

/** Normalization types */

334

enum Norm { L1, L2, Linf }

335

}

336

```

337

338

**Usage Example:**

339

340

```java

341

import smile.feature.transform.*;

342

343

// Standardization pipeline

344

Standardizer standardizer = Standardizer.fit(trainData);

345

double[][] standardizedTrain = standardizer.apply(trainData);

346

double[] standardizedTest = standardizer.apply(testSample);

347

348

// Min-max scaling to [0, 1]

349

Scaler scaler = Scaler.fit(trainData, 0.0, 1.0);

350

double[][] scaledData = scaler.apply(trainData);

351

352

// Robust scaling for outlier handling

353

RobustStandardizer robust = RobustStandardizer.fit(trainData);

354

double[][] robustScaled = robust.apply(trainData);

355

```

356

357

### Missing Value Imputation

358

359

Methods for handling missing values in datasets.

360

361

```java { .api }

362

/**

363

* Simple imputation strategies

364

*/

365

class SimpleImputer implements Transform {

366

/** Mean imputation for missing values */

367

public static SimpleImputer mean(double[][] data);

368

369

/** Median imputation for missing values */

370

public static SimpleImputer median(double[][] data);

371

372

/** Mode imputation for missing values */

373

public static SimpleImputer mode(double[][] data);

374

375

/** Constant value imputation */

376

public static SimpleImputer constant(double[][] data, double value);

377

378

/** Transform data with imputation */

379

public double[] apply(double[] x);

380

381

/** Get imputation values */

382

public double[] values();

383

}

384

385

/**

386

* K-Nearest Neighbors imputation

387

*/

388

class KNNImputer implements Transform {

389

/** Fit KNN imputer with specified k */

390

public static KNNImputer fit(double[][] data, int k);

391

392

/** Fit with custom distance metric */

393

public static KNNImputer fit(double[][] data, int k, Distance<double[]> distance);

394

395

/** Transform with KNN imputation */

396

public double[] apply(double[] x);

397

398

/** Get k value */

399

public int k();

400

}

401

402

/**

403

* K-Medoids imputation

404

*/

405

class KMedoidsImputer implements Transform {

406

/** Fit K-medoids imputer */

407

public static KMedoidsImputer fit(double[][] data, int k);

408

409

/** Transform with medoid imputation */

410

public double[] apply(double[] x);

411

412

/** Get medoid centers */

413

public double[][] medoids();

414

}

415

416

/**

417

* SVD-based imputation interface

418

*/

419

interface SVDImputer {

420

/** Impute missing values using SVD */

421

double[][] impute(double[][] data, int rank);

422

}

423

```

424

425

### Text Feature Extraction

426

427

Feature extraction methods for text and categorical data.

428

429

```java { .api }

430

/**

431

* Bag of Words transformation for text

432

*/

433

class BagOfWords implements Transform {

434

/** Fit vocabulary from text documents */

435

public static BagOfWords fit(String[] documents);

436

437

/** Fit with custom parameters */

438

public static BagOfWords fit(String[] documents, int maxFeatures, int minDF, int maxDF);

439

440

/** Transform text to feature vector */

441

public double[] apply(String text);

442

443

/** Get vocabulary */

444

public Map<String, Integer> vocabulary();

445

446

/** Get document frequencies */

447

public double[] documentFrequency();

448

}

449

450

/**

451

* Binary encoding for categorical features

452

*/

453

class BinaryEncoder implements Function<Tuple, int[]> {

454

/** Fit binary encoder from data */

455

public static BinaryEncoder fit(DataFrame data);

456

457

/** Encode tuple to binary features */

458

public int[] apply(Tuple tuple);

459

460

/** Get encoding dimension */

461

public int dimension();

462

}

463

464

/**

465

* Sparse encoding for high-dimensional categorical data

466

*/

467

class SparseEncoder implements Function<Tuple, SparseArray> {

468

/** Fit sparse encoder */

469

public static SparseEncoder fit(DataFrame data);

470

471

/** Encode tuple to sparse array */

472

public SparseArray apply(Tuple tuple);

473

474

/** Get feature dimension */

475

public int dimension();

476

}

477

478

/**

479

* Feature hashing for categorical features

480

*/

481

class HashEncoder implements Function<String, SparseArray> {

482

/** Create hash encoder with specified dimension */

483

public static HashEncoder of(int dimension);

484

485

/** Encode string to sparse hash features */

486

public SparseArray apply(String text);

487

488

/** Get hash dimension */

489

public int dimension();

490

}

491

```

492

493

### Feature Importance

494

495

Methods for measuring and interpreting feature importance.

496

497

```java { .api }

498

/**

499

* SHAP (SHapley Additive exPlanations) values interface

500

* @param <T> the type of input objects

501

*/

502

interface SHAP<T> {

503

/** Calculate SHAP values for feature importance */

504

double[] shap(T x);

505

506

/** Calculate SHAP values for multiple samples */

507

default double[][] shap(T[] x) {

508

return Arrays.stream(x).map(this::shap).toArray(double[][]::new);

509

}

510

}

511

512

/**

513

* Tree-specific SHAP implementation

514

*/

515

interface TreeSHAP extends SHAP<Tuple> {

516

/** Calculate SHAP values for tree-based models */

517

double[] shap(Tuple x);

518

519

/** Calculate SHAP interaction values */

520

double[][] shapInteraction(Tuple x);

521

}

522

```

523

524

### Base Classes

525

526

Abstract base classes for feature transformation implementations.

527

528

```java { .api }

529

/**

530

* Base class for projection-based dimensionality reduction

531

*/

532

abstract class Projection implements Transform {

533

/** Project data to lower-dimensional space */

534

public abstract double[] project(double[] x);

535

536

/** Apply transformation (same as project) */

537

public double[] apply(double[] x) {

538

return project(x);

539

}

540

541

/** Get projection dimension */

542

public abstract int dimension();

543

}

544

```

545

546

**Comprehensive Usage Example:**

547

548

```java

549

import smile.feature.extraction.PCA;

550

import smile.feature.transform.Standardizer;

551

import smile.feature.imputation.SimpleImputer;

552

import smile.feature.selection.SignalNoiseRatio;

553

554

// Complete preprocessing pipeline

555

public class FeaturePipeline {

556

private SimpleImputer imputer;

557

private Standardizer standardizer;

558

private PCA pca;

559

private int[] selectedFeatures;

560

561

public void fit(double[][] rawData, int[] labels) {

562

// 1. Handle missing values

563

imputer = SimpleImputer.mean(rawData);

564

double[][] imputedData = imputer.apply(rawData);

565

566

// 2. Standardize features

567

standardizer = Standardizer.fit(imputedData);

568

double[][] standardizedData = standardizer.apply(imputedData);

569

570

// 3. Feature selection

571

SignalNoiseRatio[] snr = SignalNoiseRatio.fit(standardizedData, labels);

572

Arrays.sort(snr, Collections.reverseOrder());

573

selectedFeatures = Arrays.stream(snr)

574

.limit(100) // Select top 100 features

575

.mapToInt(s -> s.feature)

576

.toArray();

577

578

// Select features

579

double[][] selectedData = selectFeatures(standardizedData, selectedFeatures);

580

581

// 4. Dimensionality reduction

582

pca = PCA.fit(selectedData, 50); // Reduce to 50 dimensions

583

}

584

585

public double[] transform(double[] sample) {

586

double[] imputed = imputer.apply(sample);

587

double[] standardized = standardizer.apply(imputed);

588

double[] selected = selectFeatures(standardized, selectedFeatures);

589

return pca.apply(selected);

590

}

591

}

592

```

593

594

### Common Parameters

595

596

Feature engineering methods commonly support these parameters:

597

598

- **k**: Number of components/features to keep

599

- **threshold**: Selection threshold for feature ranking

600

- **minDF/maxDF**: Minimum/maximum document frequency (text)

601

- **maxFeatures**: Maximum number of features to extract

602

- **learningRate**: Learning rate for online algorithms

603

- **sparse**: Whether to return sparse representations

604

- **random_state**: Random seed for reproducible results