or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mdevaluation.mdfeature-processing.mdfrequent-pattern-mining.mdindex.mdlinear-algebra.mdpipeline.mdrecommendation.mdregression.mdstatistics.md

feature-processing.mddocs/

0

# Feature Processing

1

2

Comprehensive feature extraction, transformation, selection, and engineering capabilities. MLlib provides over 60 feature processing methods including text processing, scaling, dimensionality reduction, categorical encoding, and feature selection.

3

4

## Capabilities

5

6

### Vector Assembly and Manipulation

7

8

Core utilities for combining and manipulating feature vectors.

9

10

```scala { .api }

11

/**

12

* VectorAssembler - combines multiple columns into a single vector column

13

* Essential for preparing features for ML algorithms

14

*/

15

class VectorAssembler extends Transformer with HasInputCols with HasOutputCol with HasHandleInvalid {

16

def setInputCols(value: Array[String]): this.type

17

def setOutputCol(value: String): this.type

18

def setHandleInvalid(value: String): this.type

19

}

20

21

/**

22

* VectorSlicer - selects a subset of features from a vector

23

* Useful for feature selection and dimensionality reduction

24

*/

25

class VectorSlicer extends Transformer with HasInputCol with HasOutputCol {

26

def setInputCol(value: String): this.type

27

def setOutputCol(value: String): this.type

28

def setIndices(value: Array[Int]): this.type

29

def setNames(value: Array[String]): this.type

30

}

31

32

/**

33

* VectorSizeHint - provides size information for vectors

34

* Helps optimize vector operations when size is known

35

*/

36

class VectorSizeHint extends Transformer with HasInputCol with HasOutputCol {

37

def setInputCol(value: String): this.type

38

def setOutputCol(value: String): this.type

39

def setSize(value: Int): this.type

40

def setHandleInvalid(value: String): this.type

41

}

42

```

43

44

**Usage Example:**

45

46

```scala

47

import org.apache.spark.ml.feature.VectorAssembler

48

49

val assembler = new VectorAssembler()

50

.setInputCols(Array("hour", "mobile", "userFeatures"))

51

.setOutputCol("features")

52

53

val output = assembler.transform(dataset)

54

```

55

56

### Feature Scaling and Normalization

57

58

Methods for scaling and normalizing feature values for improved algorithm performance.

59

60

```scala { .api }

61

/**

62

* StandardScaler - standardizes features by removing mean and scaling to unit variance

63

* Transforms features to have zero mean and unit standard deviation

64

*/

65

class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerParams {

66

def setInputCol(value: String): this.type

67

def setOutputCol(value: String): this.type

68

def setWithMean(value: Boolean): this.type

69

def setWithStd(value: Boolean): this.type

70

}

71

72

class StandardScalerModel extends Model[StandardScalerModel] with StandardScalerParams {

73

def mean: Vector

74

def std: Vector

75

}

76

77

/**

78

* MinMaxScaler - rescales features to a specified range [min, max]

79

* Transforms features to fit within the specified minimum and maximum values

80

*/

81

class MinMaxScaler extends Estimator[MinMaxScalerModel] with MinMaxScalerParams {

82

def setInputCol(value: String): this.type

83

def setOutputCol(value: String): this.type

84

def setMin(value: Double): this.type

85

def setMax(value: Double): this.type

86

}

87

88

class MinMaxScalerModel extends Model[MinMaxScalerModel] with MinMaxScalerParams {

89

def originalMin: Vector

90

def originalMax: Vector

91

}

92

93

/**

94

* MaxAbsScaler - scales features by the maximum absolute value

95

* Divides each feature by its maximum absolute value to scale to [-1, 1]

96

*/

97

class MaxAbsScaler extends Estimator[MaxAbsScalerModel] with MaxAbsScalerParams {

98

def setInputCol(value: String): this.type

99

def setOutputCol(value: String): this.type

100

}

101

102

class MaxAbsScalerModel extends Model[MaxAbsScalerModel] with MaxAbsScalerParams {

103

def maxAbs: Vector

104

}

105

106

/**

107

* RobustScaler - scales features using median and interquartile range

108

* More robust to outliers than StandardScaler

109

*/

110

class RobustScaler extends Estimator[RobustScalerModel] with RobustScalerParams {

111

def setInputCol(value: String): this.type

112

def setOutputCol(value: String): this.type

113

def setWithCentering(value: Boolean): this.type

114

def setWithScaling(value: Boolean): this.type

115

def setLower(value: Double): this.type

116

def setUpper(value: Double): this.type

117

}

118

119

class RobustScalerModel extends Model[RobustScalerModel] with RobustScalerParams {

120

def median: Vector

121

def range: Vector

122

}

123

124

/**

125

* Normalizer - normalizes vectors to have unit norm

126

* Scales individual samples to have unit norm (L1, L2, or Inf norm)

127

*/

128

class Normalizer extends Transformer with HasInputCol with HasOutputCol {

129

def setInputCol(value: String): this.type

130

def setOutputCol(value: String): this.type

131

def setP(value: Double): this.type

132

}

133

```

134

135

### Categorical Feature Processing

136

137

Methods for handling categorical variables including indexing and encoding.

138

139

```scala { .api }

140

/**

141

* StringIndexer - maps string values to numeric indices

142

* Converts categorical string features to numeric indices for ML algorithms

143

*/

144

class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerParams {

145

def setInputCol(value: String): this.type

146

def setOutputCol(value: String): this.type

147

def setHandleInvalid(value: String): this.type

148

def setStringOrderType(value: String): this.type

149

}

150

151

class StringIndexerModel extends Model[StringIndexerModel] with StringIndexerParams {

152

def labels: Array[String]

153

def labelsArray: Array[Array[String]]

154

}

155

156

/**

157

* IndexToString - maps numeric indices back to string values

158

* Reverse operation of StringIndexer for interpreting model outputs

159

*/

160

class IndexToString extends Transformer with HasInputCol with HasOutputCol {

161

def setInputCol(value: String): this.type

162

def setOutputCol(value: String): this.type

163

def setLabels(value: Array[String]): this.type

164

}

165

166

/**

167

* OneHotEncoder - converts categorical indices to binary vectors

168

* Creates binary columns for each category level

169

*/

170

class OneHotEncoder extends Estimator[OneHotEncoderModel] with OneHotEncoderParams {

171

def setInputCols(value: Array[String]): this.type

172

def setOutputCols(value: Array[String]): this.type

173

def setDropLast(value: Boolean): this.type

174

def setHandleInvalid(value: String): this.type

175

}

176

177

class OneHotEncoderModel extends Model[OneHotEncoderModel] with OneHotEncoderParams {

178

def categorySizes: Array[Int]

179

}

180

181

/**

182

* VectorIndexer - automatically identifies categorical features in vectors

183

* Treats features with <= maxCategories unique values as categorical

184

*/

185

class VectorIndexer extends Estimator[VectorIndexerModel] with VectorIndexerParams {

186

def setInputCol(value: String): this.type

187

def setOutputCol(value: String): this.type

188

def setMaxCategories(value: Int): this.type

189

def setHandleInvalid(value: String): this.type

190

}

191

192

class VectorIndexerModel extends Model[VectorIndexerModel] with VectorIndexerParams {

193

def numFeatures: Int

194

def categoryMaps: Map[Int, Map[Double, Int]]

195

}

196

```

197

198

### Text Processing and Feature Extraction

199

200

Comprehensive text processing capabilities for natural language data.

201

202

```scala { .api }

203

/**

204

* Tokenizer - splits text into individual words

205

* Basic tokenization using whitespace and punctuation

206

*/

207

class Tokenizer extends Transformer with HasInputCol with HasOutputCol {

208

def setInputCol(value: String): this.type

209

def setOutputCol(value: String): this.type

210

}

211

212

/**

213

* RegexTokenizer - tokenizes text using regular expressions

214

* More flexible tokenization with configurable patterns

215

*/

216

class RegexTokenizer extends Transformer with HasInputCol with HasOutputCol {

217

def setInputCol(value: String): this.type

218

def setOutputCol(value: String): this.type

219

def setPattern(value: String): this.type

220

def setGaps(value: Boolean): this.type

221

def setToLowercase(value: Boolean): this.type

222

def setMinTokenLength(value: Int): this.type

223

}

224

225

/**

226

* StopWordsRemover - removes stop words from tokenized text

227

* Filters common words that don't contribute to text meaning

228

*/

229

class StopWordsRemover extends Transformer with HasInputCol with HasOutputCol {

230

def setInputCol(value: String): this.type

231

def setOutputCol(value: String): this.type

232

def setStopWords(value: Array[String]): this.type

233

def setCaseSensitive(value: Boolean): this.type

234

def setLocale(value: String): this.type

235

}

236

237

/**

238

* NGram - generates n-grams from token sequences

239

* Creates sequences of n consecutive tokens for text analysis

240

*/

241

class NGram extends Transformer with HasInputCol with HasOutputCol {

242

def setInputCol(value: String): this.type

243

def setOutputCol(value: String): this.type

244

def setN(value: Int): this.type

245

}

246

247

/**

248

* HashingTF - maps terms to term frequency vectors using hashing

249

* Fast text vectorization using hash functions

250

*/

251

class HashingTF extends Transformer with HasInputCol with HasOutputCol {

252

def setInputCol(value: String): this.type

253

def setOutputCol(value: String): this.type

254

def setNumFeatures(value: Int): this.type

255

def setBinary(value: Boolean): this.type

256

}

257

258

/**

259

* CountVectorizer - converts text to vectors based on term counts

260

* Creates vocabulary and maps documents to term frequency vectors

261

*/

262

class CountVectorizer extends Estimator[CountVectorizerModel] with CountVectorizerParams {

263

def setInputCol(value: String): this.type

264

def setOutputCol(value: String): this.type

265

def setVocabSize(value: Int): this.type

266

def setMinDF(value: Double): this.type

267

def setMaxDF(value: Double): this.type

268

def setMinTF(value: Double): this.type

269

def setBinary(value: Boolean): this.type

270

}

271

272

class CountVectorizerModel extends Model[CountVectorizerModel] with CountVectorizerParams {

273

def vocabulary: Array[String]

274

}

275

276

/**

277

* IDF - computes inverse document frequency for TF-IDF

278

* Weights term frequencies by their inverse document frequency

279

*/

280

class IDF extends Estimator[IDFModel] with IDFParams {

281

def setInputCol(value: String): this.type

282

def setOutputCol(value: String): this.type

283

def setMinDocFreq(value: Int): this.type

284

}

285

286

class IDFModel extends Model[IDFModel] with IDFParams {

287

def idf: Vector

288

def docFreq: Array[Long]

289

def numDocs: Long

290

}

291

292

/**

293

* Word2Vec - learns vector representations of words

294

* Trains word embeddings using the Word2Vec algorithm

295

*/

296

class Word2Vec extends Estimator[Word2VecModel] with Word2VecParams {

297

def setInputCol(value: String): this.type

298

def setOutputCol(value: String): this.type

299

def setVectorSize(value: Int): this.type

300

def setMinCount(value: Int): this.type

301

def setNumPartitions(value: Int): this.type

302

def setStepSize(value: Double): this.type

303

def setMaxIter(value: Int): this.type

304

def setSeed(value: Long): this.type

305

def setWindowSize(value: Int): this.type

306

}

307

308

class Word2VecModel extends Model[Word2VecModel] with Word2VecParams {

309

def getVectors: DataFrame

310

def findSynonyms(word: String, num: Int): DataFrame

311

def findSynonymsArray(word: String, num: Int): Array[(String, Double)]

312

def transform(word: String): Vector

313

}

314

```

315

316

### Feature Selection

317

318

Methods for selecting the most relevant features for modeling.

319

320

```scala { .api }

321

/**

322

* ChiSqSelector - selects features based on Chi-squared test

323

* Statistical feature selection for categorical features

324

*/

325

class ChiSqSelector extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams {

326

def setFeaturesCol(value: String): this.type

327

def setOutputCol(value: String): this.type

328

def setLabelCol(value: String): this.type

329

def setSelectorType(value: String): this.type

330

def setNumTopFeatures(value: Int): this.type

331

def setPercentile(value: Double): this.type

332

def setFpr(value: Double): this.type

333

def setFdr(value: Double): this.type

334

def setFwe(value: Double): this.type

335

}

336

337

class ChiSqSelectorModel extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {

338

def selectedFeatures: Array[Int]

339

}

340

341

/**

342

* UnivariateFeatureSelector - selects features using statistical tests

343

* Supports various statistical tests for feature selection

344

*/

345

class UnivariateFeatureSelector extends Estimator[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {

346

def setFeaturesCol(value: String): this.type

347

def setOutputCol(value: String): this.type

348

def setLabelCol(value: String): this.type

349

def setFeatureType(value: String): this.type

350

def setLabelType(value: String): this.type

351

def setSelectionMode(value: String): this.type

352

def setSelectionThreshold(value: Double): this.type

353

}

354

355

class UnivariateFeatureSelectorModel extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {

356

def selectedFeatures: Array[Int]

357

}

358

359

/**

360

* VarianceThresholdSelector - removes low-variance features

361

* Filters features with variance below specified threshold

362

*/

363

class VarianceThresholdSelector extends Estimator[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {

364

def setFeaturesCol(value: String): this.type

365

def setOutputCol(value: String): this.type

366

def setVarianceThreshold(value: Double): this.type

367

}

368

369

class VarianceThresholdSelectorModel extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {

370

def selectedFeatures: Array[Int]

371

}

372

```

373

374

### Dimensionality Reduction

375

376

Methods for reducing the number of features while preserving information.

377

378

```scala { .api }

379

/**

380

* PCA - Principal Component Analysis for dimensionality reduction

381

* Projects data onto lower dimensional space preserving maximum variance

382

*/

383

class PCA extends Estimator[PCAModel] with PCAParams {

384

def setInputCol(value: String): this.type

385

def setOutputCol(value: String): this.type

386

def setK(value: Int): this.type

387

}

388

389

class PCAModel extends Model[PCAModel] with PCAParams {

390

def pc: Matrix

391

def explainedVariance: Vector

392

}

393

394

/**

395

* DCT - Discrete Cosine Transform

396

* Applies DCT transformation for frequency domain analysis

397

*/

398

class DCT extends Transformer with HasInputCol with HasOutputCol {

399

def setInputCol(value: String): this.type

400

def setOutputCol(value: String): this.type

401

def setInverse(value: Boolean): this.type

402

}

403

```

404

405

### Bucketing and Discretization

406

407

Methods for converting continuous variables into discrete bins.

408

409

```scala { .api }

410

/**

411

* Bucketizer - maps continuous features to buckets

412

* Converts continuous values to discrete bins using thresholds

413

*/

414

class Bucketizer extends Transformer with HasInputCol with HasOutputCol with HasHandleInvalid {

415

def setInputCol(value: String): this.type

416

def setOutputCol(value: String): this.type

417

def setSplits(value: Array[Double]): this.type

418

def setInputCols(value: Array[String]): this.type

419

def setOutputCols(value: Array[String]): this.type

420

def setSplitsArray(value: Array[Array[Double]]): this.type

421

def setHandleInvalid(value: String): this.type

422

}

423

424

/**

425

* QuantileDiscretizer - discretizes continuous features using quantiles

426

* Automatically determines bucket boundaries based on data distribution

427

*/

428

class QuantileDiscretizer extends Estimator[QuantileDiscretizerModel] with QuantileDiscretizerParams {

429

def setInputCol(value: String): this.type

430

def setOutputCol(value: String): this.type

431

def setInputCols(value: Array[String]): this.type

432

def setOutputCols(value: Array[String]): this.type

433

def setNumBuckets(value: Int): this.type

434

def setNumBucketsArray(value: Array[Int]): this.type

435

def setRelativeError(value: Double): this.type

436

def setHandleInvalid(value: String): this.type

437

}

438

439

class QuantileDiscretizerModel extends Model[QuantileDiscretizerModel] with QuantileDiscretizerParams {

440

def getSplits: Array[Double]

441

def getSplitsArray: Array[Array[Double]]

442

}

443

```

444

445

### Feature Engineering

446

447

Advanced feature engineering techniques for creating new features.

448

449

```scala { .api }

450

/**

451

* PolynomialExpansion - generates polynomial features

452

* Creates interaction features by expanding polynomial terms

453

*/

454

class PolynomialExpansion extends Transformer with HasInputCol with HasOutputCol {

455

def setInputCol(value: String): this.type

456

def setOutputCol(value: String): this.type

457

def setDegree(value: Int): this.type

458

}

459

460

/**

461

* Interaction - creates interaction features between selected columns

462

* Generates cross-product features for modeling feature interactions

463

*/

464

class Interaction extends Transformer with HasInputCols with HasOutputCol {

465

def setInputCols(value: Array[String]): this.type

466

def setOutputCol(value: String): this.type

467

}

468

469

/**

470

* Binarizer - thresholds numerical features to binary values

471

* Converts continuous values to binary based on threshold

472

*/

473

class Binarizer extends Transformer with HasInputCol with HasOutputCol with HasThreshold {

474

def setInputCol(value: String): this.type

475

def setOutputCol(value: String): this.type

476

def setInputCols(value: Array[String]): this.type

477

def setOutputCols(value: Array[String]): this.type

478

def setThreshold(value: Double): this.type

479

def setThresholds(value: Array[Double]): this.type

480

}

481

482

/**

483

* FeatureHasher - hashes categorical features to sparse vectors

484

* Maps arbitrary features to fixed-length vectors using hashing

485

*/

486

class FeatureHasher extends Transformer with HasInputCols with HasOutputCol {

487

def setInputCols(value: Array[String]): this.type

488

def setOutputCol(value: String): this.type

489

def setNumFeatures(value: Int): this.type

490

def setCategoricalCols(value: Array[String]): this.type

491

}

492

493

/**

494

* SQLTransformer - applies SQL transformations to DataFrames

495

* Enables complex feature engineering using SQL expressions

496

*/

497

class SQLTransformer extends Transformer {

498

def setStatement(value: String): this.type

499

def getStatement: String

500

}

501

502

/**

503

* RFormula - creates features using R-style formulas

504

* Provides R-like syntax for feature specification and encoding

505

*/

506

class RFormula extends Estimator[RFormulaModel] with RFormulaParams {

507

def setFormula(value: String): this.type

508

def setFeaturesCol(value: String): this.type

509

def setLabelCol(value: String): this.type

510

def setForceIndexLabel(value: Boolean): this.type

511

def setStringIndexerOrderType(value: String): this.type

512

def setHandleInvalid(value: String): this.type

513

}

514

515

class RFormulaModel extends Model[RFormulaModel] with RFormulaParams {

516

def pipelineModel: PipelineModel

517

}

518

```

519

520

## Usage Examples

521

522

### Complete Feature Processing Pipeline

523

524

```scala

525

import org.apache.spark.ml.feature._

526

import org.apache.spark.ml.Pipeline

527

528

// Text processing pipeline

529

val tokenizer = new Tokenizer()

530

.setInputCol("text")

531

.setOutputCol("words")

532

533

val stopWordsRemover = new StopWordsRemover()

534

.setInputCol("words")

535

.setOutputCol("filtered")

536

537

val hashingTF = new HashingTF()

538

.setInputCol("filtered")

539

.setOutputCol("rawFeatures")

540

.setNumFeatures(1000)

541

542

val idf = new IDF()

543

.setInputCol("rawFeatures")

544

.setOutputCol("textFeatures")

545

546

// Categorical feature processing

547

val stringIndexer = new StringIndexer()

548

.setInputCol("category")

549

.setOutputCol("categoryIndex")

550

551

val oneHotEncoder = new OneHotEncoder()

552

.setInputCols(Array("categoryIndex"))

553

.setOutputCols(Array("categoryVec"))

554

555

// Numerical feature processing

556

val scaler = new StandardScaler()

557

.setInputCol("numericFeatures")

558

.setOutputCol("scaledNumeric")

559

.setWithMean(true)

560

.setWithStd(true)

561

562

// Feature assembly

563

val assembler = new VectorAssembler()

564

.setInputCols(Array("textFeatures", "categoryVec", "scaledNumeric"))

565

.setOutputCol("features")

566

567

// Create pipeline

568

val pipeline = new Pipeline()

569

.setStages(Array(

570

tokenizer, stopWordsRemover, hashingTF, idf,

571

stringIndexer, oneHotEncoder,

572

scaler, assembler

573

))

574

575

val model = pipeline.fit(trainingData)

576

val processedData = model.transform(rawData)

577

```

578

579

### Feature Selection Example

580

581

```scala

582

import org.apache.spark.ml.feature.ChiSqSelector

583

584

val selector = new ChiSqSelector()

585

.setNumTopFeatures(50)

586

.setFeaturesCol("features")

587

.setLabelCol("label")

588

.setOutputCol("selectedFeatures")

589

590

val result = selector.fit(data).transform(data)

591

```

592

593

### Dimensionality Reduction

594

595

```scala

596

import org.apache.spark.ml.feature.PCA

597

598

val pca = new PCA()

599

.setInputCol("features")

600

.setOutputCol("pcaFeatures")

601

.setK(3)

602

603

val model = pca.fit(data)

604

val result = model.transform(data)

605

606

// Access explained variance

607

println(s"Explained variance: ${model.explainedVariance}")

608

```