0
# Feature Processing
1
2
Comprehensive feature extraction, transformation, selection, and engineering capabilities. MLlib provides over 60 feature processing methods including text processing, scaling, dimensionality reduction, categorical encoding, and feature selection.
3
4
## Capabilities
5
6
### Vector Assembly and Manipulation
7
8
Core utilities for combining and manipulating feature vectors.
9
10
```scala { .api }
11
/**
12
* VectorAssembler - combines multiple columns into a single vector column
13
* Essential for preparing features for ML algorithms
14
*/
15
class VectorAssembler extends Transformer with HasInputCols with HasOutputCol with HasHandleInvalid {
16
def setInputCols(value: Array[String]): this.type
17
def setOutputCol(value: String): this.type
18
def setHandleInvalid(value: String): this.type
19
}
20
21
/**
22
* VectorSlicer - selects a subset of features from a vector
23
* Useful for feature selection and dimensionality reduction
24
*/
25
class VectorSlicer extends Transformer with HasInputCol with HasOutputCol {
26
def setInputCol(value: String): this.type
27
def setOutputCol(value: String): this.type
28
def setIndices(value: Array[Int]): this.type
29
def setNames(value: Array[String]): this.type
30
}
31
32
/**
33
* VectorSizeHint - provides size information for vectors
34
* Helps optimize vector operations when size is known
35
*/
36
class VectorSizeHint extends Transformer with HasInputCol with HasOutputCol {
37
def setInputCol(value: String): this.type
38
def setOutputCol(value: String): this.type
39
def setSize(value: Int): this.type
40
def setHandleInvalid(value: String): this.type
41
}
42
```
43
44
**Usage Example:**
45
46
```scala
47
import org.apache.spark.ml.feature.VectorAssembler
48
49
val assembler = new VectorAssembler()
50
.setInputCols(Array("hour", "mobile", "userFeatures"))
51
.setOutputCol("features")
52
53
val output = assembler.transform(dataset)
54
```
55
56
### Feature Scaling and Normalization
57
58
Methods for scaling and normalizing feature values for improved algorithm performance.
59
60
```scala { .api }
61
/**
62
* StandardScaler - standardizes features by removing mean and scaling to unit variance
63
* Transforms features to have zero mean and unit standard deviation
64
*/
65
class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerParams {
66
def setInputCol(value: String): this.type
67
def setOutputCol(value: String): this.type
68
def setWithMean(value: Boolean): this.type
69
def setWithStd(value: Boolean): this.type
70
}
71
72
class StandardScalerModel extends Model[StandardScalerModel] with StandardScalerParams {
73
def mean: Vector
74
def std: Vector
75
}
76
77
/**
78
* MinMaxScaler - rescales features to a specified range [min, max]
79
* Transforms features to fit within the specified minimum and maximum values
80
*/
81
class MinMaxScaler extends Estimator[MinMaxScalerModel] with MinMaxScalerParams {
82
def setInputCol(value: String): this.type
83
def setOutputCol(value: String): this.type
84
def setMin(value: Double): this.type
85
def setMax(value: Double): this.type
86
}
87
88
class MinMaxScalerModel extends Model[MinMaxScalerModel] with MinMaxScalerParams {
89
def originalMin: Vector
90
def originalMax: Vector
91
}
92
93
/**
94
* MaxAbsScaler - scales features by the maximum absolute value
95
* Divides each feature by its maximum absolute value to scale to [-1, 1]
96
*/
97
class MaxAbsScaler extends Estimator[MaxAbsScalerModel] with MaxAbsScalerParams {
98
def setInputCol(value: String): this.type
99
def setOutputCol(value: String): this.type
100
}
101
102
class MaxAbsScalerModel extends Model[MaxAbsScalerModel] with MaxAbsScalerParams {
103
def maxAbs: Vector
104
}
105
106
/**
107
* RobustScaler - scales features using median and interquartile range
108
* More robust to outliers than StandardScaler
109
*/
110
class RobustScaler extends Estimator[RobustScalerModel] with RobustScalerParams {
111
def setInputCol(value: String): this.type
112
def setOutputCol(value: String): this.type
113
def setWithCentering(value: Boolean): this.type
114
def setWithScaling(value: Boolean): this.type
115
def setLower(value: Double): this.type
116
def setUpper(value: Double): this.type
117
}
118
119
class RobustScalerModel extends Model[RobustScalerModel] with RobustScalerParams {
120
def median: Vector
121
def range: Vector
122
}
123
124
/**
125
* Normalizer - normalizes vectors to have unit norm
126
* Scales individual samples to have unit norm (L1, L2, or Inf norm)
127
*/
128
class Normalizer extends Transformer with HasInputCol with HasOutputCol {
129
def setInputCol(value: String): this.type
130
def setOutputCol(value: String): this.type
131
def setP(value: Double): this.type
132
}
133
```
134
135
### Categorical Feature Processing
136
137
Methods for handling categorical variables including indexing and encoding.
138
139
```scala { .api }
140
/**
141
* StringIndexer - maps string values to numeric indices
142
* Converts categorical string features to numeric indices for ML algorithms
143
*/
144
class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerParams {
145
def setInputCol(value: String): this.type
146
def setOutputCol(value: String): this.type
147
def setHandleInvalid(value: String): this.type
148
def setStringOrderType(value: String): this.type
149
}
150
151
class StringIndexerModel extends Model[StringIndexerModel] with StringIndexerParams {
152
def labels: Array[String]
153
def labelsArray: Array[Array[String]]
154
}
155
156
/**
157
* IndexToString - maps numeric indices back to string values
158
* Reverse operation of StringIndexer for interpreting model outputs
159
*/
160
class IndexToString extends Transformer with HasInputCol with HasOutputCol {
161
def setInputCol(value: String): this.type
162
def setOutputCol(value: String): this.type
163
def setLabels(value: Array[String]): this.type
164
}
165
166
/**
167
* OneHotEncoder - converts categorical indices to binary vectors
168
* Creates binary columns for each category level
169
*/
170
class OneHotEncoder extends Estimator[OneHotEncoderModel] with OneHotEncoderParams {
171
def setInputCols(value: Array[String]): this.type
172
def setOutputCols(value: Array[String]): this.type
173
def setDropLast(value: Boolean): this.type
174
def setHandleInvalid(value: String): this.type
175
}
176
177
class OneHotEncoderModel extends Model[OneHotEncoderModel] with OneHotEncoderParams {
178
def categorySizes: Array[Int]
179
}
180
181
/**
182
* VectorIndexer - automatically identifies categorical features in vectors
183
* Treats features with <= maxCategories unique values as categorical
184
*/
185
class VectorIndexer extends Estimator[VectorIndexerModel] with VectorIndexerParams {
186
def setInputCol(value: String): this.type
187
def setOutputCol(value: String): this.type
188
def setMaxCategories(value: Int): this.type
189
def setHandleInvalid(value: String): this.type
190
}
191
192
class VectorIndexerModel extends Model[VectorIndexerModel] with VectorIndexerParams {
193
def numFeatures: Int
194
def categoryMaps: Map[Int, Map[Double, Int]]
195
}
196
```
197
198
### Text Processing and Feature Extraction
199
200
Comprehensive text processing capabilities for natural language data.
201
202
```scala { .api }
203
/**
204
* Tokenizer - splits text into individual words
205
* Basic tokenization using whitespace and punctuation
206
*/
207
class Tokenizer extends Transformer with HasInputCol with HasOutputCol {
208
def setInputCol(value: String): this.type
209
def setOutputCol(value: String): this.type
210
}
211
212
/**
213
* RegexTokenizer - tokenizes text using regular expressions
214
* More flexible tokenization with configurable patterns
215
*/
216
class RegexTokenizer extends Transformer with HasInputCol with HasOutputCol {
217
def setInputCol(value: String): this.type
218
def setOutputCol(value: String): this.type
219
def setPattern(value: String): this.type
220
def setGaps(value: Boolean): this.type
221
def setToLowercase(value: Boolean): this.type
222
def setMinTokenLength(value: Int): this.type
223
}
224
225
/**
226
* StopWordsRemover - removes stop words from tokenized text
227
* Filters common words that don't contribute to text meaning
228
*/
229
class StopWordsRemover extends Transformer with HasInputCol with HasOutputCol {
230
def setInputCol(value: String): this.type
231
def setOutputCol(value: String): this.type
232
def setStopWords(value: Array[String]): this.type
233
def setCaseSensitive(value: Boolean): this.type
234
def setLocale(value: String): this.type
235
}
236
237
/**
238
* NGram - generates n-grams from token sequences
239
* Creates sequences of n consecutive tokens for text analysis
240
*/
241
class NGram extends Transformer with HasInputCol with HasOutputCol {
242
def setInputCol(value: String): this.type
243
def setOutputCol(value: String): this.type
244
def setN(value: Int): this.type
245
}
246
247
/**
248
* HashingTF - maps terms to term frequency vectors using hashing
249
* Fast text vectorization using hash functions
250
*/
251
class HashingTF extends Transformer with HasInputCol with HasOutputCol {
252
def setInputCol(value: String): this.type
253
def setOutputCol(value: String): this.type
254
def setNumFeatures(value: Int): this.type
255
def setBinary(value: Boolean): this.type
256
}
257
258
/**
259
* CountVectorizer - converts text to vectors based on term counts
260
* Creates vocabulary and maps documents to term frequency vectors
261
*/
262
class CountVectorizer extends Estimator[CountVectorizerModel] with CountVectorizerParams {
263
def setInputCol(value: String): this.type
264
def setOutputCol(value: String): this.type
265
def setVocabSize(value: Int): this.type
266
def setMinDF(value: Double): this.type
267
def setMaxDF(value: Double): this.type
268
def setMinTF(value: Double): this.type
269
def setBinary(value: Boolean): this.type
270
}
271
272
class CountVectorizerModel extends Model[CountVectorizerModel] with CountVectorizerParams {
273
def vocabulary: Array[String]
274
}
275
276
/**
277
* IDF - computes inverse document frequency for TF-IDF
278
* Weights term frequencies by their inverse document frequency
279
*/
280
class IDF extends Estimator[IDFModel] with IDFParams {
281
def setInputCol(value: String): this.type
282
def setOutputCol(value: String): this.type
283
def setMinDocFreq(value: Int): this.type
284
}
285
286
class IDFModel extends Model[IDFModel] with IDFParams {
287
def idf: Vector
288
def docFreq: Array[Long]
289
def numDocs: Long
290
}
291
292
/**
293
* Word2Vec - learns vector representations of words
294
* Trains word embeddings using the Word2Vec algorithm
295
*/
296
class Word2Vec extends Estimator[Word2VecModel] with Word2VecParams {
297
def setInputCol(value: String): this.type
298
def setOutputCol(value: String): this.type
299
def setVectorSize(value: Int): this.type
300
def setMinCount(value: Int): this.type
301
def setNumPartitions(value: Int): this.type
302
def setStepSize(value: Double): this.type
303
def setMaxIter(value: Int): this.type
304
def setSeed(value: Long): this.type
305
def setWindowSize(value: Int): this.type
306
}
307
308
class Word2VecModel extends Model[Word2VecModel] with Word2VecParams {
309
def getVectors: DataFrame
310
def findSynonyms(word: String, num: Int): DataFrame
311
def findSynonymsArray(word: String, num: Int): Array[(String, Double)]
312
def transform(word: String): Vector
313
}
314
```
315
316
### Feature Selection
317
318
Methods for selecting the most relevant features for modeling.
319
320
```scala { .api }
321
/**
322
* ChiSqSelector - selects features based on Chi-squared test
323
* Statistical feature selection for categorical features
324
*/
325
class ChiSqSelector extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams {
326
def setFeaturesCol(value: String): this.type
327
def setOutputCol(value: String): this.type
328
def setLabelCol(value: String): this.type
329
def setSelectorType(value: String): this.type
330
def setNumTopFeatures(value: Int): this.type
331
def setPercentile(value: Double): this.type
332
def setFpr(value: Double): this.type
333
def setFdr(value: Double): this.type
334
def setFwe(value: Double): this.type
335
}
336
337
class ChiSqSelectorModel extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
338
def selectedFeatures: Array[Int]
339
}
340
341
/**
342
* UnivariateFeatureSelector - selects features using statistical tests
343
* Supports various statistical tests for feature selection
344
*/
345
class UnivariateFeatureSelector extends Estimator[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {
346
def setFeaturesCol(value: String): this.type
347
def setOutputCol(value: String): this.type
348
def setLabelCol(value: String): this.type
349
def setFeatureType(value: String): this.type
350
def setLabelType(value: String): this.type
351
def setSelectionMode(value: String): this.type
352
def setSelectionThreshold(value: Double): this.type
353
}
354
355
class UnivariateFeatureSelectorModel extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {
356
def selectedFeatures: Array[Int]
357
}
358
359
/**
360
* VarianceThresholdSelector - removes low-variance features
361
* Filters features with variance below specified threshold
362
*/
363
class VarianceThresholdSelector extends Estimator[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {
364
def setFeaturesCol(value: String): this.type
365
def setOutputCol(value: String): this.type
366
def setVarianceThreshold(value: Double): this.type
367
}
368
369
class VarianceThresholdSelectorModel extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {
370
def selectedFeatures: Array[Int]
371
}
372
```
373
374
### Dimensionality Reduction
375
376
Methods for reducing the number of features while preserving information.
377
378
```scala { .api }
379
/**
380
* PCA - Principal Component Analysis for dimensionality reduction
381
* Projects data onto lower dimensional space preserving maximum variance
382
*/
383
class PCA extends Estimator[PCAModel] with PCAParams {
384
def setInputCol(value: String): this.type
385
def setOutputCol(value: String): this.type
386
def setK(value: Int): this.type
387
}
388
389
class PCAModel extends Model[PCAModel] with PCAParams {
390
def pc: Matrix
391
def explainedVariance: Vector
392
}
393
394
/**
395
* DCT - Discrete Cosine Transform
396
* Applies DCT transformation for frequency domain analysis
397
*/
398
class DCT extends Transformer with HasInputCol with HasOutputCol {
399
def setInputCol(value: String): this.type
400
def setOutputCol(value: String): this.type
401
def setInverse(value: Boolean): this.type
402
}
403
```
404
405
### Bucketing and Discretization
406
407
Methods for converting continuous variables into discrete bins.
408
409
```scala { .api }
410
/**
411
* Bucketizer - maps continuous features to buckets
412
* Converts continuous values to discrete bins using thresholds
413
*/
414
class Bucketizer extends Transformer with HasInputCol with HasOutputCol with HasHandleInvalid {
415
def setInputCol(value: String): this.type
416
def setOutputCol(value: String): this.type
417
def setSplits(value: Array[Double]): this.type
418
def setInputCols(value: Array[String]): this.type
419
def setOutputCols(value: Array[String]): this.type
420
def setSplitsArray(value: Array[Array[Double]]): this.type
421
def setHandleInvalid(value: String): this.type
422
}
423
424
/**
425
* QuantileDiscretizer - discretizes continuous features using quantiles
426
* Automatically determines bucket boundaries based on data distribution
427
*/
428
class QuantileDiscretizer extends Estimator[QuantileDiscretizerModel] with QuantileDiscretizerParams {
429
def setInputCol(value: String): this.type
430
def setOutputCol(value: String): this.type
431
def setInputCols(value: Array[String]): this.type
432
def setOutputCols(value: Array[String]): this.type
433
def setNumBuckets(value: Int): this.type
434
def setNumBucketsArray(value: Array[Int]): this.type
435
def setRelativeError(value: Double): this.type
436
def setHandleInvalid(value: String): this.type
437
}
438
439
class QuantileDiscretizerModel extends Model[QuantileDiscretizerModel] with QuantileDiscretizerParams {
440
def getSplits: Array[Double]
441
def getSplitsArray: Array[Array[Double]]
442
}
443
```
444
445
### Feature Engineering
446
447
Advanced feature engineering techniques for creating new features.
448
449
```scala { .api }
450
/**
451
* PolynomialExpansion - generates polynomial features
452
* Creates interaction features by expanding polynomial terms
453
*/
454
class PolynomialExpansion extends Transformer with HasInputCol with HasOutputCol {
455
def setInputCol(value: String): this.type
456
def setOutputCol(value: String): this.type
457
def setDegree(value: Int): this.type
458
}
459
460
/**
461
* Interaction - creates interaction features between selected columns
462
* Generates cross-product features for modeling feature interactions
463
*/
464
class Interaction extends Transformer with HasInputCols with HasOutputCol {
465
def setInputCols(value: Array[String]): this.type
466
def setOutputCol(value: String): this.type
467
}
468
469
/**
470
* Binarizer - thresholds numerical features to binary values
471
* Converts continuous values to binary based on threshold
472
*/
473
class Binarizer extends Transformer with HasInputCol with HasOutputCol with HasThreshold {
474
def setInputCol(value: String): this.type
475
def setOutputCol(value: String): this.type
476
def setInputCols(value: Array[String]): this.type
477
def setOutputCols(value: Array[String]): this.type
478
def setThreshold(value: Double): this.type
479
def setThresholds(value: Array[Double]): this.type
480
}
481
482
/**
483
* FeatureHasher - hashes categorical features to sparse vectors
484
* Maps arbitrary features to fixed-length vectors using hashing
485
*/
486
class FeatureHasher extends Transformer with HasInputCols with HasOutputCol {
487
def setInputCols(value: Array[String]): this.type
488
def setOutputCol(value: String): this.type
489
def setNumFeatures(value: Int): this.type
490
def setCategoricalCols(value: Array[String]): this.type
491
}
492
493
/**
494
* SQLTransformer - applies SQL transformations to DataFrames
495
* Enables complex feature engineering using SQL expressions
496
*/
497
class SQLTransformer extends Transformer {
498
def setStatement(value: String): this.type
499
def getStatement: String
500
}
501
502
/**
503
* RFormula - creates features using R-style formulas
504
* Provides R-like syntax for feature specification and encoding
505
*/
506
class RFormula extends Estimator[RFormulaModel] with RFormulaParams {
507
def setFormula(value: String): this.type
508
def setFeaturesCol(value: String): this.type
509
def setLabelCol(value: String): this.type
510
def setForceIndexLabel(value: Boolean): this.type
511
def setStringIndexerOrderType(value: String): this.type
512
def setHandleInvalid(value: String): this.type
513
}
514
515
class RFormulaModel extends Model[RFormulaModel] with RFormulaParams {
516
def pipelineModel: PipelineModel
517
}
518
```
519
520
## Usage Examples
521
522
### Complete Feature Processing Pipeline
523
524
```scala
525
import org.apache.spark.ml.feature._
526
import org.apache.spark.ml.Pipeline
527
528
// Text processing pipeline
529
val tokenizer = new Tokenizer()
530
.setInputCol("text")
531
.setOutputCol("words")
532
533
val stopWordsRemover = new StopWordsRemover()
534
.setInputCol("words")
535
.setOutputCol("filtered")
536
537
val hashingTF = new HashingTF()
538
.setInputCol("filtered")
539
.setOutputCol("rawFeatures")
540
.setNumFeatures(1000)
541
542
val idf = new IDF()
543
.setInputCol("rawFeatures")
544
.setOutputCol("textFeatures")
545
546
// Categorical feature processing
547
val stringIndexer = new StringIndexer()
548
.setInputCol("category")
549
.setOutputCol("categoryIndex")
550
551
val oneHotEncoder = new OneHotEncoder()
552
.setInputCols(Array("categoryIndex"))
553
.setOutputCols(Array("categoryVec"))
554
555
// Numerical feature processing
556
val scaler = new StandardScaler()
557
.setInputCol("numericFeatures")
558
.setOutputCol("scaledNumeric")
559
.setWithMean(true)
560
.setWithStd(true)
561
562
// Feature assembly
563
val assembler = new VectorAssembler()
564
.setInputCols(Array("textFeatures", "categoryVec", "scaledNumeric"))
565
.setOutputCol("features")
566
567
// Create pipeline
568
val pipeline = new Pipeline()
569
.setStages(Array(
570
tokenizer, stopWordsRemover, hashingTF, idf,
571
stringIndexer, oneHotEncoder,
572
scaler, assembler
573
))
574
575
val model = pipeline.fit(trainingData)
576
val processedData = model.transform(rawData)
577
```
578
579
### Feature Selection Example
580
581
```scala
582
import org.apache.spark.ml.feature.ChiSqSelector
583
584
val selector = new ChiSqSelector()
585
.setNumTopFeatures(50)
586
.setFeaturesCol("features")
587
.setLabelCol("label")
588
.setOutputCol("selectedFeatures")
589
590
val result = selector.fit(data).transform(data)
591
```
592
593
### Dimensionality Reduction
594
595
```scala
596
import org.apache.spark.ml.feature.PCA
597
598
val pca = new PCA()
599
.setInputCol("features")
600
.setOutputCol("pcaFeatures")
601
.setK(3)
602
603
val model = pca.fit(data)
604
val result = model.transform(data)
605
606
// Access explained variance
607
println(s"Explained variance: ${model.explainedVariance}")
608
```