0
# Feature Engineering
1
2
Comprehensive data preprocessing and feature transformation utilities for preparing raw data for machine learning algorithms, including text processing, categorical encoding, and numerical scaling.
3
4
## Capabilities
5
6
### Vector Assembly and Manipulation
7
8
Core utilities for assembling feature vectors and manipulating vector data structures.
9
10
```scala { .api }
11
/**
12
* Combines multiple columns into a single vector column
13
*/
14
class VectorAssembler extends Transformer {
15
def setInputCols(value: Array[String]): this.type
16
def setOutputCol(value: String): this.type
17
def setHandleInvalid(value: String): this.type
18
}
19
20
/**
21
* Extracts elements from vectors by indices or names
22
*/
23
class VectorSlicer extends Transformer {
24
def setInputCol(value: String): this.type
25
def setOutputCol(value: String): this.type
26
def setIndices(value: Array[Int]): this.type
27
def setNames(value: Array[String]): this.type
28
}
29
30
/**
31
* Indexes categorical features in vector columns
32
*/
33
class VectorIndexer extends Estimator[VectorIndexerModel] {
34
def setInputCol(value: String): this.type
35
def setOutputCol(value: String): this.type
36
def setMaxCategories(value: Int): this.type
37
def setHandleInvalid(value: String): this.type
38
}
39
40
class VectorIndexerModel extends Model[VectorIndexerModel] with VectorIndexerParams {
41
def numFeatures: Int
42
def categoryMaps: Map[Int, Map[Double, Int]]
43
}
44
```
45
46
**Usage Example:**
47
48
```scala
49
import org.apache.spark.ml.feature.VectorAssembler
50
51
val assembler = new VectorAssembler()
52
.setInputCols(Array("hour", "mobile", "userFeatures"))
53
.setOutputCol("features")
54
55
val output = assembler.transform(dataset)
56
output.select("features").show()
57
```
58
59
### Scaling and Normalization
60
61
Feature scaling transformations for normalizing numerical data distributions.
62
63
```scala { .api }
64
/**
65
* Standardizes features by removing mean and scaling to unit variance
66
*/
67
class StandardScaler extends Estimator[StandardScalerModel] {
68
def setInputCol(value: String): this.type
69
def setOutputCol(value: String): this.type
70
def setWithMean(value: Boolean): this.type
71
def setWithStd(value: Boolean): this.type
72
}
73
74
class StandardScalerModel extends Model[StandardScalerModel] with StandardScalerParams {
75
def mean: Vector
76
def std: Vector
77
}
78
79
/**
80
* Rescales features to a common range [min, max]
81
*/
82
class MinMaxScaler extends Estimator[MinMaxScalerModel] {
83
def setInputCol(value: String): this.type
84
def setOutputCol(value: String): this.type
85
def setMin(value: Double): this.type
86
def setMax(value: Double): this.type
87
}
88
89
class MinMaxScalerModel extends Model[MinMaxScalerModel] with MinMaxScalerParams {
90
def originalMin: Vector
91
def originalMax: Vector
92
}
93
94
/**
95
* Scales features by the maximum absolute value
96
*/
97
class MaxAbsScaler extends Estimator[MaxAbsScalerModel] {
98
def setInputCol(value: String): this.type
99
def setOutputCol(value: String): this.type
100
}
101
102
class MaxAbsScalerModel extends Model[MaxAbsScalerModel] with MaxAbsScalerParams {
103
def maxAbs: Vector
104
}
105
106
/**
107
* Normalizes vectors to have unit norm
108
*/
109
class Normalizer extends Transformer {
110
def setInputCol(value: String): this.type
111
def setOutputCol(value: String): this.type
112
def setP(value: Double): this.type
113
}
114
115
/**
116
* Robust scaling using quantiles, less sensitive to outliers
117
*/
118
class RobustScaler extends Estimator[RobustScalerModel] {
119
def setInputCol(value: String): this.type
120
def setOutputCol(value: String): this.type
121
def setLower(value: Double): this.type
122
def setUpper(value: Double): this.type
123
def setWithCentering(value: Boolean): this.type
124
def setWithScaling(value: Boolean): this.type
125
def setRelativeError(value: Double): this.type
126
}
127
128
class RobustScalerModel extends Model[RobustScalerModel] with RobustScalerParams {
129
def median: Vector
130
def range: Vector
131
}
132
```
133
134
### Categorical Feature Processing
135
136
Transformations for handling categorical data including encoding and indexing.
137
138
```scala { .api }
139
/**
140
* Maps string labels to numeric indices
141
*/
142
class StringIndexer extends Estimator[StringIndexerModel] {
143
def setInputCol(value: String): this.type
144
def setOutputCol(value: String): this.type
145
def setHandleInvalid(value: String): this.type
146
def setStringOrderType(value: String): this.type
147
}
148
149
class StringIndexerModel extends Model[StringIndexerModel] with StringIndexerParams {
150
def labels: Array[String]
151
def labelsArray: Array[Array[String]]
152
}
153
154
/**
155
* Maps numeric indices back to string labels
156
*/
157
class IndexToString extends Transformer {
158
def setInputCol(value: String): this.type
159
def setOutputCol(value: String): this.type
160
def setLabels(value: Array[String]): this.type
161
}
162
163
/**
164
* One-hot encoding for categorical features
165
*/
166
class OneHotEncoder extends Transformer {
167
def setInputCols(value: Array[String]): this.type
168
def setOutputCols(value: Array[String]): this.type
169
def setDropLast(value: Boolean): this.type
170
def setHandleInvalid(value: String): this.type
171
}
172
```
173
174
**Usage Example:**
175
176
```scala
177
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder}
178
import org.apache.spark.ml.Pipeline
179
180
val indexer = new StringIndexer()
181
.setInputCol("category")
182
.setOutputCol("categoryIndex")
183
184
val encoder = new OneHotEncoder()
185
.setInputCols(Array("categoryIndex"))
186
.setOutputCols(Array("categoryVec"))
187
188
val pipeline = new Pipeline()
189
.setStages(Array(indexer, encoder))
190
191
val model = pipeline.fit(df)
192
model.transform(df).show()
193
```
194
195
### Text Processing
196
197
Comprehensive text processing utilities for natural language processing tasks.
198
199
```scala { .api }
200
/**
201
* Tokenizes text into individual words
202
*/
203
class Tokenizer extends Transformer {
204
def setInputCol(value: String): this.type
205
def setOutputCol(value: String): this.type
206
}
207
208
/**
209
* Advanced tokenization using regular expressions
210
*/
211
class RegexTokenizer extends Transformer {
212
def setInputCol(value: String): this.type
213
def setOutputCol(value: String): this.type
214
def setPattern(value: String): this.type
215
def setGaps(value: Boolean): this.type
216
def setToLowercase(value: Boolean): this.type
217
def setMinTokenLength(value: Int): this.type
218
}
219
220
/**
221
* Removes common stop words from text
222
*/
223
class StopWordsRemover extends Transformer {
224
def setInputCol(value: String): this.type
225
def setOutputCol(value: String): this.type
226
def setStopWords(value: Array[String]): this.type
227
def setCaseSensitive(value: Boolean): this.type
228
def setLocale(value: String): this.type
229
}
230
231
/**
232
* Generates n-grams from sequences of tokens
233
*/
234
class NGram extends Transformer {
235
def setInputCol(value: String): this.type
236
def setOutputCol(value: String): this.type
237
def setN(value: Int): this.type
238
}
239
240
/**
241
* Term frequency using hashing trick
242
*/
243
class HashingTF extends Transformer {
244
def setInputCol(value: String): this.type
245
def setOutputCol(value: String): this.type
246
def setNumFeatures(value: Int): this.type
247
def setBinary(value: Boolean): this.type
248
}
249
250
/**
251
* Inverse document frequency weighting
252
*/
253
class IDF extends Estimator[IDFModel] {
254
def setInputCol(value: String): this.type
255
def setOutputCol(value: String): this.type
256
def setMinDocFreq(value: Int): this.type
257
}
258
259
class IDFModel extends Model[IDFModel] with IDFParams {
260
def idf: Vector
261
}
262
263
/**
264
* Count-based vectorization of text documents
265
*/
266
class CountVectorizer extends Estimator[CountVectorizerModel] {
267
def setInputCol(value: String): this.type
268
def setOutputCol(value: String): this.type
269
def setVocabSize(value: Int): this.type
270
def setMinDF(value: Double): this.type
271
def setMaxDF(value: Double): this.type
272
def setMinTF(value: Double): this.type
273
def setBinary(value: Boolean): this.type
274
}
275
276
class CountVectorizerModel extends Model[CountVectorizerModel] with CountVectorizerParams {
277
def vocabulary: Array[String]
278
}
279
280
/**
281
* Word2Vec for learning vector representations of words
282
*/
283
class Word2Vec extends Estimator[Word2VecModel] {
284
def setInputCol(value: String): this.type
285
def setOutputCol(value: String): this.type
286
def setVectorSize(value: Int): this.type
287
def setMinCount(value: Int): this.type
288
def setNumPartitions(value: Int): this.type
289
def setStepSize(value: Double): this.type
290
def setMaxIter(value: Int): this.type
291
def setSeed(value: Long): this.type
292
def setWindowSize(value: Int): this.type
293
def setMaxSentenceLength(value: Int): this.type
294
}
295
296
class Word2VecModel extends Model[Word2VecModel] with Word2VecParams {
297
def getVectors: DataFrame
298
def findSynonyms(word: String, num: Int): DataFrame
299
def findSynonymsArray(word: String, num: Int): Array[(String, Double)]
300
def transform(word: String): Vector
301
}
302
```
303
304
### Feature Selection
305
306
Statistical methods for selecting most relevant features for machine learning models.
307
308
```scala { .api }
309
/**
310
* Univariate feature selection using statistical tests
311
*/
312
class UnivariateFeatureSelector extends Estimator[UnivariateFeatureSelectorModel] {
313
def setFeatureType(value: String): this.type
314
def setLabelType(value: String): this.type
315
def setSelectionMode(value: String): this.type
316
def setSelectionThreshold(value: Double): this.type
317
def setInputCol(value: String): this.type
318
def setOutputCol(value: String): this.type
319
}
320
321
class UnivariateFeatureSelectorModel extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {
322
def selectedFeatures: Array[Int]
323
}
324
325
/**
326
* Feature selection based on variance threshold
327
*/
328
class VarianceThresholdSelector extends Estimator[VarianceThresholdSelectorModel] {
329
def setVarianceThreshold(value: Double): this.type
330
def setInputCol(value: String): this.type
331
def setOutputCol(value: String): this.type
332
}
333
334
class VarianceThresholdSelectorModel extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {
335
def selectedFeatures: Array[Int]
336
}
337
338
/**
339
* Chi-square feature selection for categorical data
340
*/
341
class ChiSqSelector extends Estimator[ChiSqSelectorModel] {
342
def setNumTopFeatures(value: Int): this.type
343
def setPercentile(value: Double): this.type
344
def setFpr(value: Double): this.type
345
def setFdr(value: Double): this.type
346
def setFwe(value: Double): this.type
347
def setSelectorType(value: String): this.type
348
def setInputCol(value: String): this.type
349
def setOutputCol(value: String): this.type
350
}
351
352
class ChiSqSelectorModel extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
353
def selectedFeatures: Array[Int]
354
}
355
```
356
357
### Dimensionality Reduction
358
359
Algorithms for reducing the number of features while preserving important information.
360
361
```scala { .api }
362
/**
363
* Principal Component Analysis for dimensionality reduction
364
*/
365
class PCA extends Estimator[PCAModel] {
366
def setInputCol(value: String): this.type
367
def setOutputCol(value: String): this.type
368
def setK(value: Int): this.type
369
}
370
371
class PCAModel extends Model[PCAModel] with PCAParams {
372
def pc: Matrix
373
def explainedVariance: Vector
374
}
375
```
376
377
### Missing Value Handling
378
379
Utilities for handling missing or invalid data values.
380
381
```scala { .api }
382
/**
383
* Imputes missing values using mean, median, or mode
384
*/
385
class Imputer extends Estimator[ImputerModel] {
386
def setInputCols(value: Array[String]): this.type
387
def setOutputCols(value: Array[String]): this.type
388
def setStrategy(value: String): this.type
389
def setMissingValue(value: Double): this.type
390
def setRelativeError(value: Double): this.type
391
}
392
393
class ImputerModel extends Model[ImputerModel] with ImputerParams {
394
def surrogateDF: DataFrame
395
}
396
```
397
398
### Advanced Transformations
399
400
Specialized transformations for complex feature engineering tasks.
401
402
```scala { .api }
403
/**
404
* Applies SQL transformation to DataFrames
405
*/
406
class SQLTransformer extends Transformer {
407
def setStatement(value: String): this.type
408
}
409
410
/**
411
* Quantile-based discretization of continuous features
412
*/
413
class QuantileDiscretizer extends Estimator[QuantileDiscretizerModel] {
414
def setInputCol(value: String): this.type
415
def setOutputCol(value: String): this.type
416
def setNumBuckets(value: Int): this.type
417
def setRelativeError(value: Double): this.type
418
def setHandleInvalid(value: String): this.type
419
}
420
421
class QuantileDiscretizerModel extends Model[QuantileDiscretizerModel] with QuantileDiscretizerParams {
422
def getSplits: Array[Double]
423
}
424
425
/**
426
* Maps continuous features to categorical buckets
427
*/
428
class Bucketizer extends Transformer {
429
def setInputCol(value: String): this.type
430
def setOutputCol(value: String): this.type
431
def setSplits(value: Array[Double]): this.type
432
def setHandleInvalid(value: String): this.type
433
}
434
435
/**
436
* Feature hashing for high-dimensional categorical data
437
*/
438
class FeatureHasher extends Transformer {
439
def setInputCols(value: Array[String]): this.type
440
def setOutputCol(value: String): this.type
441
def setNumFeatures(value: Int): this.type
442
def setCategoricalCols(value: Array[String]): this.type
443
}
444
445
/**
446
* Creates interaction features between input columns
447
*/
448
class Interaction extends Transformer {
449
def setInputCols(value: Array[String]): this.type
450
def setOutputCol(value: String): this.type
451
}
452
453
/**
454
* Expands features to polynomial space
455
*/
456
class PolynomialExpansion extends Transformer {
457
def setInputCol(value: String): this.type
458
def setOutputCol(value: String): this.type
459
def setDegree(value: Int): this.type
460
}
461
```
462
463
## Types
464
465
```scala { .api }
466
// Feature engineering imports
467
import org.apache.spark.ml.feature._
468
import org.apache.spark.ml.linalg.{Vector, Matrix}
469
import org.apache.spark.sql.{DataFrame, Dataset}
470
471
// Parameter traits
472
import org.apache.spark.ml.param.shared._
473
474
// Text processing utilities
475
import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer, StopWordsRemover}
476
477
// Vector utilities
478
import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer, VectorIndexer}
479
480
// Scaling utilities
481
import org.apache.spark.ml.feature.{StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler}
482
```