Tessl Tile for maven/org.apache.spark/spark-mllib_2.13@4.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md clustering.md evaluation-tuning.md feature-engineering.md index.md linear-algebra.md pipeline-components.md recommendation.md regression.md

feature-engineering.mddocs/

0
# Feature Engineering
1

2
Comprehensive data preprocessing and feature transformation utilities for preparing raw data for machine learning algorithms, including text processing, categorical encoding, and numerical scaling.
3

4
## Capabilities
5

6
### Vector Assembly and Manipulation
7

8
Core utilities for assembling feature vectors and manipulating vector data structures.
9

10
```scala { .api }
11
/**
12
 * Combines multiple columns into a single vector column
13
 */
14
class VectorAssembler extends Transformer {
15
  def setInputCols(value: Array[String]): this.type
16
  def setOutputCol(value: String): this.type
17
  def setHandleInvalid(value: String): this.type
18
}
19

20
/**
21
 * Extracts elements from vectors by indices or names
22
 */
23
class VectorSlicer extends Transformer {
24
  def setInputCol(value: String): this.type
25
  def setOutputCol(value: String): this.type
26
  def setIndices(value: Array[Int]): this.type
27
  def setNames(value: Array[String]): this.type
28
}
29

30
/**
31
 * Indexes categorical features in vector columns
32
 */
33
class VectorIndexer extends Estimator[VectorIndexerModel] {
34
  def setInputCol(value: String): this.type
35
  def setOutputCol(value: String): this.type
36
  def setMaxCategories(value: Int): this.type
37
  def setHandleInvalid(value: String): this.type
38
}
39

40
class VectorIndexerModel extends Model[VectorIndexerModel] with VectorIndexerParams {
41
  def numFeatures: Int
42
  def categoryMaps: Map[Int, Map[Double, Int]]
43
}
44
```
45

46
**Usage Example:**
47

48
```scala
49
import org.apache.spark.ml.feature.VectorAssembler
50

51
val assembler = new VectorAssembler()
52
  .setInputCols(Array("hour", "mobile", "userFeatures"))
53
  .setOutputCol("features")
54

55
val output = assembler.transform(dataset)
56
output.select("features").show()
57
```
58

59
### Scaling and Normalization
60

61
Feature scaling transformations for normalizing numerical data distributions.
62

63
```scala { .api }
64
/**
65
 * Standardizes features by removing mean and scaling to unit variance
66
 */
67
class StandardScaler extends Estimator[StandardScalerModel] {
68
  def setInputCol(value: String): this.type
69
  def setOutputCol(value: String): this.type
70
  def setWithMean(value: Boolean): this.type
71
  def setWithStd(value: Boolean): this.type
72
}
73

74
class StandardScalerModel extends Model[StandardScalerModel] with StandardScalerParams {
75
  def mean: Vector
76
  def std: Vector
77
}
78

79
/**
80
 * Rescales features to a common range [min, max]
81
 */
82
class MinMaxScaler extends Estimator[MinMaxScalerModel] {
83
  def setInputCol(value: String): this.type
84
  def setOutputCol(value: String): this.type
85
  def setMin(value: Double): this.type
86
  def setMax(value: Double): this.type
87
}
88

89
class MinMaxScalerModel extends Model[MinMaxScalerModel] with MinMaxScalerParams {
90
  def originalMin: Vector
91
  def originalMax: Vector
92
}
93

94
/**
95
 * Scales features by the maximum absolute value
96
 */
97
class MaxAbsScaler extends Estimator[MaxAbsScalerModel] {
98
  def setInputCol(value: String): this.type
99
  def setOutputCol(value: String): this.type
100
}
101

102
class MaxAbsScalerModel extends Model[MaxAbsScalerModel] with MaxAbsScalerParams {
103
  def maxAbs: Vector
104
}
105

106
/**
107
 * Normalizes vectors to have unit norm
108
 */
109
class Normalizer extends Transformer {
110
  def setInputCol(value: String): this.type
111
  def setOutputCol(value: String): this.type
112
  def setP(value: Double): this.type
113
}
114

115
/**
116
 * Robust scaling using quantiles, less sensitive to outliers
117
 */
118
class RobustScaler extends Estimator[RobustScalerModel] {
119
  def setInputCol(value: String): this.type
120
  def setOutputCol(value: String): this.type
121
  def setLower(value: Double): this.type
122
  def setUpper(value: Double): this.type
123
  def setWithCentering(value: Boolean): this.type
124
  def setWithScaling(value: Boolean): this.type
125
  def setRelativeError(value: Double): this.type
126
}
127

128
class RobustScalerModel extends Model[RobustScalerModel] with RobustScalerParams {
129
  def median: Vector
130
  def range: Vector
131
}
132
```
133

134
### Categorical Feature Processing
135

136
Transformations for handling categorical data including encoding and indexing.
137

138
```scala { .api }
139
/**
140
 * Maps string labels to numeric indices
141
 */
142
class StringIndexer extends Estimator[StringIndexerModel] {
143
  def setInputCol(value: String): this.type
144
  def setOutputCol(value: String): this.type
145
  def setHandleInvalid(value: String): this.type
146
  def setStringOrderType(value: String): this.type
147
}
148

149
class StringIndexerModel extends Model[StringIndexerModel] with StringIndexerParams {
150
  def labels: Array[String]
151
  def labelsArray: Array[Array[String]]
152
}
153

154
/**
155
 * Maps numeric indices back to string labels
156
 */
157
class IndexToString extends Transformer {
158
  def setInputCol(value: String): this.type
159
  def setOutputCol(value: String): this.type
160
  def setLabels(value: Array[String]): this.type
161
}
162

163
/**
164
 * One-hot encoding for categorical features
165
 */
166
class OneHotEncoder extends Transformer {
167
  def setInputCols(value: Array[String]): this.type
168
  def setOutputCols(value: Array[String]): this.type
169
  def setDropLast(value: Boolean): this.type
170
  def setHandleInvalid(value: String): this.type
171
}
172
```
173

174
**Usage Example:**
175

176
```scala
177
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder}
178
import org.apache.spark.ml.Pipeline
179

180
val indexer = new StringIndexer()
181
  .setInputCol("category")
182
  .setOutputCol("categoryIndex")
183

184
val encoder = new OneHotEncoder()
185
  .setInputCols(Array("categoryIndex"))
186
  .setOutputCols(Array("categoryVec"))
187

188
val pipeline = new Pipeline()
189
  .setStages(Array(indexer, encoder))
190

191
val model = pipeline.fit(df)
192
model.transform(df).show()
193
```
194

195
### Text Processing
196

197
Comprehensive text processing utilities for natural language processing tasks.
198

199
```scala { .api }
200
/**
201
 * Tokenizes text into individual words
202
 */
203
class Tokenizer extends Transformer {
204
  def setInputCol(value: String): this.type
205
  def setOutputCol(value: String): this.type
206
}
207

208
/**
209
 * Advanced tokenization using regular expressions
210
 */
211
class RegexTokenizer extends Transformer {
212
  def setInputCol(value: String): this.type
213
  def setOutputCol(value: String): this.type
214
  def setPattern(value: String): this.type
215
  def setGaps(value: Boolean): this.type
216
  def setToLowercase(value: Boolean): this.type
217
  def setMinTokenLength(value: Int): this.type
218
}
219

220
/**
221
 * Removes common stop words from text
222
 */
223
class StopWordsRemover extends Transformer {
224
  def setInputCol(value: String): this.type
225
  def setOutputCol(value: String): this.type
226
  def setStopWords(value: Array[String]): this.type
227
  def setCaseSensitive(value: Boolean): this.type
228
  def setLocale(value: String): this.type
229
}
230

231
/**
232
 * Generates n-grams from sequences of tokens
233
 */
234
class NGram extends Transformer {
235
  def setInputCol(value: String): this.type
236
  def setOutputCol(value: String): this.type
237
  def setN(value: Int): this.type
238
}
239

240
/**
241
 * Term frequency using hashing trick
242
 */
243
class HashingTF extends Transformer {
244
  def setInputCol(value: String): this.type
245
  def setOutputCol(value: String): this.type
246
  def setNumFeatures(value: Int): this.type
247
  def setBinary(value: Boolean): this.type
248
}
249

250
/**
251
 * Inverse document frequency weighting
252
 */
253
class IDF extends Estimator[IDFModel] {
254
  def setInputCol(value: String): this.type
255
  def setOutputCol(value: String): this.type
256
  def setMinDocFreq(value: Int): this.type
257
}
258

259
class IDFModel extends Model[IDFModel] with IDFParams {
260
  def idf: Vector
261
}
262

263
/**
264
 * Count-based vectorization of text documents
265
 */
266
class CountVectorizer extends Estimator[CountVectorizerModel] {
267
  def setInputCol(value: String): this.type
268
  def setOutputCol(value: String): this.type
269
  def setVocabSize(value: Int): this.type
270
  def setMinDF(value: Double): this.type
271
  def setMaxDF(value: Double): this.type
272
  def setMinTF(value: Double): this.type
273
  def setBinary(value: Boolean): this.type
274
}
275

276
class CountVectorizerModel extends Model[CountVectorizerModel] with CountVectorizerParams {
277
  def vocabulary: Array[String]
278
}
279

280
/**
281
 * Word2Vec for learning vector representations of words
282
 */
283
class Word2Vec extends Estimator[Word2VecModel] {
284
  def setInputCol(value: String): this.type
285
  def setOutputCol(value: String): this.type
286
  def setVectorSize(value: Int): this.type
287
  def setMinCount(value: Int): this.type
288
  def setNumPartitions(value: Int): this.type
289
  def setStepSize(value: Double): this.type
290
  def setMaxIter(value: Int): this.type
291
  def setSeed(value: Long): this.type
292
  def setWindowSize(value: Int): this.type
293
  def setMaxSentenceLength(value: Int): this.type
294
}
295

296
class Word2VecModel extends Model[Word2VecModel] with Word2VecParams {
297
  def getVectors: DataFrame
298
  def findSynonyms(word: String, num: Int): DataFrame
299
  def findSynonymsArray(word: String, num: Int): Array[(String, Double)]
300
  def transform(word: String): Vector
301
}
302
```
303

304
### Feature Selection
305

306
Statistical methods for selecting most relevant features for machine learning models.
307

308
```scala { .api }
309
/**
310
 * Univariate feature selection using statistical tests
311
 */
312
class UnivariateFeatureSelector extends Estimator[UnivariateFeatureSelectorModel] {
313
  def setFeatureType(value: String): this.type
314
  def setLabelType(value: String): this.type
315
  def setSelectionMode(value: String): this.type
316
  def setSelectionThreshold(value: Double): this.type
317
  def setInputCol(value: String): this.type
318
  def setOutputCol(value: String): this.type
319
}
320

321
class UnivariateFeatureSelectorModel extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {
322
  def selectedFeatures: Array[Int]
323
}
324

325
/**
326
 * Feature selection based on variance threshold
327
 */
328
class VarianceThresholdSelector extends Estimator[VarianceThresholdSelectorModel] {
329
  def setVarianceThreshold(value: Double): this.type
330
  def setInputCol(value: String): this.type
331
  def setOutputCol(value: String): this.type
332
}
333

334
class VarianceThresholdSelectorModel extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {
335
  def selectedFeatures: Array[Int]
336
}
337

338
/**
339
 * Chi-square feature selection for categorical data
340
 */
341
class ChiSqSelector extends Estimator[ChiSqSelectorModel] {
342
  def setNumTopFeatures(value: Int): this.type
343
  def setPercentile(value: Double): this.type
344
  def setFpr(value: Double): this.type
345
  def setFdr(value: Double): this.type
346
  def setFwe(value: Double): this.type
347
  def setSelectorType(value: String): this.type
348
  def setInputCol(value: String): this.type
349
  def setOutputCol(value: String): this.type
350
}
351

352
class ChiSqSelectorModel extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
353
  def selectedFeatures: Array[Int]
354
}
355
```
356

357
### Dimensionality Reduction
358

359
Algorithms for reducing the number of features while preserving important information.
360

361
```scala { .api }
362
/**
363
 * Principal Component Analysis for dimensionality reduction
364
 */
365
class PCA extends Estimator[PCAModel] {
366
  def setInputCol(value: String): this.type
367
  def setOutputCol(value: String): this.type
368
  def setK(value: Int): this.type
369
}
370

371
class PCAModel extends Model[PCAModel] with PCAParams {
372
  def pc: Matrix
373
  def explainedVariance: Vector
374
}
375
```
376

377
### Missing Value Handling
378

379
Utilities for handling missing or invalid data values.
380

381
```scala { .api }
382
/**
383
 * Imputes missing values using mean, median, or mode
384
 */
385
class Imputer extends Estimator[ImputerModel] {
386
  def setInputCols(value: Array[String]): this.type
387
  def setOutputCols(value: Array[String]): this.type
388
  def setStrategy(value: String): this.type
389
  def setMissingValue(value: Double): this.type
390
  def setRelativeError(value: Double): this.type
391
}
392

393
class ImputerModel extends Model[ImputerModel] with ImputerParams {
394
  def surrogateDF: DataFrame
395
}
396
```
397

398
### Advanced Transformations
399

400
Specialized transformations for complex feature engineering tasks.
401

402
```scala { .api }
403
/**
404
 * Applies SQL transformation to DataFrames
405
 */
406
class SQLTransformer extends Transformer {
407
  def setStatement(value: String): this.type
408
}
409

410
/**
411
 * Quantile-based discretization of continuous features
412
 */
413
class QuantileDiscretizer extends Estimator[QuantileDiscretizerModel] {
414
  def setInputCol(value: String): this.type
415
  def setOutputCol(value: String): this.type
416
  def setNumBuckets(value: Int): this.type
417
  def setRelativeError(value: Double): this.type
418
  def setHandleInvalid(value: String): this.type
419
}
420

421
class QuantileDiscretizerModel extends Model[QuantileDiscretizerModel] with QuantileDiscretizerParams {
422
  def getSplits: Array[Double]
423
}
424

425
/**
426
 * Maps continuous features to categorical buckets
427
 */
428
class Bucketizer extends Transformer {
429
  def setInputCol(value: String): this.type
430
  def setOutputCol(value: String): this.type
431
  def setSplits(value: Array[Double]): this.type
432
  def setHandleInvalid(value: String): this.type
433
}
434

435
/**
436
 * Feature hashing for high-dimensional categorical data
437
 */
438
class FeatureHasher extends Transformer {
439
  def setInputCols(value: Array[String]): this.type
440
  def setOutputCol(value: String): this.type
441
  def setNumFeatures(value: Int): this.type
442
  def setCategoricalCols(value: Array[String]): this.type
443
}
444

445
/**
446
 * Creates interaction features between input columns
447
 */
448
class Interaction extends Transformer {
449
  def setInputCols(value: Array[String]): this.type
450
  def setOutputCol(value: String): this.type
451
}
452

453
/**
454
 * Expands features to polynomial space
455
 */
456
class PolynomialExpansion extends Transformer {
457
  def setInputCol(value: String): this.type
458
  def setOutputCol(value: String): this.type
459
  def setDegree(value: Int): this.type
460
}
461
```
462

463
## Types
464

465
```scala { .api }
466
// Feature engineering imports
467
import org.apache.spark.ml.feature._
468
import org.apache.spark.ml.linalg.{Vector, Matrix}
469
import org.apache.spark.sql.{DataFrame, Dataset}
470

471
// Parameter traits
472
import org.apache.spark.ml.param.shared._
473

474
// Text processing utilities
475
import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer, StopWordsRemover}
476

477
// Vector utilities
478
import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer, VectorIndexer}
479

480
// Scaling utilities
481
import org.apache.spark.ml.feature.{StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler}
482
```

Version

Tile

Files

feature-engineering.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

feature-engineering.mddocs/