Tessl Tile for maven/org.apache.spark/spark-mllib_2.13@3.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md clustering.md evaluation.md feature-processing.md frequent-pattern-mining.md index.md linear-algebra.md pipeline.md recommendation.md regression.md statistics.md

feature-processing.mddocs/

0
# Feature Processing
1

2
Comprehensive feature extraction, transformation, selection, and engineering capabilities. MLlib provides over 60 feature processing methods including text processing, scaling, dimensionality reduction, categorical encoding, and feature selection.
3

4
## Capabilities
5

6
### Vector Assembly and Manipulation
7

8
Core utilities for combining and manipulating feature vectors.
9

10
```scala { .api }
11
/**
12
 * VectorAssembler - combines multiple columns into a single vector column
13
 * Essential for preparing features for ML algorithms
14
 */
15
class VectorAssembler extends Transformer with HasInputCols with HasOutputCol with HasHandleInvalid {
16
  def setInputCols(value: Array[String]): this.type
17
  def setOutputCol(value: String): this.type
18
  def setHandleInvalid(value: String): this.type
19
}
20

21
/**
22
 * VectorSlicer - selects a subset of features from a vector
23
 * Useful for feature selection and dimensionality reduction
24
 */
25
class VectorSlicer extends Transformer with HasInputCol with HasOutputCol {
26
  def setInputCol(value: String): this.type
27
  def setOutputCol(value: String): this.type
28
  def setIndices(value: Array[Int]): this.type
29
  def setNames(value: Array[String]): this.type
30
}
31

32
/**
33
 * VectorSizeHint - provides size information for vectors
34
 * Helps optimize vector operations when size is known
35
 */
36
class VectorSizeHint extends Transformer with HasInputCol with HasOutputCol {
37
  def setInputCol(value: String): this.type
38
  def setOutputCol(value: String): this.type
39
  def setSize(value: Int): this.type
40
  def setHandleInvalid(value: String): this.type
41
}
42
```
43

44
**Usage Example:**
45

46
```scala
47
import org.apache.spark.ml.feature.VectorAssembler
48

49
val assembler = new VectorAssembler()
50
  .setInputCols(Array("hour", "mobile", "userFeatures"))
51
  .setOutputCol("features")
52

53
val output = assembler.transform(dataset)
54
```
55

56
### Feature Scaling and Normalization
57

58
Methods for scaling and normalizing feature values for improved algorithm performance.
59

60
```scala { .api }
61
/**
62
 * StandardScaler - standardizes features by removing mean and scaling to unit variance
63
 * Transforms features to have zero mean and unit standard deviation
64
 */
65
class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerParams {
66
  def setInputCol(value: String): this.type
67
  def setOutputCol(value: String): this.type
68
  def setWithMean(value: Boolean): this.type
69
  def setWithStd(value: Boolean): this.type
70
}
71

72
class StandardScalerModel extends Model[StandardScalerModel] with StandardScalerParams {
73
  def mean: Vector
74
  def std: Vector
75
}
76

77
/**
78
 * MinMaxScaler - rescales features to a specified range [min, max]
79
 * Transforms features to fit within the specified minimum and maximum values
80
 */
81
class MinMaxScaler extends Estimator[MinMaxScalerModel] with MinMaxScalerParams {
82
  def setInputCol(value: String): this.type
83
  def setOutputCol(value: String): this.type
84
  def setMin(value: Double): this.type
85
  def setMax(value: Double): this.type
86
}
87

88
class MinMaxScalerModel extends Model[MinMaxScalerModel] with MinMaxScalerParams {
89
  def originalMin: Vector
90
  def originalMax: Vector
91
}
92

93
/**
94
 * MaxAbsScaler - scales features by the maximum absolute value
95
 * Divides each feature by its maximum absolute value to scale to [-1, 1]
96
 */  
97
class MaxAbsScaler extends Estimator[MaxAbsScalerModel] with MaxAbsScalerParams {
98
  def setInputCol(value: String): this.type
99
  def setOutputCol(value: String): this.type
100
}
101

102
class MaxAbsScalerModel extends Model[MaxAbsScalerModel] with MaxAbsScalerParams {
103
  def maxAbs: Vector
104
}
105

106
/**
107
 * RobustScaler - scales features using median and interquartile range
108
 * More robust to outliers than StandardScaler
109
 */
110
class RobustScaler extends Estimator[RobustScalerModel] with RobustScalerParams {
111
  def setInputCol(value: String): this.type
112
  def setOutputCol(value: String): this.type
113
  def setWithCentering(value: Boolean): this.type
114
  def setWithScaling(value: Boolean): this.type
115
  def setLower(value: Double): this.type
116
  def setUpper(value: Double): this.type
117
}
118

119
class RobustScalerModel extends Model[RobustScalerModel] with RobustScalerParams {
120
  def median: Vector
121
  def range: Vector
122
}
123

124
/**
125
 * Normalizer - normalizes vectors to have unit norm
126
 * Scales individual samples to have unit norm (L1, L2, or Inf norm)
127
 */
128
class Normalizer extends Transformer with HasInputCol with HasOutputCol {
129
  def setInputCol(value: String): this.type
130
  def setOutputCol(value: String): this.type
131
  def setP(value: Double): this.type
132
}
133
```
134

135
### Categorical Feature Processing
136

137
Methods for handling categorical variables including indexing and encoding.
138

139
```scala { .api }
140
/**
141
 * StringIndexer - maps string values to numeric indices
142
 * Converts categorical string features to numeric indices for ML algorithms
143
 */
144
class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerParams {
145
  def setInputCol(value: String): this.type
146
  def setOutputCol(value: String): this.type
147
  def setHandleInvalid(value: String): this.type
148
  def setStringOrderType(value: String): this.type
149
}
150

151
class StringIndexerModel extends Model[StringIndexerModel] with StringIndexerParams {
152
  def labels: Array[String]
153
  def labelsArray: Array[Array[String]]
154
}
155

156
/**
157
 * IndexToString - maps numeric indices back to string values
158
 * Reverse operation of StringIndexer for interpreting model outputs
159
 */
160
class IndexToString extends Transformer with HasInputCol with HasOutputCol {
161
  def setInputCol(value: String): this.type
162
  def setOutputCol(value: String): this.type
163
  def setLabels(value: Array[String]): this.type
164
}
165

166
/**
167
 * OneHotEncoder - converts categorical indices to binary vectors
168
 * Creates binary columns for each category level
169
 */
170
class OneHotEncoder extends Estimator[OneHotEncoderModel] with OneHotEncoderParams {
171
  def setInputCols(value: Array[String]): this.type
172
  def setOutputCols(value: Array[String]): this.type
173
  def setDropLast(value: Boolean): this.type
174
  def setHandleInvalid(value: String): this.type
175
}
176

177
class OneHotEncoderModel extends Model[OneHotEncoderModel] with OneHotEncoderParams {
178
  def categorySizes: Array[Int]
179
}
180

181
/**
182
 * VectorIndexer - automatically identifies categorical features in vectors
183
 * Treats features with <= maxCategories unique values as categorical
184
 */
185
class VectorIndexer extends Estimator[VectorIndexerModel] with VectorIndexerParams {
186
  def setInputCol(value: String): this.type
187
  def setOutputCol(value: String): this.type
188
  def setMaxCategories(value: Int): this.type
189
  def setHandleInvalid(value: String): this.type
190
}
191

192
class VectorIndexerModel extends Model[VectorIndexerModel] with VectorIndexerParams {
193
  def numFeatures: Int
194
  def categoryMaps: Map[Int, Map[Double, Int]]
195
}
196
```
197

198
### Text Processing and Feature Extraction
199

200
Comprehensive text processing capabilities for natural language data.
201

202
```scala { .api }
203
/**
204
 * Tokenizer - splits text into individual words
205
 * Basic tokenization using whitespace and punctuation
206
 */
207
class Tokenizer extends Transformer with HasInputCol with HasOutputCol {
208
  def setInputCol(value: String): this.type
209
  def setOutputCol(value: String): this.type
210
}
211

212
/**
213
 * RegexTokenizer - tokenizes text using regular expressions
214
 * More flexible tokenization with configurable patterns
215
 */
216
class RegexTokenizer extends Transformer with HasInputCol with HasOutputCol {
217
  def setInputCol(value: String): this.type
218
  def setOutputCol(value: String): this.type  
219
  def setPattern(value: String): this.type
220
  def setGaps(value: Boolean): this.type
221
  def setToLowercase(value: Boolean): this.type
222
  def setMinTokenLength(value: Int): this.type
223
}
224

225
/**
226
 * StopWordsRemover - removes stop words from tokenized text
227
 * Filters common words that don't contribute to text meaning
228
 */
229
class StopWordsRemover extends Transformer with HasInputCol with HasOutputCol {
230
  def setInputCol(value: String): this.type
231
  def setOutputCol(value: String): this.type
232
  def setStopWords(value: Array[String]): this.type
233
  def setCaseSensitive(value: Boolean): this.type
234
  def setLocale(value: String): this.type
235
}
236

237
/**
238
 * NGram - generates n-grams from token sequences
239
 * Creates sequences of n consecutive tokens for text analysis
240
 */
241
class NGram extends Transformer with HasInputCol with HasOutputCol {
242
  def setInputCol(value: String): this.type
243
  def setOutputCol(value: String): this.type
244
  def setN(value: Int): this.type
245
}
246

247
/**
248
 * HashingTF - maps terms to term frequency vectors using hashing
249
 * Fast text vectorization using hash functions
250
 */
251
class HashingTF extends Transformer with HasInputCol with HasOutputCol {
252
  def setInputCol(value: String): this.type
253
  def setOutputCol(value: String): this.type
254
  def setNumFeatures(value: Int): this.type
255
  def setBinary(value: Boolean): this.type
256
}
257

258
/**
259
 * CountVectorizer - converts text to vectors based on term counts
260
 * Creates vocabulary and maps documents to term frequency vectors
261
 */
262
class CountVectorizer extends Estimator[CountVectorizerModel] with CountVectorizerParams {
263
  def setInputCol(value: String): this.type
264
  def setOutputCol(value: String): this.type
265
  def setVocabSize(value: Int): this.type
266
  def setMinDF(value: Double): this.type
267
  def setMaxDF(value: Double): this.type
268
  def setMinTF(value: Double): this.type
269
  def setBinary(value: Boolean): this.type
270
}
271

272
class CountVectorizerModel extends Model[CountVectorizerModel] with CountVectorizerParams {
273
  def vocabulary: Array[String]
274
}
275

276
/**
277
 * IDF - computes inverse document frequency for TF-IDF
278
 * Weights term frequencies by their inverse document frequency
279
 */
280
class IDF extends Estimator[IDFModel] with IDFParams {
281
  def setInputCol(value: String): this.type
282
  def setOutputCol(value: String): this.type
283
  def setMinDocFreq(value: Int): this.type
284
}
285

286
class IDFModel extends Model[IDFModel] with IDFParams {
287
  def idf: Vector
288
  def docFreq: Array[Long]
289
  def numDocs: Long
290
}
291

292
/**
293
 * Word2Vec - learns vector representations of words
294
 * Trains word embeddings using the Word2Vec algorithm
295
 */
296
class Word2Vec extends Estimator[Word2VecModel] with Word2VecParams {
297
  def setInputCol(value: String): this.type
298
  def setOutputCol(value: String): this.type
299
  def setVectorSize(value: Int): this.type
300
  def setMinCount(value: Int): this.type
301
  def setNumPartitions(value: Int): this.type
302
  def setStepSize(value: Double): this.type
303
  def setMaxIter(value: Int): this.type
304
  def setSeed(value: Long): this.type
305
  def setWindowSize(value: Int): this.type
306
}
307

308
class Word2VecModel extends Model[Word2VecModel] with Word2VecParams {
309
  def getVectors: DataFrame
310
  def findSynonyms(word: String, num: Int): DataFrame
311
  def findSynonymsArray(word: String, num: Int): Array[(String, Double)]
312
  def transform(word: String): Vector
313
}
314
```
315

316
### Feature Selection
317

318
Methods for selecting the most relevant features for modeling.
319

320
```scala { .api }
321
/**
322
 * ChiSqSelector - selects features based on Chi-squared test
323
 * Statistical feature selection for categorical features
324
 */
325
class ChiSqSelector extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams {
326
  def setFeaturesCol(value: String): this.type
327
  def setOutputCol(value: String): this.type
328
  def setLabelCol(value: String): this.type
329
  def setSelectorType(value: String): this.type
330
  def setNumTopFeatures(value: Int): this.type
331
  def setPercentile(value: Double): this.type
332
  def setFpr(value: Double): this.type
333
  def setFdr(value: Double): this.type
334
  def setFwe(value: Double): this.type
335
}
336

337
class ChiSqSelectorModel extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
338
  def selectedFeatures: Array[Int]
339
}
340

341
/**
342
 * UnivariateFeatureSelector - selects features using statistical tests
343
 * Supports various statistical tests for feature selection
344
 */
345
class UnivariateFeatureSelector extends Estimator[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {
346
  def setFeaturesCol(value: String): this.type
347
  def setOutputCol(value: String): this.type
348
  def setLabelCol(value: String): this.type
349
  def setFeatureType(value: String): this.type
350
  def setLabelType(value: String): this.type
351
  def setSelectionMode(value: String): this.type
352
  def setSelectionThreshold(value: Double): this.type
353
}
354

355
class UnivariateFeatureSelectorModel extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {
356
  def selectedFeatures: Array[Int]
357
}
358

359
/**
360
 * VarianceThresholdSelector - removes low-variance features
361
 * Filters features with variance below specified threshold
362
 */
363
class VarianceThresholdSelector extends Estimator[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {
364
  def setFeaturesCol(value: String): this.type
365
  def setOutputCol(value: String): this.type
366
  def setVarianceThreshold(value: Double): this.type
367
}
368

369
class VarianceThresholdSelectorModel extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {
370
  def selectedFeatures: Array[Int]
371
}
372
```
373

374
### Dimensionality Reduction
375

376
Methods for reducing the number of features while preserving information.
377

378
```scala { .api }
379
/**
380
 * PCA - Principal Component Analysis for dimensionality reduction
381
 * Projects data onto lower dimensional space preserving maximum variance
382
 */
383
class PCA extends Estimator[PCAModel] with PCAParams {
384
  def setInputCol(value: String): this.type
385
  def setOutputCol(value: String): this.type
386
  def setK(value: Int): this.type
387
}
388

389
class PCAModel extends Model[PCAModel] with PCAParams {
390
  def pc: Matrix
391
  def explainedVariance: Vector
392
}
393

394
/**
395
 * DCT - Discrete Cosine Transform
396
 * Applies DCT transformation for frequency domain analysis
397
 */
398
class DCT extends Transformer with HasInputCol with HasOutputCol {
399
  def setInputCol(value: String): this.type
400
  def setOutputCol(value: String): this.type
401
  def setInverse(value: Boolean): this.type
402
}
403
```
404

405
### Bucketing and Discretization
406

407
Methods for converting continuous variables into discrete bins.
408

409
```scala { .api }
410
/**
411
 * Bucketizer - maps continuous features to buckets
412
 * Converts continuous values to discrete bins using thresholds
413
 */
414
class Bucketizer extends Transformer with HasInputCol with HasOutputCol with HasHandleInvalid {
415
  def setInputCol(value: String): this.type
416
  def setOutputCol(value: String): this.type
417
  def setSplits(value: Array[Double]): this.type
418
  def setInputCols(value: Array[String]): this.type
419
  def setOutputCols(value: Array[String]): this.type
420
  def setSplitsArray(value: Array[Array[Double]]): this.type
421
  def setHandleInvalid(value: String): this.type
422
}
423

424
/**
425
 * QuantileDiscretizer - discretizes continuous features using quantiles
426
 * Automatically determines bucket boundaries based on data distribution
427
 */
428
class QuantileDiscretizer extends Estimator[QuantileDiscretizerModel] with QuantileDiscretizerParams {
429
  def setInputCol(value: String): this.type
430
  def setOutputCol(value: String): this.type
431
  def setInputCols(value: Array[String]): this.type
432
  def setOutputCols(value: Array[String]): this.type
433
  def setNumBuckets(value: Int): this.type
434
  def setNumBucketsArray(value: Array[Int]): this.type
435
  def setRelativeError(value: Double): this.type
436
  def setHandleInvalid(value: String): this.type
437
}
438

439
class QuantileDiscretizerModel extends Model[QuantileDiscretizerModel] with QuantileDiscretizerParams {
440
  def getSplits: Array[Double]
441
  def getSplitsArray: Array[Array[Double]]
442
}
443
```
444

445
### Feature Engineering
446

447
Advanced feature engineering techniques for creating new features.
448

449
```scala { .api }
450
/**
451
 * PolynomialExpansion - generates polynomial features
452
 * Creates interaction features by expanding polynomial terms
453
 */
454
class PolynomialExpansion extends Transformer with HasInputCol with HasOutputCol {
455
  def setInputCol(value: String): this.type
456
  def setOutputCol(value: String): this.type
457
  def setDegree(value: Int): this.type
458
}
459

460
/**
461
 * Interaction - creates interaction features between selected columns
462
 * Generates cross-product features for modeling feature interactions
463
 */
464
class Interaction extends Transformer with HasInputCols with HasOutputCol {
465
  def setInputCols(value: Array[String]): this.type
466
  def setOutputCol(value: String): this.type
467
}
468

469
/**
470
 * Binarizer - thresholds numerical features to binary values
471
 * Converts continuous values to binary based on threshold
472
 */
473
class Binarizer extends Transformer with HasInputCol with HasOutputCol with HasThreshold {
474
  def setInputCol(value: String): this.type
475
  def setOutputCol(value: String): this.type
476
  def setInputCols(value: Array[String]): this.type
477
  def setOutputCols(value: Array[String]): this.type
478
  def setThreshold(value: Double): this.type
479
  def setThresholds(value: Array[Double]): this.type
480
}
481

482
/**
483
 * FeatureHasher - hashes categorical features to sparse vectors
484
 * Maps arbitrary features to fixed-length vectors using hashing
485
 */
486
class FeatureHasher extends Transformer with HasInputCols with HasOutputCol {
487
  def setInputCols(value: Array[String]): this.type
488
  def setOutputCol(value: String): this.type
489
  def setNumFeatures(value: Int): this.type
490
  def setCategoricalCols(value: Array[String]): this.type
491
}
492

493
/**
494
 * SQLTransformer - applies SQL transformations to DataFrames
495
 * Enables complex feature engineering using SQL expressions
496
 */
497
class SQLTransformer extends Transformer {
498
  def setStatement(value: String): this.type
499
  def getStatement: String
500
}
501

502
/**
503
 * RFormula - creates features using R-style formulas
504
 * Provides R-like syntax for feature specification and encoding
505
 */
506
class RFormula extends Estimator[RFormulaModel] with RFormulaParams {
507
  def setFormula(value: String): this.type
508
  def setFeaturesCol(value: String): this.type
509
  def setLabelCol(value: String): this.type
510
  def setForceIndexLabel(value: Boolean): this.type
511
  def setStringIndexerOrderType(value: String): this.type
512
  def setHandleInvalid(value: String): this.type
513
}
514

515
class RFormulaModel extends Model[RFormulaModel] with RFormulaParams {
516
  def pipelineModel: PipelineModel
517
}
518
```
519

520
## Usage Examples
521

522
### Complete Feature Processing Pipeline
523

524
```scala
525
import org.apache.spark.ml.feature._
526
import org.apache.spark.ml.Pipeline
527

528
// Text processing pipeline
529
val tokenizer = new Tokenizer()
530
  .setInputCol("text")
531
  .setOutputCol("words")
532

533
val stopWordsRemover = new StopWordsRemover()
534
  .setInputCol("words")
535
  .setOutputCol("filtered")
536

537
val hashingTF = new HashingTF()
538
  .setInputCol("filtered")
539
  .setOutputCol("rawFeatures")
540
  .setNumFeatures(1000)
541

542
val idf = new IDF()
543
  .setInputCol("rawFeatures")
544
  .setOutputCol("textFeatures")
545

546
// Categorical feature processing
547
val stringIndexer = new StringIndexer()
548
  .setInputCol("category")
549
  .setOutputCol("categoryIndex")
550

551
val oneHotEncoder = new OneHotEncoder()
552
  .setInputCols(Array("categoryIndex"))
553
  .setOutputCols(Array("categoryVec"))
554

555
// Numerical feature processing
556
val scaler = new StandardScaler()
557
  .setInputCol("numericFeatures")
558
  .setOutputCol("scaledNumeric")
559
  .setWithMean(true)
560
  .setWithStd(true)
561

562
// Feature assembly
563
val assembler = new VectorAssembler()
564
  .setInputCols(Array("textFeatures", "categoryVec", "scaledNumeric"))
565
  .setOutputCol("features")
566

567
// Create pipeline
568
val pipeline = new Pipeline()
569
  .setStages(Array(
570
    tokenizer, stopWordsRemover, hashingTF, idf,
571
    stringIndexer, oneHotEncoder,
572
    scaler, assembler
573
  ))
574

575
val model = pipeline.fit(trainingData)
576
val processedData = model.transform(rawData)
577
```
578

579
### Feature Selection Example
580

581
```scala
582
import org.apache.spark.ml.feature.ChiSqSelector
583

584
val selector = new ChiSqSelector()
585
  .setNumTopFeatures(50)
586
  .setFeaturesCol("features")
587
  .setLabelCol("label")
588
  .setOutputCol("selectedFeatures")
589

590
val result = selector.fit(data).transform(data)
591
```
592

593
### Dimensionality Reduction
594

595
```scala
596
import org.apache.spark.ml.feature.PCA
597

598
val pca = new PCA()
599
  .setInputCol("features")
600
  .setOutputCol("pcaFeatures")
601
  .setK(3)
602

603
val model = pca.fit(data)
604
val result = model.transform(data)
605

606
// Access explained variance
607
println(s"Explained variance: ${model.explainedVariance}")
608
```

Version

Tile

Files

feature-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

feature-processing.mddocs/