Comprehensive data preprocessing and feature transformation utilities for preparing raw data for machine learning algorithms, including text processing, categorical encoding, and numerical scaling.
Core utilities for assembling feature vectors and manipulating vector data structures.
/**
* Combines multiple columns into a single vector column
*/
class VectorAssembler extends Transformer {
def setInputCols(value: Array[String]): this.type
def setOutputCol(value: String): this.type
def setHandleInvalid(value: String): this.type
}
/**
* Extracts elements from vectors by indices or names
*/
class VectorSlicer extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setIndices(value: Array[Int]): this.type
def setNames(value: Array[String]): this.type
}
/**
* Indexes categorical features in vector columns
*/
class VectorIndexer extends Estimator[VectorIndexerModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setMaxCategories(value: Int): this.type
def setHandleInvalid(value: String): this.type
}
class VectorIndexerModel extends Model[VectorIndexerModel] with VectorIndexerParams {
def numFeatures: Int
def categoryMaps: Map[Int, Map[Double, Int]]
}Usage Example:
import org.apache.spark.ml.feature.VectorAssembler
val assembler = new VectorAssembler()
.setInputCols(Array("hour", "mobile", "userFeatures"))
.setOutputCol("features")
val output = assembler.transform(dataset)
output.select("features").show()Feature scaling transformations for normalizing numerical data distributions.
/**
* Standardizes features by removing mean and scaling to unit variance
*/
class StandardScaler extends Estimator[StandardScalerModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setWithMean(value: Boolean): this.type
def setWithStd(value: Boolean): this.type
}
class StandardScalerModel extends Model[StandardScalerModel] with StandardScalerParams {
def mean: Vector
def std: Vector
}
/**
* Rescales features to a common range [min, max]
*/
class MinMaxScaler extends Estimator[MinMaxScalerModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setMin(value: Double): this.type
def setMax(value: Double): this.type
}
class MinMaxScalerModel extends Model[MinMaxScalerModel] with MinMaxScalerParams {
def originalMin: Vector
def originalMax: Vector
}
/**
* Scales features by the maximum absolute value
*/
class MaxAbsScaler extends Estimator[MaxAbsScalerModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
}
class MaxAbsScalerModel extends Model[MaxAbsScalerModel] with MaxAbsScalerParams {
def maxAbs: Vector
}
/**
* Normalizes vectors to have unit norm
*/
class Normalizer extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setP(value: Double): this.type
}
/**
* Robust scaling using quantiles, less sensitive to outliers
*/
class RobustScaler extends Estimator[RobustScalerModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setLower(value: Double): this.type
def setUpper(value: Double): this.type
def setWithCentering(value: Boolean): this.type
def setWithScaling(value: Boolean): this.type
def setRelativeError(value: Double): this.type
}
class RobustScalerModel extends Model[RobustScalerModel] with RobustScalerParams {
def median: Vector
def range: Vector
}Transformations for handling categorical data including encoding and indexing.
/**
* Maps string labels to numeric indices
*/
class StringIndexer extends Estimator[StringIndexerModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setHandleInvalid(value: String): this.type
def setStringOrderType(value: String): this.type
}
class StringIndexerModel extends Model[StringIndexerModel] with StringIndexerParams {
def labels: Array[String]
def labelsArray: Array[Array[String]]
}
/**
* Maps numeric indices back to string labels
*/
class IndexToString extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setLabels(value: Array[String]): this.type
}
/**
* One-hot encoding for categorical features
*/
class OneHotEncoder extends Transformer {
def setInputCols(value: Array[String]): this.type
def setOutputCols(value: Array[String]): this.type
def setDropLast(value: Boolean): this.type
def setHandleInvalid(value: String): this.type
}Usage Example:
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder}
import org.apache.spark.ml.Pipeline
val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("categoryIndex")
val encoder = new OneHotEncoder()
.setInputCols(Array("categoryIndex"))
.setOutputCols(Array("categoryVec"))
val pipeline = new Pipeline()
.setStages(Array(indexer, encoder))
val model = pipeline.fit(df)
model.transform(df).show()Comprehensive text processing utilities for natural language processing tasks.
/**
* Tokenizes text into individual words
*/
class Tokenizer extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
}
/**
* Advanced tokenization using regular expressions
*/
class RegexTokenizer extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setPattern(value: String): this.type
def setGaps(value: Boolean): this.type
def setToLowercase(value: Boolean): this.type
def setMinTokenLength(value: Int): this.type
}
/**
* Removes common stop words from text
*/
class StopWordsRemover extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setStopWords(value: Array[String]): this.type
def setCaseSensitive(value: Boolean): this.type
def setLocale(value: String): this.type
}
/**
* Generates n-grams from sequences of tokens
*/
class NGram extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setN(value: Int): this.type
}
/**
* Term frequency using hashing trick
*/
class HashingTF extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setNumFeatures(value: Int): this.type
def setBinary(value: Boolean): this.type
}
/**
* Inverse document frequency weighting
*/
class IDF extends Estimator[IDFModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setMinDocFreq(value: Int): this.type
}
class IDFModel extends Model[IDFModel] with IDFParams {
def idf: Vector
}
/**
* Count-based vectorization of text documents
*/
class CountVectorizer extends Estimator[CountVectorizerModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setVocabSize(value: Int): this.type
def setMinDF(value: Double): this.type
def setMaxDF(value: Double): this.type
def setMinTF(value: Double): this.type
def setBinary(value: Boolean): this.type
}
class CountVectorizerModel extends Model[CountVectorizerModel] with CountVectorizerParams {
def vocabulary: Array[String]
}
/**
* Word2Vec for learning vector representations of words
*/
class Word2Vec extends Estimator[Word2VecModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setVectorSize(value: Int): this.type
def setMinCount(value: Int): this.type
def setNumPartitions(value: Int): this.type
def setStepSize(value: Double): this.type
def setMaxIter(value: Int): this.type
def setSeed(value: Long): this.type
def setWindowSize(value: Int): this.type
def setMaxSentenceLength(value: Int): this.type
}
class Word2VecModel extends Model[Word2VecModel] with Word2VecParams {
def getVectors: DataFrame
def findSynonyms(word: String, num: Int): DataFrame
def findSynonymsArray(word: String, num: Int): Array[(String, Double)]
def transform(word: String): Vector
}Statistical methods for selecting most relevant features for machine learning models.
/**
* Univariate feature selection using statistical tests
*/
class UnivariateFeatureSelector extends Estimator[UnivariateFeatureSelectorModel] {
def setFeatureType(value: String): this.type
def setLabelType(value: String): this.type
def setSelectionMode(value: String): this.type
def setSelectionThreshold(value: Double): this.type
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
}
class UnivariateFeatureSelectorModel extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {
def selectedFeatures: Array[Int]
}
/**
* Feature selection based on variance threshold
*/
class VarianceThresholdSelector extends Estimator[VarianceThresholdSelectorModel] {
def setVarianceThreshold(value: Double): this.type
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
}
class VarianceThresholdSelectorModel extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {
def selectedFeatures: Array[Int]
}
/**
* Chi-square feature selection for categorical data
*/
class ChiSqSelector extends Estimator[ChiSqSelectorModel] {
def setNumTopFeatures(value: Int): this.type
def setPercentile(value: Double): this.type
def setFpr(value: Double): this.type
def setFdr(value: Double): this.type
def setFwe(value: Double): this.type
def setSelectorType(value: String): this.type
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
}
class ChiSqSelectorModel extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
def selectedFeatures: Array[Int]
}Algorithms for reducing the number of features while preserving important information.
/**
* Principal Component Analysis for dimensionality reduction
*/
class PCA extends Estimator[PCAModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setK(value: Int): this.type
}
class PCAModel extends Model[PCAModel] with PCAParams {
def pc: Matrix
def explainedVariance: Vector
}Utilities for handling missing or invalid data values.
/**
* Imputes missing values using mean, median, or mode
*/
class Imputer extends Estimator[ImputerModel] {
def setInputCols(value: Array[String]): this.type
def setOutputCols(value: Array[String]): this.type
def setStrategy(value: String): this.type
def setMissingValue(value: Double): this.type
def setRelativeError(value: Double): this.type
}
class ImputerModel extends Model[ImputerModel] with ImputerParams {
def surrogateDF: DataFrame
}Specialized transformations for complex feature engineering tasks.
/**
* Applies SQL transformation to DataFrames
*/
class SQLTransformer extends Transformer {
def setStatement(value: String): this.type
}
/**
* Quantile-based discretization of continuous features
*/
class QuantileDiscretizer extends Estimator[QuantileDiscretizerModel] {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setNumBuckets(value: Int): this.type
def setRelativeError(value: Double): this.type
def setHandleInvalid(value: String): this.type
}
class QuantileDiscretizerModel extends Model[QuantileDiscretizerModel] with QuantileDiscretizerParams {
def getSplits: Array[Double]
}
/**
* Maps continuous features to categorical buckets
*/
class Bucketizer extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setSplits(value: Array[Double]): this.type
def setHandleInvalid(value: String): this.type
}
/**
* Feature hashing for high-dimensional categorical data
*/
class FeatureHasher extends Transformer {
def setInputCols(value: Array[String]): this.type
def setOutputCol(value: String): this.type
def setNumFeatures(value: Int): this.type
def setCategoricalCols(value: Array[String]): this.type
}
/**
* Creates interaction features between input columns
*/
class Interaction extends Transformer {
def setInputCols(value: Array[String]): this.type
def setOutputCol(value: String): this.type
}
/**
* Expands features to polynomial space
*/
class PolynomialExpansion extends Transformer {
def setInputCol(value: String): this.type
def setOutputCol(value: String): this.type
def setDegree(value: Int): this.type
}// Feature engineering imports
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.{Vector, Matrix}
import org.apache.spark.sql.{DataFrame, Dataset}
// Parameter traits
import org.apache.spark.ml.param.shared._
// Text processing utilities
import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer, StopWordsRemover}
// Vector utilities
import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer, VectorIndexer}
// Scaling utilities
import org.apache.spark.ml.feature.{StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler}