or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

classification.mdclustering.mdevaluation-tuning.mdfeature-engineering.mdindex.mdlinear-algebra.mdpipeline-components.mdrecommendation.mdregression.md
tile.json

feature-engineering.mddocs/

Feature Engineering

Comprehensive data preprocessing and feature transformation utilities for preparing raw data for machine learning algorithms, including text processing, categorical encoding, and numerical scaling.

Capabilities

Vector Assembly and Manipulation

Core utilities for assembling feature vectors and manipulating vector data structures.

/**
 * Combines multiple columns into a single vector column
 */
class VectorAssembler extends Transformer {
  def setInputCols(value: Array[String]): this.type
  def setOutputCol(value: String): this.type
  def setHandleInvalid(value: String): this.type
}

/**
 * Extracts elements from vectors by indices or names
 */
class VectorSlicer extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setIndices(value: Array[Int]): this.type
  def setNames(value: Array[String]): this.type
}

/**
 * Indexes categorical features in vector columns
 */
class VectorIndexer extends Estimator[VectorIndexerModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setMaxCategories(value: Int): this.type
  def setHandleInvalid(value: String): this.type
}

class VectorIndexerModel extends Model[VectorIndexerModel] with VectorIndexerParams {
  def numFeatures: Int
  def categoryMaps: Map[Int, Map[Double, Int]]
}

Usage Example:

import org.apache.spark.ml.feature.VectorAssembler

val assembler = new VectorAssembler()
  .setInputCols(Array("hour", "mobile", "userFeatures"))
  .setOutputCol("features")

val output = assembler.transform(dataset)
output.select("features").show()

Scaling and Normalization

Feature scaling transformations for normalizing numerical data distributions.

/**
 * Standardizes features by removing mean and scaling to unit variance
 */
class StandardScaler extends Estimator[StandardScalerModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setWithMean(value: Boolean): this.type
  def setWithStd(value: Boolean): this.type
}

class StandardScalerModel extends Model[StandardScalerModel] with StandardScalerParams {
  def mean: Vector
  def std: Vector
}

/**
 * Rescales features to a common range [min, max]
 */
class MinMaxScaler extends Estimator[MinMaxScalerModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setMin(value: Double): this.type
  def setMax(value: Double): this.type
}

class MinMaxScalerModel extends Model[MinMaxScalerModel] with MinMaxScalerParams {
  def originalMin: Vector
  def originalMax: Vector
}

/**
 * Scales features by the maximum absolute value
 */
class MaxAbsScaler extends Estimator[MaxAbsScalerModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
}

class MaxAbsScalerModel extends Model[MaxAbsScalerModel] with MaxAbsScalerParams {
  def maxAbs: Vector
}

/**
 * Normalizes vectors to have unit norm
 */
class Normalizer extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setP(value: Double): this.type
}

/**
 * Robust scaling using quantiles, less sensitive to outliers
 */
class RobustScaler extends Estimator[RobustScalerModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setLower(value: Double): this.type
  def setUpper(value: Double): this.type
  def setWithCentering(value: Boolean): this.type
  def setWithScaling(value: Boolean): this.type
  def setRelativeError(value: Double): this.type
}

class RobustScalerModel extends Model[RobustScalerModel] with RobustScalerParams {
  def median: Vector
  def range: Vector
}

Categorical Feature Processing

Transformations for handling categorical data including encoding and indexing.

/**
 * Maps string labels to numeric indices
 */
class StringIndexer extends Estimator[StringIndexerModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setHandleInvalid(value: String): this.type
  def setStringOrderType(value: String): this.type
}

class StringIndexerModel extends Model[StringIndexerModel] with StringIndexerParams {
  def labels: Array[String]
  def labelsArray: Array[Array[String]]
}

/**
 * Maps numeric indices back to string labels
 */
class IndexToString extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setLabels(value: Array[String]): this.type
}

/**
 * One-hot encoding for categorical features
 */
class OneHotEncoder extends Transformer {
  def setInputCols(value: Array[String]): this.type
  def setOutputCols(value: Array[String]): this.type
  def setDropLast(value: Boolean): this.type
  def setHandleInvalid(value: String): this.type
}

Usage Example:

import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder}
import org.apache.spark.ml.Pipeline

val indexer = new StringIndexer()
  .setInputCol("category")
  .setOutputCol("categoryIndex")

val encoder = new OneHotEncoder()
  .setInputCols(Array("categoryIndex"))
  .setOutputCols(Array("categoryVec"))

val pipeline = new Pipeline()
  .setStages(Array(indexer, encoder))

val model = pipeline.fit(df)
model.transform(df).show()

Text Processing

Comprehensive text processing utilities for natural language processing tasks.

/**
 * Tokenizes text into individual words
 */
class Tokenizer extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
}

/**
 * Advanced tokenization using regular expressions
 */
class RegexTokenizer extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setPattern(value: String): this.type
  def setGaps(value: Boolean): this.type
  def setToLowercase(value: Boolean): this.type
  def setMinTokenLength(value: Int): this.type
}

/**
 * Removes common stop words from text
 */
class StopWordsRemover extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setStopWords(value: Array[String]): this.type
  def setCaseSensitive(value: Boolean): this.type
  def setLocale(value: String): this.type
}

/**
 * Generates n-grams from sequences of tokens
 */
class NGram extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setN(value: Int): this.type
}

/**
 * Term frequency using hashing trick
 */
class HashingTF extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setNumFeatures(value: Int): this.type
  def setBinary(value: Boolean): this.type
}

/**
 * Inverse document frequency weighting
 */
class IDF extends Estimator[IDFModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setMinDocFreq(value: Int): this.type
}

class IDFModel extends Model[IDFModel] with IDFParams {
  def idf: Vector
}

/**
 * Count-based vectorization of text documents
 */
class CountVectorizer extends Estimator[CountVectorizerModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setVocabSize(value: Int): this.type
  def setMinDF(value: Double): this.type
  def setMaxDF(value: Double): this.type
  def setMinTF(value: Double): this.type
  def setBinary(value: Boolean): this.type
}

class CountVectorizerModel extends Model[CountVectorizerModel] with CountVectorizerParams {
  def vocabulary: Array[String]
}

/**
 * Word2Vec for learning vector representations of words
 */
class Word2Vec extends Estimator[Word2VecModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setVectorSize(value: Int): this.type
  def setMinCount(value: Int): this.type
  def setNumPartitions(value: Int): this.type
  def setStepSize(value: Double): this.type
  def setMaxIter(value: Int): this.type
  def setSeed(value: Long): this.type
  def setWindowSize(value: Int): this.type
  def setMaxSentenceLength(value: Int): this.type
}

class Word2VecModel extends Model[Word2VecModel] with Word2VecParams {
  def getVectors: DataFrame
  def findSynonyms(word: String, num: Int): DataFrame
  def findSynonymsArray(word: String, num: Int): Array[(String, Double)]
  def transform(word: String): Vector
}

Feature Selection

Statistical methods for selecting most relevant features for machine learning models.

/**
 * Univariate feature selection using statistical tests
 */
class UnivariateFeatureSelector extends Estimator[UnivariateFeatureSelectorModel] {
  def setFeatureType(value: String): this.type
  def setLabelType(value: String): this.type
  def setSelectionMode(value: String): this.type
  def setSelectionThreshold(value: Double): this.type
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
}

class UnivariateFeatureSelectorModel extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {
  def selectedFeatures: Array[Int]
}

/**
 * Feature selection based on variance threshold
 */
class VarianceThresholdSelector extends Estimator[VarianceThresholdSelectorModel] {
  def setVarianceThreshold(value: Double): this.type
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
}

class VarianceThresholdSelectorModel extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {
  def selectedFeatures: Array[Int]
}

/**
 * Chi-square feature selection for categorical data
 */
class ChiSqSelector extends Estimator[ChiSqSelectorModel] {
  def setNumTopFeatures(value: Int): this.type
  def setPercentile(value: Double): this.type
  def setFpr(value: Double): this.type
  def setFdr(value: Double): this.type
  def setFwe(value: Double): this.type
  def setSelectorType(value: String): this.type
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
}

class ChiSqSelectorModel extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
  def selectedFeatures: Array[Int]
}

Dimensionality Reduction

Algorithms for reducing the number of features while preserving important information.

/**
 * Principal Component Analysis for dimensionality reduction
 */
class PCA extends Estimator[PCAModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setK(value: Int): this.type
}

class PCAModel extends Model[PCAModel] with PCAParams {
  def pc: Matrix
  def explainedVariance: Vector
}

Missing Value Handling

Utilities for handling missing or invalid data values.

/**
 * Imputes missing values using mean, median, or mode
 */
class Imputer extends Estimator[ImputerModel] {
  def setInputCols(value: Array[String]): this.type
  def setOutputCols(value: Array[String]): this.type
  def setStrategy(value: String): this.type
  def setMissingValue(value: Double): this.type
  def setRelativeError(value: Double): this.type
}

class ImputerModel extends Model[ImputerModel] with ImputerParams {
  def surrogateDF: DataFrame
}

Advanced Transformations

Specialized transformations for complex feature engineering tasks.

/**
 * Applies SQL transformation to DataFrames
 */
class SQLTransformer extends Transformer {
  def setStatement(value: String): this.type
}

/**
 * Quantile-based discretization of continuous features
 */
class QuantileDiscretizer extends Estimator[QuantileDiscretizerModel] {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setNumBuckets(value: Int): this.type
  def setRelativeError(value: Double): this.type
  def setHandleInvalid(value: String): this.type
}

class QuantileDiscretizerModel extends Model[QuantileDiscretizerModel] with QuantileDiscretizerParams {
  def getSplits: Array[Double]
}

/**
 * Maps continuous features to categorical buckets
 */
class Bucketizer extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setSplits(value: Array[Double]): this.type
  def setHandleInvalid(value: String): this.type
}

/**
 * Feature hashing for high-dimensional categorical data
 */
class FeatureHasher extends Transformer {
  def setInputCols(value: Array[String]): this.type
  def setOutputCol(value: String): this.type
  def setNumFeatures(value: Int): this.type
  def setCategoricalCols(value: Array[String]): this.type
}

/**
 * Creates interaction features between input columns
 */
class Interaction extends Transformer {
  def setInputCols(value: Array[String]): this.type
  def setOutputCol(value: String): this.type
}

/**
 * Expands features to polynomial space
 */
class PolynomialExpansion extends Transformer {
  def setInputCol(value: String): this.type
  def setOutputCol(value: String): this.type
  def setDegree(value: Int): this.type
}

Types

// Feature engineering imports
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.{Vector, Matrix}
import org.apache.spark.sql.{DataFrame, Dataset}

// Parameter traits
import org.apache.spark.ml.param.shared._

// Text processing utilities
import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer, StopWordsRemover}

// Vector utilities
import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer, VectorIndexer}

// Scaling utilities
import org.apache.spark.ml.feature.{StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler}