or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

classification.mdclustering.mdevaluation-tuning.mdfeature-engineering.mdindex.mdlinear-algebra.mdpipeline-components.mdrecommendation.mdregression.md
tile.json

classification.mddocs/

Classification

Supervised learning algorithms for predicting categorical outcomes, including binary and multiclass classification with probabilistic predictions and comprehensive model evaluation.

Capabilities

Logistic Regression

Linear classification algorithm using logistic function for binary and multiclass problems with L1/L2 regularization support.

/**
 * Logistic regression classifier with regularization support
 */
class LogisticRegression extends Classifier[Vector, LogisticRegression, LogisticRegressionModel] {
  def setMaxIter(value: Int): this.type
  def setRegParam(value: Double): this.type
  def setElasticNetParam(value: Double): this.type
  def setTol(value: Double): this.type
  def setFitIntercept(value: Boolean): this.type
  def setStandardization(value: Boolean): this.type
  def setThreshold(value: Double): this.type
  def setThresholds(value: Array[Double]): this.type
  def setWeightCol(value: String): this.type
  def setAggregationDepth(value: Int): this.type
  def setFamily(value: String): this.type
  def setLowerBoundsOnCoefficients(value: Matrix): this.type
  def setUpperBoundsOnCoefficients(value: Matrix): this.type
  def setLowerBoundsOnIntercepts(value: Vector): this.type
  def setUpperBoundsOnIntercepts(value: Vector): this.type
  def setMaxBlockSizeInMB(value: Double): this.type
  def setInitialModel(model: LogisticRegressionModel): this.type
}

class LogisticRegressionModel extends ClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams {
  def coefficients: Vector
  def intercept: Double
  def coefficientMatrix: Matrix
  def interceptVector: Vector
  def summary: LogisticRegressionTrainingSummary
  def hasSummary: Boolean
  def evaluate(dataset: Dataset[_]): LogisticRegressionSummary
}

class LogisticRegressionSummary {
  def predictions: DataFrame
  def probabilityCol: String
  def labelCol: String
  def featuresCol: String
  def predictionCol: String
}

class BinaryLogisticRegressionSummary extends LogisticRegressionSummary {
  def areaUnderROC: Double
  def roc: DataFrame
  def areaUnderPR: Double
  def pr: DataFrame
  def fMeasureByThreshold: DataFrame
  def precisionByThreshold: DataFrame
  def recallByThreshold: DataFrame
}

Usage Example:

import org.apache.spark.ml.classification.LogisticRegression

val lr = new LogisticRegression()
  .setMaxIter(20)
  .setRegParam(0.3)
  .setElasticNetParam(0.8)
  .setFamily("binomial")

val lrModel = lr.fit(trainingData)
val predictions = lrModel.transform(testData)

// Access model coefficients
println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")

// Get training summary
val trainingSummary = lrModel.summary
println(s"Number of iterations: ${trainingSummary.totalIterations}")

Decision Tree Classifier

Tree-based classifier using recursive binary splits with support for categorical and continuous features.

/**
 * Decision tree classifier with configurable splitting criteria
 */
class DecisionTreeClassifier extends Classifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel] {
  def setMaxDepth(value: Int): this.type
  def setMaxBins(value: Int): this.type
  def setMinInstancesPerNode(value: Int): this.type
  def setMinInfoGain(value: Double): this.type
  def setMaxMemoryInMB(value: Int): this.type
  def setCacheNodeIds(value: Boolean): this.type
  def setCheckpointInterval(value: Int): this.type
  def setImpurity(value: String): this.type
  def setSeed(value: Long): this.type
}

class DecisionTreeClassificationModel extends ClassificationModel[Vector, DecisionTreeClassificationModel] with DecisionTreeClassifierParams {
  def rootNode: Node
  def depth: Int
  def numNodes: Int
  def toDebugString: String
  def featureImportances: Vector
}

abstract class Node extends Serializable {
  def prediction: Double
  def impurity: Double
  def impurityStats: ImpurityStats
  def isLeaf: Boolean
  def deepCopy(): Node
}

Usage Example:

import org.apache.spark.ml.classification.DecisionTreeClassifier

val dt = new DecisionTreeClassifier()
  .setLabelCol("indexedLabel")
  .setFeaturesCol("indexedFeatures")
  .setMaxDepth(5)
  .setMaxBins(32)
  .setMinInstancesPerNode(1)
  .setMinInfoGain(0.0)
  .setImpurity("gini")

val dtModel = dt.fit(trainingData)
val predictions = dtModel.transform(testData)

// Print the learned classification tree model
println(s"Learned classification tree model:\n ${dtModel.toDebugString}")

// Get feature importances
println(s"Feature importances: ${dtModel.featureImportances}")

Random Forest Classifier

Ensemble method combining multiple decision trees with bootstrap aggregating and random feature selection.

/**
 * Random Forest classifier using ensemble of decision trees
 */
class RandomForestClassifier extends Classifier[Vector, RandomForestClassifier, RandomForestClassificationModel] {
  def setNumTrees(value: Int): this.type
  def setMaxDepth(value: Int): this.type
  def setMaxBins(value: Int): this.type
  def setMinInstancesPerNode(value: Int): this.type
  def setMinInfoGain(value: Double): this.type
  def setMaxMemoryInMB(value: Int): this.type
  def setCacheNodeIds(value: Boolean): this.type
  def setCheckpointInterval(value: Int): this.type
  def setImpurity(value: String): this.type
  def setSubsamplingRate(value: Double): this.type
  def setSeed(value: Long): this.type
  def setFeatureSubsetStrategy(value: String): this.type
}

class RandomForestClassificationModel extends ClassificationModel[Vector, RandomForestClassificationModel] with RandomForestClassifierParams {
  def trees: Array[DecisionTreeClassificationModel]
  def treeWeights: Array[Double]
  def numFeatures: Int
  def totalNumNodes: Int
  def toDebugString: String
  def featureImportances: Vector
}

Usage Example:

import org.apache.spark.ml.classification.RandomForestClassifier

val rf = new RandomForestClassifier()
  .setLabelCol("indexedLabel")
  .setFeaturesCol("indexedFeatures")
  .setNumTrees(20)
  .setMaxDepth(5)
  .setMaxBins(32)
  .setFeatureSubsetStrategy("auto")

val rfModel = rf.fit(trainingData)
val predictions = rfModel.transform(testData)

// Print feature importances
println(s"Feature importances: ${rfModel.featureImportances}")

// Access individual trees
println(s"Number of trees: ${rfModel.trees.length}")

Gradient Boosted Tree Classifier

Ensemble method that builds models sequentially where each new model corrects errors from previous models.

/**
 * Gradient-boosted tree classifier for binary classification
 */
class GBTClassifier extends Classifier[Vector, GBTClassifier, GBTClassificationModel] {
  def setLossType(value: String): this.type
  def setMaxIter(value: Int): this.type
  def setStepSize(value: Double): this.type
  def setMaxDepth(value: Int): this.type
  def setMaxBins(value: Int): this.type
  def setMinInstancesPerNode(value: Int): this.type
  def setMinInfoGain(value: Double): this.type
  def setMaxMemoryInMB(value: Int): this.type
  def setCacheNodeIds(value: Boolean): this.type
  def setCheckpointInterval(value: Int): this.type
  def setImpurity(value: String): this.type
  def setSubsamplingRate(value: Double): this.type
  def setSeed(value: Long): this.type
  def setFeatureSubsetStrategy(value: String): this.type
  def setValidationTol(value: Double): this.type
  def setValidationIndicatorCol(value: String): this.type
}

class GBTClassificationModel extends ClassificationModel[Vector, GBTClassificationModel] with GBTClassifierParams {
  def trees: Array[DecisionTreeRegressionModel]
  def treeWeights: Array[Double]
  def numFeatures: Int
  def totalNumNodes: Int
  def toDebugString: String
  def featureImportances: Vector
}

Support Vector Machine

Linear support vector classifier with L2 regularization for binary classification problems.

/**
 * Linear Support Vector Machine classifier
 */
class LinearSVC extends Classifier[Vector, LinearSVC, LinearSVCModel] {
  def setRegParam(value: Double): this.type
  def setMaxIter(value: Int): this.type
  def setTol(value: Double): this.type
  def setFitIntercept(value: Boolean): this.type
  def setStandardization(value: Boolean): this.type
  def setThreshold(value: Double): this.type
  def setWeightCol(value: String): this.type
  def setAggregationDepth(value: Int): this.type
}

class LinearSVCModel extends ClassificationModel[Vector, LinearSVCModel] with LinearSVCParams {
  def coefficients: Vector
  def intercept: Double
}

Naive Bayes

Probabilistic classifier based on Bayes' theorem with naive independence assumption between features.

/**
 * Naive Bayes classifier with multiple model types
 */
class NaiveBayes extends Classifier[Vector, NaiveBayes, NaiveBayesModel] {
  def setModelType(value: String): this.type
  def setSmoothing(value: Double): this.type
  def setThresholds(value: Array[Double]): this.type
  def setWeightCol(value: String): this.type
}

class NaiveBayesModel extends ClassificationModel[Vector, NaiveBayesModel] with NaiveBayesParams {
  def pi: Vector
  def theta: Matrix
  def sigma: Matrix
  def numFeatures: Int
  def numClasses: Int
}

Neural Network Classifier

Multilayer perceptron classifier using backpropagation for training feed-forward neural networks.

/**
 * Multilayer perceptron classifier
 */
class MultilayerPerceptronClassifier extends Classifier[Vector, MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel] {
  def setLayers(value: Array[Int]): this.type
  def setBlockSize(value: Int): this.type
  def setSolver(value: String): this.type
  def setMaxIter(value: Int): this.type
  def setTol(value: Double): this.type
  def setSeed(value: Long): this.type
  def setInitialWeights(value: Vector): this.type
  def setStepSize(value: Double): this.type
}

class MultilayerPerceptronClassificationModel extends ClassificationModel[Vector, MultilayerPerceptronClassificationModel] with MultilayerPerceptronClassifierParams {
  def layers: Array[Int]
  def weights: Vector
}

One-vs-Rest Strategy

Meta-algorithm that enables binary classifiers to handle multiclass problems by training one classifier per class.

/**
 * One-vs-Rest multiclass classification strategy
 */
class OneVsRest extends Estimator[OneVsRestModel] with OneVsRestParams {
  def setClassifier(value: Classifier[_, _, _]): this.type
  def setLabelCol(value: String): this.type
  def setFeaturesCol(value: String): this.type
  def setPredictionCol(value: String): this.type
  def setRawPredictionCol(value: String): this.type
  def setParallelism(value: Int): this.type
  def setWeightCol(value: String): this.type
}

class OneVsRestModel extends Model[OneVsRestModel] with OneVsRestParams {
  def models: Array[_ <: ClassificationModel[_, _]]
  def labelMetadata: Metadata
}

Factorization Machine Classifier

Factorization machine for classification tasks modeling feature interactions efficiently.

/**
 * Factorization Machine classifier for binary classification
 */
class FMClassifier extends Classifier[Vector, FMClassifier, FMClassificationModel] {
  def setFactorSize(value: Int): this.type
  def setFitIntercept(value: Boolean): this.type
  def setFitLinear(value: Boolean): this.type
  def setRegParam(value: Double): this.type
  def setMiniBatchFraction(value: Double): this.type
  def setInitStd(value: Double): this.type
  def setMaxIter(value: Int): this.type
  def setStepSize(value: Double): this.type
  def setTol(value: Double): this.type
  def setSolver(value: String): this.type
  def setThreshold(value: Double): this.type
  def setSeed(value: Long): this.type
}

class FMClassificationModel extends ClassificationModel[Vector, FMClassificationModel] with FMClassifierParams {
  def intercept: Double
  def linear: Vector
  def factors: Matrix
}

Shared Classification Components

Base Classes and Traits

/**
 * Base classifier abstraction
 */
abstract class Classifier[
  FeaturesType,
  E <: Classifier[FeaturesType, E, M],
  M <: ClassificationModel[FeaturesType, M]
] extends Estimator[M] with ClassifierParams {
  def fit(dataset: Dataset[_]): M
}

/**
 * Base classification model
 */
abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[FeaturesType, M]]
  extends Model[M] with ClassificationParams {
  def numClasses: Int
  def predictRaw(features: FeaturesType): Vector
  def rawPredictionCol: String
}

/**
 * Probabilistic classifier with probability predictions
 */
abstract class ProbabilisticClassifier[
  FeaturesType,
  E <: ProbabilisticClassifier[FeaturesType, E, M],
  M <: ProbabilisticClassificationModel[FeaturesType, M]
] extends Classifier[FeaturesType, E, M] with ProbabilisticClassifierParams

/**
 * Probabilistic classification model
 */
abstract class ProbabilisticClassificationModel[FeaturesType, M <: ProbabilisticClassificationModel[FeaturesType, M]]
  extends ClassificationModel[FeaturesType, M] with ProbabilisticClassifierParams {
  def predictProbability(features: FeaturesType): Vector
  def probabilityCol: String
}

Types

// Classification-specific imports
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.{Vector, Matrix}
import org.apache.spark.sql.{DataFrame, Dataset}

// Parameter traits
import org.apache.spark.ml.param.shared._

// Model summary types
import org.apache.spark.ml.classification.{
  LogisticRegressionSummary,
  BinaryLogisticRegressionSummary,
  MulticlassLogisticRegressionSummary
}

// Tree model components
import org.apache.spark.ml.tree.{Node, InternalNode, LeafNode}
import org.apache.spark.ml.tree.impurity.{Gini, Entropy, ImpurityStats}