Supervised learning algorithms for predicting categorical outcomes, including binary and multiclass classification with probabilistic predictions and comprehensive model evaluation.
Linear classification algorithm using logistic function for binary and multiclass problems with L1/L2 regularization support.
/**
* Logistic regression classifier with regularization support
*/
class LogisticRegression extends Classifier[Vector, LogisticRegression, LogisticRegressionModel] {
def setMaxIter(value: Int): this.type
def setRegParam(value: Double): this.type
def setElasticNetParam(value: Double): this.type
def setTol(value: Double): this.type
def setFitIntercept(value: Boolean): this.type
def setStandardization(value: Boolean): this.type
def setThreshold(value: Double): this.type
def setThresholds(value: Array[Double]): this.type
def setWeightCol(value: String): this.type
def setAggregationDepth(value: Int): this.type
def setFamily(value: String): this.type
def setLowerBoundsOnCoefficients(value: Matrix): this.type
def setUpperBoundsOnCoefficients(value: Matrix): this.type
def setLowerBoundsOnIntercepts(value: Vector): this.type
def setUpperBoundsOnIntercepts(value: Vector): this.type
def setMaxBlockSizeInMB(value: Double): this.type
def setInitialModel(model: LogisticRegressionModel): this.type
}
class LogisticRegressionModel extends ClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams {
def coefficients: Vector
def intercept: Double
def coefficientMatrix: Matrix
def interceptVector: Vector
def summary: LogisticRegressionTrainingSummary
def hasSummary: Boolean
def evaluate(dataset: Dataset[_]): LogisticRegressionSummary
}
class LogisticRegressionSummary {
def predictions: DataFrame
def probabilityCol: String
def labelCol: String
def featuresCol: String
def predictionCol: String
}
class BinaryLogisticRegressionSummary extends LogisticRegressionSummary {
def areaUnderROC: Double
def roc: DataFrame
def areaUnderPR: Double
def pr: DataFrame
def fMeasureByThreshold: DataFrame
def precisionByThreshold: DataFrame
def recallByThreshold: DataFrame
}Usage Example:
import org.apache.spark.ml.classification.LogisticRegression
val lr = new LogisticRegression()
.setMaxIter(20)
.setRegParam(0.3)
.setElasticNetParam(0.8)
.setFamily("binomial")
val lrModel = lr.fit(trainingData)
val predictions = lrModel.transform(testData)
// Access model coefficients
println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")
// Get training summary
val trainingSummary = lrModel.summary
println(s"Number of iterations: ${trainingSummary.totalIterations}")Tree-based classifier using recursive binary splits with support for categorical and continuous features.
/**
* Decision tree classifier with configurable splitting criteria
*/
class DecisionTreeClassifier extends Classifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel] {
def setMaxDepth(value: Int): this.type
def setMaxBins(value: Int): this.type
def setMinInstancesPerNode(value: Int): this.type
def setMinInfoGain(value: Double): this.type
def setMaxMemoryInMB(value: Int): this.type
def setCacheNodeIds(value: Boolean): this.type
def setCheckpointInterval(value: Int): this.type
def setImpurity(value: String): this.type
def setSeed(value: Long): this.type
}
class DecisionTreeClassificationModel extends ClassificationModel[Vector, DecisionTreeClassificationModel] with DecisionTreeClassifierParams {
def rootNode: Node
def depth: Int
def numNodes: Int
def toDebugString: String
def featureImportances: Vector
}
abstract class Node extends Serializable {
def prediction: Double
def impurity: Double
def impurityStats: ImpurityStats
def isLeaf: Boolean
def deepCopy(): Node
}Usage Example:
import org.apache.spark.ml.classification.DecisionTreeClassifier
val dt = new DecisionTreeClassifier()
.setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures")
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setImpurity("gini")
val dtModel = dt.fit(trainingData)
val predictions = dtModel.transform(testData)
// Print the learned classification tree model
println(s"Learned classification tree model:\n ${dtModel.toDebugString}")
// Get feature importances
println(s"Feature importances: ${dtModel.featureImportances}")Ensemble method combining multiple decision trees with bootstrap aggregating and random feature selection.
/**
* Random Forest classifier using ensemble of decision trees
*/
class RandomForestClassifier extends Classifier[Vector, RandomForestClassifier, RandomForestClassificationModel] {
def setNumTrees(value: Int): this.type
def setMaxDepth(value: Int): this.type
def setMaxBins(value: Int): this.type
def setMinInstancesPerNode(value: Int): this.type
def setMinInfoGain(value: Double): this.type
def setMaxMemoryInMB(value: Int): this.type
def setCacheNodeIds(value: Boolean): this.type
def setCheckpointInterval(value: Int): this.type
def setImpurity(value: String): this.type
def setSubsamplingRate(value: Double): this.type
def setSeed(value: Long): this.type
def setFeatureSubsetStrategy(value: String): this.type
}
class RandomForestClassificationModel extends ClassificationModel[Vector, RandomForestClassificationModel] with RandomForestClassifierParams {
def trees: Array[DecisionTreeClassificationModel]
def treeWeights: Array[Double]
def numFeatures: Int
def totalNumNodes: Int
def toDebugString: String
def featureImportances: Vector
}Usage Example:
import org.apache.spark.ml.classification.RandomForestClassifier
val rf = new RandomForestClassifier()
.setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures")
.setNumTrees(20)
.setMaxDepth(5)
.setMaxBins(32)
.setFeatureSubsetStrategy("auto")
val rfModel = rf.fit(trainingData)
val predictions = rfModel.transform(testData)
// Print feature importances
println(s"Feature importances: ${rfModel.featureImportances}")
// Access individual trees
println(s"Number of trees: ${rfModel.trees.length}")Ensemble method that builds models sequentially where each new model corrects errors from previous models.
/**
* Gradient-boosted tree classifier for binary classification
*/
class GBTClassifier extends Classifier[Vector, GBTClassifier, GBTClassificationModel] {
def setLossType(value: String): this.type
def setMaxIter(value: Int): this.type
def setStepSize(value: Double): this.type
def setMaxDepth(value: Int): this.type
def setMaxBins(value: Int): this.type
def setMinInstancesPerNode(value: Int): this.type
def setMinInfoGain(value: Double): this.type
def setMaxMemoryInMB(value: Int): this.type
def setCacheNodeIds(value: Boolean): this.type
def setCheckpointInterval(value: Int): this.type
def setImpurity(value: String): this.type
def setSubsamplingRate(value: Double): this.type
def setSeed(value: Long): this.type
def setFeatureSubsetStrategy(value: String): this.type
def setValidationTol(value: Double): this.type
def setValidationIndicatorCol(value: String): this.type
}
class GBTClassificationModel extends ClassificationModel[Vector, GBTClassificationModel] with GBTClassifierParams {
def trees: Array[DecisionTreeRegressionModel]
def treeWeights: Array[Double]
def numFeatures: Int
def totalNumNodes: Int
def toDebugString: String
def featureImportances: Vector
}Linear support vector classifier with L2 regularization for binary classification problems.
/**
* Linear Support Vector Machine classifier
*/
class LinearSVC extends Classifier[Vector, LinearSVC, LinearSVCModel] {
def setRegParam(value: Double): this.type
def setMaxIter(value: Int): this.type
def setTol(value: Double): this.type
def setFitIntercept(value: Boolean): this.type
def setStandardization(value: Boolean): this.type
def setThreshold(value: Double): this.type
def setWeightCol(value: String): this.type
def setAggregationDepth(value: Int): this.type
}
class LinearSVCModel extends ClassificationModel[Vector, LinearSVCModel] with LinearSVCParams {
def coefficients: Vector
def intercept: Double
}Probabilistic classifier based on Bayes' theorem with naive independence assumption between features.
/**
* Naive Bayes classifier with multiple model types
*/
class NaiveBayes extends Classifier[Vector, NaiveBayes, NaiveBayesModel] {
def setModelType(value: String): this.type
def setSmoothing(value: Double): this.type
def setThresholds(value: Array[Double]): this.type
def setWeightCol(value: String): this.type
}
class NaiveBayesModel extends ClassificationModel[Vector, NaiveBayesModel] with NaiveBayesParams {
def pi: Vector
def theta: Matrix
def sigma: Matrix
def numFeatures: Int
def numClasses: Int
}Multilayer perceptron classifier using backpropagation for training feed-forward neural networks.
/**
* Multilayer perceptron classifier
*/
class MultilayerPerceptronClassifier extends Classifier[Vector, MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel] {
def setLayers(value: Array[Int]): this.type
def setBlockSize(value: Int): this.type
def setSolver(value: String): this.type
def setMaxIter(value: Int): this.type
def setTol(value: Double): this.type
def setSeed(value: Long): this.type
def setInitialWeights(value: Vector): this.type
def setStepSize(value: Double): this.type
}
class MultilayerPerceptronClassificationModel extends ClassificationModel[Vector, MultilayerPerceptronClassificationModel] with MultilayerPerceptronClassifierParams {
def layers: Array[Int]
def weights: Vector
}Meta-algorithm that enables binary classifiers to handle multiclass problems by training one classifier per class.
/**
* One-vs-Rest multiclass classification strategy
*/
class OneVsRest extends Estimator[OneVsRestModel] with OneVsRestParams {
def setClassifier(value: Classifier[_, _, _]): this.type
def setLabelCol(value: String): this.type
def setFeaturesCol(value: String): this.type
def setPredictionCol(value: String): this.type
def setRawPredictionCol(value: String): this.type
def setParallelism(value: Int): this.type
def setWeightCol(value: String): this.type
}
class OneVsRestModel extends Model[OneVsRestModel] with OneVsRestParams {
def models: Array[_ <: ClassificationModel[_, _]]
def labelMetadata: Metadata
}Factorization machine for classification tasks modeling feature interactions efficiently.
/**
* Factorization Machine classifier for binary classification
*/
class FMClassifier extends Classifier[Vector, FMClassifier, FMClassificationModel] {
def setFactorSize(value: Int): this.type
def setFitIntercept(value: Boolean): this.type
def setFitLinear(value: Boolean): this.type
def setRegParam(value: Double): this.type
def setMiniBatchFraction(value: Double): this.type
def setInitStd(value: Double): this.type
def setMaxIter(value: Int): this.type
def setStepSize(value: Double): this.type
def setTol(value: Double): this.type
def setSolver(value: String): this.type
def setThreshold(value: Double): this.type
def setSeed(value: Long): this.type
}
class FMClassificationModel extends ClassificationModel[Vector, FMClassificationModel] with FMClassifierParams {
def intercept: Double
def linear: Vector
def factors: Matrix
}/**
* Base classifier abstraction
*/
abstract class Classifier[
FeaturesType,
E <: Classifier[FeaturesType, E, M],
M <: ClassificationModel[FeaturesType, M]
] extends Estimator[M] with ClassifierParams {
def fit(dataset: Dataset[_]): M
}
/**
* Base classification model
*/
abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[FeaturesType, M]]
extends Model[M] with ClassificationParams {
def numClasses: Int
def predictRaw(features: FeaturesType): Vector
def rawPredictionCol: String
}
/**
* Probabilistic classifier with probability predictions
*/
abstract class ProbabilisticClassifier[
FeaturesType,
E <: ProbabilisticClassifier[FeaturesType, E, M],
M <: ProbabilisticClassificationModel[FeaturesType, M]
] extends Classifier[FeaturesType, E, M] with ProbabilisticClassifierParams
/**
* Probabilistic classification model
*/
abstract class ProbabilisticClassificationModel[FeaturesType, M <: ProbabilisticClassificationModel[FeaturesType, M]]
extends ClassificationModel[FeaturesType, M] with ProbabilisticClassifierParams {
def predictProbability(features: FeaturesType): Vector
def probabilityCol: String
}// Classification-specific imports
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.{Vector, Matrix}
import org.apache.spark.sql.{DataFrame, Dataset}
// Parameter traits
import org.apache.spark.ml.param.shared._
// Model summary types
import org.apache.spark.ml.classification.{
LogisticRegressionSummary,
BinaryLogisticRegressionSummary,
MulticlassLogisticRegressionSummary
}
// Tree model components
import org.apache.spark.ml.tree.{Node, InternalNode, LeafNode}
import org.apache.spark.ml.tree.impurity.{Gini, Entropy, ImpurityStats}