Supervised learning algorithms for predicting continuous numerical values, including linear models, tree-based methods, and survival analysis with comprehensive residual analysis.
Linear regression algorithm with L1/L2 regularization and comprehensive statistical summaries.
/**
* Linear regression with regularization support
*/
class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel] {
def setMaxIter(value: Int): this.type
def setRegParam(value: Double): this.type
def setElasticNetParam(value: Double): this.type
def setTol(value: Double): this.type
def setFitIntercept(value: Boolean): this.type
def setStandardization(value: Boolean): this.type
def setWeightCol(value: String): this.type
def setSolver(value: String): this.type
def setAggregationDepth(value: Int): this.type
def setLoss(value: String): this.type
def setEpsilon(value: Double): this.type
}
class LinearRegressionModel extends RegressionModel[Vector, LinearRegressionModel] with LinearRegressionParams {
def coefficients: Vector
def intercept: Double
def scale: Double
def summary: LinearRegressionTrainingSummary
def hasSummary: Boolean
def evaluate(dataset: Dataset[_]): LinearRegressionSummary
}
class LinearRegressionSummary {
def predictions: DataFrame
def predictionCol: String
def labelCol: String
def featuresCol: String
def explainedVariance: Double
def meanAbsoluteError: Double
def meanSquaredError: Double
def rootMeanSquaredError: Double
def r2: Double
def residuals: DataFrame
}
class LinearRegressionTrainingSummary extends LinearRegressionSummary {
def totalIterations: Int
def objectiveHistory: Array[Double]
def devianceResiduals: Array[Double]
def coefficientStandardErrors: Array[Double]
def tValues: Array[Double]
def pValues: Array[Double]
}Usage Example:
import org.apache.spark.ml.regression.LinearRegression
val lr = new LinearRegression()
.setMaxIter(20)
.setRegParam(0.3)
.setElasticNetParam(0.8)
val lrModel = lr.fit(trainingData)
val predictions = lrModel.transform(testData)
// Print coefficients and intercept
println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")
// Summarize the model over the training set
val trainingSummary = lrModel.summary
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"R2: ${trainingSummary.r2}")Generalized linear models supporting various exponential family distributions and link functions.
/**
* Generalized Linear Regression with multiple family distributions
*/
class GeneralizedLinearRegression extends Regressor[Vector, GeneralizedLinearRegression, GeneralizedLinearRegressionModel] {
def setFamily(value: String): this.type
def setLink(value: String): this.type
def setFitIntercept(value: Boolean): this.type
def setMaxIter(value: Int): this.type
def setTol(value: Double): this.type
def setRegParam(value: Double): this.type
def setWeightCol(value: String): this.type
def setSolver(value: String): this.type
def setLinkPredictionCol(value: String): this.type
def setVariancePower(value: Double): this.type
def setLinkPower(value: Double): this.type
def setOffsetCol(value: String): this.type
}
class GeneralizedLinearRegressionModel extends RegressionModel[Vector, GeneralizedLinearRegressionModel] with GeneralizedLinearRegressionParams {
def coefficients: Vector
def intercept: Double
def summary: GeneralizedLinearRegressionTrainingSummary
def hasSummary: Boolean
def evaluate(dataset: Dataset[_]): GeneralizedLinearRegressionSummary
}
class GeneralizedLinearRegressionSummary {
def predictions: DataFrame
def predictionCol: String
def labelCol: String
def featuresCol: String
def rank: Long
def degreesOfFreedom: Long
def residualDegreeOfFreedom: Long
def residualDegreeOfFreedomNull: Long
def aic: Double
def deviance: Double
def nullDeviance: Double
def dispersion: Double
}Tree-based regression algorithm using recursive binary splits for continuous target variables.
/**
* Decision tree regressor with configurable tree parameters
*/
class DecisionTreeRegressor extends Regressor[Vector, DecisionTreeRegressor, DecisionTreeRegressionModel] {
def setMaxDepth(value: Int): this.type
def setMaxBins(value: Int): this.type
def setMinInstancesPerNode(value: Int): this.type
def setMinInfoGain(value: Double): this.type
def setMaxMemoryInMB(value: Int): this.type
def setCacheNodeIds(value: Boolean): this.type
def setCheckpointInterval(value: Int): this.type
def setImpurity(value: String): this.type
def setSeed(value: Long): this.type
def setVarianceCol(value: String): this.type
}
class DecisionTreeRegressionModel extends RegressionModel[Vector, DecisionTreeRegressionModel] with DecisionTreeRegressorParams {
def rootNode: Node
def depth: Int
def numNodes: Int
def toDebugString: String
def featureImportances: Vector
}Ensemble regression method combining multiple decision trees with bootstrap aggregating.
/**
* Random Forest regressor using ensemble of decision trees
*/
class RandomForestRegressor extends Regressor[Vector, RandomForestRegressor, RandomForestRegressionModel] {
def setNumTrees(value: Int): this.type
def setMaxDepth(value: Int): this.type
def setMaxBins(value: Int): this.type
def setMinInstancesPerNode(value: Int): this.type
def setMinInfoGain(value: Double): this.type
def setMaxMemoryInMB(value: Int): this.type
def setCacheNodeIds(value: Boolean): this.type
def setCheckpointInterval(value: Int): this.type
def setImpurity(value: String): this.type
def setSubsamplingRate(value: Double): this.type
def setSeed(value: Long): this.type
def setFeatureSubsetStrategy(value: String): this.type
}
class RandomForestRegressionModel extends RegressionModel[Vector, RandomForestRegressionModel] with RandomForestRegressorParams {
def trees: Array[DecisionTreeRegressionModel]
def treeWeights: Array[Double]
def numFeatures: Int
def totalNumNodes: Int
def toDebugString: String
def featureImportances: Vector
}Sequential ensemble method where each tree corrects errors from previous trees.
/**
* Gradient-boosted tree regressor
*/
class GBTRegressor extends Regressor[Vector, GBTRegressor, GBTRegressionModel] {
def setLossType(value: String): this.type
def setMaxIter(value: Int): this.type
def setStepSize(value: Double): this.type
def setMaxDepth(value: Int): this.type
def setMaxBins(value: Int): this.type
def setMinInstancesPerNode(value: Int): this.type
def setMinInfoGain(value: Double): this.type
def setMaxMemoryInMB(value: Int): this.type
def setCacheNodeIds(value: Boolean): this.type
def setCheckpointInterval(value: Int): this.type
def setImpurity(value: String): this.type
def setSubsamplingRate(value: Double): this.type
def setSeed(value: Long): this.type
def setFeatureSubsetStrategy(value: String): this.type
def setValidationTol(value: Double): this.type
def setValidationIndicatorCol(value: String): this.type
}
class GBTRegressionModel extends RegressionModel[Vector, GBTRegressionModel] with GBTRegressorParams {
def trees: Array[DecisionTreeRegressionModel]
def treeWeights: Array[Double]
def numFeatures: Int
def totalNumNodes: Int
def toDebugString: String
def featureImportances: Vector
}Non-parametric regression that fits a monotonic function to the data.
/**
* Isotonic regression for monotonic relationships
*/
class IsotonicRegression extends Regressor[Double, IsotonicRegression, IsotonicRegressionModel] {
def setIsotonic(value: Boolean): this.type
def setFeatureIndex(value: Int): this.type
def setWeightCol(value: String): this.type
}
class IsotonicRegressionModel extends RegressionModel[Double, IsotonicRegressionModel] with IsotonicRegressionParams {
def boundaries: Vector
def predictions: Vector
def numFeatures: Int
}Accelerated failure time model for survival analysis with censored data.
/**
* Accelerated Failure Time survival regression
*/
class AFTSurvivalRegression extends Regressor[Vector, AFTSurvivalRegression, AFTSurvivalRegressionModel] {
def setCensorCol(value: String): this.type
def setQuantileProbabilities(value: Array[Double]): this.type
def setQuantilesCol(value: String): this.type
def setMaxIter(value: Int): this.type
def setTol(value: Double): this.type
def setFitIntercept(value: Boolean): this.type
def setAggregationDepth(value: Int): this.type
}
class AFTSurvivalRegressionModel extends RegressionModel[Vector, AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams {
def coefficients: Vector
def intercept: Double
def scale: Double
def predictQuantiles(features: Vector): Vector
}Factorization machine for regression tasks modeling feature interactions.
/**
* Factorization Machine regressor
*/
class FMRegressor extends Regressor[Vector, FMRegressor, FMRegressionModel] {
def setFactorSize(value: Int): this.type
def setFitIntercept(value: Boolean): this.type
def setFitLinear(value: Boolean): this.type
def setRegParam(value: Double): this.type
def setMiniBatchFraction(value: Double): this.type
def setInitStd(value: Double): this.type
def setMaxIter(value: Int): this.type
def setStepSize(value: Double): this.type
def setTol(value: Double): this.type
def setSolver(value: String): this.type
def setSeed(value: Long): this.type
}
class FMRegressionModel extends RegressionModel[Vector, FMRegressionModel] with FMRegressorParams {
def intercept: Double
def linear: Vector
def factors: Matrix
}/**
* Base regressor abstraction
*/
abstract class Regressor[
FeaturesType,
E <: Regressor[FeaturesType, E, M],
M <: RegressionModel[FeaturesType, M]
] extends Estimator[M] with RegressorParams {
def fit(dataset: Dataset[_]): M
}
/**
* Base regression model
*/
abstract class RegressionModel[FeaturesType, M <: RegressionModel[FeaturesType, M]]
extends Model[M] with RegressionParams {
def predict(features: FeaturesType): Double
}// Regression-specific imports
import org.apache.spark.ml.regression._
import org.apache.spark.ml.linalg.{Vector, Matrix}
import org.apache.spark.sql.{DataFrame, Dataset}
// Parameter traits
import org.apache.spark.ml.param.shared._
// Model summary types
import org.apache.spark.ml.regression.{
LinearRegressionSummary,
LinearRegressionTrainingSummary,
GeneralizedLinearRegressionSummary,
GeneralizedLinearRegressionTrainingSummary
}
// Tree model components (shared with classification)
import org.apache.spark.ml.tree.{Node, InternalNode, LeafNode}