0
# Regression
1
2
Supervised learning algorithms for predicting continuous numerical values, including linear models, tree-based methods, and survival analysis with comprehensive residual analysis.
3
4
## Capabilities
5
6
### Linear Regression
7
8
Linear regression algorithm with L1/L2 regularization and comprehensive statistical summaries.
9
10
```scala { .api }
11
/**
12
* Linear regression with regularization support
13
*/
14
class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel] {
15
def setMaxIter(value: Int): this.type
16
def setRegParam(value: Double): this.type
17
def setElasticNetParam(value: Double): this.type
18
def setTol(value: Double): this.type
19
def setFitIntercept(value: Boolean): this.type
20
def setStandardization(value: Boolean): this.type
21
def setWeightCol(value: String): this.type
22
def setSolver(value: String): this.type
23
def setAggregationDepth(value: Int): this.type
24
def setLoss(value: String): this.type
25
def setEpsilon(value: Double): this.type
26
}
27
28
class LinearRegressionModel extends RegressionModel[Vector, LinearRegressionModel] with LinearRegressionParams {
29
def coefficients: Vector
30
def intercept: Double
31
def scale: Double
32
def summary: LinearRegressionTrainingSummary
33
def hasSummary: Boolean
34
def evaluate(dataset: Dataset[_]): LinearRegressionSummary
35
}
36
37
class LinearRegressionSummary {
38
def predictions: DataFrame
39
def predictionCol: String
40
def labelCol: String
41
def featuresCol: String
42
def explainedVariance: Double
43
def meanAbsoluteError: Double
44
def meanSquaredError: Double
45
def rootMeanSquaredError: Double
46
def r2: Double
47
def residuals: DataFrame
48
}
49
50
class LinearRegressionTrainingSummary extends LinearRegressionSummary {
51
def totalIterations: Int
52
def objectiveHistory: Array[Double]
53
def devianceResiduals: Array[Double]
54
def coefficientStandardErrors: Array[Double]
55
def tValues: Array[Double]
56
def pValues: Array[Double]
57
}
58
```
59
60
**Usage Example:**
61
62
```scala
63
import org.apache.spark.ml.regression.LinearRegression
64
65
val lr = new LinearRegression()
66
.setMaxIter(20)
67
.setRegParam(0.3)
68
.setElasticNetParam(0.8)
69
70
val lrModel = lr.fit(trainingData)
71
val predictions = lrModel.transform(testData)
72
73
// Print coefficients and intercept
74
println(s"Coefficients: ${lrModel.coefficients}")
75
println(s"Intercept: ${lrModel.intercept}")
76
77
// Summarize the model over the training set
78
val trainingSummary = lrModel.summary
79
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
80
println(s"R2: ${trainingSummary.r2}")
81
```
82
83
### Generalized Linear Regression
84
85
Generalized linear models supporting various exponential family distributions and link functions.
86
87
```scala { .api }
88
/**
89
* Generalized Linear Regression with multiple family distributions
90
*/
91
class GeneralizedLinearRegression extends Regressor[Vector, GeneralizedLinearRegression, GeneralizedLinearRegressionModel] {
92
def setFamily(value: String): this.type
93
def setLink(value: String): this.type
94
def setFitIntercept(value: Boolean): this.type
95
def setMaxIter(value: Int): this.type
96
def setTol(value: Double): this.type
97
def setRegParam(value: Double): this.type
98
def setWeightCol(value: String): this.type
99
def setSolver(value: String): this.type
100
def setLinkPredictionCol(value: String): this.type
101
def setVariancePower(value: Double): this.type
102
def setLinkPower(value: Double): this.type
103
def setOffsetCol(value: String): this.type
104
}
105
106
class GeneralizedLinearRegressionModel extends RegressionModel[Vector, GeneralizedLinearRegressionModel] with GeneralizedLinearRegressionParams {
107
def coefficients: Vector
108
def intercept: Double
109
def summary: GeneralizedLinearRegressionTrainingSummary
110
def hasSummary: Boolean
111
def evaluate(dataset: Dataset[_]): GeneralizedLinearRegressionSummary
112
}
113
114
class GeneralizedLinearRegressionSummary {
115
def predictions: DataFrame
116
def predictionCol: String
117
def labelCol: String
118
def featuresCol: String
119
def rank: Long
120
def degreesOfFreedom: Long
121
def residualDegreeOfFreedom: Long
122
def residualDegreeOfFreedomNull: Long
123
def aic: Double
124
def deviance: Double
125
def nullDeviance: Double
126
def dispersion: Double
127
}
128
```
129
130
### Decision Tree Regressor
131
132
Tree-based regression algorithm using recursive binary splits for continuous target variables.
133
134
```scala { .api }
135
/**
136
* Decision tree regressor with configurable tree parameters
137
*/
138
class DecisionTreeRegressor extends Regressor[Vector, DecisionTreeRegressor, DecisionTreeRegressionModel] {
139
def setMaxDepth(value: Int): this.type
140
def setMaxBins(value: Int): this.type
141
def setMinInstancesPerNode(value: Int): this.type
142
def setMinInfoGain(value: Double): this.type
143
def setMaxMemoryInMB(value: Int): this.type
144
def setCacheNodeIds(value: Boolean): this.type
145
def setCheckpointInterval(value: Int): this.type
146
def setImpurity(value: String): this.type
147
def setSeed(value: Long): this.type
148
def setVarianceCol(value: String): this.type
149
}
150
151
class DecisionTreeRegressionModel extends RegressionModel[Vector, DecisionTreeRegressionModel] with DecisionTreeRegressorParams {
152
def rootNode: Node
153
def depth: Int
154
def numNodes: Int
155
def toDebugString: String
156
def featureImportances: Vector
157
}
158
```
159
160
### Random Forest Regressor
161
162
Ensemble regression method combining multiple decision trees with bootstrap aggregating.
163
164
```scala { .api }
165
/**
166
* Random Forest regressor using ensemble of decision trees
167
*/
168
class RandomForestRegressor extends Regressor[Vector, RandomForestRegressor, RandomForestRegressionModel] {
169
def setNumTrees(value: Int): this.type
170
def setMaxDepth(value: Int): this.type
171
def setMaxBins(value: Int): this.type
172
def setMinInstancesPerNode(value: Int): this.type
173
def setMinInfoGain(value: Double): this.type
174
def setMaxMemoryInMB(value: Int): this.type
175
def setCacheNodeIds(value: Boolean): this.type
176
def setCheckpointInterval(value: Int): this.type
177
def setImpurity(value: String): this.type
178
def setSubsamplingRate(value: Double): this.type
179
def setSeed(value: Long): this.type
180
def setFeatureSubsetStrategy(value: String): this.type
181
}
182
183
class RandomForestRegressionModel extends RegressionModel[Vector, RandomForestRegressionModel] with RandomForestRegressorParams {
184
def trees: Array[DecisionTreeRegressionModel]
185
def treeWeights: Array[Double]
186
def numFeatures: Int
187
def totalNumNodes: Int
188
def toDebugString: String
189
def featureImportances: Vector
190
}
191
```
192
193
### Gradient Boosted Tree Regressor
194
195
Sequential ensemble method where each tree corrects errors from previous trees.
196
197
```scala { .api }
198
/**
199
* Gradient-boosted tree regressor
200
*/
201
class GBTRegressor extends Regressor[Vector, GBTRegressor, GBTRegressionModel] {
202
def setLossType(value: String): this.type
203
def setMaxIter(value: Int): this.type
204
def setStepSize(value: Double): this.type
205
def setMaxDepth(value: Int): this.type
206
def setMaxBins(value: Int): this.type
207
def setMinInstancesPerNode(value: Int): this.type
208
def setMinInfoGain(value: Double): this.type
209
def setMaxMemoryInMB(value: Int): this.type
210
def setCacheNodeIds(value: Boolean): this.type
211
def setCheckpointInterval(value: Int): this.type
212
def setImpurity(value: String): this.type
213
def setSubsamplingRate(value: Double): this.type
214
def setSeed(value: Long): this.type
215
def setFeatureSubsetStrategy(value: String): this.type
216
def setValidationTol(value: Double): this.type
217
def setValidationIndicatorCol(value: String): this.type
218
}
219
220
class GBTRegressionModel extends RegressionModel[Vector, GBTRegressionModel] with GBTRegressorParams {
221
def trees: Array[DecisionTreeRegressionModel]
222
def treeWeights: Array[Double]
223
def numFeatures: Int
224
def totalNumNodes: Int
225
def toDebugString: String
226
def featureImportances: Vector
227
}
228
```
229
230
### Isotonic Regression
231
232
Non-parametric regression that fits a monotonic function to the data.
233
234
```scala { .api }
235
/**
236
* Isotonic regression for monotonic relationships
237
*/
238
class IsotonicRegression extends Regressor[Double, IsotonicRegression, IsotonicRegressionModel] {
239
def setIsotonic(value: Boolean): this.type
240
def setFeatureIndex(value: Int): this.type
241
def setWeightCol(value: String): this.type
242
}
243
244
class IsotonicRegressionModel extends RegressionModel[Double, IsotonicRegressionModel] with IsotonicRegressionParams {
245
def boundaries: Vector
246
def predictions: Vector
247
def numFeatures: Int
248
}
249
```
250
251
### Survival Regression
252
253
Accelerated failure time model for survival analysis with censored data.
254
255
```scala { .api }
256
/**
257
* Accelerated Failure Time survival regression
258
*/
259
class AFTSurvivalRegression extends Regressor[Vector, AFTSurvivalRegression, AFTSurvivalRegressionModel] {
260
def setCensorCol(value: String): this.type
261
def setQuantileProbabilities(value: Array[Double]): this.type
262
def setQuantilesCol(value: String): this.type
263
def setMaxIter(value: Int): this.type
264
def setTol(value: Double): this.type
265
def setFitIntercept(value: Boolean): this.type
266
def setAggregationDepth(value: Int): this.type
267
}
268
269
class AFTSurvivalRegressionModel extends RegressionModel[Vector, AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams {
270
def coefficients: Vector
271
def intercept: Double
272
def scale: Double
273
def predictQuantiles(features: Vector): Vector
274
}
275
```
276
277
### Factorization Machine Regressor
278
279
Factorization machine for regression tasks modeling feature interactions.
280
281
```scala { .api }
282
/**
283
* Factorization Machine regressor
284
*/
285
class FMRegressor extends Regressor[Vector, FMRegressor, FMRegressionModel] {
286
def setFactorSize(value: Int): this.type
287
def setFitIntercept(value: Boolean): this.type
288
def setFitLinear(value: Boolean): this.type
289
def setRegParam(value: Double): this.type
290
def setMiniBatchFraction(value: Double): this.type
291
def setInitStd(value: Double): this.type
292
def setMaxIter(value: Int): this.type
293
def setStepSize(value: Double): this.type
294
def setTol(value: Double): this.type
295
def setSolver(value: String): this.type
296
def setSeed(value: Long): this.type
297
}
298
299
class FMRegressionModel extends RegressionModel[Vector, FMRegressionModel] with FMRegressorParams {
300
def intercept: Double
301
def linear: Vector
302
def factors: Matrix
303
}
304
```
305
306
## Shared Regression Components
307
308
### Base Classes and Traits
309
310
```scala { .api }
311
/**
312
* Base regressor abstraction
313
*/
314
abstract class Regressor[
315
FeaturesType,
316
E <: Regressor[FeaturesType, E, M],
317
M <: RegressionModel[FeaturesType, M]
318
] extends Estimator[M] with RegressorParams {
319
def fit(dataset: Dataset[_]): M
320
}
321
322
/**
323
* Base regression model
324
*/
325
abstract class RegressionModel[FeaturesType, M <: RegressionModel[FeaturesType, M]]
326
extends Model[M] with RegressionParams {
327
def predict(features: FeaturesType): Double
328
}
329
```
330
331
## Types
332
333
```scala { .api }
334
// Regression-specific imports
335
import org.apache.spark.ml.regression._
336
import org.apache.spark.ml.linalg.{Vector, Matrix}
337
import org.apache.spark.sql.{DataFrame, Dataset}
338
339
// Parameter traits
340
import org.apache.spark.ml.param.shared._
341
342
// Model summary types
343
import org.apache.spark.ml.regression.{
344
LinearRegressionSummary,
345
LinearRegressionTrainingSummary,
346
GeneralizedLinearRegressionSummary,
347
GeneralizedLinearRegressionTrainingSummary
348
}
349
350
// Tree model components (shared with classification)
351
import org.apache.spark.ml.tree.{Node, InternalNode, LeafNode}
352
```