0
# Clustering
1
2
Unsupervised learning algorithms for discovering hidden patterns and grouping similar data points, including partitioning, hierarchical, and probabilistic clustering methods.
3
4
## Capabilities
5
6
### K-Means Clustering
7
8
Partitioning algorithm that groups data into k clusters by minimizing within-cluster sum of squared distances.
9
10
```scala { .api }
11
/**
12
* K-Means clustering algorithm
13
*/
14
class KMeans extends Estimator[KMeansModel] with KMeansParams {
15
def setK(value: Int): this.type
16
def setInitMode(value: String): this.type
17
def setInitSteps(value: Int): this.type
18
def setMaxIter(value: Int): this.type
19
def setTol(value: Double): this.type
20
def setSeed(value: Long): this.type
21
def setDistanceMeasure(value: String): this.type
22
def setWeightCol(value: String): this.type
23
}
24
25
class KMeansModel extends Model[KMeansModel] with KMeansParams with MLWritable {
26
def clusterCenters: Array[Vector]
27
def computeCost(dataset: Dataset[_]): Double
28
def summary: KMeansSummary
29
def hasSummary: Boolean
30
}
31
32
class KMeansSummary {
33
def predictions: DataFrame
34
def predictionCol: String
35
def featuresCol: String
36
def k: Int
37
def clusterSizes: Array[Long]
38
def trainingCost: Double
39
def numIter: Int
40
}
41
```
42
43
**Usage Example:**
44
45
```scala
46
import org.apache.spark.ml.clustering.KMeans
47
48
val kmeans = new KMeans()
49
.setK(3)
50
.setSeed(1L)
51
.setMaxIter(20)
52
53
val model = kmeans.fit(dataset)
54
55
// Make predictions
56
val predictions = model.transform(dataset)
57
58
// Evaluate clustering by computing Within Set Sum of Squared Errors
59
val WSSSE = model.computeCost(dataset)
60
println(s"Within Set Sum of Squared Errors = $WSSSE")
61
62
// Show cluster centers
63
println("Cluster Centers: ")
64
model.clusterCenters.foreach(println)
65
```
66
67
### Gaussian Mixture Model
68
69
Probabilistic clustering using expectation-maximization algorithm to model data as mixture of Gaussian distributions.
70
71
```scala { .api }
72
/**
73
* Gaussian Mixture Model clustering
74
*/
75
class GaussianMixture extends Estimator[GaussianMixtureModel] with GaussianMixtureParams {
76
def setK(value: Int): this.type
77
def setMaxIter(value: Int): this.type
78
def setTol(value: Double): this.type
79
def setSeed(value: Long): this.type
80
def setAggregationDepth(value: Int): this.type
81
def setWeightCol(value: String): this.type
82
}
83
84
class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {
85
def weights: Array[Double]
86
def gaussians: Array[MultivariateGaussian]
87
def summary: GaussianMixtureSummary
88
def hasSummary: Boolean
89
}
90
91
class GaussianMixtureSummary {
92
def predictions: DataFrame
93
def predictionCol: String
94
def probabilityCol: String
95
def featuresCol: String
96
def k: Int
97
def clusterSizes: Array[Long]
98
def logLikelihood: Double
99
def numIter: Int
100
}
101
102
case class MultivariateGaussian(mean: Vector, cov: Matrix) {
103
def pdf(x: Vector): Double
104
def logpdf(x: Vector): Double
105
}
106
```
107
108
**Usage Example:**
109
110
```scala
111
import org.apache.spark.ml.clustering.GaussianMixture
112
113
val gmm = new GaussianMixture()
114
.setK(3)
115
.setSeed(538009335L)
116
117
val model = gmm.fit(dataset)
118
119
// Output the parameters of the mixture model
120
for (i <- 0 until model.getK) {
121
println(s"Gaussian $i:\nweight=${model.weights(i)}\n" +
122
s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n")
123
}
124
```
125
126
### Latent Dirichlet Allocation
127
128
Topic modeling algorithm for discovering abstract topics in document collections.
129
130
```scala { .api }
131
/**
132
* Latent Dirichlet Allocation for topic modeling
133
*/
134
class LDA extends Estimator[LDAModel] with LDAParams {
135
def setK(value: Int): this.type
136
def setMaxIter(value: Int): this.type
137
def setDocConcentration(value: Array[Double]): this.type
138
def setTopicConcentration(value: Double): this.type
139
def setOptimizer(value: String): this.type
140
def setLearningOffset(value: Double): this.type
141
def setLearningDecay(value: Double): this.type
142
def setSubsamplingRate(value: Double): this.type
143
def setOptimizeDocConcentration(value: Boolean): this.type
144
def setSeed(value: Long): this.type
145
def setCheckpointInterval(value: Int): this.type
146
def setKeepLastCheckpoint(value: Boolean): this.type
147
}
148
149
abstract class LDAModel extends Model[LDAModel] with LDAParams {
150
def vocabSize: Int
151
def topicsMatrix: Matrix
152
def isDistributed: Boolean
153
def logLikelihood(dataset: Dataset[_]): Double
154
def logPerplexity(dataset: Dataset[_]): Double
155
def describeTopics(maxTermsPerTopic: Int): DataFrame
156
def describeTopics(): DataFrame
157
}
158
159
class LocalLDAModel extends LDAModel {
160
def getDocConcentration: Vector
161
def topicDistributions(dataset: Dataset[_]): DataFrame
162
}
163
164
class DistributedLDAModel extends LDAModel {
165
def getDocConcentration: Vector
166
def toLocal: LocalLDAModel
167
def logPrior: Double
168
}
169
```
170
171
### Bisecting K-Means
172
173
Hierarchical clustering algorithm that recursively applies k-means to split clusters into two subclusters.
174
175
```scala { .api }
176
/**
177
* Bisecting K-Means clustering
178
*/
179
class BisectingKMeans extends Estimator[BisectingKMeansModel] with BisectingKMeansParams {
180
def setK(value: Int): this.type
181
def setMaxIter(value: Int): this.type
182
def setSeed(value: Long): this.type
183
def setMinDivisibleClusterSize(value: Double): this.type
184
def setDistanceMeasure(value: String): this.type
185
}
186
187
class BisectingKMeansModel extends Model[BisectingKMeansModel] with BisectingKMeansParams with MLWritable {
188
def clusterCenters: Array[Vector]
189
def computeCost(dataset: Dataset[_]): Double
190
def summary: BisectingKMeansSummary
191
def hasSummary: Boolean
192
}
193
194
class BisectingKMeansSummary {
195
def predictions: DataFrame
196
def predictionCol: String
197
def featuresCol: String
198
def k: Int
199
def clusterSizes: Array[Long]
200
def trainingCost: Double
201
def numIter: Int
202
}
203
```
204
205
### Power Iteration Clustering
206
207
Graph-based clustering algorithm using power iteration to find a low-dimensional embedding of the affinity matrix.
208
209
```scala { .api }
210
/**
211
* Power Iteration Clustering for graph-based clustering
212
*/
213
class PowerIterationClustering extends Estimator[PowerIterationClusteringModel] with PowerIterationClusteringParams {
214
def setK(value: Int): this.type
215
def setMaxIter(value: Int): this.type
216
def setInitMode(value: String): this.type
217
def setSrcCol(value: String): this.type
218
def setDstCol(value: String): this.type
219
def setWeightCol(value: String): this.type
220
}
221
222
class PowerIterationClusteringModel extends Model[PowerIterationClusteringModel] with PowerIterationClusteringParams {
223
def assignments: DataFrame
224
}
225
```
226
227
**Usage Example:**
228
229
```scala
230
import org.apache.spark.ml.clustering.PowerIterationClustering
231
232
val pic = new PowerIterationClustering()
233
.setK(3)
234
.setMaxIter(20)
235
.setSrcCol("src")
236
.setDstCol("dst")
237
.setWeightCol("weight")
238
239
val model = pic.fit(edgeDataset)
240
val result = model.assignments
241
result.show()
242
```
243
244
## Types
245
246
```scala { .api }
247
// Clustering-specific imports
248
import org.apache.spark.ml.clustering._
249
import org.apache.spark.ml.linalg.{Vector, Matrix}
250
import org.apache.spark.sql.{DataFrame, Dataset}
251
252
// Gaussian mixture components
253
import org.apache.spark.ml.stat.distribution.MultivariateGaussian
254
255
// Parameter traits
256
import org.apache.spark.ml.param.shared._
257
258
// Summary types
259
import org.apache.spark.ml.clustering.{
260
KMeansSummary,
261
GaussianMixtureSummary,
262
BisectingKMeansSummary
263
}
264
```