0
# Clustering Algorithms
1
2
Unsupervised learning algorithms for discovering patterns and groupings in data. MLlib provides clustering algorithms including K-means, Gaussian mixture models, hierarchical clustering, and topic modeling.
3
4
## Capabilities
5
6
### K-Means Clustering
7
8
Classic clustering algorithm that partitions data into k clusters using centroid-based approach.
9
10
```scala { .api }
11
/**
12
* K-means clustering algorithm using Lloyd's algorithm with configurable
13
* initialization strategies and convergence criteria
14
*/
15
class KMeans extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {
16
def setK(value: Int): this.type
17
def setInitMode(value: String): this.type
18
def setInitSteps(value: Int): this.type
19
def setMaxIter(value: Int): this.type
20
def setTol(value: Double): this.type
21
def setDistanceMeasure(value: String): this.type
22
def setSeed(value: Long): this.type
23
def setWeightCol(value: String): this.type
24
}
25
26
class KMeansModel extends Model[KMeansModel] with KMeansParams with MLWritable {
27
def clusterCenters: Array[Vector]
28
def numFeatures: Int
29
def k: Int
30
def summary: KMeansSummary
31
def hasSummary: Boolean
32
def computeCost(dataset: Dataset[_]): Double
33
}
34
35
class KMeansSummary extends Serializable {
36
def predictions: DataFrame
37
def predictionCol: String
38
def featuresCol: String
39
def k: Int
40
def clusterSizes: Array[Long]
41
def trainingCost: Double
42
def numIter: Int
43
}
44
```
45
46
### Gaussian Mixture Model
47
48
Probabilistic clustering using Gaussian mixture models with EM algorithm.
49
50
```scala { .api }
51
/**
52
* Gaussian Mixture Model clustering using Expectation-Maximization algorithm
53
* Models data as mixture of Gaussian distributions
54
*/
55
class GaussianMixture extends Estimator[GaussianMixtureModel] with GaussianMixtureParams {
56
def setK(value: Int): this.type
57
def setMaxIter(value: Int): this.type
58
def setTol(value: Double): this.type
59
def setSeed(value: Long): this.type
60
def setWeightCol(value: String): this.type
61
}
62
63
class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {
64
def weights: Array[Double]
65
def gaussians: Array[MultivariateGaussian]
66
def numFeatures: Int
67
def k: Int
68
def summary: GaussianMixtureSummary
69
def hasSummary: Boolean
70
def predict(features: Vector): Int
71
def predictProbability(features: Vector): Vector
72
}
73
74
class GaussianMixtureSummary extends Serializable {
75
def predictions: DataFrame
76
def predictionCol: String
77
def probabilityCol: String
78
def featuresCol: String
79
def k: Int
80
def clusterSizes: Array[Long]
81
def logLikelihood: Double
82
def numIter: Int
83
}
84
```
85
86
### Bisecting K-Means
87
88
Hierarchical clustering using divisive approach with K-means splitting.
89
90
```scala { .api }
91
/**
92
* Bisecting K-means clustering - hierarchical variant of K-means
93
* Uses divisive approach by recursively splitting clusters
94
*/
95
class BisectingKMeans extends Estimator[BisectingKMeansModel] with BisectingKMeansParams {
96
def setK(value: Int): this.type
97
def setMaxIter(value: Int): this.type
98
def setSeed(value: Long): this.type
99
def setMinDivisibleClusterSize(value: Double): this.type
100
def setDistanceMeasure(value: String): this.type
101
}
102
103
class BisectingKMeansModel extends Model[BisectingKMeansModel] with BisectingKMeansParams with MLWritable {
104
def clusterCenters: Array[Vector]
105
def numFeatures: Int
106
def k: Int
107
def summary: BisectingKMeansSummary
108
def hasSummary: Boolean
109
def computeCost(dataset: Dataset[_]): Double
110
}
111
112
class BisectingKMeansSummary extends Serializable {
113
def predictions: DataFrame
114
def predictionCol: String
115
def featuresCol: String
116
def k: Int
117
def clusterSizes: Array[Long]
118
def trainingCost: Double
119
def numIter: Int
120
}
121
```
122
123
### Latent Dirichlet Allocation
124
125
Topic modeling algorithm for discovering latent topics in document collections.
126
127
```scala { .api }
128
/**
129
* Latent Dirichlet Allocation for topic modeling
130
* Discovers latent topics in document collections using Dirichlet distributions
131
*/
132
class LDA extends Estimator[LDAModel] with LDAParams {
133
def setK(value: Int): this.type
134
def setMaxIter(value: Int): this.type
135
def setDocConcentration(value: Array[Double]): this.type
136
def setTopicConcentration(value: Double): this.type
137
def setOptimizer(value: String): this.type
138
def setLearningOffset(value: Double): this.type
139
def setLearningDecay(value: Double): this.type
140
def setSubsamplingRate(value: Double): this.type
141
def setSeed(value: Long): this.type
142
def setCheckpointInterval(value: Int): this.type
143
def setKeepLastCheckpoint(value: Boolean): this.type
144
}
145
146
// Abstract base class
147
abstract class LDAModel extends Model[LDAModel] with LDAParams {
148
def vocabSize: Int
149
def numTopics: Int
150
def estimatedDocConcentration: Vector
151
def topicsMatrix: Matrix
152
def isDistributed: Boolean
153
def logLikelihood(dataset: Dataset[_]): Double
154
def logPerplexity(dataset: Dataset[_]): Double
155
def describeTopics(): DataFrame
156
def describeTopics(maxTermsPerTopic: Int): DataFrame
157
}
158
159
class LocalLDAModel extends LDAModel with MLWritable {
160
def getTopicDistribution(document: Vector): Vector
161
}
162
163
class DistributedLDAModel extends LDAModel {
164
def toLocal: LocalLDAModel
165
def logPrior: Double
166
def trainingLogLikelihood: Double
167
}
168
```
169
170
### Power Iteration Clustering
171
172
Clustering algorithm based on graph theory and power iteration method.
173
174
```scala { .api }
175
/**
176
* Power Iteration Clustering using normalized graph cuts
177
* Clusters data points based on pairwise similarities using power iteration
178
*/
179
class PowerIterationClustering extends Params with Logging {
180
def setK(value: Int): this.type
181
def setMaxIter(value: Int): this.type
182
def setInitMode(value: String): this.type
183
def setSrcCol(value: String): this.type
184
def setDstCol(value: String): this.type
185
def setWeightCol(value: String): this.type
186
def assignClusters(dataset: Dataset[_]): Dataset[_]
187
}
188
```
189
190
## Base Classes and Traits
191
192
```scala { .api }
193
// Base clustering summary
194
abstract class ClusteringSummary extends Serializable {
195
def predictionCol: String
196
def featuresCol: String
197
def k: Int
198
def clusterSizes: Array[Long]
199
}
200
201
// Parameter traits
202
trait HasK extends Params {
203
final val k: IntParam
204
def getK: Int
205
def setK(value: Int): this.type
206
}
207
208
trait HasMaxIter extends Params {
209
final val maxIter: IntParam
210
def getMaxIter: Int
211
def setMaxIter(value: Int): this.type
212
}
213
214
trait HasTol extends Params {
215
final val tol: DoubleParam
216
def getTol: Double
217
def setTol(value: Double): this.type
218
}
219
220
trait HasSeed extends Params {
221
final val seed: LongParam
222
def getSeed: Long
223
def setSeed(value: Long): this.type
224
}
225
```
226
227
## Usage Examples
228
229
### K-Means Clustering
230
231
```scala
232
import org.apache.spark.ml.clustering.KMeans
233
234
val kmeans = new KMeans()
235
.setK(3)
236
.setMaxIter(20)
237
.setSeed(1L)
238
239
val model = kmeans.fit(dataset)
240
241
// Make predictions
242
val predictions = model.transform(dataset)
243
244
// Evaluate clustering
245
val cost = model.computeCost(dataset)
246
println(s"Within Set Sum of Squared Errors = $cost")
247
248
// Show cluster centers
249
model.clusterCenters.foreach(println)
250
```
251
252
### Gaussian Mixture Model
253
254
```scala
255
import org.apache.spark.ml.clustering.GaussianMixture
256
257
val gmm = new GaussianMixture()
258
.setK(3)
259
.setMaxIter(100)
260
.setSeed(538009335L)
261
262
val model = gmm.fit(dataset)
263
264
// Show model information
265
println(s"Gaussians shown as (weight, mu, sigma):")
266
model.gaussians.zip(model.weights).foreach {
267
case (g, w) => println(s"weight=$w, mu=${g.mean}, sigma=\n${g.cov}")
268
}
269
270
// Make predictions with probabilities
271
val predictions = model.transform(dataset)
272
predictions.select("features", "prediction", "probability").show()
273
```
274
275
### Latent Dirichlet Allocation
276
277
```scala
278
import org.apache.spark.ml.clustering.LDA
279
280
val lda = new LDA()
281
.setK(10)
282
.setMaxIter(10)
283
.setFeaturesCol("features")
284
285
val model = lda.fit(dataset)
286
287
val ll = model.logLikelihood(dataset)
288
val lp = model.logPerplexity(dataset)
289
println(s"The lower bound on the log likelihood of the entire corpus: $ll")
290
println(s"The upper bound on perplexity: $lp")
291
292
// Describe topics
293
val topics = model.describeTopics(3)
294
println("The topics described by their top-weighted terms:")
295
topics.show(false)
296
297
// Transform documents
298
val transformed = model.transform(dataset)
299
transformed.select("features", "topicDistribution").show(false)
300
```
301
302
### Bisecting K-Means
303
304
```scala
305
import org.apache.spark.ml.clustering.BisectingKMeans
306
307
val bkm = new BisectingKMeans()
308
.setK(3)
309
.setMaxIter(20)
310
.setSeed(1)
311
312
val model = bkm.fit(dataset)
313
314
// Make predictions
315
val predictions = model.transform(dataset)
316
317
// Evaluate clustering
318
val cost = model.computeCost(dataset)
319
println(s"Within Set Sum of Squared Errors = $cost")
320
321
// Show cluster centers
322
println("Cluster Centers: ")
323
model.clusterCenters.foreach(println)
324
```
325
326
### Power Iteration Clustering
327
328
```scala
329
import org.apache.spark.ml.clustering.PowerIterationClustering
330
331
// Create similarity data with columns: src, dst, weight
332
val pic = new PowerIterationClustering()
333
.setK(3)
334
.setMaxIter(20)
335
.setSrcCol("src")
336
.setDstCol("dst")
337
.setWeightCol("weight")
338
339
val assignments = pic.assignClusters(similarityData)
340
assignments.select("id", "cluster").show()
341
```