Tessl Tile for maven/org.apache.spark/spark-mllib_2.13@3.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md clustering.md evaluation.md feature-processing.md frequent-pattern-mining.md index.md linear-algebra.md pipeline.md recommendation.md regression.md statistics.md

clustering.mddocs/

0
# Clustering Algorithms
1

2
Unsupervised learning algorithms for discovering patterns and groupings in data. MLlib provides clustering algorithms including K-means, Gaussian mixture models, hierarchical clustering, and topic modeling.
3

4
## Capabilities
5

6
### K-Means Clustering
7

8
Classic clustering algorithm that partitions data into k clusters using centroid-based approach.
9

10
```scala { .api }
11
/**
12
 * K-means clustering algorithm using Lloyd's algorithm with configurable
13
 * initialization strategies and convergence criteria
14
 */
15
class KMeans extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {
16
  def setK(value: Int): this.type
17
  def setInitMode(value: String): this.type
18
  def setInitSteps(value: Int): this.type
19
  def setMaxIter(value: Int): this.type
20
  def setTol(value: Double): this.type
21
  def setDistanceMeasure(value: String): this.type
22
  def setSeed(value: Long): this.type
23
  def setWeightCol(value: String): this.type
24
}
25

26
class KMeansModel extends Model[KMeansModel] with KMeansParams with MLWritable {
27
  def clusterCenters: Array[Vector]
28
  def numFeatures: Int
29
  def k: Int
30
  def summary: KMeansSummary
31
  def hasSummary: Boolean
32
  def computeCost(dataset: Dataset[_]): Double
33
}
34

35
class KMeansSummary extends Serializable {
36
  def predictions: DataFrame  
37
  def predictionCol: String
38
  def featuresCol: String
39
  def k: Int
40
  def clusterSizes: Array[Long]
41
  def trainingCost: Double
42
  def numIter: Int
43
}
44
```
45

46
### Gaussian Mixture Model
47

48
Probabilistic clustering using Gaussian mixture models with EM algorithm.
49

50
```scala { .api }
51
/**
52
 * Gaussian Mixture Model clustering using Expectation-Maximization algorithm
53
 * Models data as mixture of Gaussian distributions
54
 */
55
class GaussianMixture extends Estimator[GaussianMixtureModel] with GaussianMixtureParams {
56
  def setK(value: Int): this.type
57
  def setMaxIter(value: Int): this.type
58
  def setTol(value: Double): this.type
59
  def setSeed(value: Long): this.type
60
  def setWeightCol(value: String): this.type
61
}
62

63
class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {
64
  def weights: Array[Double]
65
  def gaussians: Array[MultivariateGaussian]
66
  def numFeatures: Int
67
  def k: Int
68
  def summary: GaussianMixtureSummary
69
  def hasSummary: Boolean
70
  def predict(features: Vector): Int
71
  def predictProbability(features: Vector): Vector
72
}
73

74
class GaussianMixtureSummary extends Serializable {
75
  def predictions: DataFrame
76
  def predictionCol: String
77
  def probabilityCol: String
78
  def featuresCol: String
79
  def k: Int
80
  def clusterSizes: Array[Long]
81
  def logLikelihood: Double
82
  def numIter: Int
83
}
84
```
85

86
### Bisecting K-Means
87

88
Hierarchical clustering using divisive approach with K-means splitting.
89

90
```scala { .api }
91
/**
92
 * Bisecting K-means clustering - hierarchical variant of K-means
93
 * Uses divisive approach by recursively splitting clusters
94
 */
95
class BisectingKMeans extends Estimator[BisectingKMeansModel] with BisectingKMeansParams {
96
  def setK(value: Int): this.type
97
  def setMaxIter(value: Int): this.type
98
  def setSeed(value: Long): this.type
99
  def setMinDivisibleClusterSize(value: Double): this.type
100
  def setDistanceMeasure(value: String): this.type
101
}
102

103
class BisectingKMeansModel extends Model[BisectingKMeansModel] with BisectingKMeansParams with MLWritable {
104
  def clusterCenters: Array[Vector]
105
  def numFeatures: Int
106
  def k: Int
107
  def summary: BisectingKMeansSummary  
108
  def hasSummary: Boolean
109
  def computeCost(dataset: Dataset[_]): Double
110
}
111

112
class BisectingKMeansSummary extends Serializable {
113
  def predictions: DataFrame
114
  def predictionCol: String
115
  def featuresCol: String
116
  def k: Int
117
  def clusterSizes: Array[Long]
118
  def trainingCost: Double
119
  def numIter: Int
120
}
121
```
122

123
### Latent Dirichlet Allocation
124

125
Topic modeling algorithm for discovering latent topics in document collections.
126

127
```scala { .api }
128
/**
129
 * Latent Dirichlet Allocation for topic modeling
130
 * Discovers latent topics in document collections using Dirichlet distributions
131
 */
132
class LDA extends Estimator[LDAModel] with LDAParams {
133
  def setK(value: Int): this.type
134
  def setMaxIter(value: Int): this.type
135
  def setDocConcentration(value: Array[Double]): this.type
136
  def setTopicConcentration(value: Double): this.type
137
  def setOptimizer(value: String): this.type
138
  def setLearningOffset(value: Double): this.type
139
  def setLearningDecay(value: Double): this.type
140
  def setSubsamplingRate(value: Double): this.type
141
  def setSeed(value: Long): this.type
142
  def setCheckpointInterval(value: Int): this.type
143
  def setKeepLastCheckpoint(value: Boolean): this.type
144
}
145

146
// Abstract base class
147
abstract class LDAModel extends Model[LDAModel] with LDAParams {
148
  def vocabSize: Int
149
  def numTopics: Int
150
  def estimatedDocConcentration: Vector
151
  def topicsMatrix: Matrix
152
  def isDistributed: Boolean
153
  def logLikelihood(dataset: Dataset[_]): Double
154
  def logPerplexity(dataset: Dataset[_]): Double
155
  def describeTopics(): DataFrame
156
  def describeTopics(maxTermsPerTopic: Int): DataFrame
157
}
158

159
class LocalLDAModel extends LDAModel with MLWritable {
160
  def getTopicDistribution(document: Vector): Vector
161
}
162

163
class DistributedLDAModel extends LDAModel {
164
  def toLocal: LocalLDAModel
165
  def logPrior: Double
166
  def trainingLogLikelihood: Double
167
}
168
```
169

170
### Power Iteration Clustering
171

172
Clustering algorithm based on graph theory and power iteration method.
173

174
```scala { .api }
175
/**
176
 * Power Iteration Clustering using normalized graph cuts
177
 * Clusters data points based on pairwise similarities using power iteration
178
 */  
179
class PowerIterationClustering extends Params with Logging {
180
  def setK(value: Int): this.type
181
  def setMaxIter(value: Int): this.type
182
  def setInitMode(value: String): this.type
183
  def setSrcCol(value: String): this.type
184
  def setDstCol(value: String): this.type
185
  def setWeightCol(value: String): this.type
186
  def assignClusters(dataset: Dataset[_]): Dataset[_]
187
}
188
```
189

190
## Base Classes and Traits
191

192
```scala { .api }
193
// Base clustering summary
194
abstract class ClusteringSummary extends Serializable {
195
  def predictionCol: String
196
  def featuresCol: String
197
  def k: Int
198
  def clusterSizes: Array[Long]
199
}
200

201
// Parameter traits
202
trait HasK extends Params {
203
  final val k: IntParam
204
  def getK: Int
205
  def setK(value: Int): this.type
206
}
207

208
trait HasMaxIter extends Params {
209
  final val maxIter: IntParam
210
  def getMaxIter: Int  
211
  def setMaxIter(value: Int): this.type
212
}
213

214
trait HasTol extends Params {
215
  final val tol: DoubleParam
216
  def getTol: Double
217
  def setTol(value: Double): this.type
218
}
219

220
trait HasSeed extends Params {
221
  final val seed: LongParam
222
  def getSeed: Long
223
  def setSeed(value: Long): this.type
224
}
225
```
226

227
## Usage Examples
228

229
### K-Means Clustering
230

231
```scala
232
import org.apache.spark.ml.clustering.KMeans
233

234
val kmeans = new KMeans()
235
  .setK(3)
236
  .setMaxIter(20)
237
  .setSeed(1L)
238

239
val model = kmeans.fit(dataset)
240

241
// Make predictions
242
val predictions = model.transform(dataset)
243

244
// Evaluate clustering
245
val cost = model.computeCost(dataset)
246
println(s"Within Set Sum of Squared Errors = $cost")
247

248
// Show cluster centers
249
model.clusterCenters.foreach(println)
250
```
251

252
### Gaussian Mixture Model
253

254
```scala
255
import org.apache.spark.ml.clustering.GaussianMixture
256

257
val gmm = new GaussianMixture()
258
  .setK(3)
259
  .setMaxIter(100)
260
  .setSeed(538009335L)
261

262
val model = gmm.fit(dataset)
263

264
// Show model information
265
println(s"Gaussians shown as (weight, mu, sigma):")
266
model.gaussians.zip(model.weights).foreach {
267
  case (g, w) => println(s"weight=$w, mu=${g.mean}, sigma=\n${g.cov}")
268
}
269

270
// Make predictions with probabilities
271
val predictions = model.transform(dataset)
272
predictions.select("features", "prediction", "probability").show()
273
```
274

275
### Latent Dirichlet Allocation
276

277
```scala
278
import org.apache.spark.ml.clustering.LDA
279

280
val lda = new LDA()
281
  .setK(10)
282
  .setMaxIter(10)
283
  .setFeaturesCol("features")
284

285
val model = lda.fit(dataset)
286

287
val ll = model.logLikelihood(dataset)
288
val lp = model.logPerplexity(dataset)
289
println(s"The lower bound on the log likelihood of the entire corpus: $ll")
290
println(s"The upper bound on perplexity: $lp")
291

292
// Describe topics
293
val topics = model.describeTopics(3)
294
println("The topics described by their top-weighted terms:")
295
topics.show(false)
296

297
// Transform documents
298
val transformed = model.transform(dataset)
299
transformed.select("features", "topicDistribution").show(false)
300
```
301

302
### Bisecting K-Means
303

304
```scala
305
import org.apache.spark.ml.clustering.BisectingKMeans
306

307
val bkm = new BisectingKMeans()
308
  .setK(3)
309
  .setMaxIter(20)
310
  .setSeed(1)
311

312
val model = bkm.fit(dataset)
313

314
// Make predictions
315
val predictions = model.transform(dataset)
316

317
// Evaluate clustering
318
val cost = model.computeCost(dataset)
319
println(s"Within Set Sum of Squared Errors = $cost")
320

321
// Show cluster centers
322
println("Cluster Centers: ")
323
model.clusterCenters.foreach(println)
324
```
325

326
### Power Iteration Clustering
327

328
```scala
329
import org.apache.spark.ml.clustering.PowerIterationClustering
330

331
// Create similarity data with columns: src, dst, weight
332
val pic = new PowerIterationClustering()
333
  .setK(3)
334
  .setMaxIter(20)
335
  .setSrcCol("src")
336
  .setDstCol("dst")
337
  .setWeightCol("weight")
338

339
val assignments = pic.assignClusters(similarityData)
340
assignments.select("id", "cluster").show()
341
```

Version

Tile

Files

clustering.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

clustering.mddocs/