or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mdevaluation.mdfeature-processing.mdfrequent-pattern-mining.mdindex.mdlinear-algebra.mdpipeline.mdrecommendation.mdregression.mdstatistics.md

clustering.mddocs/

0

# Clustering Algorithms

1

2

Unsupervised learning algorithms for discovering patterns and groupings in data. MLlib provides clustering algorithms including K-means, Gaussian mixture models, hierarchical clustering, and topic modeling.

3

4

## Capabilities

5

6

### K-Means Clustering

7

8

Classic clustering algorithm that partitions data into k clusters using centroid-based approach.

9

10

```scala { .api }

11

/**

12

* K-means clustering algorithm using Lloyd's algorithm with configurable

13

* initialization strategies and convergence criteria

14

*/

15

class KMeans extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {

16

def setK(value: Int): this.type

17

def setInitMode(value: String): this.type

18

def setInitSteps(value: Int): this.type

19

def setMaxIter(value: Int): this.type

20

def setTol(value: Double): this.type

21

def setDistanceMeasure(value: String): this.type

22

def setSeed(value: Long): this.type

23

def setWeightCol(value: String): this.type

24

}

25

26

class KMeansModel extends Model[KMeansModel] with KMeansParams with MLWritable {

27

def clusterCenters: Array[Vector]

28

def numFeatures: Int

29

def k: Int

30

def summary: KMeansSummary

31

def hasSummary: Boolean

32

def computeCost(dataset: Dataset[_]): Double

33

}

34

35

class KMeansSummary extends Serializable {

36

def predictions: DataFrame

37

def predictionCol: String

38

def featuresCol: String

39

def k: Int

40

def clusterSizes: Array[Long]

41

def trainingCost: Double

42

def numIter: Int

43

}

44

```

45

46

### Gaussian Mixture Model

47

48

Probabilistic clustering using Gaussian mixture models with EM algorithm.

49

50

```scala { .api }

51

/**

52

* Gaussian Mixture Model clustering using Expectation-Maximization algorithm

53

* Models data as mixture of Gaussian distributions

54

*/

55

class GaussianMixture extends Estimator[GaussianMixtureModel] with GaussianMixtureParams {

56

def setK(value: Int): this.type

57

def setMaxIter(value: Int): this.type

58

def setTol(value: Double): this.type

59

def setSeed(value: Long): this.type

60

def setWeightCol(value: String): this.type

61

}

62

63

class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {

64

def weights: Array[Double]

65

def gaussians: Array[MultivariateGaussian]

66

def numFeatures: Int

67

def k: Int

68

def summary: GaussianMixtureSummary

69

def hasSummary: Boolean

70

def predict(features: Vector): Int

71

def predictProbability(features: Vector): Vector

72

}

73

74

class GaussianMixtureSummary extends Serializable {

75

def predictions: DataFrame

76

def predictionCol: String

77

def probabilityCol: String

78

def featuresCol: String

79

def k: Int

80

def clusterSizes: Array[Long]

81

def logLikelihood: Double

82

def numIter: Int

83

}

84

```

85

86

### Bisecting K-Means

87

88

Hierarchical clustering using divisive approach with K-means splitting.

89

90

```scala { .api }

91

/**

92

* Bisecting K-means clustering - hierarchical variant of K-means

93

* Uses divisive approach by recursively splitting clusters

94

*/

95

class BisectingKMeans extends Estimator[BisectingKMeansModel] with BisectingKMeansParams {

96

def setK(value: Int): this.type

97

def setMaxIter(value: Int): this.type

98

def setSeed(value: Long): this.type

99

def setMinDivisibleClusterSize(value: Double): this.type

100

def setDistanceMeasure(value: String): this.type

101

}

102

103

class BisectingKMeansModel extends Model[BisectingKMeansModel] with BisectingKMeansParams with MLWritable {

104

def clusterCenters: Array[Vector]

105

def numFeatures: Int

106

def k: Int

107

def summary: BisectingKMeansSummary

108

def hasSummary: Boolean

109

def computeCost(dataset: Dataset[_]): Double

110

}

111

112

class BisectingKMeansSummary extends Serializable {

113

def predictions: DataFrame

114

def predictionCol: String

115

def featuresCol: String

116

def k: Int

117

def clusterSizes: Array[Long]

118

def trainingCost: Double

119

def numIter: Int

120

}

121

```

122

123

### Latent Dirichlet Allocation

124

125

Topic modeling algorithm for discovering latent topics in document collections.

126

127

```scala { .api }

128

/**

129

* Latent Dirichlet Allocation for topic modeling

130

* Discovers latent topics in document collections using Dirichlet distributions

131

*/

132

class LDA extends Estimator[LDAModel] with LDAParams {

133

def setK(value: Int): this.type

134

def setMaxIter(value: Int): this.type

135

def setDocConcentration(value: Array[Double]): this.type

136

def setTopicConcentration(value: Double): this.type

137

def setOptimizer(value: String): this.type

138

def setLearningOffset(value: Double): this.type

139

def setLearningDecay(value: Double): this.type

140

def setSubsamplingRate(value: Double): this.type

141

def setSeed(value: Long): this.type

142

def setCheckpointInterval(value: Int): this.type

143

def setKeepLastCheckpoint(value: Boolean): this.type

144

}

145

146

// Abstract base class

147

abstract class LDAModel extends Model[LDAModel] with LDAParams {

148

def vocabSize: Int

149

def numTopics: Int

150

def estimatedDocConcentration: Vector

151

def topicsMatrix: Matrix

152

def isDistributed: Boolean

153

def logLikelihood(dataset: Dataset[_]): Double

154

def logPerplexity(dataset: Dataset[_]): Double

155

def describeTopics(): DataFrame

156

def describeTopics(maxTermsPerTopic: Int): DataFrame

157

}

158

159

class LocalLDAModel extends LDAModel with MLWritable {

160

def getTopicDistribution(document: Vector): Vector

161

}

162

163

class DistributedLDAModel extends LDAModel {

164

def toLocal: LocalLDAModel

165

def logPrior: Double

166

def trainingLogLikelihood: Double

167

}

168

```

169

170

### Power Iteration Clustering

171

172

Clustering algorithm based on graph theory and power iteration method.

173

174

```scala { .api }

175

/**

176

* Power Iteration Clustering using normalized graph cuts

177

* Clusters data points based on pairwise similarities using power iteration

178

*/

179

class PowerIterationClustering extends Params with Logging {

180

def setK(value: Int): this.type

181

def setMaxIter(value: Int): this.type

182

def setInitMode(value: String): this.type

183

def setSrcCol(value: String): this.type

184

def setDstCol(value: String): this.type

185

def setWeightCol(value: String): this.type

186

def assignClusters(dataset: Dataset[_]): Dataset[_]

187

}

188

```

189

190

## Base Classes and Traits

191

192

```scala { .api }

193

// Base clustering summary

194

abstract class ClusteringSummary extends Serializable {

195

def predictionCol: String

196

def featuresCol: String

197

def k: Int

198

def clusterSizes: Array[Long]

199

}

200

201

// Parameter traits

202

trait HasK extends Params {

203

final val k: IntParam

204

def getK: Int

205

def setK(value: Int): this.type

206

}

207

208

trait HasMaxIter extends Params {

209

final val maxIter: IntParam

210

def getMaxIter: Int

211

def setMaxIter(value: Int): this.type

212

}

213

214

trait HasTol extends Params {

215

final val tol: DoubleParam

216

def getTol: Double

217

def setTol(value: Double): this.type

218

}

219

220

trait HasSeed extends Params {

221

final val seed: LongParam

222

def getSeed: Long

223

def setSeed(value: Long): this.type

224

}

225

```

226

227

## Usage Examples

228

229

### K-Means Clustering

230

231

```scala

232

import org.apache.spark.ml.clustering.KMeans

233

234

val kmeans = new KMeans()

235

.setK(3)

236

.setMaxIter(20)

237

.setSeed(1L)

238

239

val model = kmeans.fit(dataset)

240

241

// Make predictions

242

val predictions = model.transform(dataset)

243

244

// Evaluate clustering

245

val cost = model.computeCost(dataset)

246

println(s"Within Set Sum of Squared Errors = $cost")

247

248

// Show cluster centers

249

model.clusterCenters.foreach(println)

250

```

251

252

### Gaussian Mixture Model

253

254

```scala

255

import org.apache.spark.ml.clustering.GaussianMixture

256

257

val gmm = new GaussianMixture()

258

.setK(3)

259

.setMaxIter(100)

260

.setSeed(538009335L)

261

262

val model = gmm.fit(dataset)

263

264

// Show model information

265

println(s"Gaussians shown as (weight, mu, sigma):")

266

model.gaussians.zip(model.weights).foreach {

267

case (g, w) => println(s"weight=$w, mu=${g.mean}, sigma=\n${g.cov}")

268

}

269

270

// Make predictions with probabilities

271

val predictions = model.transform(dataset)

272

predictions.select("features", "prediction", "probability").show()

273

```

274

275

### Latent Dirichlet Allocation

276

277

```scala

278

import org.apache.spark.ml.clustering.LDA

279

280

val lda = new LDA()

281

.setK(10)

282

.setMaxIter(10)

283

.setFeaturesCol("features")

284

285

val model = lda.fit(dataset)

286

287

val ll = model.logLikelihood(dataset)

288

val lp = model.logPerplexity(dataset)

289

println(s"The lower bound on the log likelihood of the entire corpus: $ll")

290

println(s"The upper bound on perplexity: $lp")

291

292

// Describe topics

293

val topics = model.describeTopics(3)

294

println("The topics described by their top-weighted terms:")

295

topics.show(false)

296

297

// Transform documents

298

val transformed = model.transform(dataset)

299

transformed.select("features", "topicDistribution").show(false)

300

```

301

302

### Bisecting K-Means

303

304

```scala

305

import org.apache.spark.ml.clustering.BisectingKMeans

306

307

val bkm = new BisectingKMeans()

308

.setK(3)

309

.setMaxIter(20)

310

.setSeed(1)

311

312

val model = bkm.fit(dataset)

313

314

// Make predictions

315

val predictions = model.transform(dataset)

316

317

// Evaluate clustering

318

val cost = model.computeCost(dataset)

319

println(s"Within Set Sum of Squared Errors = $cost")

320

321

// Show cluster centers

322

println("Cluster Centers: ")

323

model.clusterCenters.foreach(println)

324

```

325

326

### Power Iteration Clustering

327

328

```scala

329

import org.apache.spark.ml.clustering.PowerIterationClustering

330

331

// Create similarity data with columns: src, dst, weight

332

val pic = new PowerIterationClustering()

333

.setK(3)

334

.setMaxIter(20)

335

.setSrcCol("src")

336

.setDstCol("dst")

337

.setWeightCol("weight")

338

339

val assignments = pic.assignClusters(similarityData)

340

assignments.select("id", "cluster").show()

341

```