or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mdevaluation-tuning.mdfeature-engineering.mdindex.mdlinear-algebra.mdpipeline-components.mdrecommendation.mdregression.md

clustering.mddocs/

0

# Clustering

1

2

Unsupervised learning algorithms for discovering hidden patterns and grouping similar data points, including partitioning, hierarchical, and probabilistic clustering methods.

3

4

## Capabilities

5

6

### K-Means Clustering

7

8

Partitioning algorithm that groups data into k clusters by minimizing within-cluster sum of squared distances.

9

10

```scala { .api }

11

/**

12

* K-Means clustering algorithm

13

*/

14

class KMeans extends Estimator[KMeansModel] with KMeansParams {

15

def setK(value: Int): this.type

16

def setInitMode(value: String): this.type

17

def setInitSteps(value: Int): this.type

18

def setMaxIter(value: Int): this.type

19

def setTol(value: Double): this.type

20

def setSeed(value: Long): this.type

21

def setDistanceMeasure(value: String): this.type

22

def setWeightCol(value: String): this.type

23

}

24

25

class KMeansModel extends Model[KMeansModel] with KMeansParams with MLWritable {

26

def clusterCenters: Array[Vector]

27

def computeCost(dataset: Dataset[_]): Double

28

def summary: KMeansSummary

29

def hasSummary: Boolean

30

}

31

32

class KMeansSummary {

33

def predictions: DataFrame

34

def predictionCol: String

35

def featuresCol: String

36

def k: Int

37

def clusterSizes: Array[Long]

38

def trainingCost: Double

39

def numIter: Int

40

}

41

```

42

43

**Usage Example:**

44

45

```scala

46

import org.apache.spark.ml.clustering.KMeans

47

48

val kmeans = new KMeans()

49

.setK(3)

50

.setSeed(1L)

51

.setMaxIter(20)

52

53

val model = kmeans.fit(dataset)

54

55

// Make predictions

56

val predictions = model.transform(dataset)

57

58

// Evaluate clustering by computing Within Set Sum of Squared Errors

59

val WSSSE = model.computeCost(dataset)

60

println(s"Within Set Sum of Squared Errors = $WSSSE")

61

62

// Show cluster centers

63

println("Cluster Centers: ")

64

model.clusterCenters.foreach(println)

65

```

66

67

### Gaussian Mixture Model

68

69

Probabilistic clustering using expectation-maximization algorithm to model data as mixture of Gaussian distributions.

70

71

```scala { .api }

72

/**

73

* Gaussian Mixture Model clustering

74

*/

75

class GaussianMixture extends Estimator[GaussianMixtureModel] with GaussianMixtureParams {

76

def setK(value: Int): this.type

77

def setMaxIter(value: Int): this.type

78

def setTol(value: Double): this.type

79

def setSeed(value: Long): this.type

80

def setAggregationDepth(value: Int): this.type

81

def setWeightCol(value: String): this.type

82

}

83

84

class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {

85

def weights: Array[Double]

86

def gaussians: Array[MultivariateGaussian]

87

def summary: GaussianMixtureSummary

88

def hasSummary: Boolean

89

}

90

91

class GaussianMixtureSummary {

92

def predictions: DataFrame

93

def predictionCol: String

94

def probabilityCol: String

95

def featuresCol: String

96

def k: Int

97

def clusterSizes: Array[Long]

98

def logLikelihood: Double

99

def numIter: Int

100

}

101

102

case class MultivariateGaussian(mean: Vector, cov: Matrix) {

103

def pdf(x: Vector): Double

104

def logpdf(x: Vector): Double

105

}

106

```

107

108

**Usage Example:**

109

110

```scala

111

import org.apache.spark.ml.clustering.GaussianMixture

112

113

val gmm = new GaussianMixture()

114

.setK(3)

115

.setSeed(538009335L)

116

117

val model = gmm.fit(dataset)

118

119

// Output the parameters of the mixture model

120

for (i <- 0 until model.getK) {

121

println(s"Gaussian $i:\nweight=${model.weights(i)}\n" +

122

s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n")

123

}

124

```

125

126

### Latent Dirichlet Allocation

127

128

Topic modeling algorithm for discovering abstract topics in document collections.

129

130

```scala { .api }

131

/**

132

* Latent Dirichlet Allocation for topic modeling

133

*/

134

class LDA extends Estimator[LDAModel] with LDAParams {

135

def setK(value: Int): this.type

136

def setMaxIter(value: Int): this.type

137

def setDocConcentration(value: Array[Double]): this.type

138

def setTopicConcentration(value: Double): this.type

139

def setOptimizer(value: String): this.type

140

def setLearningOffset(value: Double): this.type

141

def setLearningDecay(value: Double): this.type

142

def setSubsamplingRate(value: Double): this.type

143

def setOptimizeDocConcentration(value: Boolean): this.type

144

def setSeed(value: Long): this.type

145

def setCheckpointInterval(value: Int): this.type

146

def setKeepLastCheckpoint(value: Boolean): this.type

147

}

148

149

abstract class LDAModel extends Model[LDAModel] with LDAParams {

150

def vocabSize: Int

151

def topicsMatrix: Matrix

152

def isDistributed: Boolean

153

def logLikelihood(dataset: Dataset[_]): Double

154

def logPerplexity(dataset: Dataset[_]): Double

155

def describeTopics(maxTermsPerTopic: Int): DataFrame

156

def describeTopics(): DataFrame

157

}

158

159

class LocalLDAModel extends LDAModel {

160

def getDocConcentration: Vector

161

def topicDistributions(dataset: Dataset[_]): DataFrame

162

}

163

164

class DistributedLDAModel extends LDAModel {

165

def getDocConcentration: Vector

166

def toLocal: LocalLDAModel

167

def logPrior: Double

168

}

169

```

170

171

### Bisecting K-Means

172

173

Hierarchical clustering algorithm that recursively applies k-means to split clusters into two subclusters.

174

175

```scala { .api }

176

/**

177

* Bisecting K-Means clustering

178

*/

179

class BisectingKMeans extends Estimator[BisectingKMeansModel] with BisectingKMeansParams {

180

def setK(value: Int): this.type

181

def setMaxIter(value: Int): this.type

182

def setSeed(value: Long): this.type

183

def setMinDivisibleClusterSize(value: Double): this.type

184

def setDistanceMeasure(value: String): this.type

185

}

186

187

class BisectingKMeansModel extends Model[BisectingKMeansModel] with BisectingKMeansParams with MLWritable {

188

def clusterCenters: Array[Vector]

189

def computeCost(dataset: Dataset[_]): Double

190

def summary: BisectingKMeansSummary

191

def hasSummary: Boolean

192

}

193

194

class BisectingKMeansSummary {

195

def predictions: DataFrame

196

def predictionCol: String

197

def featuresCol: String

198

def k: Int

199

def clusterSizes: Array[Long]

200

def trainingCost: Double

201

def numIter: Int

202

}

203

```

204

205

### Power Iteration Clustering

206

207

Graph-based clustering algorithm using power iteration to find a low-dimensional embedding of the affinity matrix.

208

209

```scala { .api }

210

/**

211

* Power Iteration Clustering for graph-based clustering

212

*/

213

class PowerIterationClustering extends Estimator[PowerIterationClusteringModel] with PowerIterationClusteringParams {

214

def setK(value: Int): this.type

215

def setMaxIter(value: Int): this.type

216

def setInitMode(value: String): this.type

217

def setSrcCol(value: String): this.type

218

def setDstCol(value: String): this.type

219

def setWeightCol(value: String): this.type

220

}

221

222

class PowerIterationClusteringModel extends Model[PowerIterationClusteringModel] with PowerIterationClusteringParams {

223

def assignments: DataFrame

224

}

225

```

226

227

**Usage Example:**

228

229

```scala

230

import org.apache.spark.ml.clustering.PowerIterationClustering

231

232

val pic = new PowerIterationClustering()

233

.setK(3)

234

.setMaxIter(20)

235

.setSrcCol("src")

236

.setDstCol("dst")

237

.setWeightCol("weight")

238

239

val model = pic.fit(edgeDataset)

240

val result = model.assignments

241

result.show()

242

```

243

244

## Types

245

246

```scala { .api }

247

// Clustering-specific imports

248

import org.apache.spark.ml.clustering._

249

import org.apache.spark.ml.linalg.{Vector, Matrix}

250

import org.apache.spark.sql.{DataFrame, Dataset}

251

252

// Gaussian mixture components

253

import org.apache.spark.ml.stat.distribution.MultivariateGaussian

254

255

// Parameter traits

256

import org.apache.spark.ml.param.shared._

257

258

// Summary types

259

import org.apache.spark.ml.clustering.{

260

KMeansSummary,

261

GaussianMixtureSummary,

262

BisectingKMeansSummary

263

}

264

```