Tessl Tile for maven/org.apache.spark/spark-mllib_2.13@4.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md clustering.md evaluation-tuning.md feature-engineering.md index.md linear-algebra.md pipeline-components.md recommendation.md regression.md

clustering.mddocs/

0
# Clustering
1

2
Unsupervised learning algorithms for discovering hidden patterns and grouping similar data points, including partitioning, hierarchical, and probabilistic clustering methods.
3

4
## Capabilities
5

6
### K-Means Clustering
7

8
Partitioning algorithm that groups data into k clusters by minimizing within-cluster sum of squared distances.
9

10
```scala { .api }
11
/**
12
 * K-Means clustering algorithm
13
 */
14
class KMeans extends Estimator[KMeansModel] with KMeansParams {
15
  def setK(value: Int): this.type
16
  def setInitMode(value: String): this.type
17
  def setInitSteps(value: Int): this.type
18
  def setMaxIter(value: Int): this.type
19
  def setTol(value: Double): this.type
20
  def setSeed(value: Long): this.type
21
  def setDistanceMeasure(value: String): this.type
22
  def setWeightCol(value: String): this.type
23
}
24

25
class KMeansModel extends Model[KMeansModel] with KMeansParams with MLWritable {
26
  def clusterCenters: Array[Vector]
27
  def computeCost(dataset: Dataset[_]): Double
28
  def summary: KMeansSummary
29
  def hasSummary: Boolean
30
}
31

32
class KMeansSummary {
33
  def predictions: DataFrame
34
  def predictionCol: String
35
  def featuresCol: String
36
  def k: Int
37
  def clusterSizes: Array[Long]
38
  def trainingCost: Double
39
  def numIter: Int
40
}
41
```
42

43
**Usage Example:**
44

45
```scala
46
import org.apache.spark.ml.clustering.KMeans
47

48
val kmeans = new KMeans()
49
  .setK(3)
50
  .setSeed(1L)
51
  .setMaxIter(20)
52

53
val model = kmeans.fit(dataset)
54

55
// Make predictions
56
val predictions = model.transform(dataset)
57

58
// Evaluate clustering by computing Within Set Sum of Squared Errors
59
val WSSSE = model.computeCost(dataset)
60
println(s"Within Set Sum of Squared Errors = $WSSSE")
61

62
// Show cluster centers
63
println("Cluster Centers: ")
64
model.clusterCenters.foreach(println)
65
```
66

67
### Gaussian Mixture Model
68

69
Probabilistic clustering using expectation-maximization algorithm to model data as mixture of Gaussian distributions.
70

71
```scala { .api }
72
/**
73
 * Gaussian Mixture Model clustering
74
 */
75
class GaussianMixture extends Estimator[GaussianMixtureModel] with GaussianMixtureParams {
76
  def setK(value: Int): this.type
77
  def setMaxIter(value: Int): this.type
78
  def setTol(value: Double): this.type
79
  def setSeed(value: Long): this.type
80
  def setAggregationDepth(value: Int): this.type
81
  def setWeightCol(value: String): this.type
82
}
83

84
class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {
85
  def weights: Array[Double]
86
  def gaussians: Array[MultivariateGaussian]
87
  def summary: GaussianMixtureSummary
88
  def hasSummary: Boolean
89
}
90

91
class GaussianMixtureSummary {
92
  def predictions: DataFrame
93
  def predictionCol: String
94
  def probabilityCol: String
95
  def featuresCol: String
96
  def k: Int
97
  def clusterSizes: Array[Long]
98
  def logLikelihood: Double
99
  def numIter: Int
100
}
101

102
case class MultivariateGaussian(mean: Vector, cov: Matrix) {
103
  def pdf(x: Vector): Double
104
  def logpdf(x: Vector): Double
105
}
106
```
107

108
**Usage Example:**
109

110
```scala
111
import org.apache.spark.ml.clustering.GaussianMixture
112

113
val gmm = new GaussianMixture()
114
  .setK(3)
115
  .setSeed(538009335L)
116

117
val model = gmm.fit(dataset)
118

119
// Output the parameters of the mixture model
120
for (i <- 0 until model.getK) {
121
  println(s"Gaussian $i:\nweight=${model.weights(i)}\n" +
122
    s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n")
123
}
124
```
125

126
### Latent Dirichlet Allocation
127

128
Topic modeling algorithm for discovering abstract topics in document collections.
129

130
```scala { .api }
131
/**
132
 * Latent Dirichlet Allocation for topic modeling
133
 */
134
class LDA extends Estimator[LDAModel] with LDAParams {
135
  def setK(value: Int): this.type
136
  def setMaxIter(value: Int): this.type
137
  def setDocConcentration(value: Array[Double]): this.type
138
  def setTopicConcentration(value: Double): this.type
139
  def setOptimizer(value: String): this.type
140
  def setLearningOffset(value: Double): this.type
141
  def setLearningDecay(value: Double): this.type
142
  def setSubsamplingRate(value: Double): this.type
143
  def setOptimizeDocConcentration(value: Boolean): this.type
144
  def setSeed(value: Long): this.type
145
  def setCheckpointInterval(value: Int): this.type
146
  def setKeepLastCheckpoint(value: Boolean): this.type
147
}
148

149
abstract class LDAModel extends Model[LDAModel] with LDAParams {
150
  def vocabSize: Int
151
  def topicsMatrix: Matrix
152
  def isDistributed: Boolean
153
  def logLikelihood(dataset: Dataset[_]): Double
154
  def logPerplexity(dataset: Dataset[_]): Double
155
  def describeTopics(maxTermsPerTopic: Int): DataFrame
156
  def describeTopics(): DataFrame
157
}
158

159
class LocalLDAModel extends LDAModel {
160
  def getDocConcentration: Vector
161
  def topicDistributions(dataset: Dataset[_]): DataFrame
162
}
163

164
class DistributedLDAModel extends LDAModel {
165
  def getDocConcentration: Vector
166
  def toLocal: LocalLDAModel
167
  def logPrior: Double
168
}
169
```
170

171
### Bisecting K-Means
172

173
Hierarchical clustering algorithm that recursively applies k-means to split clusters into two subclusters.
174

175
```scala { .api }
176
/**
177
 * Bisecting K-Means clustering
178
 */
179
class BisectingKMeans extends Estimator[BisectingKMeansModel] with BisectingKMeansParams {
180
  def setK(value: Int): this.type
181
  def setMaxIter(value: Int): this.type
182
  def setSeed(value: Long): this.type
183
  def setMinDivisibleClusterSize(value: Double): this.type
184
  def setDistanceMeasure(value: String): this.type
185
}
186

187
class BisectingKMeansModel extends Model[BisectingKMeansModel] with BisectingKMeansParams with MLWritable {
188
  def clusterCenters: Array[Vector]
189
  def computeCost(dataset: Dataset[_]): Double
190
  def summary: BisectingKMeansSummary
191
  def hasSummary: Boolean
192
}
193

194
class BisectingKMeansSummary {
195
  def predictions: DataFrame
196
  def predictionCol: String
197
  def featuresCol: String
198
  def k: Int
199
  def clusterSizes: Array[Long]
200
  def trainingCost: Double
201
  def numIter: Int
202
}
203
```
204

205
### Power Iteration Clustering
206

207
Graph-based clustering algorithm using power iteration to find a low-dimensional embedding of the affinity matrix.
208

209
```scala { .api }
210
/**
211
 * Power Iteration Clustering for graph-based clustering
212
 */
213
class PowerIterationClustering extends Estimator[PowerIterationClusteringModel] with PowerIterationClusteringParams {
214
  def setK(value: Int): this.type
215
  def setMaxIter(value: Int): this.type
216
  def setInitMode(value: String): this.type
217
  def setSrcCol(value: String): this.type
218
  def setDstCol(value: String): this.type
219
  def setWeightCol(value: String): this.type
220
}
221

222
class PowerIterationClusteringModel extends Model[PowerIterationClusteringModel] with PowerIterationClusteringParams {
223
  def assignments: DataFrame
224
}
225
```
226

227
**Usage Example:**
228

229
```scala
230
import org.apache.spark.ml.clustering.PowerIterationClustering
231

232
val pic = new PowerIterationClustering()
233
  .setK(3)
234
  .setMaxIter(20)
235
  .setSrcCol("src")
236
  .setDstCol("dst")
237
  .setWeightCol("weight")
238

239
val model = pic.fit(edgeDataset)
240
val result = model.assignments
241
result.show()
242
```
243

244
## Types
245

246
```scala { .api }
247
// Clustering-specific imports
248
import org.apache.spark.ml.clustering._
249
import org.apache.spark.ml.linalg.{Vector, Matrix}
250
import org.apache.spark.sql.{DataFrame, Dataset}
251

252
// Gaussian mixture components
253
import org.apache.spark.ml.stat.distribution.MultivariateGaussian
254

255
// Parameter traits
256
import org.apache.spark.ml.param.shared._
257

258
// Summary types
259
import org.apache.spark.ml.clustering.{
260
  KMeansSummary,
261
  GaussianMixtureSummary,
262
  BisectingKMeansSummary
263
}
264
```

Version

Tile

Files

clustering.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

clustering.mddocs/