Unsupervised learning algorithms for discovering hidden patterns and grouping similar data points, including partitioning, hierarchical, and probabilistic clustering methods.
Partitioning algorithm that groups data into k clusters by minimizing within-cluster sum of squared distances.
/**
* K-Means clustering algorithm
*/
class KMeans extends Estimator[KMeansModel] with KMeansParams {
def setK(value: Int): this.type
def setInitMode(value: String): this.type
def setInitSteps(value: Int): this.type
def setMaxIter(value: Int): this.type
def setTol(value: Double): this.type
def setSeed(value: Long): this.type
def setDistanceMeasure(value: String): this.type
def setWeightCol(value: String): this.type
}
class KMeansModel extends Model[KMeansModel] with KMeansParams with MLWritable {
def clusterCenters: Array[Vector]
def computeCost(dataset: Dataset[_]): Double
def summary: KMeansSummary
def hasSummary: Boolean
}
class KMeansSummary {
def predictions: DataFrame
def predictionCol: String
def featuresCol: String
def k: Int
def clusterSizes: Array[Long]
def trainingCost: Double
def numIter: Int
}Usage Example:
import org.apache.spark.ml.clustering.KMeans
val kmeans = new KMeans()
.setK(3)
.setSeed(1L)
.setMaxIter(20)
val model = kmeans.fit(dataset)
// Make predictions
val predictions = model.transform(dataset)
// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = model.computeCost(dataset)
println(s"Within Set Sum of Squared Errors = $WSSSE")
// Show cluster centers
println("Cluster Centers: ")
model.clusterCenters.foreach(println)Probabilistic clustering using expectation-maximization algorithm to model data as mixture of Gaussian distributions.
/**
* Gaussian Mixture Model clustering
*/
class GaussianMixture extends Estimator[GaussianMixtureModel] with GaussianMixtureParams {
def setK(value: Int): this.type
def setMaxIter(value: Int): this.type
def setTol(value: Double): this.type
def setSeed(value: Long): this.type
def setAggregationDepth(value: Int): this.type
def setWeightCol(value: String): this.type
}
class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {
def weights: Array[Double]
def gaussians: Array[MultivariateGaussian]
def summary: GaussianMixtureSummary
def hasSummary: Boolean
}
class GaussianMixtureSummary {
def predictions: DataFrame
def predictionCol: String
def probabilityCol: String
def featuresCol: String
def k: Int
def clusterSizes: Array[Long]
def logLikelihood: Double
def numIter: Int
}
case class MultivariateGaussian(mean: Vector, cov: Matrix) {
def pdf(x: Vector): Double
def logpdf(x: Vector): Double
}Usage Example:
import org.apache.spark.ml.clustering.GaussianMixture
val gmm = new GaussianMixture()
.setK(3)
.setSeed(538009335L)
val model = gmm.fit(dataset)
// Output the parameters of the mixture model
for (i <- 0 until model.getK) {
println(s"Gaussian $i:\nweight=${model.weights(i)}\n" +
s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n")
}Topic modeling algorithm for discovering abstract topics in document collections.
/**
* Latent Dirichlet Allocation for topic modeling
*/
class LDA extends Estimator[LDAModel] with LDAParams {
def setK(value: Int): this.type
def setMaxIter(value: Int): this.type
def setDocConcentration(value: Array[Double]): this.type
def setTopicConcentration(value: Double): this.type
def setOptimizer(value: String): this.type
def setLearningOffset(value: Double): this.type
def setLearningDecay(value: Double): this.type
def setSubsamplingRate(value: Double): this.type
def setOptimizeDocConcentration(value: Boolean): this.type
def setSeed(value: Long): this.type
def setCheckpointInterval(value: Int): this.type
def setKeepLastCheckpoint(value: Boolean): this.type
}
abstract class LDAModel extends Model[LDAModel] with LDAParams {
def vocabSize: Int
def topicsMatrix: Matrix
def isDistributed: Boolean
def logLikelihood(dataset: Dataset[_]): Double
def logPerplexity(dataset: Dataset[_]): Double
def describeTopics(maxTermsPerTopic: Int): DataFrame
def describeTopics(): DataFrame
}
class LocalLDAModel extends LDAModel {
def getDocConcentration: Vector
def topicDistributions(dataset: Dataset[_]): DataFrame
}
class DistributedLDAModel extends LDAModel {
def getDocConcentration: Vector
def toLocal: LocalLDAModel
def logPrior: Double
}Hierarchical clustering algorithm that recursively applies k-means to split clusters into two subclusters.
/**
* Bisecting K-Means clustering
*/
class BisectingKMeans extends Estimator[BisectingKMeansModel] with BisectingKMeansParams {
def setK(value: Int): this.type
def setMaxIter(value: Int): this.type
def setSeed(value: Long): this.type
def setMinDivisibleClusterSize(value: Double): this.type
def setDistanceMeasure(value: String): this.type
}
class BisectingKMeansModel extends Model[BisectingKMeansModel] with BisectingKMeansParams with MLWritable {
def clusterCenters: Array[Vector]
def computeCost(dataset: Dataset[_]): Double
def summary: BisectingKMeansSummary
def hasSummary: Boolean
}
class BisectingKMeansSummary {
def predictions: DataFrame
def predictionCol: String
def featuresCol: String
def k: Int
def clusterSizes: Array[Long]
def trainingCost: Double
def numIter: Int
}Graph-based clustering algorithm using power iteration to find a low-dimensional embedding of the affinity matrix.
/**
* Power Iteration Clustering for graph-based clustering
*/
class PowerIterationClustering extends Estimator[PowerIterationClusteringModel] with PowerIterationClusteringParams {
def setK(value: Int): this.type
def setMaxIter(value: Int): this.type
def setInitMode(value: String): this.type
def setSrcCol(value: String): this.type
def setDstCol(value: String): this.type
def setWeightCol(value: String): this.type
}
class PowerIterationClusteringModel extends Model[PowerIterationClusteringModel] with PowerIterationClusteringParams {
def assignments: DataFrame
}Usage Example:
import org.apache.spark.ml.clustering.PowerIterationClustering
val pic = new PowerIterationClustering()
.setK(3)
.setMaxIter(20)
.setSrcCol("src")
.setDstCol("dst")
.setWeightCol("weight")
val model = pic.fit(edgeDataset)
val result = model.assignments
result.show()// Clustering-specific imports
import org.apache.spark.ml.clustering._
import org.apache.spark.ml.linalg.{Vector, Matrix}
import org.apache.spark.sql.{DataFrame, Dataset}
// Gaussian mixture components
import org.apache.spark.ml.stat.distribution.MultivariateGaussian
// Parameter traits
import org.apache.spark.ml.param.shared._
// Summary types
import org.apache.spark.ml.clustering.{
KMeansSummary,
GaussianMixtureSummary,
BisectingKMeansSummary
}