or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

classification.mdclustering.mdevaluation-tuning.mdfeature-engineering.mdindex.mdlinear-algebra.mdpipeline-components.mdrecommendation.mdregression.md
tile.json

clustering.mddocs/

Clustering

Unsupervised learning algorithms for discovering hidden patterns and grouping similar data points, including partitioning, hierarchical, and probabilistic clustering methods.

Capabilities

K-Means Clustering

Partitioning algorithm that groups data into k clusters by minimizing within-cluster sum of squared distances.

/**
 * K-Means clustering algorithm
 */
class KMeans extends Estimator[KMeansModel] with KMeansParams {
  def setK(value: Int): this.type
  def setInitMode(value: String): this.type
  def setInitSteps(value: Int): this.type
  def setMaxIter(value: Int): this.type
  def setTol(value: Double): this.type
  def setSeed(value: Long): this.type
  def setDistanceMeasure(value: String): this.type
  def setWeightCol(value: String): this.type
}

class KMeansModel extends Model[KMeansModel] with KMeansParams with MLWritable {
  def clusterCenters: Array[Vector]
  def computeCost(dataset: Dataset[_]): Double
  def summary: KMeansSummary
  def hasSummary: Boolean
}

class KMeansSummary {
  def predictions: DataFrame
  def predictionCol: String
  def featuresCol: String
  def k: Int
  def clusterSizes: Array[Long]
  def trainingCost: Double
  def numIter: Int
}

Usage Example:

import org.apache.spark.ml.clustering.KMeans

val kmeans = new KMeans()
  .setK(3)
  .setSeed(1L)
  .setMaxIter(20)

val model = kmeans.fit(dataset)

// Make predictions
val predictions = model.transform(dataset)

// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = model.computeCost(dataset)
println(s"Within Set Sum of Squared Errors = $WSSSE")

// Show cluster centers
println("Cluster Centers: ")
model.clusterCenters.foreach(println)

Gaussian Mixture Model

Probabilistic clustering using expectation-maximization algorithm to model data as mixture of Gaussian distributions.

/**
 * Gaussian Mixture Model clustering
 */
class GaussianMixture extends Estimator[GaussianMixtureModel] with GaussianMixtureParams {
  def setK(value: Int): this.type
  def setMaxIter(value: Int): this.type
  def setTol(value: Double): this.type
  def setSeed(value: Long): this.type
  def setAggregationDepth(value: Int): this.type
  def setWeightCol(value: String): this.type
}

class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {
  def weights: Array[Double]
  def gaussians: Array[MultivariateGaussian]
  def summary: GaussianMixtureSummary
  def hasSummary: Boolean
}

class GaussianMixtureSummary {
  def predictions: DataFrame
  def predictionCol: String
  def probabilityCol: String
  def featuresCol: String
  def k: Int
  def clusterSizes: Array[Long]
  def logLikelihood: Double
  def numIter: Int
}

case class MultivariateGaussian(mean: Vector, cov: Matrix) {
  def pdf(x: Vector): Double
  def logpdf(x: Vector): Double
}

Usage Example:

import org.apache.spark.ml.clustering.GaussianMixture

val gmm = new GaussianMixture()
  .setK(3)
  .setSeed(538009335L)

val model = gmm.fit(dataset)

// Output the parameters of the mixture model
for (i <- 0 until model.getK) {
  println(s"Gaussian $i:\nweight=${model.weights(i)}\n" +
    s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n")
}

Latent Dirichlet Allocation

Topic modeling algorithm for discovering abstract topics in document collections.

/**
 * Latent Dirichlet Allocation for topic modeling
 */
class LDA extends Estimator[LDAModel] with LDAParams {
  def setK(value: Int): this.type
  def setMaxIter(value: Int): this.type
  def setDocConcentration(value: Array[Double]): this.type
  def setTopicConcentration(value: Double): this.type
  def setOptimizer(value: String): this.type
  def setLearningOffset(value: Double): this.type
  def setLearningDecay(value: Double): this.type
  def setSubsamplingRate(value: Double): this.type
  def setOptimizeDocConcentration(value: Boolean): this.type
  def setSeed(value: Long): this.type
  def setCheckpointInterval(value: Int): this.type
  def setKeepLastCheckpoint(value: Boolean): this.type
}

abstract class LDAModel extends Model[LDAModel] with LDAParams {
  def vocabSize: Int
  def topicsMatrix: Matrix
  def isDistributed: Boolean
  def logLikelihood(dataset: Dataset[_]): Double
  def logPerplexity(dataset: Dataset[_]): Double
  def describeTopics(maxTermsPerTopic: Int): DataFrame
  def describeTopics(): DataFrame
}

class LocalLDAModel extends LDAModel {
  def getDocConcentration: Vector
  def topicDistributions(dataset: Dataset[_]): DataFrame
}

class DistributedLDAModel extends LDAModel {
  def getDocConcentration: Vector
  def toLocal: LocalLDAModel
  def logPrior: Double
}

Bisecting K-Means

Hierarchical clustering algorithm that recursively applies k-means to split clusters into two subclusters.

/**
 * Bisecting K-Means clustering
 */
class BisectingKMeans extends Estimator[BisectingKMeansModel] with BisectingKMeansParams {
  def setK(value: Int): this.type
  def setMaxIter(value: Int): this.type
  def setSeed(value: Long): this.type
  def setMinDivisibleClusterSize(value: Double): this.type
  def setDistanceMeasure(value: String): this.type
}

class BisectingKMeansModel extends Model[BisectingKMeansModel] with BisectingKMeansParams with MLWritable {
  def clusterCenters: Array[Vector]
  def computeCost(dataset: Dataset[_]): Double
  def summary: BisectingKMeansSummary
  def hasSummary: Boolean
}

class BisectingKMeansSummary {
  def predictions: DataFrame
  def predictionCol: String
  def featuresCol: String
  def k: Int
  def clusterSizes: Array[Long]
  def trainingCost: Double
  def numIter: Int
}

Power Iteration Clustering

Graph-based clustering algorithm using power iteration to find a low-dimensional embedding of the affinity matrix.

/**
 * Power Iteration Clustering for graph-based clustering
 */
class PowerIterationClustering extends Estimator[PowerIterationClusteringModel] with PowerIterationClusteringParams {
  def setK(value: Int): this.type
  def setMaxIter(value: Int): this.type
  def setInitMode(value: String): this.type
  def setSrcCol(value: String): this.type
  def setDstCol(value: String): this.type
  def setWeightCol(value: String): this.type
}

class PowerIterationClusteringModel extends Model[PowerIterationClusteringModel] with PowerIterationClusteringParams {
  def assignments: DataFrame
}

Usage Example:

import org.apache.spark.ml.clustering.PowerIterationClustering

val pic = new PowerIterationClustering()
  .setK(3)
  .setMaxIter(20)
  .setSrcCol("src")
  .setDstCol("dst")
  .setWeightCol("weight")

val model = pic.fit(edgeDataset)
val result = model.assignments
result.show()

Types

// Clustering-specific imports
import org.apache.spark.ml.clustering._
import org.apache.spark.ml.linalg.{Vector, Matrix}
import org.apache.spark.sql.{DataFrame, Dataset}

// Gaussian mixture components
import org.apache.spark.ml.stat.distribution.MultivariateGaussian

// Parameter traits
import org.apache.spark.ml.param.shared._

// Summary types
import org.apache.spark.ml.clustering.{
  KMeansSummary,
  GaussianMixtureSummary,
  BisectingKMeansSummary
}