or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-analytics.mdclassification.mdclustering.mddeep-learning.mdfeature-engineering.mdindex.mdregression.mdvalidation-metrics.md

clustering.mddocs/

0

# Clustering

1

2

Unsupervised learning algorithms for discovering patterns and groupings in data. Smile Core provides comprehensive clustering capabilities including partitioning methods, hierarchical clustering, density-based algorithms, and spectral clustering.

3

4

## Capabilities

5

6

### Core Clustering Interface

7

8

Clustering algorithms extend the base `PartitionClustering` class or implement specific clustering interfaces.

9

10

```java { .api }

11

/**

12

* Base class for partition clustering algorithms

13

*/

14

abstract class PartitionClustering implements Serializable {

15

/** Number of clusters */

16

public final int k;

17

18

/** Cluster assignments for each data point */

19

public final int[] y;

20

21

/** Size of each cluster */

22

public final int[] size;

23

24

/** Constant for outlier points */

25

public static final int OUTLIER = Integer.MAX_VALUE;

26

27

/** K-means++ initialization for centroids */

28

public static double[][] seed(double[][] data, int k);

29

30

/** Run clustering algorithm multiple times and return best result */

31

public static <T extends PartitionClustering> T run(Supplier<T> clustering, int runs);

32

}

33

```

34

35

### K-Means Clustering

36

37

Partitioning algorithm that groups data into k clusters by minimizing within-cluster sum of squares.

38

39

```java { .api }

40

/**

41

* K-means clustering algorithm

42

*/

43

class KMeans extends CentroidClustering<double[], double[]> {

44

/** Train K-means with specified number of clusters */

45

public static KMeans fit(double[][] data, int k);

46

47

/** Train with custom parameters */

48

public static KMeans fit(double[][] data, int k, int maxIter, double tolerance);

49

50

/** Train with initial centroids */

51

public static KMeans fit(double[][] data, double[][] centroids, int maxIter, double tolerance);

52

53

/** Predict cluster for new data point */

54

public int predict(double[] x);

55

56

/** Get cluster centroids */

57

public double[][] centroids;

58

59

/** Get within-cluster sum of squares */

60

public double distortion;

61

62

/** Get silhouette coefficient */

63

public double[] silhouette();

64

}

65

```

66

67

**Usage Example:**

68

69

```java

70

import smile.clustering.KMeans;

71

72

// Basic K-means clustering

73

KMeans kmeans = KMeans.fit(data, 3);

74

int[] clusters = kmeans.y;

75

double[][] centroids = kmeans.centroids;

76

77

// Predict cluster for new point

78

int cluster = kmeans.predict(newPoint);

79

80

// Evaluate clustering quality

81

double[] silhouette = kmeans.silhouette();

82

```

83

84

### Hierarchical Clustering

85

86

Agglomerative clustering that builds a hierarchy of clusters using various linkage criteria.

87

88

```java { .api }

89

/**

90

* Hierarchical clustering with various linkage methods

91

*/

92

class HierarchicalClustering extends PartitionClustering {

93

/** Perform hierarchical clustering with complete linkage */

94

public static HierarchicalClustering fit(double[][] data);

95

96

/** Cluster with specified linkage method */

97

public static HierarchicalClustering fit(double[][] data, Linkage linkage);

98

99

/** Cut dendrogram at specified height to get k clusters */

100

public int[] partition(int k);

101

102

/** Cut dendrogram at specified height */

103

public int[] partition(double height);

104

105

/** Get dendrogram tree structure */

106

public Node[] tree;

107

108

/** Get merge heights */

109

public double[] height;

110

}

111

```

112

113

#### Linkage Methods

114

115

Various linkage criteria for hierarchical clustering.

116

117

```java { .api }

118

/**

119

* Base linkage interface

120

*/

121

interface Linkage {

122

/** Calculate distance between clusters */

123

double distance(int[] cluster1, int[] cluster2);

124

}

125

126

/**

127

* Single linkage (nearest neighbor)

128

*/

129

class SingleLinkage implements Linkage {

130

public static SingleLinkage of(double[][] proximity);

131

}

132

133

/**

134

* Complete linkage (farthest neighbor)

135

*/

136

class CompleteLinkage implements Linkage {

137

public static CompleteLinkage of(double[][] proximity);

138

}

139

140

/**

141

* Ward linkage (minimum variance)

142

*/

143

class WardLinkage implements Linkage {

144

public static WardLinkage of(double[][] data);

145

}

146

147

/**

148

* UPGMA linkage (unweighted pair group method)

149

*/

150

class UPGMALinkage implements Linkage {

151

public static UPGMALinkage of(double[][] proximity);

152

}

153

```

154

155

### Density-Based Clustering

156

157

Algorithms that find clusters based on density of data points, capable of finding arbitrary-shaped clusters and identifying outliers.

158

159

```java { .api }

160

/**

161

* DBSCAN density-based clustering

162

* @param <T> the type of input objects

163

*/

164

class DBSCAN<T> implements Serializable {

165

/** Perform DBSCAN clustering */

166

public static <T> DBSCAN<T> fit(T[] data, Distance<T> distance, int minPts, double radius);

167

168

/** Cluster assignments */

169

public final int[] y;

170

171

/** Number of clusters found */

172

public final int clusters;

173

174

/** Classify new point as core, border, or outlier */

175

public int predict(T x);

176

}

177

178

/**

179

* DENCLUE density-based clustering

180

*/

181

class DENCLUE implements Serializable {

182

/** Perform DENCLUE clustering */

183

public static DENCLUE fit(double[][] data, double sigma, int minPts);

184

185

/** Cluster assignments */

186

public final int[] y;

187

188

/** Number of clusters */

189

public final int k;

190

191

/** Density attractors (cluster centers) */

192

public final double[][] attractors;

193

}

194

```

195

196

### Spectral Clustering

197

198

Graph-based clustering using eigendecomposition of similarity matrices.

199

200

```java { .api }

201

/**

202

* Spectral clustering algorithm

203

*/

204

class SpectralClustering extends PartitionClustering {

205

/** Perform spectral clustering with RBF similarity */

206

public static SpectralClustering fit(double[][] data, int k, double sigma);

207

208

/** Spectral clustering with custom similarity matrix */

209

public static SpectralClustering fit(double[][] similarity, int k);

210

211

/** Get embedding coordinates */

212

public double[][] coordinates();

213

214

/** Get eigenvalues */

215

public double[] eigenvalues();

216

}

217

```

218

219

### K-Modes and Mixed-Type Clustering

220

221

Clustering algorithms for categorical data and mixed-type datasets.

222

223

```java { .api }

224

/**

225

* K-modes clustering for categorical data

226

*/

227

class KModes extends CentroidClustering<int[], int[]> {

228

/** Train K-modes clustering */

229

public static KModes fit(int[][] data, int k);

230

231

/** Train with custom parameters */

232

public static KModes fit(int[][] data, int k, int maxIter, int runs);

233

234

/** Predict cluster for categorical data */

235

public int predict(int[] x);

236

237

/** Get cluster modes (most frequent values) */

238

public int[][] centroids;

239

}

240

```

241

242

### Advanced Clustering Algorithms

243

244

Sophisticated clustering methods for specific use cases.

245

246

```java { .api }

247

/**

248

* X-means clustering with automatic k selection

249

*/

250

class XMeans extends CentroidClustering<double[], double[]> {

251

/** Perform X-means clustering with automatic k selection */

252

public static XMeans fit(double[][] data, int kmax);

253

254

/** Get final number of clusters */

255

public int k();

256

257

/** Get cluster centroids */

258

public double[][] centroids;

259

}

260

261

/**

262

* G-means clustering using Gaussian assumption test

263

*/

264

class GMeans extends CentroidClustering<double[], double[]> {

265

/** Perform G-means clustering */

266

public static GMeans fit(double[][] data, int kmax);

267

268

/** Get final number of clusters */

269

public int k();

270

}

271

272

/**

273

* CLARANS clustering for large datasets

274

*/

275

class CLARANS extends PartitionClustering {

276

/** Perform CLARANS clustering */

277

public static CLARANS fit(double[][] data, int k, int maxNeighbor, int numLocal);

278

279

/** Get medoid indices */

280

public int[] medoids();

281

}

282

283

/**

284

* Deterministic Annealing clustering

285

*/

286

class DeterministicAnnealing extends CentroidClustering<double[], double[]> {

287

/** Perform deterministic annealing clustering */

288

public static DeterministicAnnealing fit(double[][] data, int kmax);

289

290

/** Get final temperature */

291

public double temperature();

292

}

293

```

294

295

### Centroid-Based Clustering

296

297

Base class for algorithms that represent clusters by centroids.

298

299

```java { .api }

300

/**

301

* Base class for centroid-based clustering algorithms

302

* @param <T> the type of input objects

303

* @param <C> the type of cluster centroids

304

*/

305

abstract class CentroidClustering<T, C> extends PartitionClustering {

306

/** Cluster centroids */

307

public final C[] centroids;

308

309

/** Predict cluster assignment for new data point */

310

public abstract int predict(T x);

311

312

/** Calculate quantization error */

313

public abstract double quantizationError(T[] data);

314

}

315

```

316

317

### Evaluation Metrics

318

319

Metrics for evaluating clustering quality and comparing different clustering results.

320

321

```java { .api }

322

/**

323

* Silhouette analysis for cluster validation

324

*/

325

class Silhouette {

326

/** Calculate silhouette coefficient for each point */

327

public static double[] of(double[][] data, int[] clusters);

328

329

/** Calculate mean silhouette coefficient */

330

public static double mean(double[][] data, int[] clusters);

331

}

332

333

/**

334

* Davies-Bouldin Index for cluster validation

335

*/

336

class DaviesBouldin {

337

/** Calculate Davies-Bouldin index */

338

public static double of(double[][] data, int[] clusters);

339

}

340

341

/**

342

* Calinski-Harabasz Index (Variance Ratio Criterion)

343

*/

344

class CalinskiHarabasz {

345

/** Calculate Calinski-Harabasz index */

346

public static double of(double[][] data, int[] clusters);

347

}

348

```

349

350

### Cluster Initialization

351

352

Methods for initializing cluster centers and parameters.

353

354

```java { .api }

355

/**

356

* K-means++ initialization

357

*/

358

class KMeansPlusPlus {

359

/** Initialize centroids using K-means++ algorithm */

360

public static double[][] init(double[][] data, int k);

361

362

/** Initialize with custom distance metric */

363

public static double[][] init(double[][] data, int k, Distance<double[]> distance);

364

}

365

366

/**

367

* Random initialization strategies

368

*/

369

class RandomInit {

370

/** Random initialization from data points */

371

public static double[][] fromData(double[][] data, int k);

372

373

/** Random initialization within data bounds */

374

public static double[][] uniform(double[][] data, int k);

375

}

376

```

377

378

**Usage Examples:**

379

380

```java

381

// Hierarchical clustering with different linkages

382

HierarchicalClustering hc1 = HierarchicalClustering.fit(data, new WardLinkage());

383

HierarchicalClustering hc2 = HierarchicalClustering.fit(data, new CompleteLinkage());

384

385

// Cut dendrogram to get 5 clusters

386

int[] clusters = hc1.partition(5);

387

388

// DBSCAN for arbitrary-shaped clusters

389

DBSCAN<double[]> dbscan = DBSCAN.fit(data, new EuclideanDistance(), 5, 0.5);

390

int[] clusters = dbscan.y;

391

System.out.println("Found " + dbscan.clusters + " clusters");

392

393

// Spectral clustering for non-convex clusters

394

SpectralClustering sc = SpectralClustering.fit(data, 3, 1.0);

395

double[][] embedding = sc.coordinates();

396

397

// Evaluate clustering quality

398

double[] silhouette = Silhouette.of(data, clusters);

399

double meanSilhouette = Silhouette.mean(data, clusters);

400

double dbIndex = DaviesBouldin.of(data, clusters);

401

```

402

403

### Common Clustering Parameters

404

405

Most clustering algorithms support these configuration options:

406

407

- **k**: Number of clusters (for partitioning methods)

408

- **maxIter**: Maximum iterations for convergence

409

- **tolerance**: Convergence tolerance

410

- **runs**: Number of random restarts

411

- **minPts**: Minimum points for density-based clustering

412

- **radius/epsilon**: Neighborhood radius for density-based clustering

413

- **sigma**: Bandwidth parameter for kernel-based methods

414

- **linkage**: Linkage criterion for hierarchical clustering

415

- **seed**: Random seed for reproducible results

416

417

### Distance Metrics

418

419

Clustering algorithms support various distance metrics:

420

421

- **EuclideanDistance**: Standard L2 distance

422

- **ManhattanDistance**: L1 distance

423

- **ChebyshevDistance**: L∞ distance

424

- **CorrelationDistance**: 1 - correlation coefficient

425

- **CosineDistance**: 1 - cosine similarity

426

- **HammingDistance**: For binary/categorical data

427

- **JaccardDistance**: For set-based data