or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddata-utilities.mdfeatures.mdindex.mdmodel-selection.mdregression.mdtext.md

clustering.mddocs/

0

# Clustering Analysis

1

2

Visualizers for unsupervised clustering evaluation and analysis, providing tools to determine optimal cluster numbers, assess clustering quality, and understand cluster relationships. These tools support various clustering algorithms and distance metrics.

3

4

## Capabilities

5

6

### Elbow Method Analysis

7

8

K-Elbow visualizer for determining the optimal number of clusters using the elbow method. Supports multiple scoring metrics including distortion, silhouette score, and calinski-harabasz index.

9

10

```python { .api }

11

class KElbow(ClusteringScoreVisualizer):

12

"""

13

K-Elbow visualizer for optimal cluster number selection.

14

15

Parameters:

16

- estimator: scikit-learn clustering estimator (KMeans, etc.)

17

- k: int or tuple, range of K values to test (default: (4, 11))

18

- metric: str, scoring metric ('distortion', 'silhouette', 'calinski_harabasz')

19

- timings: bool, whether to show fitting time for each K

20

- locate_elbow: bool, whether to automatically locate elbow point

21

"""

22

def __init__(self, estimator, k=10, metric='distortion', timings=True, locate_elbow=True, **kwargs): ...

23

def fit(self, X, y=None, **kwargs): ...

24

def show(self, **kwargs): ...

25

26

# Alias for compatibility

27

KElbowVisualizer = KElbow

28

29

def kelbow_visualizer(estimator, X, k=10, metric='distortion', **kwargs):

30

"""

31

Functional API for K-elbow visualization.

32

33

Parameters:

34

- estimator: scikit-learn clustering estimator

35

- X: feature matrix

36

- k: int or tuple, range of K values to test

37

- metric: str, scoring metric

38

39

Returns:

40

KElbow visualizer instance

41

"""

42

43

def distortion_score(estimator, X):

44

"""

45

Compute distortion score (sum of squared distances to centroids).

46

47

Parameters:

48

- estimator: fitted clustering estimator with cluster_centers_ attribute

49

- X: feature matrix

50

51

Returns:

52

float: distortion score

53

"""

54

```

55

56

**Usage Example:**

57

58

```python

59

from yellowbrick.cluster import KElbow, kelbow_visualizer

60

from sklearn.cluster import KMeans

61

from sklearn.datasets import make_blobs

62

63

# Generate sample data

64

X, _ = make_blobs(n_samples=1000, centers=4, n_features=12, random_state=42)

65

66

# Class-based API

67

model = KMeans()

68

visualizer = KElbow(model, k=(2, 12), metric='distortion', timings=False)

69

visualizer.fit(X)

70

visualizer.show()

71

72

# Get optimal K

73

optimal_k = visualizer.elbow_value_

74

75

# Functional API

76

kelbow_visualizer(KMeans(), X, k=(2, 12), metric='silhouette')

77

```

78

79

### Silhouette Analysis

80

81

Silhouette analysis for evaluating clustering quality and cluster cohesion. Provides detailed view of how well each sample fits within its assigned cluster.

82

83

```python { .api }

84

class SilhouetteVisualizer(ClusteringScoreVisualizer):

85

"""

86

Silhouette analysis visualizer for clustering evaluation.

87

88

Parameters:

89

- estimator: scikit-learn clustering estimator

90

- colors: str or list, colors for different clusters

91

- is_fitted: bool, whether estimator is already fitted

92

"""

93

def __init__(self, estimator, colors='yellowbrick', is_fitted=False, **kwargs): ...

94

def fit(self, X, y=None, **kwargs): ...

95

def show(self, **kwargs): ...

96

97

def silhouette_visualizer(estimator, X, colors='yellowbrick', **kwargs):

98

"""

99

Functional API for silhouette analysis visualization.

100

101

Parameters:

102

- estimator: scikit-learn clustering estimator

103

- X: feature matrix

104

- colors: str or list, colors for clusters

105

106

Returns:

107

SilhouetteVisualizer instance

108

"""

109

```

110

111

**Usage Example:**

112

113

```python

114

from yellowbrick.cluster import SilhouetteVisualizer, silhouette_visualizer

115

from sklearn.cluster import KMeans

116

117

# Class-based API

118

model = KMeans(n_clusters=4, random_state=42)

119

visualizer = SilhouetteVisualizer(model, colors='yellowbrick')

120

visualizer.fit(X)

121

visualizer.show()

122

123

# Access silhouette scores

124

silhouette_scores = visualizer.silhouette_samples_

125

avg_silhouette = visualizer.silhouette_score_

126

127

# Functional API

128

silhouette_visualizer(KMeans(n_clusters=4), X)

129

```

130

131

### Intercluster Distance Maps

132

133

Intercluster distance visualization showing relationships between cluster centers using dimensionality reduction techniques like MDS or t-SNE.

134

135

```python { .api }

136

class InterclusterDistance(ClusteringScoreVisualizer):

137

"""

138

Intercluster distance map visualizer.

139

140

Parameters:

141

- estimator: scikit-learn clustering estimator

142

- embedding: str, embedding method ('mds', 'tsne')

143

- random_state: int, random state for reproducibility

144

"""

145

def __init__(self, estimator, embedding='mds', random_state=None, **kwargs): ...

146

def fit(self, X, y=None, **kwargs): ...

147

def show(self, **kwargs): ...

148

149

def intercluster_distance(estimator, X, embedding='mds', **kwargs):

150

"""

151

Functional API for intercluster distance visualization.

152

153

Parameters:

154

- estimator: scikit-learn clustering estimator

155

- X: feature matrix

156

- embedding: str, embedding method

157

158

Returns:

159

InterclusterDistance visualizer instance

160

"""

161

162

# Valid embedding methods

163

VALID_EMBEDDING = ['mds', 'tsne']

164

```

165

166

**Usage Example:**

167

168

```python

169

from yellowbrick.cluster import InterclusterDistance, intercluster_distance

170

from sklearn.cluster import KMeans

171

172

# Class-based API with MDS embedding

173

model = KMeans(n_clusters=6, random_state=42)

174

visualizer = InterclusterDistance(model, embedding='mds')

175

visualizer.fit(X)

176

visualizer.show()

177

178

# Class-based API with t-SNE embedding

179

tsne_visualizer = InterclusterDistance(model, embedding='tsne', random_state=42)

180

tsne_visualizer.fit(X)

181

tsne_visualizer.show()

182

183

# Functional API

184

intercluster_distance(KMeans(n_clusters=6), X, embedding='mds')

185

```

186

187

## Base Classes

188

189

```python { .api }

190

class ClusteringScoreVisualizer(ScoreVisualizer):

191

"""

192

Base class for clustering scoring visualizers.

193

Provides common functionality for clustering model evaluation.

194

"""

195

def __init__(self, estimator, **kwargs): ...

196

def fit(self, X, y=None, **kwargs): ...

197

```

198

199

## Usage Patterns

200

201

### Complete Clustering Analysis Workflow

202

203

```python

204

from yellowbrick.cluster import KElbow, SilhouetteVisualizer, InterclusterDistance

205

from sklearn.cluster import KMeans

206

from sklearn.datasets import make_blobs

207

import matplotlib.pyplot as plt

208

209

# Generate sample data

210

X, _ = make_blobs(n_samples=1000, centers=4, n_features=12, random_state=42)

211

212

# Step 1: Determine optimal number of clusters

213

print("Step 1: Finding optimal K using elbow method")

214

elbow_viz = KElbow(KMeans(), k=(2, 12), metric='distortion')

215

elbow_viz.fit(X)

216

elbow_viz.show()

217

optimal_k = elbow_viz.elbow_value_

218

print(f"Optimal K: {optimal_k}")

219

220

# Step 2: Evaluate clustering quality with silhouette analysis

221

print(f"Step 2: Silhouette analysis with K={optimal_k}")

222

model = KMeans(n_clusters=optimal_k, random_state=42)

223

silhouette_viz = SilhouetteVisualizer(model)

224

silhouette_viz.fit(X)

225

silhouette_viz.show()

226

print(f"Average silhouette score: {silhouette_viz.silhouette_score_:.3f}")

227

228

# Step 3: Visualize cluster relationships

229

print("Step 3: Intercluster distance analysis")

230

distance_viz = InterclusterDistance(model, embedding='mds')

231

distance_viz.fit(X)

232

distance_viz.show()

233

```

234

235

### Comparing Multiple Clustering Algorithms

236

237

```python

238

from yellowbrick.cluster import SilhouetteVisualizer

239

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

240

from sklearn.mixture import GaussianMixture

241

import matplotlib.pyplot as plt

242

243

# Define clustering algorithms

244

algorithms = {

245

'K-Means': KMeans(n_clusters=4, random_state=42),

246

'Agglomerative': AgglomerativeClustering(n_clusters=4),

247

'Gaussian Mixture': GaussianMixture(n_components=4, random_state=42)

248

}

249

250

# Compare silhouette analysis across algorithms

251

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

252

253

for idx, (name, algorithm) in enumerate(algorithms.items()):

254

viz = SilhouetteVisualizer(algorithm, ax=axes[idx])

255

viz.fit(X)

256

viz.finalize()

257

axes[idx].set_title(f'{name}\nAvg Score: {viz.silhouette_score_:.3f}')

258

259

plt.tight_layout()

260

plt.show()

261

```

262

263

### Parameter Tuning with Multiple Metrics

264

265

```python

266

from yellowbrick.cluster import KElbow

267

from sklearn.cluster import KMeans

268

import matplotlib.pyplot as plt

269

270

# Compare different scoring metrics

271

metrics = ['distortion', 'silhouette', 'calinski_harabasz']

272

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

273

274

for idx, metric in enumerate(metrics):

275

viz = KElbow(KMeans(), k=(2, 12), metric=metric, ax=axes[idx])

276

viz.fit(X)

277

viz.finalize()

278

axes[idx].set_title(f'Elbow Method - {metric.title()}')

279

280

plt.tight_layout()

281

plt.show()

282

```

283

284

### Advanced Clustering Evaluation

285

286

```python

287

from yellowbrick.cluster import KElbow, SilhouetteVisualizer

288

from sklearn.cluster import KMeans

289

from sklearn.preprocessing import StandardScaler

290

from sklearn.pipeline import Pipeline

291

292

# Clustering with preprocessing pipeline

293

pipeline = Pipeline([

294

('scaler', StandardScaler()),

295

('kmeans', KMeans())

296

])

297

298

# Elbow analysis with pipeline

299

elbow_viz = KElbow(pipeline, k=(2, 12), metric='silhouette')

300

elbow_viz.fit(X)

301

elbow_viz.show()

302

303

# Silhouette analysis with optimal K

304

optimal_k = elbow_viz.elbow_value_

305

pipeline.set_params(kmeans__n_clusters=optimal_k)

306

silhouette_viz = SilhouetteVisualizer(pipeline)

307

silhouette_viz.fit(X)

308

silhouette_viz.show()

309

```

310

311

### Custom Distance Metrics

312

313

```python

314

from yellowbrick.cluster import KElbow

315

from sklearn.cluster import KMeans

316

from sklearn.metrics import pairwise_distances

317

318

# Custom scoring function example

319

def custom_score(estimator, X):

320

"""Custom scoring function using average intra-cluster distance"""

321

labels = estimator.labels_

322

centers = estimator.cluster_centers_

323

324

score = 0

325

for i in range(len(centers)):

326

cluster_points = X[labels == i]

327

if len(cluster_points) > 0:

328

distances = pairwise_distances(cluster_points, [centers[i]])

329

score += distances.mean()

330

331

return score

332

333

# Use custom scoring with manual evaluation

334

k_values = range(2, 12)

335

scores = []

336

337

for k in k_values:

338

kmeans = KMeans(n_clusters=k, random_state=42)

339

kmeans.fit(X)

340

score = custom_score(kmeans, X)

341

scores.append(score)

342

343

# Plot custom scores

344

import matplotlib.pyplot as plt

345

plt.figure(figsize=(10, 6))

346

plt.plot(k_values, scores, 'bo-')

347

plt.xlabel('Number of Clusters (K)')

348

plt.ylabel('Custom Score')

349

plt.title('Custom Clustering Evaluation')

350

plt.grid(True)

351

plt.show()

352

```