0
# Clustering and Unsupervised Learning
1
2
Orange3 provides unsupervised learning algorithms for discovering patterns and structures in data without labeled examples.
3
4
## Capabilities
5
6
### K-Means Clustering
7
8
Partition data into k clusters based on feature similarity.
9
10
```python { .api }
11
class KMeans:
12
"""
13
K-means clustering algorithm.
14
15
Args:
16
n_clusters: Number of clusters
17
init: Initialization method ('k-means++', 'random')
18
n_init: Number of random initializations
19
max_iter: Maximum iterations
20
tol: Tolerance for convergence
21
random_state: Random seed
22
"""
23
def __init__(self, n_clusters=8, init='k-means++', n_init=10,
24
max_iter=300, tol=1e-4, random_state=None): ...
25
26
def fit(self, X):
27
"""
28
Fit k-means clustering to data.
29
30
Args:
31
X: Data array or Orange Table
32
33
Returns:
34
Fitted k-means model
35
"""
36
37
def predict(self, X):
38
"""Predict cluster labels for new data."""
39
40
@property
41
def cluster_centers_(self):
42
"""Cluster center coordinates."""
43
```
44
45
### Density-Based Clustering
46
47
DBSCAN algorithm for finding clusters of varying shapes and sizes.
48
49
```python { .api }
50
class DBSCAN:
51
"""
52
DBSCAN (Density-Based Spatial Clustering) algorithm.
53
54
Args:
55
eps: Maximum distance between samples in same neighborhood
56
min_samples: Minimum samples in neighborhood for core point
57
metric: Distance metric
58
algorithm: Algorithm for computing nearest neighbors
59
"""
60
def __init__(self, eps=0.5, min_samples=5, metric='euclidean', algorithm='auto'): ...
61
62
def fit(self, X):
63
"""
64
Fit DBSCAN clustering to data.
65
66
Args:
67
X: Data array or Orange Table
68
69
Returns:
70
Fitted DBSCAN model
71
"""
72
73
def predict(self, X):
74
"""Predict cluster labels (not supported by standard DBSCAN)."""
75
76
@property
77
def labels_(self):
78
"""Cluster labels for training data."""
79
80
@property
81
def core_sample_indices_(self):
82
"""Indices of core samples."""
83
```
84
85
### Hierarchical Clustering
86
87
Build tree of clusters using agglomerative approach.
88
89
```python { .api }
90
class HierarchicalClustering:
91
"""
92
Agglomerative hierarchical clustering.
93
94
Args:
95
n_clusters: Number of clusters (if None, returns full tree)
96
linkage: Linkage criterion ('ward', 'complete', 'average', 'single')
97
metric: Distance metric
98
compute_full_tree: Compute full dendrogram
99
"""
100
def __init__(self, n_clusters=None, linkage='ward', metric='euclidean',
101
compute_full_tree='auto'): ...
102
103
def fit(self, X):
104
"""
105
Fit hierarchical clustering to data.
106
107
Args:
108
X: Data array or Orange Table
109
110
Returns:
111
Fitted hierarchical clustering model
112
"""
113
114
@property
115
def labels_(self):
116
"""Cluster labels."""
117
118
@property
119
def children_(self):
120
"""Tree structure of clustering."""
121
122
@property
123
def distances_(self):
124
"""Distances between merged clusters."""
125
```
126
127
### Community Detection
128
129
Graph-based clustering using the Louvain algorithm.
130
131
```python { .api }
132
class Louvain:
133
"""
134
Louvain community detection algorithm.
135
136
Args:
137
resolution: Resolution parameter for modularity
138
random_state: Random seed
139
"""
140
def __init__(self, resolution=1.0, random_state=None): ...
141
142
def fit(self, graph):
143
"""
144
Fit Louvain clustering to graph data.
145
146
Args:
147
graph: Network graph or adjacency matrix
148
149
Returns:
150
Fitted Louvain model
151
"""
152
153
@property
154
def labels_(self):
155
"""Community labels."""
156
```
157
158
### Clustering Utilities
159
160
Helper functions for clustering analysis.
161
162
```python { .api }
163
def matrix_to_knn_graph(distances, k, include_self=False):
164
"""
165
Convert distance matrix to k-nearest neighbor graph.
166
167
Args:
168
distances: Distance matrix
169
k: Number of nearest neighbors
170
include_self: Include self-connections
171
172
Returns:
173
Sparse adjacency matrix representing kNN graph
174
"""
175
```
176
177
### Clustering Evaluation
178
179
Metrics for assessing clustering quality.
180
181
```python { .api }
182
def silhouette_score(X, labels):
183
"""
184
Calculate silhouette coefficient for clustering.
185
186
Args:
187
X: Data samples
188
labels: Cluster labels
189
190
Returns:
191
float: Mean silhouette coefficient
192
"""
193
194
def adjusted_rand_score(labels_true, labels_pred):
195
"""
196
Calculate adjusted rand index between two clusterings.
197
198
Args:
199
labels_true: True cluster labels
200
labels_pred: Predicted cluster labels
201
202
Returns:
203
float: Adjusted rand index
204
"""
205
206
def calinski_harabasz_score(X, labels):
207
"""
208
Calculate Calinski-Harabasz index (variance ratio criterion).
209
210
Args:
211
X: Data samples
212
labels: Cluster labels
213
214
Returns:
215
float: Calinski-Harabasz index
216
"""
217
```
218
219
### Usage Examples
220
221
```python
222
# Basic clustering workflow
223
from Orange.data import Table
224
from Orange.clustering import KMeans, DBSCAN, HierarchicalClustering
225
import numpy as np
226
227
# Load or create data
228
data = Table("iris")
229
X = data.X # Feature matrix
230
231
# K-means clustering
232
kmeans = KMeans(n_clusters=3, random_state=42)
233
kmeans_model = kmeans.fit(X)
234
kmeans_labels = kmeans_model.predict(X)
235
236
print(f"K-means cluster centers shape: {kmeans_model.cluster_centers_.shape}")
237
print(f"K-means labels: {np.unique(kmeans_labels)}")
238
239
# DBSCAN clustering
240
dbscan = DBSCAN(eps=0.5, min_samples=5)
241
dbscan_model = dbscan.fit(X)
242
dbscan_labels = dbscan_model.labels_
243
244
print(f"DBSCAN found {len(np.unique(dbscan_labels[dbscan_labels != -1]))} clusters")
245
print(f"DBSCAN noise points: {np.sum(dbscan_labels == -1)}")
246
247
# Hierarchical clustering
248
hierarchical = HierarchicalClustering(n_clusters=3, linkage='ward')
249
hierarchical_model = hierarchical.fit(X)
250
hierarchical_labels = hierarchical_model.labels_
251
252
print(f"Hierarchical clustering labels: {np.unique(hierarchical_labels)}")
253
254
# Evaluate clustering quality
255
from Orange.clustering import silhouette_score, calinski_harabasz_score
256
257
kmeans_silhouette = silhouette_score(X, kmeans_labels)
258
hierarchical_silhouette = silhouette_score(X, hierarchical_labels)
259
260
kmeans_ch_score = calinski_harabasz_score(X, kmeans_labels)
261
hierarchical_ch_score = calinski_harabasz_score(X, hierarchical_labels)
262
263
print(f"K-means silhouette score: {kmeans_silhouette:.3f}")
264
print(f"Hierarchical silhouette score: {hierarchical_silhouette:.3f}")
265
print(f"K-means Calinski-Harabasz score: {kmeans_ch_score:.3f}")
266
print(f"Hierarchical Calinski-Harabasz score: {hierarchical_ch_score:.3f}")
267
268
# Find optimal number of clusters using elbow method
269
inertias = []
270
silhouette_scores = []
271
k_range = range(2, 11)
272
273
for k in k_range:
274
kmeans_k = KMeans(n_clusters=k, random_state=42)
275
model_k = kmeans_k.fit(X)
276
labels_k = model_k.predict(X)
277
278
# Note: inertia would be available as model_k.inertia_ in actual implementation
279
silhouette_k = silhouette_score(X, labels_k)
280
silhouette_scores.append(silhouette_k)
281
282
print(f"Silhouette scores for k=2 to 10: {silhouette_scores}")
283
284
# Graph-based clustering example (requires network data)
285
# from Orange.clustering import Louvain, matrix_to_knn_graph
286
#
287
# # Create kNN graph from distance matrix
288
# knn_graph = matrix_to_knn_graph(distance_matrix, k=5)
289
#
290
# # Apply Louvain community detection
291
# louvain = Louvain(resolution=1.0)
292
# louvain_model = louvain.fit(knn_graph)
293
# community_labels = louvain_model.labels_
294
```