0
# Clustering
1
2
High-performance implementations of clustering algorithms with Intel hardware acceleration. These algorithms provide significant speedups for density-based and centroid-based clustering on large datasets.
3
4
## Capabilities
5
6
### K-Means Clustering
7
8
Intel-accelerated K-means clustering with optimized centroid computation and distance calculations.
9
10
```python { .api }
11
class KMeans:
12
"""
13
K-means clustering with Intel optimization.
14
15
Provides 10-100x speedup over standard scikit-learn implementation
16
through vectorized operations and Intel hardware acceleration.
17
"""
18
19
def __init__(
20
self,
21
n_clusters=8,
22
init='k-means++',
23
n_init=10,
24
max_iter=300,
25
tol=1e-4,
26
random_state=None,
27
copy_x=True,
28
algorithm='auto'
29
):
30
"""
31
Initialize K-means clustering.
32
33
Parameters:
34
n_clusters (int): Number of clusters to form
35
init (str or array): Initialization method ('k-means++', 'random')
36
n_init (int): Number of initializations to perform
37
max_iter (int): Maximum number of iterations
38
tol (float): Tolerance for convergence
39
random_state (int): Random state for reproducibility
40
copy_x (bool): Whether to copy input data
41
algorithm (str): Algorithm to use ('auto', 'full', 'elkan')
42
"""
43
44
def fit(self, X, y=None, sample_weight=None):
45
"""
46
Compute k-means clustering.
47
48
Parameters:
49
X (array-like): Training data of shape (n_samples, n_features)
50
y: Ignored, present for API consistency
51
sample_weight (array-like): Sample weights
52
53
Returns:
54
self: Fitted estimator
55
"""
56
57
def predict(self, X, sample_weight=None):
58
"""
59
Predict cluster labels for samples.
60
61
Parameters:
62
X (array-like): New data to predict
63
sample_weight (array-like): Sample weights
64
65
Returns:
66
array: Cluster labels for each sample
67
"""
68
69
def fit_predict(self, X, y=None, sample_weight=None):
70
"""
71
Compute clustering and return cluster labels.
72
73
Parameters:
74
X (array-like): Training data
75
y: Ignored
76
sample_weight (array-like): Sample weights
77
78
Returns:
79
array: Cluster labels
80
"""
81
82
def transform(self, X):
83
"""
84
Transform X to cluster-distance space.
85
86
Parameters:
87
X (array-like): Data to transform
88
89
Returns:
90
array: Distances to cluster centers
91
"""
92
93
def fit_transform(self, X, y=None, sample_weight=None):
94
"""
95
Compute clustering and transform to cluster-distance space.
96
97
Parameters:
98
X (array-like): Training data
99
y: Ignored
100
sample_weight (array-like): Sample weights
101
102
Returns:
103
array: Distances to cluster centers
104
"""
105
106
def score(self, X, y=None, sample_weight=None):
107
"""
108
Return the negative sum of squared distances to centroids.
109
110
Parameters:
111
X (array-like): Data to score
112
y: Ignored
113
sample_weight (array-like): Sample weights
114
115
Returns:
116
float: Negative inertia score
117
"""
118
119
# Attributes available after fitting
120
cluster_centers_: ... # Cluster centers
121
labels_: ... # Labels of training data
122
inertia_: ... # Sum of squared distances to centroids
123
n_iter_: ... # Number of iterations run
124
```
125
126
### DBSCAN Clustering
127
128
Density-Based Spatial Clustering of Applications with Noise, optimized for Intel hardware.
129
130
```python { .api }
131
class DBSCAN:
132
"""
133
DBSCAN clustering with Intel optimization.
134
135
Efficient density-based clustering that finds clusters of varying shapes
136
and identifies outliers as noise points.
137
"""
138
139
def __init__(
140
self,
141
eps=0.5,
142
min_samples=5,
143
metric='euclidean',
144
metric_params=None,
145
algorithm='auto',
146
leaf_size=30,
147
p=None,
148
n_jobs=None
149
):
150
"""
151
Initialize DBSCAN clustering.
152
153
Parameters:
154
eps (float): Maximum distance between samples in same neighborhood
155
min_samples (int): Minimum samples in neighborhood for core point
156
metric (str): Distance metric to use
157
metric_params (dict): Additional parameters for distance metric
158
algorithm (str): Algorithm for nearest neighbors computation
159
leaf_size (int): Leaf size for tree algorithms
160
p (float): Power parameter for Minkowski metric
161
n_jobs (int): Number of parallel jobs
162
"""
163
164
def fit(self, X, y=None, sample_weight=None):
165
"""
166
Perform DBSCAN clustering.
167
168
Parameters:
169
X (array-like): Training data of shape (n_samples, n_features)
170
y: Ignored, present for API consistency
171
sample_weight (array-like): Sample weights
172
173
Returns:
174
self: Fitted estimator
175
"""
176
177
def fit_predict(self, X, y=None, sample_weight=None):
178
"""
179
Compute clustering and return cluster labels.
180
181
Parameters:
182
X (array-like): Training data
183
y: Ignored
184
sample_weight (array-like): Sample weights
185
186
Returns:
187
array: Cluster labels (-1 for noise points)
188
"""
189
190
# Attributes available after fitting
191
labels_: ... # Cluster labels (-1 for noise)
192
core_sample_indices_: ... # Indices of core samples
193
components_: ... # Core samples
194
```
195
196
## Usage Examples
197
198
### Basic K-Means Clustering
199
200
```python
201
import numpy as np
202
from sklearnex.cluster import KMeans
203
from sklearn.datasets import make_blobs
204
205
# Generate sample data
206
X, _ = make_blobs(n_samples=1000, centers=4, n_features=2,
207
cluster_std=1.0, random_state=42)
208
209
# Create and fit K-means model
210
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
211
kmeans.fit(X)
212
213
# Get cluster labels and centers
214
labels = kmeans.labels_
215
centers = kmeans.cluster_centers_
216
inertia = kmeans.inertia_
217
218
print(f"Inertia: {inertia:.2f}")
219
print(f"Centers shape: {centers.shape}")
220
221
# Predict clusters for new data
222
new_points = np.array([[1, 2], [3, 4]])
223
new_labels = kmeans.predict(new_points)
224
distances = kmeans.transform(new_points)
225
226
print(f"New point labels: {new_labels}")
227
print(f"Distances to centers: {distances}")
228
```
229
230
### DBSCAN Clustering with Noise Detection
231
232
```python
233
import numpy as np
234
from sklearnex.cluster import DBSCAN
235
from sklearn.datasets import make_blobs
236
237
# Generate data with noise
238
X, _ = make_blobs(n_samples=300, centers=4, n_features=2,
239
random_state=42, cluster_std=0.60)
240
241
# Add noise points
242
noise = np.random.uniform(-6, 6, (50, 2))
243
X = np.vstack([X, noise])
244
245
# Create and fit DBSCAN model
246
dbscan = DBSCAN(eps=0.3, min_samples=10)
247
cluster_labels = dbscan.fit_predict(X)
248
249
# Analyze results
250
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
251
n_noise = list(cluster_labels).count(-1)
252
253
print(f"Estimated number of clusters: {n_clusters}")
254
print(f"Estimated number of noise points: {n_noise}")
255
print(f"Core samples: {len(dbscan.core_sample_indices_)}")
256
257
# Get core samples
258
core_samples = dbscan.components_
259
print(f"Core samples shape: {core_samples.shape}")
260
```
261
262
### Comparison with Standard Scikit-learn
263
264
```python
265
import time
266
import numpy as np
267
from sklearn.datasets import make_blobs
268
269
# Generate large dataset
270
X, _ = make_blobs(n_samples=100000, centers=10, n_features=50, random_state=42)
271
272
# Intel-optimized version
273
from sklearnex.cluster import KMeans as IntelKMeans
274
275
start_time = time.time()
276
intel_kmeans = IntelKMeans(n_clusters=10, random_state=42)
277
intel_kmeans.fit(X)
278
intel_time = time.time() - start_time
279
280
print(f"Intel K-means time: {intel_time:.2f} seconds")
281
print(f"Intel inertia: {intel_kmeans.inertia_:.2f}")
282
283
# Standard scikit-learn version (for comparison)
284
from sklearn.cluster import KMeans as StandardKMeans
285
286
start_time = time.time()
287
standard_kmeans = StandardKMeans(n_clusters=10, random_state=42)
288
standard_kmeans.fit(X)
289
standard_time = time.time() - start_time
290
291
print(f"Standard K-means time: {standard_time:.2f} seconds")
292
print(f"Standard inertia: {standard_kmeans.inertia_:.2f}")
293
print(f"Speedup: {standard_time / intel_time:.1f}x")
294
```
295
296
## Performance Notes
297
298
- K-means shows significant speedups on datasets with >1000 samples
299
- DBSCAN benefits most from Intel optimization on high-dimensional data
300
- Both algorithms maintain identical results to scikit-learn implementations
301
- Memory usage is comparable to standard scikit-learn versions