or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdclustering.mddaal4py-mb.mddecomposition.mdensemble.mdindex.mdlinear-models.mdmetrics-model-selection.mdneighbors.mdpatching-config.mdstats-manifold.mdsvm.md

clustering.mddocs/

0

# Clustering

1

2

High-performance implementations of clustering algorithms with Intel hardware acceleration. These algorithms provide significant speedups for density-based and centroid-based clustering on large datasets.

3

4

## Capabilities

5

6

### K-Means Clustering

7

8

Intel-accelerated K-means clustering with optimized centroid computation and distance calculations.

9

10

```python { .api }

11

class KMeans:

12

"""

13

K-means clustering with Intel optimization.

14

15

Provides 10-100x speedup over standard scikit-learn implementation

16

through vectorized operations and Intel hardware acceleration.

17

"""

18

19

def __init__(

20

self,

21

n_clusters=8,

22

init='k-means++',

23

n_init=10,

24

max_iter=300,

25

tol=1e-4,

26

random_state=None,

27

copy_x=True,

28

algorithm='auto'

29

):

30

"""

31

Initialize K-means clustering.

32

33

Parameters:

34

n_clusters (int): Number of clusters to form

35

init (str or array): Initialization method ('k-means++', 'random')

36

n_init (int): Number of initializations to perform

37

max_iter (int): Maximum number of iterations

38

tol (float): Tolerance for convergence

39

random_state (int): Random state for reproducibility

40

copy_x (bool): Whether to copy input data

41

algorithm (str): Algorithm to use ('auto', 'full', 'elkan')

42

"""

43

44

def fit(self, X, y=None, sample_weight=None):

45

"""

46

Compute k-means clustering.

47

48

Parameters:

49

X (array-like): Training data of shape (n_samples, n_features)

50

y: Ignored, present for API consistency

51

sample_weight (array-like): Sample weights

52

53

Returns:

54

self: Fitted estimator

55

"""

56

57

def predict(self, X, sample_weight=None):

58

"""

59

Predict cluster labels for samples.

60

61

Parameters:

62

X (array-like): New data to predict

63

sample_weight (array-like): Sample weights

64

65

Returns:

66

array: Cluster labels for each sample

67

"""

68

69

def fit_predict(self, X, y=None, sample_weight=None):

70

"""

71

Compute clustering and return cluster labels.

72

73

Parameters:

74

X (array-like): Training data

75

y: Ignored

76

sample_weight (array-like): Sample weights

77

78

Returns:

79

array: Cluster labels

80

"""

81

82

def transform(self, X):

83

"""

84

Transform X to cluster-distance space.

85

86

Parameters:

87

X (array-like): Data to transform

88

89

Returns:

90

array: Distances to cluster centers

91

"""

92

93

def fit_transform(self, X, y=None, sample_weight=None):

94

"""

95

Compute clustering and transform to cluster-distance space.

96

97

Parameters:

98

X (array-like): Training data

99

y: Ignored

100

sample_weight (array-like): Sample weights

101

102

Returns:

103

array: Distances to cluster centers

104

"""

105

106

def score(self, X, y=None, sample_weight=None):

107

"""

108

Return the negative sum of squared distances to centroids.

109

110

Parameters:

111

X (array-like): Data to score

112

y: Ignored

113

sample_weight (array-like): Sample weights

114

115

Returns:

116

float: Negative inertia score

117

"""

118

119

# Attributes available after fitting

120

cluster_centers_: ... # Cluster centers

121

labels_: ... # Labels of training data

122

inertia_: ... # Sum of squared distances to centroids

123

n_iter_: ... # Number of iterations run

124

```

125

126

### DBSCAN Clustering

127

128

Density-Based Spatial Clustering of Applications with Noise, optimized for Intel hardware.

129

130

```python { .api }

131

class DBSCAN:

132

"""

133

DBSCAN clustering with Intel optimization.

134

135

Efficient density-based clustering that finds clusters of varying shapes

136

and identifies outliers as noise points.

137

"""

138

139

def __init__(

140

self,

141

eps=0.5,

142

min_samples=5,

143

metric='euclidean',

144

metric_params=None,

145

algorithm='auto',

146

leaf_size=30,

147

p=None,

148

n_jobs=None

149

):

150

"""

151

Initialize DBSCAN clustering.

152

153

Parameters:

154

eps (float): Maximum distance between samples in same neighborhood

155

min_samples (int): Minimum samples in neighborhood for core point

156

metric (str): Distance metric to use

157

metric_params (dict): Additional parameters for distance metric

158

algorithm (str): Algorithm for nearest neighbors computation

159

leaf_size (int): Leaf size for tree algorithms

160

p (float): Power parameter for Minkowski metric

161

n_jobs (int): Number of parallel jobs

162

"""

163

164

def fit(self, X, y=None, sample_weight=None):

165

"""

166

Perform DBSCAN clustering.

167

168

Parameters:

169

X (array-like): Training data of shape (n_samples, n_features)

170

y: Ignored, present for API consistency

171

sample_weight (array-like): Sample weights

172

173

Returns:

174

self: Fitted estimator

175

"""

176

177

def fit_predict(self, X, y=None, sample_weight=None):

178

"""

179

Compute clustering and return cluster labels.

180

181

Parameters:

182

X (array-like): Training data

183

y: Ignored

184

sample_weight (array-like): Sample weights

185

186

Returns:

187

array: Cluster labels (-1 for noise points)

188

"""

189

190

# Attributes available after fitting

191

labels_: ... # Cluster labels (-1 for noise)

192

core_sample_indices_: ... # Indices of core samples

193

components_: ... # Core samples

194

```

195

196

## Usage Examples

197

198

### Basic K-Means Clustering

199

200

```python

201

import numpy as np

202

from sklearnex.cluster import KMeans

203

from sklearn.datasets import make_blobs

204

205

# Generate sample data

206

X, _ = make_blobs(n_samples=1000, centers=4, n_features=2,

207

cluster_std=1.0, random_state=42)

208

209

# Create and fit K-means model

210

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)

211

kmeans.fit(X)

212

213

# Get cluster labels and centers

214

labels = kmeans.labels_

215

centers = kmeans.cluster_centers_

216

inertia = kmeans.inertia_

217

218

print(f"Inertia: {inertia:.2f}")

219

print(f"Centers shape: {centers.shape}")

220

221

# Predict clusters for new data

222

new_points = np.array([[1, 2], [3, 4]])

223

new_labels = kmeans.predict(new_points)

224

distances = kmeans.transform(new_points)

225

226

print(f"New point labels: {new_labels}")

227

print(f"Distances to centers: {distances}")

228

```

229

230

### DBSCAN Clustering with Noise Detection

231

232

```python

233

import numpy as np

234

from sklearnex.cluster import DBSCAN

235

from sklearn.datasets import make_blobs

236

237

# Generate data with noise

238

X, _ = make_blobs(n_samples=300, centers=4, n_features=2,

239

random_state=42, cluster_std=0.60)

240

241

# Add noise points

242

noise = np.random.uniform(-6, 6, (50, 2))

243

X = np.vstack([X, noise])

244

245

# Create and fit DBSCAN model

246

dbscan = DBSCAN(eps=0.3, min_samples=10)

247

cluster_labels = dbscan.fit_predict(X)

248

249

# Analyze results

250

n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)

251

n_noise = list(cluster_labels).count(-1)

252

253

print(f"Estimated number of clusters: {n_clusters}")

254

print(f"Estimated number of noise points: {n_noise}")

255

print(f"Core samples: {len(dbscan.core_sample_indices_)}")

256

257

# Get core samples

258

core_samples = dbscan.components_

259

print(f"Core samples shape: {core_samples.shape}")

260

```

261

262

### Comparison with Standard Scikit-learn

263

264

```python

265

import time

266

import numpy as np

267

from sklearn.datasets import make_blobs

268

269

# Generate large dataset

270

X, _ = make_blobs(n_samples=100000, centers=10, n_features=50, random_state=42)

271

272

# Intel-optimized version

273

from sklearnex.cluster import KMeans as IntelKMeans

274

275

start_time = time.time()

276

intel_kmeans = IntelKMeans(n_clusters=10, random_state=42)

277

intel_kmeans.fit(X)

278

intel_time = time.time() - start_time

279

280

print(f"Intel K-means time: {intel_time:.2f} seconds")

281

print(f"Intel inertia: {intel_kmeans.inertia_:.2f}")

282

283

# Standard scikit-learn version (for comparison)

284

from sklearn.cluster import KMeans as StandardKMeans

285

286

start_time = time.time()

287

standard_kmeans = StandardKMeans(n_clusters=10, random_state=42)

288

standard_kmeans.fit(X)

289

standard_time = time.time() - start_time

290

291

print(f"Standard K-means time: {standard_time:.2f} seconds")

292

print(f"Standard inertia: {standard_kmeans.inertia_:.2f}")

293

print(f"Speedup: {standard_time / intel_time:.1f}x")

294

```

295

296

## Performance Notes

297

298

- K-means shows significant speedups on datasets with >1000 samples

299

- DBSCAN benefits most from Intel optimization on high-dimensional data

300

- Both algorithms maintain identical results to scikit-learn implementations

301

- Memory usage is comparable to standard scikit-learn versions