tessl/pypi-scikit-learn-intelex

Intel Extension for Scikit-learn providing hardware-accelerated implementations of scikit-learn algorithms optimized for Intel CPUs and GPUs.

—

Pending

Overview

Eval results

Files

Clustering

Name: tessl/pypi-scikit-learn-intelex
Author: tessl

High-performance implementations of clustering algorithms with Intel hardware acceleration. These algorithms provide significant speedups for density-based and centroid-based clustering on large datasets.

Capabilities

K-Means Clustering

Intel-accelerated K-means clustering with optimized centroid computation and distance calculations.

class KMeans:
    """
    K-means clustering with Intel optimization.
    
    Provides 10-100x speedup over standard scikit-learn implementation
    through vectorized operations and Intel hardware acceleration.
    """
    
    def __init__(
        self,
        n_clusters=8,
        init='k-means++',
        n_init=10,
        max_iter=300,
        tol=1e-4,
        random_state=None,
        copy_x=True,
        algorithm='auto'
    ):
        """
        Initialize K-means clustering.
        
        Parameters:
            n_clusters (int): Number of clusters to form
            init (str or array): Initialization method ('k-means++', 'random')
            n_init (int): Number of initializations to perform
            max_iter (int): Maximum number of iterations
            tol (float): Tolerance for convergence
            random_state (int): Random state for reproducibility
            copy_x (bool): Whether to copy input data
            algorithm (str): Algorithm to use ('auto', 'full', 'elkan')
        """
    
    def fit(self, X, y=None, sample_weight=None):
        """
        Compute k-means clustering.
        
        Parameters:
            X (array-like): Training data of shape (n_samples, n_features)
            y: Ignored, present for API consistency
            sample_weight (array-like): Sample weights
            
        Returns:
            self: Fitted estimator
        """
    
    def predict(self, X, sample_weight=None):
        """
        Predict cluster labels for samples.
        
        Parameters:
            X (array-like): New data to predict
            sample_weight (array-like): Sample weights
            
        Returns:
            array: Cluster labels for each sample
        """
    
    def fit_predict(self, X, y=None, sample_weight=None):
        """
        Compute clustering and return cluster labels.
        
        Parameters:
            X (array-like): Training data
            y: Ignored
            sample_weight (array-like): Sample weights
            
        Returns:
            array: Cluster labels
        """
    
    def transform(self, X):
        """
        Transform X to cluster-distance space.
        
        Parameters:
            X (array-like): Data to transform
            
        Returns:
            array: Distances to cluster centers
        """
    
    def fit_transform(self, X, y=None, sample_weight=None):
        """
        Compute clustering and transform to cluster-distance space.
        
        Parameters:
            X (array-like): Training data
            y: Ignored
            sample_weight (array-like): Sample weights
            
        Returns:
            array: Distances to cluster centers
        """
    
    def score(self, X, y=None, sample_weight=None):
        """
        Return the negative sum of squared distances to centroids.
        
        Parameters:
            X (array-like): Data to score
            y: Ignored
            sample_weight (array-like): Sample weights
            
        Returns:
            float: Negative inertia score
        """
    
    # Attributes available after fitting
    cluster_centers_: ...  # Cluster centers
    labels_: ...          # Labels of training data
    inertia_: ...         # Sum of squared distances to centroids
    n_iter_: ...          # Number of iterations run

DBSCAN Clustering

Density-Based Spatial Clustering of Applications with Noise, optimized for Intel hardware.

class DBSCAN:
    """
    DBSCAN clustering with Intel optimization.
    
    Efficient density-based clustering that finds clusters of varying shapes
    and identifies outliers as noise points.
    """
    
    def __init__(
        self,
        eps=0.5,
        min_samples=5,
        metric='euclidean',
        metric_params=None,
        algorithm='auto',
        leaf_size=30,
        p=None,
        n_jobs=None
    ):
        """
        Initialize DBSCAN clustering.
        
        Parameters:
            eps (float): Maximum distance between samples in same neighborhood
            min_samples (int): Minimum samples in neighborhood for core point
            metric (str): Distance metric to use
            metric_params (dict): Additional parameters for distance metric
            algorithm (str): Algorithm for nearest neighbors computation
            leaf_size (int): Leaf size for tree algorithms
            p (float): Power parameter for Minkowski metric
            n_jobs (int): Number of parallel jobs
        """
    
    def fit(self, X, y=None, sample_weight=None):
        """
        Perform DBSCAN clustering.
        
        Parameters:
            X (array-like): Training data of shape (n_samples, n_features)
            y: Ignored, present for API consistency
            sample_weight (array-like): Sample weights
            
        Returns:
            self: Fitted estimator
        """
    
    def fit_predict(self, X, y=None, sample_weight=None):
        """
        Compute clustering and return cluster labels.
        
        Parameters:
            X (array-like): Training data
            y: Ignored
            sample_weight (array-like): Sample weights
            
        Returns:
            array: Cluster labels (-1 for noise points)
        """
    
    # Attributes available after fitting
    labels_: ...              # Cluster labels (-1 for noise)
    core_sample_indices_: ... # Indices of core samples
    components_: ...          # Core samples

Usage Examples

Basic K-Means Clustering

import numpy as np
from sklearnex.cluster import KMeans
from sklearn.datasets import make_blobs

# Generate sample data
X, _ = make_blobs(n_samples=1000, centers=4, n_features=2, 
                  cluster_std=1.0, random_state=42)

# Create and fit K-means model
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans.fit(X)

# Get cluster labels and centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_
inertia = kmeans.inertia_

print(f"Inertia: {inertia:.2f}")
print(f"Centers shape: {centers.shape}")

# Predict clusters for new data
new_points = np.array([[1, 2], [3, 4]])
new_labels = kmeans.predict(new_points)
distances = kmeans.transform(new_points)

print(f"New point labels: {new_labels}")
print(f"Distances to centers: {distances}")

DBSCAN Clustering with Noise Detection

import numpy as np
from sklearnex.cluster import DBSCAN
from sklearn.datasets import make_blobs

# Generate data with noise
X, _ = make_blobs(n_samples=300, centers=4, n_features=2,
                  random_state=42, cluster_std=0.60)

# Add noise points
noise = np.random.uniform(-6, 6, (50, 2))
X = np.vstack([X, noise])

# Create and fit DBSCAN model
dbscan = DBSCAN(eps=0.3, min_samples=10)
cluster_labels = dbscan.fit_predict(X)

# Analyze results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")
print(f"Core samples: {len(dbscan.core_sample_indices_)}")

# Get core samples
core_samples = dbscan.components_
print(f"Core samples shape: {core_samples.shape}")

Comparison with Standard Scikit-learn

import time
import numpy as np
from sklearn.datasets import make_blobs

# Generate large dataset
X, _ = make_blobs(n_samples=100000, centers=10, n_features=50, random_state=42)

# Intel-optimized version
from sklearnex.cluster import KMeans as IntelKMeans

start_time = time.time()
intel_kmeans = IntelKMeans(n_clusters=10, random_state=42)
intel_kmeans.fit(X)
intel_time = time.time() - start_time

print(f"Intel K-means time: {intel_time:.2f} seconds")
print(f"Intel inertia: {intel_kmeans.inertia_:.2f}")

# Standard scikit-learn version (for comparison)
from sklearn.cluster import KMeans as StandardKMeans

start_time = time.time()
standard_kmeans = StandardKMeans(n_clusters=10, random_state=42)
standard_kmeans.fit(X)
standard_time = time.time() - start_time

print(f"Standard K-means time: {standard_time:.2f} seconds")
print(f"Standard inertia: {standard_kmeans.inertia_:.2f}")
print(f"Speedup: {standard_time / intel_time:.1f}x")

Performance Notes

K-means shows significant speedups on datasets with >1000 samples
DBSCAN benefits most from Intel optimization on high-dimensional data
Both algorithms maintain identical results to scikit-learn implementations
Memory usage is comparable to standard scikit-learn versions

Install with Tessl CLI

npx tessl i tessl/pypi-scikit-learn-intelex

docs

metrics-model-selection.md