Intel Extension for Scikit-learn providing hardware-accelerated implementations of scikit-learn algorithms optimized for Intel CPUs and GPUs.
—
Advanced capabilities including preview APIs and distributed computing with SPMD (Single Program Multiple Data) support. These features provide cutting-edge optimizations and enable distributed machine learning on large-scale datasets.
Preview features are experimental implementations that provide early access to new algorithms and optimizations. Enable preview features by setting the SKLEARNEX_PREVIEW environment variable.
export SKLEARNEX_PREVIEW=1Advanced utilities for accessing Intel oneDAL hyperparameters for specific algorithms and operations.
def get_hyperparameters(algorithm, op):
"""
Get hyperparameter object for specific Intel oneDAL algorithm operation.
Provides access to low-level hyperparameters for Intel oneDAL algorithms,
allowing fine-tuning of algorithm behavior and performance characteristics.
Parameters:
algorithm (str): Algorithm name (e.g., 'linear_regression', 'covariance')
op (str): Operation name (e.g., 'train', 'compute')
Returns:
HyperParameters: Object with algorithm-specific hyperparameters
None: If oneDAL version < 2024.0.0
Raises:
KeyError: If algorithm/operation combination is not supported
Example:
from sklearnex import get_hyperparameters
# Get hyperparameters for linear regression training
hparams = get_hyperparameters('linear_regression', 'train')
if hparams is not None:
# Access hyperparameter values
current_params = hparams.to_dict()
print(f"Current parameters: {current_params}")
# Modify hyperparameters (if setters available)
# hparams.some_parameter = new_value
"""Currently supported hyperparameter combinations:
# Linear regression training hyperparameters
linear_hparams = get_hyperparameters('linear_regression', 'train')
# Covariance computation hyperparameters
cov_hparams = get_hyperparameters('covariance', 'compute')Core utility functions for array handling and validation with Intel optimization.
def get_namespace(x, xp=None):
"""
Get array namespace for input arrays.
Determines the appropriate array namespace (NumPy, CuPy, etc.)
for the given input arrays, enabling cross-library compatibility.
Parameters:
x (array-like): Input array to determine namespace for
xp (module, optional): Preferred array namespace module
Returns:
module: Array namespace module (numpy, cupy, etc.)
Example:
from sklearnex.utils import get_namespace
import numpy as np
data = np.array([[1, 2], [3, 4]])
xp = get_namespace(data)
# xp will be numpy module
result = xp.mean(data, axis=0)
"""
def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
"""
Assert that all values in array are finite.
Validates that input arrays contain only finite values,
with Intel-optimized checking for large arrays.
Parameters:
X (array-like): Input array to validate
allow_nan (bool): Whether to allow NaN values
msg_dtype (str): Data type name for error messages
Raises:
ValueError: If array contains non-finite values
Example:
from sklearnex.utils import _assert_all_finite
import numpy as np
# Valid array - no error
valid_data = np.array([[1.0, 2.0], [3.0, 4.0]])
_assert_all_finite(valid_data)
# Invalid array - raises ValueError
invalid_data = np.array([[1.0, np.inf], [3.0, 4.0]])
# _assert_all_finite(invalid_data) # Would raise ValueError
"""Enhanced K-means implementation with advanced optimization techniques.
from sklearnex.preview.cluster import KMeans
class KMeans:
"""
Preview K-means clustering with advanced optimizations.
Features experimental improvements including better initialization,
adaptive convergence criteria, and enhanced memory efficiency.
"""
def __init__(
self,
n_clusters=8,
init='k-means++',
n_init=10,
max_iter=300,
tol=1e-4,
random_state=None,
copy_x=True,
algorithm='auto'
):
"""Enhanced K-means with experimental optimizations."""Advanced covariance estimation with improved numerical stability.
from sklearnex.preview.covariance import EmpiricalCovariance
class EmpiricalCovariance:
"""
Preview empirical covariance with enhanced numerical methods.
Provides improved stability for high-dimensional and near-singular
covariance matrices through advanced regularization techniques.
"""
def __init__(
self,
store_precision=True,
assume_centered=False
):
"""Enhanced empirical covariance estimation."""Advanced incremental Principal Component Analysis implementation.
from sklearnex.preview.decomposition import IncrementalPCA
class IncrementalPCA:
"""
Preview Incremental PCA with memory and computational optimizations.
Enhanced version supporting larger batch sizes and improved
numerical stability for streaming high-dimensional data.
"""
def __init__(
self,
n_components=None,
whiten=False,
copy=True,
batch_size=None
):
"""Advanced incremental PCA implementation."""Enhanced Ridge regression with advanced solver algorithms.
from sklearnex.preview.linear_model import Ridge
class Ridge:
"""
Preview Ridge regression with experimental solver improvements.
Features advanced optimization techniques for better convergence
and handling of ill-conditioned problems.
"""
def __init__(
self,
alpha=1.0,
fit_intercept=True,
normalize='deprecated',
copy_X=True,
max_iter=None,
tol=1e-3,
solver='auto',
positive=False,
random_state=None
):
"""Enhanced Ridge regression with advanced solvers."""SPMD provides distributed computing capabilities for large-scale machine learning across multiple nodes. Requires OneDAL SPMD backend and appropriate distributed computing environment.
# SPMD requires distributed computing setup
# Example with mpi4py (Message Passing Interface)
from mpi4py import MPI
import os
# Initialize MPI environment
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# Ensure OneDAL SPMD is available
os.environ['ONEAPI_DAAL_SPMD'] = '1'
# Import SPMD modules after MPI setup
from sklearnex.spmd import patch_sklearn
patch_sklearn()from sklearnex.spmd.basic_statistics import BasicStatistics
class BasicStatistics:
"""
Distributed basic statistics computation across multiple nodes.
Automatically partitions data across available MPI processes and
aggregates results for scalable statistical analysis.
"""
def fit(self, X, y=None):
"""
Compute statistics on distributed data.
Each MPI rank processes its portion of data, with automatic
aggregation of results across all nodes.
"""from sklearnex.spmd.cluster import KMeans, DBSCAN
class KMeans:
"""
Distributed K-means clustering across multiple nodes.
Scales to very large datasets by distributing computation
and coordinating centroid updates across MPI processes.
"""
class DBSCAN:
"""
Distributed DBSCAN clustering for large-scale density analysis.
Enables clustering of massive datasets through distributed
density computation and neighbor finding.
"""from sklearnex.spmd.linear_model import LinearRegression, LogisticRegression
class LinearRegression:
"""
Distributed linear regression using distributed gradient computation.
Scales to massive datasets through distributed normal equation
or gradient-based solving across multiple nodes.
"""
class LogisticRegression:
"""
Distributed logistic regression with distributed gradient descent.
Handles very large classification problems through distributed
optimization and coordinated parameter updates.
"""from sklearnex.spmd.ensemble import RandomForestClassifier, RandomForestRegressor
class RandomForestClassifier:
"""
Distributed Random Forest classification.
Distributes tree construction across nodes while maintaining
ensemble diversity and prediction accuracy.
"""
class RandomForestRegressor:
"""
Distributed Random Forest regression.
Scales tree ensemble training to very large datasets through
distributed bootstrap sampling and tree building.
"""import os
import numpy as np
# Enable preview features
os.environ['SKLEARNEX_PREVIEW'] = '1'
from sklearnex.preview.cluster import KMeans as PreviewKMeans
from sklearnex.preview.covariance import EmpiricalCovariance as PreviewCovariance
from sklearnex.preview.decomposition import IncrementalPCA as PreviewIPCA
from sklearnex.preview.linear_model import Ridge as PreviewRidge
from sklearn.datasets import make_blobs, make_regression
# Preview K-Means Example
print("Testing Preview K-Means:")
X_kmeans, _ = make_blobs(n_samples=2000, centers=5, n_features=20, random_state=42)
preview_kmeans = PreviewKMeans(n_clusters=5, random_state=42)
preview_kmeans.fit(X_kmeans)
print(f"Preview K-means inertia: {preview_kmeans.inertia_:.2f}")
print(f"Cluster centers shape: {preview_kmeans.cluster_centers_.shape}")
# Preview Empirical Covariance Example
print("\nTesting Preview Empirical Covariance:")
X_cov = np.random.randn(1000, 50)
preview_cov = PreviewCovariance(store_precision=True)
preview_cov.fit(X_cov)
print(f"Covariance matrix shape: {preview_cov.covariance_.shape}")
print(f"Precision matrix available: {hasattr(preview_cov, 'precision_')}")
print(f"Log-likelihood: {preview_cov.score(X_cov[:100]):.2f}")
# Preview Incremental PCA Example
print("\nTesting Preview Incremental PCA:")
X_pca = np.random.randn(2000, 100)
preview_ipca = PreviewIPCA(n_components=20, batch_size=200)
# Fit in batches
for i in range(0, X_pca.shape[0], 200):
batch = X_pca[i:i+200]
preview_ipca.partial_fit(batch)
# Transform data
X_transformed = preview_ipca.transform(X_pca[:500])
print(f"Transformed data shape: {X_transformed.shape}")
print(f"Explained variance ratio sum: {preview_ipca.explained_variance_ratio_.sum():.3f}")
# Preview Ridge Regression Example
print("\nTesting Preview Ridge:")
X_ridge, y_ridge = make_regression(n_samples=1500, n_features=50, noise=0.1, random_state=42)
preview_ridge = PreviewRidge(alpha=1.0, solver='auto')
preview_ridge.fit(X_ridge, y_ridge)
print(f"Ridge R² score: {preview_ridge.score(X_ridge, y_ridge):.3f}")
print(f"Coefficients shape: {preview_ridge.coef_.shape}")# Note: This example requires MPI environment and multiple processes
# Run with: mpirun -n 4 python spmd_example.py
try:
from mpi4py import MPI
import numpy as np
# Initialize MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
print(f"Process {rank} of {size} started")
# Enable SPMD mode
import os
os.environ['ONEAPI_DAAL_SPMD'] = '1'
from sklearnex.spmd.basic_statistics import BasicStatistics as SPMDStats
from sklearnex.spmd.cluster import KMeans as SPMDKMeans
from sklearnex.spmd.linear_model import LinearRegression as SPMDLinear
# Generate distributed data (each process has its portion)
np.random.seed(42 + rank) # Different seed per process
local_samples = 2500 # Samples per process
n_features = 30
X_local = np.random.randn(local_samples, n_features)
y_local = np.random.randn(local_samples)
if rank == 0:
print(f"Total dataset: {size * local_samples} samples, {n_features} features")
print(f"Each process handles: {local_samples} samples")
# Distributed Basic Statistics
if rank == 0:
print("\n=== Distributed Basic Statistics ===")
spmd_stats = SPMDStats(result_options='all')
spmd_stats.fit(X_local)
if rank == 0:
print(f"Global mean computed: {spmd_stats.mean_[:5]}...") # Show first 5
print(f"Global variance computed: {spmd_stats.variance_[:5]}...")
print(f"Total samples processed: {spmd_stats.n_samples_seen_}")
# Distributed K-Means
if rank == 0:
print("\n=== Distributed K-Means ===")
spmd_kmeans = SPMDKMeans(n_clusters=8, random_state=42)
spmd_kmeans.fit(X_local)
if rank == 0:
print(f"Global inertia: {spmd_kmeans.inertia_:.2f}")
print(f"Cluster centers shape: {spmd_kmeans.cluster_centers_.shape}")
# Distributed Linear Regression
if rank == 0:
print("\n=== Distributed Linear Regression ===")
spmd_linear = SPMDLinear()
spmd_linear.fit(X_local, y_local)
if rank == 0:
print(f"Global coefficients computed: {spmd_linear.coef_[:5]}...")
print(f"Intercept: {spmd_linear.intercept_:.4f}")
# Performance comparison (simulate)
if rank == 0:
print(f"\n=== Performance Summary ===")
print(f"Distributed processing across {size} processes")
print(f"Each process: {local_samples} samples")
print(f"Total effective dataset: {size * local_samples} samples")
print(f"Memory per process: ~{X_local.nbytes / 1024**2:.1f} MB")
print(f"Total memory distributed: ~{size * X_local.nbytes / 1024**2:.1f} MB")
except ImportError:
print("MPI not available. SPMD examples require mpi4py and MPI environment.")
print("Install with: pip install mpi4py")
print("Run with: mpirun -n 4 python script.py")
# Fallback: Show SPMD API without execution
print("\nSPMD API available for:")
try:
from sklearnex import spmd
print("- Basic Statistics (distributed)")
print("- Clustering (KMeans, DBSCAN)")
print("- Linear Models (LinearRegression, LogisticRegression)")
print("- Ensemble Methods (RandomForest)")
print("- Decomposition (PCA)")
print("- Covariance (EmpiricalCovariance)")
print("- Neighbors (KNeighbors)")
except ImportError as e:
print(f"SPMD modules not available: {e}")# Advanced example combining Preview and SPMD features
import os
import numpy as np
# Enable both preview and SPMD
os.environ['SKLEARNEX_PREVIEW'] = '1'
os.environ['ONEAPI_DAAL_SPMD'] = '1'
try:
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# Generate large-scale synthetic dataset
np.random.seed(42 + rank)
local_samples = 5000
n_features = 100
X_local = np.random.randn(local_samples, n_features)
if rank == 0:
print("=== Hybrid Preview + SPMD Workflow ===")
print(f"Dataset: {size * local_samples} samples, {n_features} features")
print(f"Processes: {size}")
# Step 1: Distributed statistics with SPMD
from sklearnex.spmd.basic_statistics import BasicStatistics
stats = BasicStatistics(result_options=['mean', 'variance'])
stats.fit(X_local)
if rank == 0:
print(f"\nStep 1 - Global Statistics:")
print(f"Mean range: [{stats.mean_.min():.3f}, {stats.mean_.max():.3f}]")
print(f"Variance range: [{stats.variance_.min():.3f}, {stats.variance_.max():.3f}]")
# Step 2: Local preprocessing with Preview features
# Standardize using global statistics
X_standardized = (X_local - stats.mean_) / np.sqrt(stats.variance_)
# Step 3: Distributed clustering with enhanced algorithm
from sklearnex.spmd.cluster import KMeans
kmeans = KMeans(n_clusters=10, n_init=3, random_state=42)
kmeans.fit(X_standardized)
if rank == 0:
print(f"\nStep 2 - Distributed Clustering:")
print(f"Global inertia: {kmeans.inertia_:.2f}")
print(f"Iterations: {kmeans.n_iter_}")
# Step 4: Local analysis on cluster assignments
local_labels = kmeans.predict(X_standardized)
local_cluster_counts = np.bincount(local_labels, minlength=10)
# Aggregate cluster counts across all processes
global_cluster_counts = comm.allreduce(local_cluster_counts, op=MPI.SUM)
if rank == 0:
print(f"\nStep 3 - Global Cluster Analysis:")
for i, count in enumerate(global_cluster_counts):
percentage = 100 * count / (size * local_samples)
print(f"Cluster {i}: {count} samples ({percentage:.1f}%)")
if rank == 0:
print(f"\nWorkflow completed successfully!")
print(f"Total computation distributed across {size} processes")
except ImportError as e:
print(f"Advanced features require MPI: {e}")
print("This example demonstrates the potential of combining:")
print("- Preview APIs for enhanced algorithms")
print("- SPMD for distributed computation")
print("- Hybrid workflows for large-scale ML")import os
import sys
def setup_advanced_features():
"""Setup and verify advanced feature availability."""
print("=== Advanced Features Configuration ===")
# Preview API setup
os.environ['SKLEARNEX_PREVIEW'] = '1'
print("✓ Preview API enabled")
# Check available preview modules
try:
from sklearnex import preview
print("✓ Preview modules available:")
print(" - preview.cluster (enhanced K-means)")
print(" - preview.covariance (advanced covariance)")
print(" - preview.decomposition (enhanced PCA)")
print(" - preview.linear_model (improved Ridge)")
except ImportError as e:
print(f"✗ Preview modules error: {e}")
# SPMD setup check
try:
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
print(f"✓ MPI available: rank {rank} of {size}")
os.environ['ONEAPI_DAAL_SPMD'] = '1'
print("✓ SPMD mode enabled")
try:
from sklearnex import spmd
print("✓ SPMD modules available:")
print(" - spmd.basic_statistics")
print(" - spmd.cluster")
print(" - spmd.linear_model")
print(" - spmd.ensemble")
print(" - spmd.decomposition")
except ImportError as e:
print(f"✗ SPMD modules error: {e}")
except ImportError:
print("✗ MPI not available (install mpi4py for SPMD)")
# OneDAL configuration
dalroot = os.environ.get('DALROOT')
if dalroot:
print(f"✓ OneDAL root: {dalroot}")
else:
print("ℹ OneDAL root not set (may use system installation)")
# Memory and threading info
print(f"\nSystem Information:")
print(f"Python version: {sys.version}")
print(f"Available CPU cores: {os.cpu_count()}")
# Threading environment variables
threading_vars = ['OMP_NUM_THREADS', 'MKL_NUM_THREADS', 'NUMBA_NUM_THREADS']
for var in threading_vars:
value = os.environ.get(var, 'not set')
print(f"{var}: {value}")
if __name__ == "__main__":
setup_advanced_features()Install with Tessl CLI
npx tessl i tessl/pypi-scikit-learn-intelex