Intel Extension for Scikit-learn providing hardware-accelerated implementations of scikit-learn algorithms optimized for Intel CPUs and GPUs.
—
High-performance implementations of statistical analysis and manifold learning algorithms with Intel hardware acceleration. These algorithms provide significant speedups for statistical computations and dimensionality reduction on large datasets.
Intel-accelerated computation of basic statistical metrics with vectorized operations for large datasets.
class BasicStatistics:
"""
Basic statistics computation with Intel optimization.
Provides efficient computation of fundamental statistical metrics
including mean, variance, covariance, correlation, and quantiles.
"""
def __init__(
self,
result_options='all',
algorithm='by_default'
):
"""
Initialize BasicStatistics estimator.
Parameters:
result_options (str or list): Statistics to compute
('all', 'mean', 'variance', 'variation', 'sum', 'sum_squares',
'sum_squares_centered', 'second_order_raw_moment', 'min', 'max')
algorithm (str): Algorithm implementation to use
"""
def fit(self, X, y=None):
"""
Compute basic statistics for the input data.
Parameters:
X (array-like): Input data of shape (n_samples, n_features)
y: Ignored, present for API consistency
Returns:
self: Fitted estimator with computed statistics
"""
def partial_fit(self, X, y=None):
"""
Update statistics with new batch of data.
Parameters:
X (array-like): New batch of data
y: Ignored
Returns:
self: Updated estimator
"""
def finalize_fit(self):
"""
Finalize the computation of statistics.
Returns:
self: Finalized estimator
"""
# Attributes available after fitting
min_: ... # Minimum values per feature
max_: ... # Maximum values per feature
sum_: ... # Sum of values per feature
mean_: ... # Mean values per feature
variance_: ... # Variance per feature
variation_: ... # Coefficient of variation per feature
sum_squares_: ... # Sum of squares per feature
sum_squares_centered_: ... # Centered sum of squares per feature
second_order_raw_moment_: ... # Second order raw moments
n_samples_seen_: ... # Number of samples processedIntel-accelerated incremental computation of basic statistics for streaming data.
class IncrementalBasicStatistics:
"""
Incremental basic statistics with Intel optimization.
Enables efficient online computation of statistical metrics
for streaming data or datasets that don't fit in memory.
"""
def __init__(
self,
result_options='all',
algorithm='by_default'
):
"""
Initialize IncrementalBasicStatistics estimator.
Parameters:
result_options (str or list): Statistics to compute
('all', 'mean', 'variance', 'variation', 'sum', 'sum_squares',
'sum_squares_centered', 'second_order_raw_moment', 'min', 'max')
algorithm (str): Algorithm implementation to use
"""
def partial_fit(self, X, y=None):
"""
Update statistics incrementally with new data batch.
Parameters:
X (array-like): New batch of data
y: Ignored, present for API consistency
Returns:
self: Updated estimator
"""
def fit(self, X, y=None):
"""
Compute statistics for input data (equivalent to single partial_fit).
Parameters:
X (array-like): Input data of shape (n_samples, n_features)
y: Ignored
Returns:
self: Fitted estimator
"""
def finalize_fit(self):
"""
Finalize incremental statistics computation.
Returns:
self: Finalized estimator with complete statistics
"""
# Attributes available after fitting
min_: ... # Minimum values per feature
max_: ... # Maximum values per feature
sum_: ... # Sum of values per feature
mean_: ... # Mean values per feature
variance_: ... # Variance per feature
variation_: ... # Coefficient of variation per feature
sum_squares_: ... # Sum of squares per feature
sum_squares_centered_: ... # Centered sum of squares per feature
second_order_raw_moment_: ... # Second order raw moments
n_samples_seen_: ... # Total number of samples processedIntel-accelerated incremental empirical covariance estimation for streaming data and large datasets.
class IncrementalEmpiricalCovariance:
"""
Incremental empirical covariance estimation with Intel optimization.
Efficiently computes sample covariance matrix incrementally, making it
suitable for streaming data and datasets too large to fit in memory.
"""
def __init__(
self,
store_precision=True,
assume_centered=False
):
"""
Initialize Incremental Empirical Covariance.
Parameters:
store_precision (bool): Whether to store precision matrix
assume_centered (bool): Whether data is already centered
"""
def fit(self, X, y=None):
"""
Fit covariance model to data.
Parameters:
X (array-like): Training data of shape (n_samples, n_features)
y: Ignored, present for API consistency
Returns:
self: Fitted estimator
"""
def partial_fit(self, X, y=None):
"""
Incrementally fit covariance model.
Parameters:
X (array-like): Data batch of shape (n_samples, n_features)
y: Ignored
Returns:
self: Updated estimator
"""
def score(self, X, y=None):
"""
Compute log-likelihood under the model.
Parameters:
X (array-like): Test data
y: Ignored
Returns:
float: Average log-likelihood
"""
# Attributes available after fitting
covariance_: ... # Estimated covariance matrix
location_: ... # Estimated location (mean)
precision_: ... # Estimated precision matrix (if store_precision=True)
n_samples_seen_: ... # Number of samples processedIntel-accelerated t-SNE for non-linear dimensionality reduction and visualization.
class TSNE:
"""
t-distributed Stochastic Neighbor Embedding with Intel optimization.
Provides efficient non-linear dimensionality reduction for visualization
and exploratory data analysis with optimized gradient computations.
"""
def __init__(
self,
n_components=2,
perplexity=30.0,
early_exaggeration=12.0,
learning_rate='warn',
n_iter=1000,
n_iter_without_progress=300,
min_grad_norm=1e-7,
metric='euclidean',
init='warn',
verbose=0,
random_state=None,
method='barnes_hut',
angle=0.5,
n_jobs=None,
square_distances='deprecated'
):
"""
Initialize t-SNE estimator.
Parameters:
n_components (int): Dimension of embedded space (usually 2 or 3)
perplexity (float): Related to number of nearest neighbors
early_exaggeration (float): How tight natural clusters are in original space
learning_rate (float or str): Learning rate for optimization
n_iter (int): Maximum number of iterations
n_iter_without_progress (int): Maximum iterations without progress
min_grad_norm (float): Minimum gradient norm for early stopping
metric (str): Distance metric to use
init (str or array): Initialization method ('random', 'pca', array)
verbose (int): Verbosity level
random_state (int): Random state for reproducibility
method (str): Algorithm to use ('barnes_hut', 'exact')
angle (float): Trade-off between speed and accuracy for Barnes-Hut
n_jobs (int): Number of parallel jobs
square_distances (str): Deprecated parameter
"""
def fit(self, X, y=None):
"""
Fit X into an embedded space.
Parameters:
X (array-like): Input data of shape (n_samples, n_features)
y: Ignored, present for API consistency
Returns:
self: Fitted estimator
"""
def fit_transform(self, X, y=None):
"""
Fit X into an embedded space and return transformed array.
Parameters:
X (array-like): Input data of shape (n_samples, n_features)
y: Ignored
Returns:
array: Embedded coordinates of shape (n_samples, n_components)
"""
# Attributes available after fitting
embedding_: ... # Stores embedding vectors
kl_divergence_: ... # Kullback-Leibler divergence after optimization
n_features_in_: ... # Number of features in input data
n_iter_: ... # Number of iterations run
learning_rate_: ... # Effective learning rateimport numpy as np
from sklearnex.basic_statistics import BasicStatistics
from sklearn.datasets import make_regression
# Generate sample data
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
# Compute basic statistics
stats = BasicStatistics(result_options='all')
stats.fit(X)
print("Basic Statistics Results:")
print(f"Data shape: {X.shape}")
print(f"Samples processed: {stats.n_samples_seen_}")
# Access computed statistics
print(f"Mean per feature: {stats.mean_}")
print(f"Variance per feature: {stats.variance_}")
print(f"Min values: {stats.min_}")
print(f"Max values: {stats.max_}")
print(f"Sum per feature: {stats.sum_}")
# Coefficient of variation (std/mean)
print(f"Coefficient of variation: {stats.variation_}")
# Statistical moments
print(f"Sum of squares: {stats.sum_squares_}")
print(f"Centered sum of squares: {stats.sum_squares_centered_}")
print(f"Second order raw moment: {stats.second_order_raw_moment_}")
# Compute specific statistics only
stats_subset = BasicStatistics(result_options=['mean', 'variance', 'min', 'max'])
stats_subset.fit(X)
print("\nSubset of statistics:")
print(f"Mean: {stats_subset.mean_}")
print(f"Variance: {stats_subset.variance_}")
print(f"Min: {stats_subset.min_}")
print(f"Max: {stats_subset.max_}")
# Verify against NumPy computations
print(f"\nVerification against NumPy:")
print(f"Mean matches NumPy: {np.allclose(stats.mean_, np.mean(X, axis=0))}")
print(f"Variance matches NumPy: {np.allclose(stats.variance_, np.var(X, axis=0, ddof=0))}")
print(f"Min matches NumPy: {np.allclose(stats.min_, np.min(X, axis=0))}")
print(f"Max matches NumPy: {np.allclose(stats.max_, np.max(X, axis=0))}")import numpy as np
from sklearnex.basic_statistics import IncrementalBasicStatistics
# Simulate streaming data
np.random.seed(42)
total_samples = 5000
batch_size = 500
n_features = 8
# Create incremental statistics estimator
inc_stats = IncrementalBasicStatistics(result_options='all')
# Process data in batches
all_data = []
for batch_idx in range(0, total_samples, batch_size):
# Generate batch of data
batch_data = np.random.randn(batch_size, n_features)
all_data.append(batch_data)
# Update statistics incrementally
inc_stats.partial_fit(batch_data)
print(f"Processed batch {batch_idx//batch_size + 1}: "
f"{inc_stats.n_samples_seen_} total samples")
# Finalize computation
inc_stats.finalize_fit()
# Compare with batch computation
full_data = np.vstack(all_data)
batch_stats = BasicStatistics(result_options='all')
batch_stats.fit(full_data)
print(f"\nIncremental vs Batch Statistics Comparison:")
print(f"Samples processed - Incremental: {inc_stats.n_samples_seen_}, "
f"Batch: {batch_stats.n_samples_seen_}")
# Verify results are identical
print(f"Mean identical: {np.allclose(inc_stats.mean_, batch_stats.mean_)}")
print(f"Variance identical: {np.allclose(inc_stats.variance_, batch_stats.variance_)}")
print(f"Min identical: {np.allclose(inc_stats.min_, batch_stats.min_)}")
print(f"Max identical: {np.allclose(inc_stats.max_, batch_stats.max_)}")
# Demonstrate memory efficiency for large datasets
print(f"\nMemory-efficient processing example:")
inc_stats_large = IncrementalBasicStatistics(result_options=['mean', 'variance'])
# Simulate processing very large dataset in small batches
n_batches = 100
batch_size = 1000
for i in range(n_batches):
# Generate and immediately process batch (no storage)
batch = np.random.normal(loc=i*0.1, scale=1.0, size=(batch_size, n_features))
inc_stats_large.partial_fit(batch)
if (i + 1) % 20 == 0:
print(f" Processed {inc_stats_large.n_samples_seen_} samples")
inc_stats_large.finalize_fit()
print(f"Final mean: {inc_stats_large.mean_}")
print(f"Final variance: {inc_stats_large.variance_}")import numpy as np
import matplotlib.pyplot as plt
from sklearnex.manifold import TSNE
from sklearn.datasets import load_digits, make_blobs
# Example 1: Digits dataset visualization
digits = load_digits()
X_digits, y_digits = digits.data, digits.target
print(f"Digits dataset shape: {X_digits.shape}")
print(f"Number of classes: {len(np.unique(y_digits))}")
# Apply t-SNE for 2D visualization
tsne = TSNE(n_components=2, perplexity=30, random_state=42, verbose=1)
X_tsne = tsne.fit_transform(X_digits)
print(f"t-SNE embedding shape: {X_tsne.shape}")
print(f"KL divergence: {tsne.kl_divergence_:.4f}")
print(f"Iterations run: {tsne.n_iter_}")
# Visualize results
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_digits, cmap='tab10', s=20, alpha=0.7)
plt.colorbar()
plt.title('t-SNE: Digits Dataset (Colored by Digit)')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
# Example 2: High-dimensional synthetic data
X_synthetic, y_synthetic = make_blobs(
n_samples=1000, centers=5, n_features=50,
cluster_std=2.0, random_state=42
)
print(f"\nSynthetic dataset shape: {X_synthetic.shape}")
# t-SNE with different parameters
tsne_synthetic = TSNE(
n_components=2,
perplexity=50,
early_exaggeration=12.0,
learning_rate=200.0,
n_iter=1000,
random_state=42
)
X_tsne_synthetic = tsne_synthetic.fit_transform(X_synthetic)
plt.subplot(1, 2, 2)
plt.scatter(X_tsne_synthetic[:, 0], X_tsne_synthetic[:, 1],
c=y_synthetic, cmap='viridis', s=20, alpha=0.7)
plt.colorbar()
plt.title('t-SNE: Synthetic High-D Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.tight_layout()
plt.show()
# Example 3: 3D embedding
tsne_3d = TSNE(n_components=3, perplexity=30, random_state=42)
X_tsne_3d = tsne_3d.fit_transform(X_digits[:500]) # Use subset for faster computation
print(f"\n3D t-SNE embedding shape: {X_tsne_3d.shape}")
# 3D visualization
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_tsne_3d[:, 0], X_tsne_3d[:, 1], X_tsne_3d[:, 2],
c=y_digits[:500], cmap='tab10', s=30, alpha=0.7)
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
ax.set_title('3D t-SNE: Digits Dataset')
plt.colorbar(scatter)
plt.show()
# Parameter sensitivity analysis
perplexity_values = [5, 15, 30, 50, 100]
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()
for i, perp in enumerate(perplexity_values):
if i >= len(axes):
break
tsne_param = TSNE(n_components=2, perplexity=perp, random_state=42)
X_param = tsne_param.fit_transform(X_digits[:1000]) # Use subset for speed
axes[i].scatter(X_param[:, 0], X_param[:, 1], c=y_digits[:1000],
cmap='tab10', s=10, alpha=0.7)
axes[i].set_title(f'Perplexity = {perp}')
axes[i].set_xlabel('t-SNE Component 1')
axes[i].set_ylabel('t-SNE Component 2')
# Hide the last subplot if not used
if len(perplexity_values) < len(axes):
axes[-1].axis('off')
plt.tight_layout()
plt.show()import numpy as np
from sklearnex.basic_statistics import BasicStatistics
from sklearnex.manifold import TSNE
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
# Load real-world dataset
cancer = load_breast_cancer()
X_cancer, y_cancer = cancer.data, cancer.target
print(f"Breast cancer dataset shape: {X_cancer.shape}")
print(f"Feature names: {cancer.feature_names[:5]}...") # Show first 5 features
# Compute basic statistics on raw data
raw_stats = BasicStatistics(result_options='all')
raw_stats.fit(X_cancer)
print("\nRaw data statistics:")
print(f"Mean range: [{raw_stats.mean_.min():.2f}, {raw_stats.mean_.max():.2f}]")
print(f"Variance range: [{raw_stats.variance_.min():.2e}, {raw_stats.variance_.max():.2e}]")
print(f"Min values range: [{raw_stats.min_.min():.2f}, {raw_stats.min_.max():.2f}]")
print(f"Max values range: [{raw_stats.max_.min():.2f}, {raw_stats.max_.max():.2f}]")
# Identify features with high variance
high_var_features = np.where(raw_stats.variance_ > np.percentile(raw_stats.variance_, 90))[0]
print(f"High variance features: {[cancer.feature_names[i] for i in high_var_features]}")
# Standardize data for better t-SNE performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cancer)
# Compute statistics on scaled data
scaled_stats = BasicStatistics(result_options=['mean', 'variance'])
scaled_stats.fit(X_scaled)
print(f"\nScaled data statistics:")
print(f"Mean after scaling: {scaled_stats.mean_}")
print(f"Variance after scaling: {scaled_stats.variance_}")
# Apply t-SNE to scaled data
tsne_cancer = TSNE(
n_components=2,
perplexity=30,
learning_rate=200,
n_iter=1000,
random_state=42,
verbose=1
)
X_tsne_cancer = tsne_cancer.fit_transform(X_scaled)
# Analyze t-SNE embedding statistics
tsne_stats = BasicStatistics(result_options='all')
tsne_stats.fit(X_tsne_cancer)
print(f"\nt-SNE embedding statistics:")
print(f"Embedding mean: {tsne_stats.mean_}")
print(f"Embedding variance: {tsne_stats.variance_}")
print(f"Embedding range: [{tsne_stats.min_}, {tsne_stats.max_}]")
# Visualize results with statistics
plt.figure(figsize=(15, 5))
# Original data: first two features
plt.subplot(1, 3, 1)
plt.scatter(X_cancer[:, 0], X_cancer[:, 1], c=y_cancer, cmap='coolwarm', alpha=0.7)
plt.xlabel(f"{cancer.feature_names[0]}")
plt.ylabel(f"{cancer.feature_names[1]}")
plt.title("Original Data (First 2 Features)")
plt.colorbar(label='Malignant (1) / Benign (0)')
# Scaled data: first two features
plt.subplot(1, 3, 2)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_cancer, cmap='coolwarm', alpha=0.7)
plt.xlabel(f"Scaled {cancer.feature_names[0]}")
plt.ylabel(f"Scaled {cancer.feature_names[1]}")
plt.title("Scaled Data (First 2 Features)")
plt.colorbar(label='Malignant (1) / Benign (0)')
# t-SNE embedding
plt.subplot(1, 3, 3)
plt.scatter(X_tsne_cancer[:, 0], X_tsne_cancer[:, 1], c=y_cancer, cmap='coolwarm', alpha=0.7)
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.title(f"t-SNE Embedding (KL={tsne_cancer.kl_divergence_:.2f})")
plt.colorbar(label='Malignant (1) / Benign (0)')
plt.tight_layout()
plt.show()
# Feature correlation analysis using statistics
feature_correlations = []
for i in range(X_cancer.shape[1]):
for j in range(i+1, X_cancer.shape[1]):
corr = np.corrcoef(X_cancer[:, i], X_cancer[:, j])[0, 1]
feature_correlations.append({
'feature1': cancer.feature_names[i],
'feature2': cancer.feature_names[j],
'correlation': abs(corr)
})
# Find most correlated features
feature_correlations.sort(key=lambda x: x['correlation'], reverse=True)
print(f"\nTop 5 most correlated feature pairs:")
for i in range(5):
fc = feature_correlations[i]
print(f" {fc['feature1']} <-> {fc['feature2']}: {fc['correlation']:.3f}")import time
import numpy as np
from sklearn.datasets import make_regression
# Generate large dataset for performance testing
X_large, _ = make_regression(n_samples=100000, n_features=50, random_state=42)
print("Performance comparison on large dataset:")
print(f"Dataset shape: {X_large.shape}")
# Test BasicStatistics performance
print("\nBasic Statistics Performance:")
# Intel-optimized version
start_time = time.time()
from sklearnex.basic_statistics import BasicStatistics as IntelStats
intel_stats = IntelStats(result_options='all')
intel_stats.fit(X_large)
intel_time = time.time() - start_time
print(f"Intel BasicStatistics: {intel_time:.3f} seconds")
# NumPy comparison
start_time = time.time()
numpy_mean = np.mean(X_large, axis=0)
numpy_var = np.var(X_large, axis=0)
numpy_min = np.min(X_large, axis=0)
numpy_max = np.max(X_large, axis=0)
numpy_sum = np.sum(X_large, axis=0)
numpy_time = time.time() - start_time
print(f"NumPy equivalent computations: {numpy_time:.3f} seconds")
print(f"Speedup: {numpy_time / intel_time:.1f}x")
# Verify results match
print(f"Results identical:")
print(f" Mean: {np.allclose(intel_stats.mean_, numpy_mean)}")
print(f" Variance: {np.allclose(intel_stats.variance_, numpy_var)}")
print(f" Min: {np.allclose(intel_stats.min_, numpy_min)}")
print(f" Max: {np.allclose(intel_stats.max_, numpy_max)}")
# Test t-SNE performance (smaller dataset for practical timing)
X_tsne_test = X_large[:5000, :20] # Reduce size for t-SNE timing
print(f"\nt-SNE Performance (shape: {X_tsne_test.shape}):")
# Intel-optimized version
start_time = time.time()
from sklearnex.manifold import TSNE as IntelTSNE
intel_tsne = IntelTSNE(n_components=2, perplexity=30, random_state=42, verbose=0)
intel_embedding = intel_tsne.fit_transform(X_tsne_test)
intel_tsne_time = time.time() - start_time
print(f"Intel t-SNE: {intel_tsne_time:.2f} seconds")
print(f"KL divergence: {intel_tsne.kl_divergence_:.4f}")
# Standard scikit-learn version
start_time = time.time()
from sklearn.manifold import TSNE as StandardTSNE
standard_tsne = StandardTSNE(n_components=2, perplexity=30, random_state=42, verbose=0)
standard_embedding = standard_tsne.fit_transform(X_tsne_test)
standard_tsne_time = time.time() - start_time
print(f"Standard t-SNE: {standard_tsne_time:.2f} seconds")
print(f"KL divergence: {standard_tsne.kl_divergence_:.4f}")
print(f"Speedup: {standard_tsne_time / intel_tsne_time:.1f}x")
# Compare embedding quality
embedding_diff = np.mean(np.abs(intel_embedding - standard_embedding))
print(f"Mean absolute difference in embeddings: {embedding_diff:.4f}")Install with Tessl CLI
npx tessl i tessl/pypi-scikit-learn-intelex