Intel Extension for Scikit-learn providing hardware-accelerated implementations of scikit-learn algorithms optimized for Intel CPUs and GPUs.
—
High-performance implementations of evaluation metrics and model selection utilities with Intel hardware acceleration. These functions provide significant speedups for model evaluation, distance computations, and data splitting operations.
Intel-accelerated computation of Area Under the ROC Curve for binary and multiclass classification.
def roc_auc_score(
y_true,
y_score,
average='macro',
sample_weight=None,
max_fpr=None,
multi_class='raise',
labels=None
):
"""
Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC).
Intel-optimized implementation providing significant speedup for large datasets
through vectorized operations and efficient curve computation.
Parameters:
y_true (array-like): True binary labels or multiclass labels
y_score (array-like): Target scores (probabilities or decision values)
average (str): Averaging strategy for multiclass ('macro', 'weighted', 'micro')
sample_weight (array-like): Sample weights
max_fpr (float): Maximum false positive rate for partial AUC
multi_class (str): Multiclass strategy ('raise', 'ovr', 'ovo')
labels (array-like): Labels to include for multiclass problems
Returns:
float: Area under ROC curve score
Example:
>>> from sklearnex.metrics import roc_auc_score
>>> y_true = [0, 0, 1, 1]
>>> y_scores = [0.1, 0.4, 0.35, 0.8]
>>> roc_auc_score(y_true, y_scores)
0.75
"""Intel-accelerated computation of pairwise distances between samples.
def pairwise_distances(
X,
Y=None,
metric='euclidean',
n_jobs=None,
force_all_finite=True,
**kwds
):
"""
Compute pairwise distances between samples.
Intel-optimized implementation with significant speedup through vectorized
distance computations and efficient memory access patterns.
Parameters:
X (array-like): Input samples of shape (n_samples_X, n_features)
Y (array-like): Second set of samples (n_samples_Y, n_features), optional
metric (str or callable): Distance metric to use
n_jobs (int): Number of parallel jobs
force_all_finite (bool): Whether to check for finite values
**kwds: Additional parameters for distance metric
Returns:
ndarray: Distance matrix of shape (n_samples_X, n_samples_Y)
Supported metrics:
- 'euclidean': L2 norm distance
- 'manhattan': L1 norm distance
- 'cosine': Cosine distance
- 'minkowski': Minkowski distance
- 'chebyshev': Chebyshev distance
- 'hamming': Hamming distance
- 'jaccard': Jaccard distance
- callable: Custom distance function
Example:
>>> from sklearnex.metrics import pairwise_distances
>>> import numpy as np
>>> X = np.array([[0, 1], [1, 0], [2, 2]])
>>> pairwise_distances(X, metric='euclidean')
array([[0. , 1.4142, 2.2361],
[1.4142, 0. , 1.4142],
[2.2361, 1.4142, 0. ]])
"""Intel-accelerated data splitting for model validation with optimized random sampling.
def train_test_split(
*arrays,
test_size=None,
train_size=None,
random_state=None,
shuffle=True,
stratify=None
):
"""
Split arrays or matrices into random train and test subsets.
Intel-optimized implementation with efficient random sampling and
memory-optimized array operations for large datasets.
Parameters:
*arrays: Sequence of indexable arrays with same length/shape[0]
test_size (float or int): Size of test set (0.0-1.0 for proportion, int for absolute)
train_size (float or int): Size of train set
random_state (int): Controls random number generation for reproducibility
shuffle (bool): Whether to shuffle data before splitting
stratify (array-like): If not None, data split in stratified fashion
Returns:
list: List containing train-test split of inputs
Example:
>>> from sklearnex.model_selection import train_test_split
>>> import numpy as np
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
>>> y = np.array([1, 2, 1, 2])
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.5, random_state=42)
>>> X_train.shape, X_test.shape
((2, 2), (2, 2))
"""import numpy as np
from sklearnex.metrics import roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# Binary classification example
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
# Get prediction probabilities
y_proba = clf.predict_proba(X_test)[:, 1] # Probabilities for positive class
# Compute ROC AUC
auc_score = roc_auc_score(y_test, y_proba)
print(f"Binary ROC AUC: {auc_score:.3f}")
# Multiclass example
X_multi, y_multi = make_classification(n_samples=1000, n_features=20, n_classes=3,
n_informative=10, random_state=42)
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
X_multi, y_multi, test_size=0.2, random_state=42)
clf_multi = RandomForestClassifier(n_estimators=100, random_state=42)
clf_multi.fit(X_train_multi, y_train_multi)
# Get prediction probabilities for all classes
y_proba_multi = clf_multi.predict_proba(X_test_multi)
# Compute multiclass ROC AUC with different averaging strategies
auc_macro = roc_auc_score(y_test_multi, y_proba_multi, multi_class='ovr', average='macro')
auc_weighted = roc_auc_score(y_test_multi, y_proba_multi, multi_class='ovr', average='weighted')
print(f"Multiclass ROC AUC (macro): {auc_macro:.3f}")
print(f"Multiclass ROC AUC (weighted): {auc_weighted:.3f}")
# Per-class ROC AUC
auc_per_class = roc_auc_score(y_test_multi, y_proba_multi, multi_class='ovr', average=None)
for i, auc in enumerate(auc_per_class):
print(f"Class {i} ROC AUC: {auc:.3f}")
# One-vs-One strategy
auc_ovo = roc_auc_score(y_test_multi, y_proba_multi, multi_class='ovo', average='macro')
print(f"Multiclass ROC AUC (OvO): {auc_ovo:.3f}")import numpy as np
from sklearnex.metrics import pairwise_distances
from sklearn.datasets import make_blobs
# Generate sample data
X, _ = make_blobs(n_samples=500, centers=3, n_features=10, random_state=42)
Y = X[:100] # Subset for pairwise comparison
# Compute various distance metrics
metrics = ['euclidean', 'manhattan', 'cosine', 'chebyshev']
for metric in metrics:
distances = pairwise_distances(X[:5], Y[:5], metric=metric)
print(f"{metric.capitalize()} distances shape: {distances.shape}")
print(f"{metric.capitalize()} distance range: [{distances.min():.3f}, {distances.max():.3f}]")
# Self-distance matrix (symmetric)
euclidean_self = pairwise_distances(X[:10], metric='euclidean')
print(f"Self-distance matrix shape: {euclidean_self.shape}")
print(f"Diagonal elements (should be ~0): {np.diag(euclidean_self)}")
# Minkowski distance with different p values
for p in [1, 2, 3]:
minkowski_dist = pairwise_distances(X[:5], Y[:5], metric='minkowski', p=p)
print(f"Minkowski distance (p={p}) range: [{minkowski_dist.min():.3f}, {minkowski_dist.max():.3f}]")
# Large dataset performance example
X_large = np.random.randn(2000, 50)
Y_large = np.random.randn(1000, 50)
import time
start_time = time.time()
distances_large = pairwise_distances(X_large, Y_large, metric='euclidean')
computation_time = time.time() - start_time
print(f"Large dataset distances shape: {distances_large.shape}")
print(f"Computation time: {computation_time:.2f} seconds")
# Memory-efficient chunked computation for very large datasets
def chunked_pairwise_distances(X, Y, chunk_size=1000, metric='euclidean'):
"""Compute pairwise distances in chunks to manage memory usage."""
n_samples_X = X.shape[0]
distances = []
for i in range(0, n_samples_X, chunk_size):
end_idx = min(i + chunk_size, n_samples_X)
chunk_distances = pairwise_distances(X[i:end_idx], Y, metric=metric)
distances.append(chunk_distances)
return np.vstack(distances)
# Example with chunked computation
X_very_large = np.random.randn(5000, 20)
Y_subset = np.random.randn(500, 20)
chunked_distances = chunked_pairwise_distances(X_very_large, Y_subset, chunk_size=1000)
print(f"Chunked distances shape: {chunked_distances.shape}")import numpy as np
from sklearnex.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression
# Basic train-test split
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
# Split with different test sizes
test_sizes = [0.2, 0.3, 0.5]
for test_size in test_sizes:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
print(f"Test size {test_size}: Train={X_train.shape[0]}, Test={X_test.shape[0]}")
# Stratified split to preserve class distribution
X_imbalanced, y_imbalanced = make_classification(
n_samples=1000, n_features=20, n_classes=3,
weights=[0.6, 0.3, 0.1], random_state=42
)
X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(
X_imbalanced, y_imbalanced, test_size=0.2, stratify=y_imbalanced, random_state=42
)
# Check class distributions
from collections import Counter
print("Original distribution:", Counter(y_imbalanced))
print("Train distribution:", Counter(y_train_strat))
print("Test distribution:", Counter(y_test_strat))
# Multiple array splitting
X_reg, y_reg = make_regression(n_samples=800, n_features=15, random_state=42)
sample_weights = np.random.rand(800)
groups = np.random.randint(0, 5, 800)
X_train, X_test, y_train, y_test, weights_train, weights_test, groups_train, groups_test = train_test_split(
X_reg, y_reg, sample_weights, groups,
test_size=0.25, random_state=42
)
print(f"Multiple arrays split:")
print(f"X: {X_train.shape[0]} train, {X_test.shape[0]} test")
print(f"y: {y_train.shape[0]} train, {y_test.shape[0]} test")
print(f"weights: {weights_train.shape[0]} train, {weights_test.shape[0]} test")
print(f"groups: {groups_train.shape[0]} train, {groups_test.shape[0]} test")
# No shuffle option
X_ordered = np.arange(100).reshape(50, 2)
y_ordered = np.arange(50)
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(
X_ordered, y_ordered, test_size=0.2, shuffle=False
)
print("No shuffle - first few train indices:", y_train_ns[:5])
print("No shuffle - first few test indices:", y_test_ns[:5])
# Fixed train size instead of test size
X_train_fixed, X_test_fixed, y_train_fixed, y_test_fixed = train_test_split(
X, y, train_size=600, random_state=42
)
print(f"Fixed train size: Train={X_train_fixed.shape[0]}, Test={X_test_fixed.shape[0]}")
# Reproducibility check
splits = []
for seed in [42, 42, 42]: # Same seed
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=seed)
splits.append(y_tr[:5])
print("Reproducibility check (should be identical):")
for i, split in enumerate(splits):
print(f"Split {i+1}: {split}")import numpy as np
from sklearnex.model_selection import train_test_split
from sklearnex.metrics import roc_auc_score, pairwise_distances
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
# Generate dataset
X, y = make_classification(
n_samples=2000, n_features=20, n_informative=15,
n_classes=2, weights=[0.7, 0.3], random_state=42
)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# Train multiple models
models = {
'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
'LogisticRegression': LogisticRegression(random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5)
}
results = {}
for name, model in models.items():
# Fit model
if name == 'LogisticRegression' or name == 'KNN':
# Scale features for these models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model.fit(X_train_scaled, y_train)
y_proba = model.predict_proba(X_test_scaled)[:, 1]
else:
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)[:, 1]
# Compute ROC AUC
auc = roc_auc_score(y_test, y_proba)
results[name] = auc
print(f"{name} ROC AUC: {auc:.3f}")
# Find best model
best_model = max(results, key=results.get)
print(f"\nBest model: {best_model} (AUC: {results[best_model]:.3f})")
# Distance-based analysis
# Compute pairwise distances between test samples
test_distances = pairwise_distances(X_test[:100], metric='euclidean')
# Analyze distance distribution
print(f"\nDistance analysis on test set:")
print(f"Mean distance: {test_distances.mean():.3f}")
print(f"Std distance: {test_distances.std():.3f}")
print(f"Min non-zero distance: {test_distances[test_distances > 0].min():.3f}")
print(f"Max distance: {test_distances.max():.3f}")
# Cross-validation with custom splits
from sklearn.model_selection import cross_val_score
# Multiple train-test splits for robust evaluation
cv_scores = []
for i in range(5):
X_cv_train, X_cv_test, y_cv_train, y_cv_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=i
)
# Train best model
best_clf = models[best_model]
if best_model == 'LogisticRegression' or best_model == 'KNN':
scaler = StandardScaler()
X_cv_train_scaled = scaler.fit_transform(X_cv_train)
X_cv_test_scaled = scaler.transform(X_cv_test)
best_clf.fit(X_cv_train_scaled, y_cv_train)
y_cv_proba = best_clf.predict_proba(X_cv_test_scaled)[:, 1]
else:
best_clf.fit(X_cv_train, y_cv_train)
y_cv_proba = best_clf.predict_proba(X_cv_test)[:, 1]
cv_auc = roc_auc_score(y_cv_test, y_cv_proba)
cv_scores.append(cv_auc)
print(f"\nCross-validation results ({len(cv_scores)} folds):")
print(f"Mean AUC: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")
print(f"Individual scores: {[f'{score:.3f}' for score in cv_scores]}")import time
import numpy as np
from sklearn.datasets import make_classification
# Generate large dataset for performance testing
X_large, y_large = make_classification(
n_samples=100000, n_features=50, n_classes=2, random_state=42
)
# Test train_test_split performance
print("Train-test split performance:")
# Intel-optimized version
start_time = time.time()
from sklearnex.model_selection import train_test_split as intel_split
X_train_intel, X_test_intel, y_train_intel, y_test_intel = intel_split(
X_large, y_large, test_size=0.2, random_state=42
)
intel_split_time = time.time() - start_time
# Standard version
start_time = time.time()
from sklearn.model_selection import train_test_split as standard_split
X_train_std, X_test_std, y_train_std, y_test_std = standard_split(
X_large, y_large, test_size=0.2, random_state=42
)
standard_split_time = time.time() - start_time
print(f"Intel train_test_split: {intel_split_time:.3f} seconds")
print(f"Standard train_test_split: {standard_split_time:.3f} seconds")
print(f"Speedup: {standard_split_time / intel_split_time:.1f}x")
# Test pairwise_distances performance
X_dist_test = np.random.randn(2000, 30)
Y_dist_test = np.random.randn(1500, 30)
print("\nPairwise distances performance:")
# Intel-optimized version
start_time = time.time()
from sklearnex.metrics import pairwise_distances as intel_distances
distances_intel = intel_distances(X_dist_test, Y_dist_test, metric='euclidean')
intel_dist_time = time.time() - start_time
# Standard version
start_time = time.time()
from sklearn.metrics import pairwise_distances as standard_distances
distances_std = standard_distances(X_dist_test, Y_dist_test, metric='euclidean')
standard_dist_time = time.time() - start_time
print(f"Intel pairwise_distances: {intel_dist_time:.3f} seconds")
print(f"Standard pairwise_distances: {standard_dist_time:.3f} seconds")
print(f"Speedup: {standard_dist_time / intel_dist_time:.1f}x")
# Verify results are identical
print(f"Results identical: {np.allclose(distances_intel, distances_std)}")Install with Tessl CLI
npx tessl i tessl/pypi-scikit-learn-intelex