0
# Decomposition
1
2
Principal Component Analysis with Intel acceleration for efficient dimensionality reduction on large datasets. Provides significant performance improvements through optimized matrix decomposition algorithms.
3
4
## Capabilities
5
6
### Principal Component Analysis
7
8
Intel-accelerated PCA with optimized singular value decomposition for fast dimensionality reduction.
9
10
```python { .api }
11
class PCA:
12
"""
13
Principal Component Analysis with Intel optimization.
14
15
Efficient dimensionality reduction using optimized SVD algorithms
16
and Intel Math Kernel Library integration.
17
"""
18
19
def __init__(
20
self,
21
n_components=None,
22
copy=True,
23
whiten=False,
24
svd_solver='auto',
25
tol=0.0,
26
iterated_power='auto',
27
n_oversamples=10,
28
power_iteration_normalizer='auto',
29
random_state=None
30
):
31
"""
32
Initialize PCA.
33
34
Parameters:
35
n_components (int or float): Number of components to keep
36
copy (bool): Whether to copy data
37
whiten (bool): Whether to whiten components
38
svd_solver (str): SVD solver algorithm
39
tol (float): Tolerance for singular values
40
iterated_power (int): Number of iterations for randomized SVD
41
n_oversamples (int): Additional samples for randomized SVD
42
power_iteration_normalizer (str): Normalization method
43
random_state (int): Random state for reproducibility
44
"""
45
46
def fit(self, X, y=None):
47
"""
48
Fit PCA model.
49
50
Parameters:
51
X (array-like): Training data of shape (n_samples, n_features)
52
y: Ignored, present for API consistency
53
54
Returns:
55
self: Fitted estimator
56
"""
57
58
def transform(self, X):
59
"""
60
Transform data to lower dimensional space.
61
62
Parameters:
63
X (array-like): Data to transform
64
65
Returns:
66
array: Transformed data
67
"""
68
69
def fit_transform(self, X, y=None):
70
"""
71
Fit model and transform data.
72
73
Parameters:
74
X (array-like): Training data
75
y: Ignored
76
77
Returns:
78
array: Transformed data
79
"""
80
81
def inverse_transform(self, X):
82
"""
83
Transform data back to original space.
84
85
Parameters:
86
X (array-like): Data in PCA space
87
88
Returns:
89
array: Data in original space
90
"""
91
92
def score(self, X, y=None):
93
"""
94
Return average log-likelihood.
95
96
Parameters:
97
X (array-like): Test data
98
y: Ignored
99
100
Returns:
101
float: Average log-likelihood
102
"""
103
104
# Attributes available after fitting
105
components_: ... # Principal axes
106
explained_variance_: ... # Variance explained by each component
107
explained_variance_ratio_: ... # Percentage of variance explained
108
singular_values_: ... # Singular values
109
mean_: ... # Per-feature empirical mean
110
n_components_: ... # Number of components
111
n_features_in_: ... # Number of features during fit
112
noise_variance_: ... # Estimated noise covariance
113
```
114
115
## Usage Examples
116
117
### Basic PCA for Dimensionality Reduction
118
119
```python
120
import numpy as np
121
from sklearnex.decomposition import PCA
122
from sklearn.datasets import make_classification
123
from sklearn.model_selection import train_test_split
124
from sklearn.ensemble import RandomForestClassifier
125
126
# Generate high-dimensional dataset
127
X, y = make_classification(
128
n_samples=1000, n_features=50, n_informative=30,
129
n_redundant=20, random_state=42
130
)
131
132
# Apply PCA for dimensionality reduction
133
pca = PCA(n_components=10, random_state=42)
134
X_reduced = pca.fit_transform(X)
135
136
print(f"Original shape: {X.shape}")
137
print(f"Reduced shape: {X_reduced.shape}")
138
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")
139
140
# Use reduced features for classification
141
X_train, X_test, y_train, y_test = train_test_split(
142
X_reduced, y, test_size=0.2, random_state=42
143
)
144
145
rf = RandomForestClassifier(random_state=42)
146
rf.fit(X_train, y_train)
147
accuracy = rf.score(X_test, y_test)
148
149
print(f"Classification accuracy with PCA: {accuracy:.3f}")
150
```
151
152
### Explained Variance Analysis
153
154
```python
155
import numpy as np
156
import matplotlib.pyplot as plt
157
from sklearnex.decomposition import PCA
158
from sklearn.datasets import load_digits
159
160
# Load digits dataset
161
digits = load_digits()
162
X, y = digits.data, digits.target
163
164
# Fit PCA with all components
165
pca_full = PCA()
166
pca_full.fit(X)
167
168
# Calculate cumulative explained variance
169
cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)
170
171
# Find number of components for 95% variance
172
n_components_95 = np.argmax(cumsum_var >= 0.95) + 1
173
print(f"Components for 95% variance: {n_components_95}/{len(cumsum_var)}")
174
175
# Apply PCA with optimal number of components
176
pca = PCA(n_components=n_components_95)
177
X_transformed = pca.fit_transform(X)
178
179
print(f"Original dimensions: {X.shape}")
180
print(f"Reduced dimensions: {X_transformed.shape}")
181
print(f"Variance preserved: {pca.explained_variance_ratio_.sum():.3f}")
182
183
# Analyze top components
184
print(f"Top 5 components variance:")
185
for i in range(min(5, len(pca.explained_variance_ratio_))):
186
print(f" PC{i+1}: {pca.explained_variance_ratio_[i]:.4f}")
187
```
188
189
### Data Reconstruction and Noise Reduction
190
191
```python
192
import numpy as np
193
from sklearnex.decomposition import PCA
194
from sklearn.datasets import make_blobs
195
196
# Generate data with noise
197
X_clean, _ = make_blobs(n_samples=500, centers=3, n_features=20, random_state=42)
198
noise = np.random.normal(0, 0.5, X_clean.shape)
199
X_noisy = X_clean + noise
200
201
# Apply PCA for noise reduction
202
pca = PCA(n_components=10) # Keep only top 10 components
203
X_pca = pca.fit_transform(X_noisy)
204
X_reconstructed = pca.inverse_transform(X_pca)
205
206
# Calculate reconstruction error
207
reconstruction_error = np.mean((X_noisy - X_reconstructed) ** 2)
208
denoising_improvement = np.mean((X_clean - X_noisy) ** 2) - np.mean((X_clean - X_reconstructed) ** 2)
209
210
print(f"Original data shape: {X_noisy.shape}")
211
print(f"PCA components: {X_pca.shape[1]}")
212
print(f"Reconstruction error: {reconstruction_error:.4f}")
213
print(f"Denoising improvement: {denoising_improvement:.4f}")
214
print(f"Explained variance: {pca.explained_variance_ratio_.sum():.3f}")
215
```
216
217
### Performance Comparison
218
219
```python
220
import time
221
import numpy as np
222
from sklearn.datasets import make_classification
223
224
# Generate large dataset
225
X, y = make_classification(
226
n_samples=5000, n_features=200, n_informative=100,
227
random_state=42
228
)
229
230
# Intel-optimized PCA
231
from sklearnex.decomposition import PCA as IntelPCA
232
233
start_time = time.time()
234
intel_pca = IntelPCA(n_components=50)
235
X_intel = intel_pca.fit_transform(X)
236
intel_time = time.time() - start_time
237
238
print(f"Intel PCA:")
239
print(f" Time: {intel_time:.2f} seconds")
240
print(f" Shape: {X_intel.shape}")
241
print(f" Explained variance: {intel_pca.explained_variance_ratio_.sum():.3f}")
242
243
# Standard scikit-learn PCA (for comparison)
244
from sklearn.decomposition import PCA as StandardPCA
245
246
start_time = time.time()
247
standard_pca = StandardPCA(n_components=50)
248
X_standard = standard_pca.fit_transform(X)
249
standard_time = time.time() - start_time
250
251
print(f"\nStandard PCA:")
252
print(f" Time: {standard_time:.2f} seconds")
253
print(f" Shape: {X_standard.shape}")
254
print(f" Explained variance: {standard_pca.explained_variance_ratio_.sum():.3f}")
255
print(f" Speedup: {standard_time / intel_time:.1f}x")
256
257
# Verify results are equivalent
258
results_close = np.allclose(
259
np.abs(X_intel), np.abs(X_standard), rtol=1e-3
260
)
261
print(f" Results equivalent: {results_close}")
262
```
263
264
## Performance Notes
265
266
- Significant speedups on datasets with >1000 samples and >50 features
267
- SVD computation is highly optimized with Intel MKL
268
- Memory usage is comparable to standard scikit-learn
269
- Randomized SVD solver provides additional performance benefits for large datasets
270
- Numerical stability maintained equivalent to scikit-learn implementation