or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdclustering.mddaal4py-mb.mddecomposition.mdensemble.mdindex.mdlinear-models.mdmetrics-model-selection.mdneighbors.mdpatching-config.mdstats-manifold.mdsvm.md

decomposition.mddocs/

0

# Decomposition

1

2

Principal Component Analysis with Intel acceleration for efficient dimensionality reduction on large datasets. Provides significant performance improvements through optimized matrix decomposition algorithms.

3

4

## Capabilities

5

6

### Principal Component Analysis

7

8

Intel-accelerated PCA with optimized singular value decomposition for fast dimensionality reduction.

9

10

```python { .api }

11

class PCA:

12

"""

13

Principal Component Analysis with Intel optimization.

14

15

Efficient dimensionality reduction using optimized SVD algorithms

16

and Intel Math Kernel Library integration.

17

"""

18

19

def __init__(

20

self,

21

n_components=None,

22

copy=True,

23

whiten=False,

24

svd_solver='auto',

25

tol=0.0,

26

iterated_power='auto',

27

n_oversamples=10,

28

power_iteration_normalizer='auto',

29

random_state=None

30

):

31

"""

32

Initialize PCA.

33

34

Parameters:

35

n_components (int or float): Number of components to keep

36

copy (bool): Whether to copy data

37

whiten (bool): Whether to whiten components

38

svd_solver (str): SVD solver algorithm

39

tol (float): Tolerance for singular values

40

iterated_power (int): Number of iterations for randomized SVD

41

n_oversamples (int): Additional samples for randomized SVD

42

power_iteration_normalizer (str): Normalization method

43

random_state (int): Random state for reproducibility

44

"""

45

46

def fit(self, X, y=None):

47

"""

48

Fit PCA model.

49

50

Parameters:

51

X (array-like): Training data of shape (n_samples, n_features)

52

y: Ignored, present for API consistency

53

54

Returns:

55

self: Fitted estimator

56

"""

57

58

def transform(self, X):

59

"""

60

Transform data to lower dimensional space.

61

62

Parameters:

63

X (array-like): Data to transform

64

65

Returns:

66

array: Transformed data

67

"""

68

69

def fit_transform(self, X, y=None):

70

"""

71

Fit model and transform data.

72

73

Parameters:

74

X (array-like): Training data

75

y: Ignored

76

77

Returns:

78

array: Transformed data

79

"""

80

81

def inverse_transform(self, X):

82

"""

83

Transform data back to original space.

84

85

Parameters:

86

X (array-like): Data in PCA space

87

88

Returns:

89

array: Data in original space

90

"""

91

92

def score(self, X, y=None):

93

"""

94

Return average log-likelihood.

95

96

Parameters:

97

X (array-like): Test data

98

y: Ignored

99

100

Returns:

101

float: Average log-likelihood

102

"""

103

104

# Attributes available after fitting

105

components_: ... # Principal axes

106

explained_variance_: ... # Variance explained by each component

107

explained_variance_ratio_: ... # Percentage of variance explained

108

singular_values_: ... # Singular values

109

mean_: ... # Per-feature empirical mean

110

n_components_: ... # Number of components

111

n_features_in_: ... # Number of features during fit

112

noise_variance_: ... # Estimated noise covariance

113

```

114

115

## Usage Examples

116

117

### Basic PCA for Dimensionality Reduction

118

119

```python

120

import numpy as np

121

from sklearnex.decomposition import PCA

122

from sklearn.datasets import make_classification

123

from sklearn.model_selection import train_test_split

124

from sklearn.ensemble import RandomForestClassifier

125

126

# Generate high-dimensional dataset

127

X, y = make_classification(

128

n_samples=1000, n_features=50, n_informative=30,

129

n_redundant=20, random_state=42

130

)

131

132

# Apply PCA for dimensionality reduction

133

pca = PCA(n_components=10, random_state=42)

134

X_reduced = pca.fit_transform(X)

135

136

print(f"Original shape: {X.shape}")

137

print(f"Reduced shape: {X_reduced.shape}")

138

print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")

139

140

# Use reduced features for classification

141

X_train, X_test, y_train, y_test = train_test_split(

142

X_reduced, y, test_size=0.2, random_state=42

143

)

144

145

rf = RandomForestClassifier(random_state=42)

146

rf.fit(X_train, y_train)

147

accuracy = rf.score(X_test, y_test)

148

149

print(f"Classification accuracy with PCA: {accuracy:.3f}")

150

```

151

152

### Explained Variance Analysis

153

154

```python

155

import numpy as np

156

import matplotlib.pyplot as plt

157

from sklearnex.decomposition import PCA

158

from sklearn.datasets import load_digits

159

160

# Load digits dataset

161

digits = load_digits()

162

X, y = digits.data, digits.target

163

164

# Fit PCA with all components

165

pca_full = PCA()

166

pca_full.fit(X)

167

168

# Calculate cumulative explained variance

169

cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)

170

171

# Find number of components for 95% variance

172

n_components_95 = np.argmax(cumsum_var >= 0.95) + 1

173

print(f"Components for 95% variance: {n_components_95}/{len(cumsum_var)}")

174

175

# Apply PCA with optimal number of components

176

pca = PCA(n_components=n_components_95)

177

X_transformed = pca.fit_transform(X)

178

179

print(f"Original dimensions: {X.shape}")

180

print(f"Reduced dimensions: {X_transformed.shape}")

181

print(f"Variance preserved: {pca.explained_variance_ratio_.sum():.3f}")

182

183

# Analyze top components

184

print(f"Top 5 components variance:")

185

for i in range(min(5, len(pca.explained_variance_ratio_))):

186

print(f" PC{i+1}: {pca.explained_variance_ratio_[i]:.4f}")

187

```

188

189

### Data Reconstruction and Noise Reduction

190

191

```python

192

import numpy as np

193

from sklearnex.decomposition import PCA

194

from sklearn.datasets import make_blobs

195

196

# Generate data with noise

197

X_clean, _ = make_blobs(n_samples=500, centers=3, n_features=20, random_state=42)

198

noise = np.random.normal(0, 0.5, X_clean.shape)

199

X_noisy = X_clean + noise

200

201

# Apply PCA for noise reduction

202

pca = PCA(n_components=10) # Keep only top 10 components

203

X_pca = pca.fit_transform(X_noisy)

204

X_reconstructed = pca.inverse_transform(X_pca)

205

206

# Calculate reconstruction error

207

reconstruction_error = np.mean((X_noisy - X_reconstructed) ** 2)

208

denoising_improvement = np.mean((X_clean - X_noisy) ** 2) - np.mean((X_clean - X_reconstructed) ** 2)

209

210

print(f"Original data shape: {X_noisy.shape}")

211

print(f"PCA components: {X_pca.shape[1]}")

212

print(f"Reconstruction error: {reconstruction_error:.4f}")

213

print(f"Denoising improvement: {denoising_improvement:.4f}")

214

print(f"Explained variance: {pca.explained_variance_ratio_.sum():.3f}")

215

```

216

217

### Performance Comparison

218

219

```python

220

import time

221

import numpy as np

222

from sklearn.datasets import make_classification

223

224

# Generate large dataset

225

X, y = make_classification(

226

n_samples=5000, n_features=200, n_informative=100,

227

random_state=42

228

)

229

230

# Intel-optimized PCA

231

from sklearnex.decomposition import PCA as IntelPCA

232

233

start_time = time.time()

234

intel_pca = IntelPCA(n_components=50)

235

X_intel = intel_pca.fit_transform(X)

236

intel_time = time.time() - start_time

237

238

print(f"Intel PCA:")

239

print(f" Time: {intel_time:.2f} seconds")

240

print(f" Shape: {X_intel.shape}")

241

print(f" Explained variance: {intel_pca.explained_variance_ratio_.sum():.3f}")

242

243

# Standard scikit-learn PCA (for comparison)

244

from sklearn.decomposition import PCA as StandardPCA

245

246

start_time = time.time()

247

standard_pca = StandardPCA(n_components=50)

248

X_standard = standard_pca.fit_transform(X)

249

standard_time = time.time() - start_time

250

251

print(f"\nStandard PCA:")

252

print(f" Time: {standard_time:.2f} seconds")

253

print(f" Shape: {X_standard.shape}")

254

print(f" Explained variance: {standard_pca.explained_variance_ratio_.sum():.3f}")

255

print(f" Speedup: {standard_time / intel_time:.1f}x")

256

257

# Verify results are equivalent

258

results_close = np.allclose(

259

np.abs(X_intel), np.abs(X_standard), rtol=1e-3

260

)

261

print(f" Results equivalent: {results_close}")

262

```

263

264

## Performance Notes

265

266

- Significant speedups on datasets with >1000 samples and >50 features

267

- SVD computation is highly optimized with Intel MKL

268

- Memory usage is comparable to standard scikit-learn

269

- Randomized SVD solver provides additional performance benefits for large datasets

270

- Numerical stability maintained equivalent to scikit-learn implementation