0
# Feature Analysis
1
2
Tools for feature selection, analysis, and visualization to understand data characteristics, identify important features, and guide feature engineering decisions. These visualizers support both supervised and unsupervised feature analysis techniques.
3
4
## Capabilities
5
6
### Feature Ranking
7
8
Univariate and bivariate feature ranking visualizers for identifying the most informative features using various statistical measures and algorithms.
9
10
```python { .api }
11
class Rank1D(Visualizer):
12
"""
13
1D feature ranking visualizer using univariate statistical measures.
14
15
Parameters:
16
- algorithm: str, ranking algorithm ('shapiro' for normality, others available)
17
- features: list, feature names for display
18
- orient: str, orientation of bars ('h' for horizontal, 'v' for vertical)
19
"""
20
def __init__(self, algorithm='shapiro', features=None, orient='h', **kwargs): ...
21
def fit(self, X, y=None, **kwargs): ...
22
def show(self, **kwargs): ...
23
24
class Rank2D(Visualizer):
25
"""
26
2D feature ranking visualizer using bivariate statistical measures.
27
28
Parameters:
29
- algorithm: str, ranking algorithm ('pearson', 'covariance', 'spearman', 'kendalltau')
30
- features: list, feature names for display
31
- colormap: str, matplotlib colormap for heatmap (default: "RdBu_r")
32
"""
33
def __init__(self, algorithm='pearson', features=None, colormap='RdBu_r', **kwargs): ...
34
def fit(self, X, y=None, **kwargs): ...
35
def show(self, **kwargs): ...
36
37
def rank1d(X, y=None, algorithm='shapiro', features=None, **kwargs):
38
"""
39
Functional API for 1D feature ranking visualization.
40
41
Parameters:
42
- X: feature matrix
43
- y: target vector (optional)
44
- algorithm: str, ranking algorithm
45
- features: list, feature names
46
47
Returns:
48
Rank1D visualizer instance
49
"""
50
51
def rank2d(X, y=None, algorithm='pearson', features=None, **kwargs):
52
"""
53
Functional API for 2D feature ranking visualization.
54
55
Parameters:
56
- X: feature matrix
57
- y: target vector (optional)
58
- algorithm: str, ranking algorithm
59
- features: list, feature names
60
61
Returns:
62
Rank2D visualizer instance
63
"""
64
```
65
66
**Usage Example:**
67
68
```python
69
from yellowbrick.features import Rank1D, Rank2D, rank1d, rank2d
70
from sklearn.datasets import load_wine
71
72
# Load sample data
73
wine = load_wine()
74
X, y = wine.data, wine.target
75
features = wine.feature_names
76
77
# 1D feature ranking
78
rank1d_viz = Rank1D(algorithm='shapiro', features=features)
79
rank1d_viz.fit(X, y)
80
rank1d_viz.show()
81
82
# 2D feature correlation
83
rank2d_viz = Rank2D(algorithm='pearson', features=features)
84
rank2d_viz.fit(X, y)
85
rank2d_viz.show()
86
87
# Functional API
88
rank1d(X, y, features=features, algorithm='shapiro')
89
rank2d(X, y, features=features, algorithm='spearman')
90
```
91
92
### Parallel Coordinates
93
94
Parallel coordinates visualization for multivariate data analysis, showing relationships between features and target classes across multiple dimensions.
95
96
```python { .api }
97
class ParallelCoordinates(Visualizer):
98
"""
99
Parallel coordinates plot for multivariate data visualization.
100
101
Parameters:
102
- classes: list, class labels for target
103
- features: list, feature names for display
104
- normalize: str, normalization method ('standard', 'minmax', 'robust', or None)
105
- sample: float or int, sampling strategy for large datasets
106
- shuffle: bool, whether to shuffle data before sampling
107
- random_state: int, random state for reproducibility
108
"""
109
def __init__(self, classes=None, features=None, normalize=None, sample=1.0, shuffle=False, random_state=None, **kwargs): ...
110
def fit(self, X, y=None, **kwargs): ...
111
def show(self, **kwargs): ...
112
113
def parallel_coordinates(X, y=None, classes=None, features=None, **kwargs):
114
"""
115
Functional API for parallel coordinates visualization.
116
117
Parameters:
118
- X: feature matrix
119
- y: target vector (optional)
120
- classes: list, class labels
121
- features: list, feature names
122
123
Returns:
124
ParallelCoordinates visualizer instance
125
"""
126
```
127
128
### Radial Visualization (RadViz)
129
130
Radial visualization for projecting multidimensional data onto a 2D plane, useful for identifying clusters and class separability.
131
132
```python { .api }
133
class RadialVisualizer(Visualizer):
134
"""
135
Radial visualization (RadViz) for multidimensional data projection.
136
137
Parameters:
138
- classes: list, class labels for target
139
- features: list, feature names for anchors
140
- alpha: float, transparency of data points
141
"""
142
def __init__(self, classes=None, features=None, alpha=0.75, **kwargs): ...
143
def fit(self, X, y=None, **kwargs): ...
144
def show(self, **kwargs): ...
145
146
# Alias for compatibility
147
RadViz = RadialVisualizer
148
149
def radviz(X, y=None, classes=None, features=None, **kwargs):
150
"""
151
Functional API for radial visualization.
152
153
Parameters:
154
- X: feature matrix
155
- y: target vector (optional)
156
- classes: list, class labels
157
- features: list, feature names
158
159
Returns:
160
RadialVisualizer instance
161
"""
162
```
163
164
### Joint Plots
165
166
Joint plots showing relationships between pairs of features with marginal distributions, useful for understanding feature interactions and distributions.
167
168
```python { .api }
169
class JointPlot(Visualizer):
170
"""
171
Joint plot visualization for feature pair analysis.
172
173
Parameters:
174
- columns: tuple or list, column indices or names for x and y axes
175
- classes: list, class labels for target
176
- kind: str, plot type ('scatter', 'hex', 'reg')
177
"""
178
def __init__(self, columns=None, classes=None, kind='scatter', **kwargs): ...
179
def fit(self, X, y=None, **kwargs): ...
180
def show(self, **kwargs): ...
181
182
# Alias for compatibility
183
JointPlotVisualizer = JointPlot
184
185
def joint_plot(X, y=None, columns=None, classes=None, **kwargs):
186
"""
187
Functional API for joint plot visualization.
188
189
Parameters:
190
- X: feature matrix
191
- y: target vector (optional)
192
- columns: tuple, column indices or names
193
- classes: list, class labels
194
195
Returns:
196
JointPlot visualizer instance
197
"""
198
```
199
200
### PCA Decomposition
201
202
Principal Component Analysis visualization for dimensionality reduction, variance explanation, and feature transformation analysis.
203
204
```python { .api }
205
class PCA(Visualizer):
206
"""
207
PCA decomposition visualizer for dimensionality reduction analysis.
208
209
Parameters:
210
- scale: bool, whether to scale features before PCA
211
- proj_features: bool, whether to project original features
212
- biplot: bool, whether to draw biplot with feature vectors
213
- classes: list, class labels for target
214
"""
215
def __init__(self, scale=True, proj_features=True, biplot=False, classes=None, **kwargs): ...
216
def fit(self, X, y=None, **kwargs): ...
217
def show(self, **kwargs): ...
218
219
# Alias for compatibility
220
PCADecomposition = PCA
221
222
def pca_decomposition(X, y=None, scale=True, proj_features=True, **kwargs):
223
"""
224
Functional API for PCA decomposition visualization.
225
226
Parameters:
227
- X: feature matrix
228
- y: target vector (optional)
229
- scale: bool, whether to scale features
230
- proj_features: bool, whether to project features
231
232
Returns:
233
PCA visualizer instance
234
"""
235
```
236
237
### Manifold Learning
238
239
Manifold learning visualization for non-linear dimensionality reduction using various algorithms like t-SNE, ISOMAP, and Locally Linear Embedding.
240
241
```python { .api }
242
class Manifold(Visualizer):
243
"""
244
Manifold learning visualizer for non-linear dimensionality reduction.
245
246
Parameters:
247
- manifold: str, manifold algorithm ('lle', 'ltsa', 'hessian', 'modified', 'isomap', 'mds', 'spectral', 'tsne')
248
- n_neighbors: int, number of neighbors for local methods
249
- classes: list, class labels for target
250
- target_type: str, target type ('discrete', 'continuous', 'single', 'auto')
251
"""
252
def __init__(self, manifold='lle', n_neighbors=None, classes=None, target_type='auto', **kwargs): ...
253
def fit(self, X, y=None, **kwargs): ...
254
def show(self, **kwargs): ...
255
256
def manifold_embedding(X, y=None, manifold='lle', classes=None, **kwargs):
257
"""
258
Functional API for manifold learning visualization.
259
260
Parameters:
261
- X: feature matrix
262
- y: target vector (optional)
263
- manifold: str, manifold algorithm
264
- classes: list, class labels
265
266
Returns:
267
Manifold visualizer instance
268
"""
269
```
270
271
### Feature Importances (Re-exported)
272
273
Feature importance visualization from model selection module, showing the relative importance of features as determined by tree-based models.
274
275
```python { .api }
276
class FeatureImportances(ModelVisualizer):
277
"""
278
Feature importances visualizer for tree-based models.
279
280
Parameters:
281
- estimator: scikit-learn estimator with feature_importances_ attribute
282
- labels: list, feature labels for display
283
- relative: bool, whether to show relative importance (percentages)
284
- absolute: bool, whether to show absolute importance values
285
"""
286
def __init__(self, estimator, labels=None, relative=True, absolute=False, **kwargs): ...
287
def fit(self, X, y, **kwargs): ...
288
def show(self, **kwargs): ...
289
290
def feature_importances(estimator, X, y, labels=None, **kwargs):
291
"""
292
Functional API for feature importances visualization.
293
294
Parameters:
295
- estimator: scikit-learn estimator
296
- X: feature matrix
297
- y: target vector
298
- labels: list, feature labels
299
300
Returns:
301
FeatureImportances visualizer instance
302
"""
303
```
304
305
### Recursive Feature Elimination (Re-exported)
306
307
Recursive Feature Elimination with Cross-Validation (RFECV) for systematic feature selection using model performance feedback.
308
309
```python { .api }
310
class RFECV(ModelVisualizer):
311
"""
312
Recursive Feature Elimination with Cross-Validation visualizer.
313
314
Parameters:
315
- estimator: scikit-learn estimator
316
- cv: int or cross-validation generator
317
- scoring: str, scoring metric
318
- step: int or float, number of features to remove at each step
319
"""
320
def __init__(self, estimator, cv=None, scoring=None, step=1, **kwargs): ...
321
def fit(self, X, y, **kwargs): ...
322
def show(self, **kwargs): ...
323
324
def rfecv(estimator, X, y, cv=None, scoring=None, **kwargs):
325
"""
326
Functional API for RFECV visualization.
327
328
Parameters:
329
- estimator: scikit-learn estimator
330
- X: feature matrix
331
- y: target vector
332
- cv: int or cross-validation generator
333
- scoring: str, scoring metric
334
335
Returns:
336
RFECV visualizer instance
337
"""
338
```
339
340
## Types
341
342
```python { .api }
343
from enum import Enum
344
345
class TargetType(Enum):
346
AUTO = "auto"
347
SINGLE = "single"
348
DISCRETE = "discrete"
349
CONTINUOUS = "continuous"
350
UNKNOWN = "unknown"
351
```
352
353
## Usage Patterns
354
355
### Comprehensive Feature Analysis
356
357
```python
358
from yellowbrick.features import Rank1D, Rank2D, ParallelCoordinates, RadViz, PCA
359
from sklearn.datasets import load_wine
360
import matplotlib.pyplot as plt
361
362
# Load sample data
363
wine = load_wine()
364
X, y = wine.data, wine.target
365
features = wine.feature_names
366
classes = wine.target_names
367
368
# Feature ranking analysis
369
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
370
371
# 1D feature ranking
372
rank1d_viz = Rank1D(features=features, ax=axes[0,0])
373
rank1d_viz.fit(X, y)
374
rank1d_viz.finalize()
375
376
# 2D feature correlation
377
rank2d_viz = Rank2D(features=features, ax=axes[0,1])
378
rank2d_viz.fit(X, y)
379
rank2d_viz.finalize()
380
381
# Parallel coordinates
382
pcoords_viz = ParallelCoordinates(classes=classes, ax=axes[1,0])
383
pcoords_viz.fit(X, y)
384
pcoords_viz.finalize()
385
386
# RadViz
387
radviz_viz = RadViz(classes=classes, ax=axes[1,1])
388
radviz_viz.fit(X, y)
389
radviz_viz.finalize()
390
391
plt.tight_layout()
392
plt.show()
393
394
# PCA analysis
395
pca_viz = PCA(scale=True, biplot=True, classes=classes)
396
pca_viz.fit(X, y)
397
pca_viz.show()
398
```
399
400
### Dimensionality Reduction Comparison
401
402
```python
403
from yellowbrick.features import PCA, Manifold
404
from sklearn.datasets import load_digits
405
import matplotlib.pyplot as plt
406
407
# Load high-dimensional data
408
digits = load_digits()
409
X, y = digits.data, digits.target
410
411
# Compare different dimensionality reduction techniques
412
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
413
axes = axes.ravel()
414
415
techniques = [
416
('PCA', PCA(scale=True)),
417
('t-SNE', Manifold(manifold='tsne')),
418
('ISOMAP', Manifold(manifold='isomap')),
419
('LLE', Manifold(manifold='lle')),
420
('Spectral', Manifold(manifold='spectral')),
421
('MDS', Manifold(manifold='mds'))
422
]
423
424
for idx, (name, viz) in enumerate(techniques):
425
viz.ax = axes[idx]
426
viz.fit(X, y)
427
viz.finalize()
428
axes[idx].set_title(name)
429
430
plt.tight_layout()
431
plt.show()
432
```
433
434
### Feature Selection Pipeline
435
436
```python
437
from yellowbrick.features import RFECV, FeatureImportances
438
from yellowbrick.model_selection import LearningCurve
439
from sklearn.ensemble import RandomForestClassifier
440
from sklearn.model_selection import train_test_split
441
442
# Split data
443
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
444
445
# Step 1: Feature importance analysis
446
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
447
fi_viz = FeatureImportances(rf_model, labels=features)
448
fi_viz.fit(X_train, y_train)
449
fi_viz.show()
450
451
# Step 2: Recursive feature elimination
452
rfecv_viz = RFECV(rf_model, cv=5, scoring='accuracy')
453
rfecv_viz.fit(X_train, y_train)
454
rfecv_viz.show()
455
456
# Get optimal features
457
optimal_features = rfecv_viz.support_
458
X_train_selected = X_train[:, optimal_features]
459
X_test_selected = X_test[:, optimal_features]
460
461
print(f"Selected {optimal_features.sum()} out of {len(optimal_features)} features")
462
```
463
464
### Multi-Algorithm Feature Ranking
465
466
```python
467
from yellowbrick.features import Rank2D
468
from sklearn.datasets import make_classification
469
import matplotlib.pyplot as plt
470
471
# Generate sample data
472
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
473
n_redundant=10, random_state=42)
474
475
# Compare different ranking algorithms
476
algorithms = ['pearson', 'covariance', 'spearman', 'kendalltau']
477
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
478
axes = axes.ravel()
479
480
for idx, algorithm in enumerate(algorithms):
481
viz = Rank2D(algorithm=algorithm, ax=axes[idx])
482
viz.fit(X, y)
483
viz.finalize()
484
axes[idx].set_title(f'{algorithm.title()} Correlation')
485
486
plt.tight_layout()
487
plt.show()
488
```
489
490
### Interactive Feature Exploration
491
492
```python
493
from yellowbrick.features import JointPlot, ParallelCoordinates
494
from sklearn.datasets import load_iris
495
496
# Load data
497
iris = load_iris()
498
X, y = iris.data, iris.target
499
features = iris.feature_names
500
classes = iris.target_names
501
502
# Joint plot for feature pairs
503
feature_pairs = [(0, 1), (0, 2), (1, 3), (2, 3)]
504
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
505
axes = axes.ravel()
506
507
for idx, (i, j) in enumerate(feature_pairs):
508
viz = JointPlot(columns=(i, j), classes=classes, ax=axes[idx])
509
viz.fit(X, y)
510
viz.finalize()
511
axes[idx].set_title(f'{features[i]} vs {features[j]}')
512
513
plt.tight_layout()
514
plt.show()
515
516
# Parallel coordinates with different normalizations
517
normalizations = [None, 'standard', 'minmax', 'robust']
518
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
519
axes = axes.ravel()
520
521
for idx, norm in enumerate(normalizations):
522
viz = ParallelCoordinates(classes=classes, normalize=norm, ax=axes[idx])
523
viz.fit(X, y)
524
viz.finalize()
525
title = f'Normalization: {norm}' if norm else 'No Normalization'
526
axes[idx].set_title(title)
527
528
plt.tight_layout()
529
plt.show()
530
```