or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddata-utilities.mdfeatures.mdindex.mdmodel-selection.mdregression.mdtext.md

features.mddocs/

0

# Feature Analysis

1

2

Tools for feature selection, analysis, and visualization to understand data characteristics, identify important features, and guide feature engineering decisions. These visualizers support both supervised and unsupervised feature analysis techniques.

3

4

## Capabilities

5

6

### Feature Ranking

7

8

Univariate and bivariate feature ranking visualizers for identifying the most informative features using various statistical measures and algorithms.

9

10

```python { .api }

11

class Rank1D(Visualizer):

12

"""

13

1D feature ranking visualizer using univariate statistical measures.

14

15

Parameters:

16

- algorithm: str, ranking algorithm ('shapiro' for normality, others available)

17

- features: list, feature names for display

18

- orient: str, orientation of bars ('h' for horizontal, 'v' for vertical)

19

"""

20

def __init__(self, algorithm='shapiro', features=None, orient='h', **kwargs): ...

21

def fit(self, X, y=None, **kwargs): ...

22

def show(self, **kwargs): ...

23

24

class Rank2D(Visualizer):

25

"""

26

2D feature ranking visualizer using bivariate statistical measures.

27

28

Parameters:

29

- algorithm: str, ranking algorithm ('pearson', 'covariance', 'spearman', 'kendalltau')

30

- features: list, feature names for display

31

- colormap: str, matplotlib colormap for heatmap (default: "RdBu_r")

32

"""

33

def __init__(self, algorithm='pearson', features=None, colormap='RdBu_r', **kwargs): ...

34

def fit(self, X, y=None, **kwargs): ...

35

def show(self, **kwargs): ...

36

37

def rank1d(X, y=None, algorithm='shapiro', features=None, **kwargs):

38

"""

39

Functional API for 1D feature ranking visualization.

40

41

Parameters:

42

- X: feature matrix

43

- y: target vector (optional)

44

- algorithm: str, ranking algorithm

45

- features: list, feature names

46

47

Returns:

48

Rank1D visualizer instance

49

"""

50

51

def rank2d(X, y=None, algorithm='pearson', features=None, **kwargs):

52

"""

53

Functional API for 2D feature ranking visualization.

54

55

Parameters:

56

- X: feature matrix

57

- y: target vector (optional)

58

- algorithm: str, ranking algorithm

59

- features: list, feature names

60

61

Returns:

62

Rank2D visualizer instance

63

"""

64

```

65

66

**Usage Example:**

67

68

```python

69

from yellowbrick.features import Rank1D, Rank2D, rank1d, rank2d

70

from sklearn.datasets import load_wine

71

72

# Load sample data

73

wine = load_wine()

74

X, y = wine.data, wine.target

75

features = wine.feature_names

76

77

# 1D feature ranking

78

rank1d_viz = Rank1D(algorithm='shapiro', features=features)

79

rank1d_viz.fit(X, y)

80

rank1d_viz.show()

81

82

# 2D feature correlation

83

rank2d_viz = Rank2D(algorithm='pearson', features=features)

84

rank2d_viz.fit(X, y)

85

rank2d_viz.show()

86

87

# Functional API

88

rank1d(X, y, features=features, algorithm='shapiro')

89

rank2d(X, y, features=features, algorithm='spearman')

90

```

91

92

### Parallel Coordinates

93

94

Parallel coordinates visualization for multivariate data analysis, showing relationships between features and target classes across multiple dimensions.

95

96

```python { .api }

97

class ParallelCoordinates(Visualizer):

98

"""

99

Parallel coordinates plot for multivariate data visualization.

100

101

Parameters:

102

- classes: list, class labels for target

103

- features: list, feature names for display

104

- normalize: str, normalization method ('standard', 'minmax', 'robust', or None)

105

- sample: float or int, sampling strategy for large datasets

106

- shuffle: bool, whether to shuffle data before sampling

107

- random_state: int, random state for reproducibility

108

"""

109

def __init__(self, classes=None, features=None, normalize=None, sample=1.0, shuffle=False, random_state=None, **kwargs): ...

110

def fit(self, X, y=None, **kwargs): ...

111

def show(self, **kwargs): ...

112

113

def parallel_coordinates(X, y=None, classes=None, features=None, **kwargs):

114

"""

115

Functional API for parallel coordinates visualization.

116

117

Parameters:

118

- X: feature matrix

119

- y: target vector (optional)

120

- classes: list, class labels

121

- features: list, feature names

122

123

Returns:

124

ParallelCoordinates visualizer instance

125

"""

126

```

127

128

### Radial Visualization (RadViz)

129

130

Radial visualization for projecting multidimensional data onto a 2D plane, useful for identifying clusters and class separability.

131

132

```python { .api }

133

class RadialVisualizer(Visualizer):

134

"""

135

Radial visualization (RadViz) for multidimensional data projection.

136

137

Parameters:

138

- classes: list, class labels for target

139

- features: list, feature names for anchors

140

- alpha: float, transparency of data points

141

"""

142

def __init__(self, classes=None, features=None, alpha=0.75, **kwargs): ...

143

def fit(self, X, y=None, **kwargs): ...

144

def show(self, **kwargs): ...

145

146

# Alias for compatibility

147

RadViz = RadialVisualizer

148

149

def radviz(X, y=None, classes=None, features=None, **kwargs):

150

"""

151

Functional API for radial visualization.

152

153

Parameters:

154

- X: feature matrix

155

- y: target vector (optional)

156

- classes: list, class labels

157

- features: list, feature names

158

159

Returns:

160

RadialVisualizer instance

161

"""

162

```

163

164

### Joint Plots

165

166

Joint plots showing relationships between pairs of features with marginal distributions, useful for understanding feature interactions and distributions.

167

168

```python { .api }

169

class JointPlot(Visualizer):

170

"""

171

Joint plot visualization for feature pair analysis.

172

173

Parameters:

174

- columns: tuple or list, column indices or names for x and y axes

175

- classes: list, class labels for target

176

- kind: str, plot type ('scatter', 'hex', 'reg')

177

"""

178

def __init__(self, columns=None, classes=None, kind='scatter', **kwargs): ...

179

def fit(self, X, y=None, **kwargs): ...

180

def show(self, **kwargs): ...

181

182

# Alias for compatibility

183

JointPlotVisualizer = JointPlot

184

185

def joint_plot(X, y=None, columns=None, classes=None, **kwargs):

186

"""

187

Functional API for joint plot visualization.

188

189

Parameters:

190

- X: feature matrix

191

- y: target vector (optional)

192

- columns: tuple, column indices or names

193

- classes: list, class labels

194

195

Returns:

196

JointPlot visualizer instance

197

"""

198

```

199

200

### PCA Decomposition

201

202

Principal Component Analysis visualization for dimensionality reduction, variance explanation, and feature transformation analysis.

203

204

```python { .api }

205

class PCA(Visualizer):

206

"""

207

PCA decomposition visualizer for dimensionality reduction analysis.

208

209

Parameters:

210

- scale: bool, whether to scale features before PCA

211

- proj_features: bool, whether to project original features

212

- biplot: bool, whether to draw biplot with feature vectors

213

- classes: list, class labels for target

214

"""

215

def __init__(self, scale=True, proj_features=True, biplot=False, classes=None, **kwargs): ...

216

def fit(self, X, y=None, **kwargs): ...

217

def show(self, **kwargs): ...

218

219

# Alias for compatibility

220

PCADecomposition = PCA

221

222

def pca_decomposition(X, y=None, scale=True, proj_features=True, **kwargs):

223

"""

224

Functional API for PCA decomposition visualization.

225

226

Parameters:

227

- X: feature matrix

228

- y: target vector (optional)

229

- scale: bool, whether to scale features

230

- proj_features: bool, whether to project features

231

232

Returns:

233

PCA visualizer instance

234

"""

235

```

236

237

### Manifold Learning

238

239

Manifold learning visualization for non-linear dimensionality reduction using various algorithms like t-SNE, ISOMAP, and Locally Linear Embedding.

240

241

```python { .api }

242

class Manifold(Visualizer):

243

"""

244

Manifold learning visualizer for non-linear dimensionality reduction.

245

246

Parameters:

247

- manifold: str, manifold algorithm ('lle', 'ltsa', 'hessian', 'modified', 'isomap', 'mds', 'spectral', 'tsne')

248

- n_neighbors: int, number of neighbors for local methods

249

- classes: list, class labels for target

250

- target_type: str, target type ('discrete', 'continuous', 'single', 'auto')

251

"""

252

def __init__(self, manifold='lle', n_neighbors=None, classes=None, target_type='auto', **kwargs): ...

253

def fit(self, X, y=None, **kwargs): ...

254

def show(self, **kwargs): ...

255

256

def manifold_embedding(X, y=None, manifold='lle', classes=None, **kwargs):

257

"""

258

Functional API for manifold learning visualization.

259

260

Parameters:

261

- X: feature matrix

262

- y: target vector (optional)

263

- manifold: str, manifold algorithm

264

- classes: list, class labels

265

266

Returns:

267

Manifold visualizer instance

268

"""

269

```

270

271

### Feature Importances (Re-exported)

272

273

Feature importance visualization from model selection module, showing the relative importance of features as determined by tree-based models.

274

275

```python { .api }

276

class FeatureImportances(ModelVisualizer):

277

"""

278

Feature importances visualizer for tree-based models.

279

280

Parameters:

281

- estimator: scikit-learn estimator with feature_importances_ attribute

282

- labels: list, feature labels for display

283

- relative: bool, whether to show relative importance (percentages)

284

- absolute: bool, whether to show absolute importance values

285

"""

286

def __init__(self, estimator, labels=None, relative=True, absolute=False, **kwargs): ...

287

def fit(self, X, y, **kwargs): ...

288

def show(self, **kwargs): ...

289

290

def feature_importances(estimator, X, y, labels=None, **kwargs):

291

"""

292

Functional API for feature importances visualization.

293

294

Parameters:

295

- estimator: scikit-learn estimator

296

- X: feature matrix

297

- y: target vector

298

- labels: list, feature labels

299

300

Returns:

301

FeatureImportances visualizer instance

302

"""

303

```

304

305

### Recursive Feature Elimination (Re-exported)

306

307

Recursive Feature Elimination with Cross-Validation (RFECV) for systematic feature selection using model performance feedback.

308

309

```python { .api }

310

class RFECV(ModelVisualizer):

311

"""

312

Recursive Feature Elimination with Cross-Validation visualizer.

313

314

Parameters:

315

- estimator: scikit-learn estimator

316

- cv: int or cross-validation generator

317

- scoring: str, scoring metric

318

- step: int or float, number of features to remove at each step

319

"""

320

def __init__(self, estimator, cv=None, scoring=None, step=1, **kwargs): ...

321

def fit(self, X, y, **kwargs): ...

322

def show(self, **kwargs): ...

323

324

def rfecv(estimator, X, y, cv=None, scoring=None, **kwargs):

325

"""

326

Functional API for RFECV visualization.

327

328

Parameters:

329

- estimator: scikit-learn estimator

330

- X: feature matrix

331

- y: target vector

332

- cv: int or cross-validation generator

333

- scoring: str, scoring metric

334

335

Returns:

336

RFECV visualizer instance

337

"""

338

```

339

340

## Types

341

342

```python { .api }

343

from enum import Enum

344

345

class TargetType(Enum):

346

AUTO = "auto"

347

SINGLE = "single"

348

DISCRETE = "discrete"

349

CONTINUOUS = "continuous"

350

UNKNOWN = "unknown"

351

```

352

353

## Usage Patterns

354

355

### Comprehensive Feature Analysis

356

357

```python

358

from yellowbrick.features import Rank1D, Rank2D, ParallelCoordinates, RadViz, PCA

359

from sklearn.datasets import load_wine

360

import matplotlib.pyplot as plt

361

362

# Load sample data

363

wine = load_wine()

364

X, y = wine.data, wine.target

365

features = wine.feature_names

366

classes = wine.target_names

367

368

# Feature ranking analysis

369

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

370

371

# 1D feature ranking

372

rank1d_viz = Rank1D(features=features, ax=axes[0,0])

373

rank1d_viz.fit(X, y)

374

rank1d_viz.finalize()

375

376

# 2D feature correlation

377

rank2d_viz = Rank2D(features=features, ax=axes[0,1])

378

rank2d_viz.fit(X, y)

379

rank2d_viz.finalize()

380

381

# Parallel coordinates

382

pcoords_viz = ParallelCoordinates(classes=classes, ax=axes[1,0])

383

pcoords_viz.fit(X, y)

384

pcoords_viz.finalize()

385

386

# RadViz

387

radviz_viz = RadViz(classes=classes, ax=axes[1,1])

388

radviz_viz.fit(X, y)

389

radviz_viz.finalize()

390

391

plt.tight_layout()

392

plt.show()

393

394

# PCA analysis

395

pca_viz = PCA(scale=True, biplot=True, classes=classes)

396

pca_viz.fit(X, y)

397

pca_viz.show()

398

```

399

400

### Dimensionality Reduction Comparison

401

402

```python

403

from yellowbrick.features import PCA, Manifold

404

from sklearn.datasets import load_digits

405

import matplotlib.pyplot as plt

406

407

# Load high-dimensional data

408

digits = load_digits()

409

X, y = digits.data, digits.target

410

411

# Compare different dimensionality reduction techniques

412

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

413

axes = axes.ravel()

414

415

techniques = [

416

('PCA', PCA(scale=True)),

417

('t-SNE', Manifold(manifold='tsne')),

418

('ISOMAP', Manifold(manifold='isomap')),

419

('LLE', Manifold(manifold='lle')),

420

('Spectral', Manifold(manifold='spectral')),

421

('MDS', Manifold(manifold='mds'))

422

]

423

424

for idx, (name, viz) in enumerate(techniques):

425

viz.ax = axes[idx]

426

viz.fit(X, y)

427

viz.finalize()

428

axes[idx].set_title(name)

429

430

plt.tight_layout()

431

plt.show()

432

```

433

434

### Feature Selection Pipeline

435

436

```python

437

from yellowbrick.features import RFECV, FeatureImportances

438

from yellowbrick.model_selection import LearningCurve

439

from sklearn.ensemble import RandomForestClassifier

440

from sklearn.model_selection import train_test_split

441

442

# Split data

443

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

444

445

# Step 1: Feature importance analysis

446

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

447

fi_viz = FeatureImportances(rf_model, labels=features)

448

fi_viz.fit(X_train, y_train)

449

fi_viz.show()

450

451

# Step 2: Recursive feature elimination

452

rfecv_viz = RFECV(rf_model, cv=5, scoring='accuracy')

453

rfecv_viz.fit(X_train, y_train)

454

rfecv_viz.show()

455

456

# Get optimal features

457

optimal_features = rfecv_viz.support_

458

X_train_selected = X_train[:, optimal_features]

459

X_test_selected = X_test[:, optimal_features]

460

461

print(f"Selected {optimal_features.sum()} out of {len(optimal_features)} features")

462

```

463

464

### Multi-Algorithm Feature Ranking

465

466

```python

467

from yellowbrick.features import Rank2D

468

from sklearn.datasets import make_classification

469

import matplotlib.pyplot as plt

470

471

# Generate sample data

472

X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,

473

n_redundant=10, random_state=42)

474

475

# Compare different ranking algorithms

476

algorithms = ['pearson', 'covariance', 'spearman', 'kendalltau']

477

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

478

axes = axes.ravel()

479

480

for idx, algorithm in enumerate(algorithms):

481

viz = Rank2D(algorithm=algorithm, ax=axes[idx])

482

viz.fit(X, y)

483

viz.finalize()

484

axes[idx].set_title(f'{algorithm.title()} Correlation')

485

486

plt.tight_layout()

487

plt.show()

488

```

489

490

### Interactive Feature Exploration

491

492

```python

493

from yellowbrick.features import JointPlot, ParallelCoordinates

494

from sklearn.datasets import load_iris

495

496

# Load data

497

iris = load_iris()

498

X, y = iris.data, iris.target

499

features = iris.feature_names

500

classes = iris.target_names

501

502

# Joint plot for feature pairs

503

feature_pairs = [(0, 1), (0, 2), (1, 3), (2, 3)]

504

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

505

axes = axes.ravel()

506

507

for idx, (i, j) in enumerate(feature_pairs):

508

viz = JointPlot(columns=(i, j), classes=classes, ax=axes[idx])

509

viz.fit(X, y)

510

viz.finalize()

511

axes[idx].set_title(f'{features[i]} vs {features[j]}')

512

513

plt.tight_layout()

514

plt.show()

515

516

# Parallel coordinates with different normalizations

517

normalizations = [None, 'standard', 'minmax', 'robust']

518

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

519

axes = axes.ravel()

520

521

for idx, norm in enumerate(normalizations):

522

viz = ParallelCoordinates(classes=classes, normalize=norm, ax=axes[idx])

523

viz.fit(X, y)

524

viz.finalize()

525

title = f'Normalization: {norm}' if norm else 'No Normalization'

526

axes[idx].set_title(title)

527

528

plt.tight_layout()

529

plt.show()

530

```