or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddata-utilities.mdfeatures.mdindex.mdmodel-selection.mdregression.mdtext.md

model-selection.mddocs/

0

# Model Selection

1

2

Visualizers for model selection, hyperparameter tuning, and performance evaluation to guide the machine learning development process. These tools help assess model performance, validate model assumptions, and optimize model parameters.

3

4

## Capabilities

5

6

### Learning Curves

7

8

Learning curve analysis to evaluate model performance as a function of training set size, helping identify underfitting, overfitting, and optimal dataset size requirements.

9

10

```python { .api }

11

class LearningCurve(ModelVisualizer):

12

"""

13

Learning curve visualizer for model performance analysis.

14

15

Parameters:

16

- estimator: scikit-learn estimator

17

- cv: int or cross-validation generator, cross-validation strategy

18

- scoring: str, scoring metric for evaluation

19

- train_sizes: array-like, training set sizes to evaluate

20

- n_jobs: int, number of parallel jobs

21

- random_state: int, random state for reproducibility

22

"""

23

def __init__(self, estimator, cv=None, scoring=None, train_sizes=None, n_jobs=None, random_state=None, **kwargs): ...

24

def fit(self, X, y, **kwargs): ...

25

def show(self, **kwargs): ...

26

27

def learning_curve(estimator, X, y, cv=None, scoring=None, **kwargs):

28

"""

29

Functional API for learning curve visualization.

30

31

Parameters:

32

- estimator: scikit-learn estimator

33

- X: feature matrix

34

- y: target vector

35

- cv: int or cross-validation generator

36

- scoring: str, scoring metric

37

38

Returns:

39

LearningCurve visualizer instance

40

"""

41

```

42

43

**Usage Example:**

44

45

```python

46

from yellowbrick.model_selection import LearningCurve, learning_curve

47

from sklearn.ensemble import RandomForestClassifier

48

from sklearn.model_selection import StratifiedKFold

49

50

# Class-based API

51

model = RandomForestClassifier()

52

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

53

visualizer = LearningCurve(model, cv=cv, scoring='accuracy', n_jobs=4)

54

visualizer.fit(X, y)

55

visualizer.show()

56

57

# Functional API

58

learning_curve(model, X, y, cv=5, scoring='f1_macro')

59

```

60

61

### Validation Curves

62

63

Validation curve analysis for hyperparameter tuning, showing model performance across different parameter values to identify optimal parameter ranges.

64

65

```python { .api }

66

class ValidationCurve(ModelVisualizer):

67

"""

68

Validation curve visualizer for hyperparameter tuning.

69

70

Parameters:

71

- estimator: scikit-learn estimator

72

- param_name: str, parameter name to vary

73

- param_range: array-like, parameter values to test

74

- cv: int or cross-validation generator

75

- scoring: str, scoring metric

76

- n_jobs: int, number of parallel jobs

77

- logx: bool, whether to use log scale for parameter axis

78

"""

79

def __init__(self, estimator, param_name, param_range, cv=None, scoring=None, n_jobs=None, logx=False, **kwargs): ...

80

def fit(self, X, y, **kwargs): ...

81

def show(self, **kwargs): ...

82

83

def validation_curve(estimator, X, y, param_name, param_range, cv=None, scoring=None, **kwargs):

84

"""

85

Functional API for validation curve visualization.

86

87

Parameters:

88

- estimator: scikit-learn estimator

89

- X: feature matrix

90

- y: target vector

91

- param_name: str, parameter name

92

- param_range: array-like, parameter values

93

- cv: int or cross-validation generator

94

- scoring: str, scoring metric

95

96

Returns:

97

ValidationCurve visualizer instance

98

"""

99

```

100

101

**Usage Example:**

102

103

```python

104

from yellowbrick.model_selection import ValidationCurve, validation_curve

105

from sklearn.ensemble import RandomForestClassifier

106

import numpy as np

107

108

# Parameter range for n_estimators

109

param_range = np.arange(10, 200, 20)

110

111

# Class-based API

112

model = RandomForestClassifier()

113

visualizer = ValidationCurve(

114

model,

115

param_name='n_estimators',

116

param_range=param_range,

117

cv=5,

118

scoring='accuracy',

119

n_jobs=4

120

)

121

visualizer.fit(X, y)

122

visualizer.show()

123

124

# Functional API with log scale

125

validation_curve(model, X, y, 'max_depth', [1, 2, 4, 8, 16, 32], logx=True)

126

```

127

128

### Cross-Validation Scores

129

130

Cross-validation score visualization for model evaluation, showing score distributions across different folds to assess model stability and performance variance.

131

132

```python { .api }

133

class CVScores(ModelVisualizer):

134

"""

135

Cross-validation scores visualizer.

136

137

Parameters:

138

- estimator: scikit-learn estimator

139

- cv: int or cross-validation generator

140

- scoring: str, scoring metric

141

"""

142

def __init__(self, estimator, cv=None, scoring=None, **kwargs): ...

143

def fit(self, X, y, **kwargs): ...

144

def show(self, **kwargs): ...

145

146

def cv_scores(estimator, X, y, cv=None, scoring=None, **kwargs):

147

"""

148

Functional API for cross-validation scores visualization.

149

150

Parameters:

151

- estimator: scikit-learn estimator

152

- X: feature matrix

153

- y: target vector

154

- cv: int or cross-validation generator

155

- scoring: str, scoring metric

156

157

Returns:

158

CVScores visualizer instance

159

"""

160

```

161

162

### Feature Dropping Curve

163

164

Feature dropping curve analysis to understand the impact of removing features on model performance, helping identify the minimum viable feature set.

165

166

```python { .api }

167

class DroppingCurve(ModelVisualizer):

168

"""

169

Feature dropping curve visualizer.

170

171

Parameters:

172

- estimator: scikit-learn estimator

173

- cv: int or cross-validation generator

174

- scoring: str, scoring metric

175

"""

176

def __init__(self, estimator, cv=None, scoring=None, **kwargs): ...

177

def fit(self, X, y, **kwargs): ...

178

def show(self, **kwargs): ...

179

180

def dropping_curve(estimator, X, y, cv=None, scoring=None, **kwargs):

181

"""

182

Functional API for dropping curve visualization.

183

184

Parameters:

185

- estimator: scikit-learn estimator

186

- X: feature matrix

187

- y: target vector

188

- cv: int or cross-validation generator

189

- scoring: str, scoring metric

190

191

Returns:

192

DroppingCurve visualizer instance

193

"""

194

```

195

196

### Feature Importances

197

198

Feature importance visualization for tree-based models, showing the relative contribution of each feature to model predictions.

199

200

```python { .api }

201

class FeatureImportances(ModelVisualizer):

202

"""

203

Feature importances visualizer for tree-based models.

204

205

Parameters:

206

- estimator: scikit-learn estimator with feature_importances_ attribute

207

- labels: list, feature labels for display

208

- relative: bool, whether to show relative importance (percentages)

209

- absolute: bool, whether to show absolute importance values

210

- xlabel: str, x-axis label

211

- ylabel: str, y-axis label

212

"""

213

def __init__(self, estimator, labels=None, relative=True, absolute=False, xlabel=None, ylabel=None, **kwargs): ...

214

def fit(self, X, y, **kwargs): ...

215

def show(self, **kwargs): ...

216

217

def feature_importances(estimator, X, y, labels=None, **kwargs):

218

"""

219

Functional API for feature importances visualization.

220

221

Parameters:

222

- estimator: scikit-learn estimator

223

- X: feature matrix

224

- y: target vector

225

- labels: list, feature labels

226

227

Returns:

228

FeatureImportances visualizer instance

229

"""

230

```

231

232

### Recursive Feature Elimination

233

234

Recursive Feature Elimination with Cross-Validation (RFECV) for systematic feature selection using model performance feedback.

235

236

```python { .api }

237

class RFECV(ModelVisualizer):

238

"""

239

Recursive Feature Elimination with Cross-Validation visualizer.

240

241

Parameters:

242

- estimator: scikit-learn estimator

243

- cv: int or cross-validation generator

244

- scoring: str, scoring metric

245

- step: int or float, number of features to remove at each step

246

- groups: array-like, group labels for group cross-validation

247

"""

248

def __init__(self, estimator, cv=None, scoring=None, step=1, groups=None, **kwargs): ...

249

def fit(self, X, y, **kwargs): ...

250

def show(self, **kwargs): ...

251

252

def rfecv(estimator, X, y, cv=None, scoring=None, **kwargs):

253

"""

254

Functional API for RFECV visualization.

255

256

Parameters:

257

- estimator: scikit-learn estimator

258

- X: feature matrix

259

- y: target vector

260

- cv: int or cross-validation generator

261

- scoring: str, scoring metric

262

263

Returns:

264

RFECV visualizer instance

265

"""

266

```

267

268

## Usage Patterns

269

270

### Complete Model Evaluation Workflow

271

272

```python

273

from yellowbrick.model_selection import LearningCurve, ValidationCurve, CVScores, FeatureImportances

274

from sklearn.ensemble import RandomForestClassifier

275

from sklearn.model_selection import train_test_split, StratifiedKFold

276

import numpy as np

277

278

# Prepare data

279

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

280

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

281

282

# Step 1: Learning curve analysis

283

print("Step 1: Learning curve analysis")

284

model = RandomForestClassifier(n_estimators=100, random_state=42)

285

learning_viz = LearningCurve(model, cv=cv, scoring='accuracy', n_jobs=4)

286

learning_viz.fit(X_train, y_train)

287

learning_viz.show()

288

289

# Step 2: Hyperparameter tuning with validation curves

290

print("Step 2: Hyperparameter tuning")

291

param_range = np.arange(10, 200, 20)

292

validation_viz = ValidationCurve(

293

model,

294

param_name='n_estimators',

295

param_range=param_range,

296

cv=cv,

297

scoring='accuracy'

298

)

299

validation_viz.fit(X_train, y_train)

300

validation_viz.show()

301

302

# Step 3: Cross-validation score assessment

303

print("Step 3: Cross-validation assessment")

304

cv_viz = CVScores(model, cv=cv, scoring='accuracy')

305

cv_viz.fit(X_train, y_train)

306

cv_viz.show()

307

308

# Step 4: Feature importance analysis

309

print("Step 4: Feature importance analysis")

310

fi_viz = FeatureImportances(model, labels=feature_names)

311

fi_viz.fit(X_train, y_train)

312

fi_viz.show()

313

```

314

315

### Hyperparameter Optimization

316

317

```python

318

from yellowbrick.model_selection import ValidationCurve

319

from sklearn.svm import SVC

320

from sklearn.ensemble import RandomForestClassifier

321

from sklearn.linear_model import LogisticRegression

322

import numpy as np

323

import matplotlib.pyplot as plt

324

325

# Compare hyperparameters across different models

326

models_params = [

327

(SVC(), 'C', np.logspace(-3, 3, 7)),

328

(RandomForestClassifier(), 'n_estimators', np.arange(10, 200, 30)),

329

(LogisticRegression(), 'C', np.logspace(-3, 3, 7))

330

]

331

332

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

333

334

for idx, (model, param_name, param_range) in enumerate(models_params):

335

viz = ValidationCurve(

336

model,

337

param_name=param_name,

338

param_range=param_range,

339

cv=5,

340

scoring='accuracy',

341

ax=axes[idx],

342

logx=(param_name == 'C') # Use log scale for C parameter

343

)

344

viz.fit(X, y)

345

viz.finalize()

346

axes[idx].set_title(f'{model.__class__.__name__} - {param_name}')

347

348

plt.tight_layout()

349

plt.show()

350

```

351

352

### Feature Selection Pipeline

353

354

```python

355

from yellowbrick.model_selection import RFECV, FeatureImportances, DroppingCurve

356

from sklearn.ensemble import RandomForestClassifier

357

from sklearn.feature_selection import SelectKBest, f_classif

358

359

# Step 1: Initial feature importance analysis

360

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

361

fi_viz = FeatureImportances(rf_model, labels=feature_names)

362

fi_viz.fit(X, y)

363

fi_viz.show()

364

365

# Step 2: Recursive feature elimination

366

rfecv_viz = RFECV(rf_model, cv=5, scoring='accuracy', step=1)

367

rfecv_viz.fit(X, y)

368

rfecv_viz.show()

369

370

# Get optimal number of features

371

n_optimal_features = rfecv_viz.n_features_

372

print(f"Optimal number of features: {n_optimal_features}")

373

374

# Step 3: Feature dropping analysis

375

dropping_viz = DroppingCurve(rf_model, cv=5, scoring='accuracy')

376

dropping_viz.fit(X, y)

377

dropping_viz.show()

378

```

379

380

### Model Comparison and Selection

381

382

```python

383

from yellowbrick.model_selection import LearningCurve, CVScores

384

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

385

from sklearn.svm import SVC

386

from sklearn.linear_model import LogisticRegression

387

import matplotlib.pyplot as plt

388

389

# Define models to compare

390

models = {

391

'Logistic Regression': LogisticRegression(),

392

'Random Forest': RandomForestClassifier(n_estimators=100),

393

'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),

394

'SVM': SVC()

395

}

396

397

# Learning curve comparison

398

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

399

axes = axes.ravel()

400

401

for idx, (name, model) in enumerate(models.items()):

402

viz = LearningCurve(model, cv=5, scoring='accuracy', ax=axes[idx])

403

viz.fit(X, y)

404

viz.finalize()

405

axes[idx].set_title(f'{name} - Learning Curve')

406

407

plt.tight_layout()

408

plt.show()

409

410

# Cross-validation scores comparison

411

fig, axes = plt.subplots(2, 2, figsize=(15, 8))

412

axes = axes.ravel()

413

414

for idx, (name, model) in enumerate(models.items()):

415

viz = CVScores(model, cv=10, scoring='accuracy', ax=axes[idx])

416

viz.fit(X, y)

417

viz.finalize()

418

axes[idx].set_title(f'{name} - CV Scores')

419

420

plt.tight_layout()

421

plt.show()

422

```

423

424

### Advanced Hyperparameter Analysis

425

426

```python

427

from yellowbrick.model_selection import ValidationCurve

428

from sklearn.ensemble import RandomForestClassifier

429

import numpy as np

430

import matplotlib.pyplot as plt

431

432

# Multi-parameter validation curves

433

model = RandomForestClassifier(random_state=42)

434

435

parameters = {

436

'n_estimators': np.arange(10, 200, 20),

437

'max_depth': [3, 5, 7, 10, 15, 20, None],

438

'min_samples_split': [2, 5, 10, 20],

439

'min_samples_leaf': [1, 2, 4, 8]

440

}

441

442

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

443

axes = axes.ravel()

444

445

for idx, (param_name, param_range) in enumerate(parameters.items()):

446

# Handle None values in max_depth

447

if param_name == 'max_depth':

448

# Replace None with a large number for plotting

449

plot_range = [x if x is not None else 50 for x in param_range]

450

tick_labels = [str(x) if x is not None else 'None' for x in param_range]

451

else:

452

plot_range = param_range

453

tick_labels = None

454

455

viz = ValidationCurve(

456

model,

457

param_name=param_name,

458

param_range=param_range,

459

cv=5,

460

scoring='accuracy',

461

ax=axes[idx]

462

)

463

viz.fit(X, y)

464

viz.finalize()

465

466

if tick_labels:

467

axes[idx].set_xticks(range(len(plot_range)))

468

axes[idx].set_xticklabels(tick_labels)

469

470

axes[idx].set_title(f'Validation Curve - {param_name}')

471

472

plt.tight_layout()

473

plt.show()

474

```

475

476

### Performance Monitoring

477

478

```python

479

from yellowbrick.model_selection import LearningCurve, CVScores

480

from sklearn.datasets import make_classification

481

from sklearn.ensemble import RandomForestClassifier

482

import numpy as np

483

484

# Generate datasets of different sizes

485

dataset_sizes = [100, 500, 1000, 5000]

486

model = RandomForestClassifier(n_estimators=100, random_state=42)

487

488

for size in dataset_sizes:

489

print(f"Dataset size: {size}")

490

491

# Generate data

492

X_sim, y_sim = make_classification(

493

n_samples=size,

494

n_features=20,

495

n_informative=15,

496

n_redundant=5,

497

random_state=42

498

)

499

500

# Learning curve

501

learning_viz = LearningCurve(model, cv=5, scoring='accuracy')

502

learning_viz.fit(X_sim, y_sim)

503

learning_viz.show()

504

505

# CV scores

506

cv_viz = CVScores(model, cv=5, scoring='accuracy')

507

cv_viz.fit(X_sim, y_sim)

508

cv_viz.show()

509

510

print(f"Mean CV score: {cv_viz.cv_scores_.mean():.3f} ± {cv_viz.cv_scores_.std():.3f}")

511

print("-" * 50)

512

```