0
# Model Selection
1
2
Visualizers for model selection, hyperparameter tuning, and performance evaluation to guide the machine learning development process. These tools help assess model performance, validate model assumptions, and optimize model parameters.
3
4
## Capabilities
5
6
### Learning Curves
7
8
Learning curve analysis to evaluate model performance as a function of training set size, helping identify underfitting, overfitting, and optimal dataset size requirements.
9
10
```python { .api }
11
class LearningCurve(ModelVisualizer):
12
"""
13
Learning curve visualizer for model performance analysis.
14
15
Parameters:
16
- estimator: scikit-learn estimator
17
- cv: int or cross-validation generator, cross-validation strategy
18
- scoring: str, scoring metric for evaluation
19
- train_sizes: array-like, training set sizes to evaluate
20
- n_jobs: int, number of parallel jobs
21
- random_state: int, random state for reproducibility
22
"""
23
def __init__(self, estimator, cv=None, scoring=None, train_sizes=None, n_jobs=None, random_state=None, **kwargs): ...
24
def fit(self, X, y, **kwargs): ...
25
def show(self, **kwargs): ...
26
27
def learning_curve(estimator, X, y, cv=None, scoring=None, **kwargs):
28
"""
29
Functional API for learning curve visualization.
30
31
Parameters:
32
- estimator: scikit-learn estimator
33
- X: feature matrix
34
- y: target vector
35
- cv: int or cross-validation generator
36
- scoring: str, scoring metric
37
38
Returns:
39
LearningCurve visualizer instance
40
"""
41
```
42
43
**Usage Example:**
44
45
```python
46
from yellowbrick.model_selection import LearningCurve, learning_curve
47
from sklearn.ensemble import RandomForestClassifier
48
from sklearn.model_selection import StratifiedKFold
49
50
# Class-based API
51
model = RandomForestClassifier()
52
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
53
visualizer = LearningCurve(model, cv=cv, scoring='accuracy', n_jobs=4)
54
visualizer.fit(X, y)
55
visualizer.show()
56
57
# Functional API
58
learning_curve(model, X, y, cv=5, scoring='f1_macro')
59
```
60
61
### Validation Curves
62
63
Validation curve analysis for hyperparameter tuning, showing model performance across different parameter values to identify optimal parameter ranges.
64
65
```python { .api }
66
class ValidationCurve(ModelVisualizer):
67
"""
68
Validation curve visualizer for hyperparameter tuning.
69
70
Parameters:
71
- estimator: scikit-learn estimator
72
- param_name: str, parameter name to vary
73
- param_range: array-like, parameter values to test
74
- cv: int or cross-validation generator
75
- scoring: str, scoring metric
76
- n_jobs: int, number of parallel jobs
77
- logx: bool, whether to use log scale for parameter axis
78
"""
79
def __init__(self, estimator, param_name, param_range, cv=None, scoring=None, n_jobs=None, logx=False, **kwargs): ...
80
def fit(self, X, y, **kwargs): ...
81
def show(self, **kwargs): ...
82
83
def validation_curve(estimator, X, y, param_name, param_range, cv=None, scoring=None, **kwargs):
84
"""
85
Functional API for validation curve visualization.
86
87
Parameters:
88
- estimator: scikit-learn estimator
89
- X: feature matrix
90
- y: target vector
91
- param_name: str, parameter name
92
- param_range: array-like, parameter values
93
- cv: int or cross-validation generator
94
- scoring: str, scoring metric
95
96
Returns:
97
ValidationCurve visualizer instance
98
"""
99
```
100
101
**Usage Example:**
102
103
```python
104
from yellowbrick.model_selection import ValidationCurve, validation_curve
105
from sklearn.ensemble import RandomForestClassifier
106
import numpy as np
107
108
# Parameter range for n_estimators
109
param_range = np.arange(10, 200, 20)
110
111
# Class-based API
112
model = RandomForestClassifier()
113
visualizer = ValidationCurve(
114
model,
115
param_name='n_estimators',
116
param_range=param_range,
117
cv=5,
118
scoring='accuracy',
119
n_jobs=4
120
)
121
visualizer.fit(X, y)
122
visualizer.show()
123
124
# Functional API with log scale
125
validation_curve(model, X, y, 'max_depth', [1, 2, 4, 8, 16, 32], logx=True)
126
```
127
128
### Cross-Validation Scores
129
130
Cross-validation score visualization for model evaluation, showing score distributions across different folds to assess model stability and performance variance.
131
132
```python { .api }
133
class CVScores(ModelVisualizer):
134
"""
135
Cross-validation scores visualizer.
136
137
Parameters:
138
- estimator: scikit-learn estimator
139
- cv: int or cross-validation generator
140
- scoring: str, scoring metric
141
"""
142
def __init__(self, estimator, cv=None, scoring=None, **kwargs): ...
143
def fit(self, X, y, **kwargs): ...
144
def show(self, **kwargs): ...
145
146
def cv_scores(estimator, X, y, cv=None, scoring=None, **kwargs):
147
"""
148
Functional API for cross-validation scores visualization.
149
150
Parameters:
151
- estimator: scikit-learn estimator
152
- X: feature matrix
153
- y: target vector
154
- cv: int or cross-validation generator
155
- scoring: str, scoring metric
156
157
Returns:
158
CVScores visualizer instance
159
"""
160
```
161
162
### Feature Dropping Curve
163
164
Feature dropping curve analysis to understand the impact of removing features on model performance, helping identify the minimum viable feature set.
165
166
```python { .api }
167
class DroppingCurve(ModelVisualizer):
168
"""
169
Feature dropping curve visualizer.
170
171
Parameters:
172
- estimator: scikit-learn estimator
173
- cv: int or cross-validation generator
174
- scoring: str, scoring metric
175
"""
176
def __init__(self, estimator, cv=None, scoring=None, **kwargs): ...
177
def fit(self, X, y, **kwargs): ...
178
def show(self, **kwargs): ...
179
180
def dropping_curve(estimator, X, y, cv=None, scoring=None, **kwargs):
181
"""
182
Functional API for dropping curve visualization.
183
184
Parameters:
185
- estimator: scikit-learn estimator
186
- X: feature matrix
187
- y: target vector
188
- cv: int or cross-validation generator
189
- scoring: str, scoring metric
190
191
Returns:
192
DroppingCurve visualizer instance
193
"""
194
```
195
196
### Feature Importances
197
198
Feature importance visualization for tree-based models, showing the relative contribution of each feature to model predictions.
199
200
```python { .api }
201
class FeatureImportances(ModelVisualizer):
202
"""
203
Feature importances visualizer for tree-based models.
204
205
Parameters:
206
- estimator: scikit-learn estimator with feature_importances_ attribute
207
- labels: list, feature labels for display
208
- relative: bool, whether to show relative importance (percentages)
209
- absolute: bool, whether to show absolute importance values
210
- xlabel: str, x-axis label
211
- ylabel: str, y-axis label
212
"""
213
def __init__(self, estimator, labels=None, relative=True, absolute=False, xlabel=None, ylabel=None, **kwargs): ...
214
def fit(self, X, y, **kwargs): ...
215
def show(self, **kwargs): ...
216
217
def feature_importances(estimator, X, y, labels=None, **kwargs):
218
"""
219
Functional API for feature importances visualization.
220
221
Parameters:
222
- estimator: scikit-learn estimator
223
- X: feature matrix
224
- y: target vector
225
- labels: list, feature labels
226
227
Returns:
228
FeatureImportances visualizer instance
229
"""
230
```
231
232
### Recursive Feature Elimination
233
234
Recursive Feature Elimination with Cross-Validation (RFECV) for systematic feature selection using model performance feedback.
235
236
```python { .api }
237
class RFECV(ModelVisualizer):
238
"""
239
Recursive Feature Elimination with Cross-Validation visualizer.
240
241
Parameters:
242
- estimator: scikit-learn estimator
243
- cv: int or cross-validation generator
244
- scoring: str, scoring metric
245
- step: int or float, number of features to remove at each step
246
- groups: array-like, group labels for group cross-validation
247
"""
248
def __init__(self, estimator, cv=None, scoring=None, step=1, groups=None, **kwargs): ...
249
def fit(self, X, y, **kwargs): ...
250
def show(self, **kwargs): ...
251
252
def rfecv(estimator, X, y, cv=None, scoring=None, **kwargs):
253
"""
254
Functional API for RFECV visualization.
255
256
Parameters:
257
- estimator: scikit-learn estimator
258
- X: feature matrix
259
- y: target vector
260
- cv: int or cross-validation generator
261
- scoring: str, scoring metric
262
263
Returns:
264
RFECV visualizer instance
265
"""
266
```
267
268
## Usage Patterns
269
270
### Complete Model Evaluation Workflow
271
272
```python
273
from yellowbrick.model_selection import LearningCurve, ValidationCurve, CVScores, FeatureImportances
274
from sklearn.ensemble import RandomForestClassifier
275
from sklearn.model_selection import train_test_split, StratifiedKFold
276
import numpy as np
277
278
# Prepare data
279
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
280
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
281
282
# Step 1: Learning curve analysis
283
print("Step 1: Learning curve analysis")
284
model = RandomForestClassifier(n_estimators=100, random_state=42)
285
learning_viz = LearningCurve(model, cv=cv, scoring='accuracy', n_jobs=4)
286
learning_viz.fit(X_train, y_train)
287
learning_viz.show()
288
289
# Step 2: Hyperparameter tuning with validation curves
290
print("Step 2: Hyperparameter tuning")
291
param_range = np.arange(10, 200, 20)
292
validation_viz = ValidationCurve(
293
model,
294
param_name='n_estimators',
295
param_range=param_range,
296
cv=cv,
297
scoring='accuracy'
298
)
299
validation_viz.fit(X_train, y_train)
300
validation_viz.show()
301
302
# Step 3: Cross-validation score assessment
303
print("Step 3: Cross-validation assessment")
304
cv_viz = CVScores(model, cv=cv, scoring='accuracy')
305
cv_viz.fit(X_train, y_train)
306
cv_viz.show()
307
308
# Step 4: Feature importance analysis
309
print("Step 4: Feature importance analysis")
310
fi_viz = FeatureImportances(model, labels=feature_names)
311
fi_viz.fit(X_train, y_train)
312
fi_viz.show()
313
```
314
315
### Hyperparameter Optimization
316
317
```python
318
from yellowbrick.model_selection import ValidationCurve
319
from sklearn.svm import SVC
320
from sklearn.ensemble import RandomForestClassifier
321
from sklearn.linear_model import LogisticRegression
322
import numpy as np
323
import matplotlib.pyplot as plt
324
325
# Compare hyperparameters across different models
326
models_params = [
327
(SVC(), 'C', np.logspace(-3, 3, 7)),
328
(RandomForestClassifier(), 'n_estimators', np.arange(10, 200, 30)),
329
(LogisticRegression(), 'C', np.logspace(-3, 3, 7))
330
]
331
332
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
333
334
for idx, (model, param_name, param_range) in enumerate(models_params):
335
viz = ValidationCurve(
336
model,
337
param_name=param_name,
338
param_range=param_range,
339
cv=5,
340
scoring='accuracy',
341
ax=axes[idx],
342
logx=(param_name == 'C') # Use log scale for C parameter
343
)
344
viz.fit(X, y)
345
viz.finalize()
346
axes[idx].set_title(f'{model.__class__.__name__} - {param_name}')
347
348
plt.tight_layout()
349
plt.show()
350
```
351
352
### Feature Selection Pipeline
353
354
```python
355
from yellowbrick.model_selection import RFECV, FeatureImportances, DroppingCurve
356
from sklearn.ensemble import RandomForestClassifier
357
from sklearn.feature_selection import SelectKBest, f_classif
358
359
# Step 1: Initial feature importance analysis
360
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
361
fi_viz = FeatureImportances(rf_model, labels=feature_names)
362
fi_viz.fit(X, y)
363
fi_viz.show()
364
365
# Step 2: Recursive feature elimination
366
rfecv_viz = RFECV(rf_model, cv=5, scoring='accuracy', step=1)
367
rfecv_viz.fit(X, y)
368
rfecv_viz.show()
369
370
# Get optimal number of features
371
n_optimal_features = rfecv_viz.n_features_
372
print(f"Optimal number of features: {n_optimal_features}")
373
374
# Step 3: Feature dropping analysis
375
dropping_viz = DroppingCurve(rf_model, cv=5, scoring='accuracy')
376
dropping_viz.fit(X, y)
377
dropping_viz.show()
378
```
379
380
### Model Comparison and Selection
381
382
```python
383
from yellowbrick.model_selection import LearningCurve, CVScores
384
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
385
from sklearn.svm import SVC
386
from sklearn.linear_model import LogisticRegression
387
import matplotlib.pyplot as plt
388
389
# Define models to compare
390
models = {
391
'Logistic Regression': LogisticRegression(),
392
'Random Forest': RandomForestClassifier(n_estimators=100),
393
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
394
'SVM': SVC()
395
}
396
397
# Learning curve comparison
398
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
399
axes = axes.ravel()
400
401
for idx, (name, model) in enumerate(models.items()):
402
viz = LearningCurve(model, cv=5, scoring='accuracy', ax=axes[idx])
403
viz.fit(X, y)
404
viz.finalize()
405
axes[idx].set_title(f'{name} - Learning Curve')
406
407
plt.tight_layout()
408
plt.show()
409
410
# Cross-validation scores comparison
411
fig, axes = plt.subplots(2, 2, figsize=(15, 8))
412
axes = axes.ravel()
413
414
for idx, (name, model) in enumerate(models.items()):
415
viz = CVScores(model, cv=10, scoring='accuracy', ax=axes[idx])
416
viz.fit(X, y)
417
viz.finalize()
418
axes[idx].set_title(f'{name} - CV Scores')
419
420
plt.tight_layout()
421
plt.show()
422
```
423
424
### Advanced Hyperparameter Analysis
425
426
```python
427
from yellowbrick.model_selection import ValidationCurve
428
from sklearn.ensemble import RandomForestClassifier
429
import numpy as np
430
import matplotlib.pyplot as plt
431
432
# Multi-parameter validation curves
433
model = RandomForestClassifier(random_state=42)
434
435
parameters = {
436
'n_estimators': np.arange(10, 200, 20),
437
'max_depth': [3, 5, 7, 10, 15, 20, None],
438
'min_samples_split': [2, 5, 10, 20],
439
'min_samples_leaf': [1, 2, 4, 8]
440
}
441
442
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
443
axes = axes.ravel()
444
445
for idx, (param_name, param_range) in enumerate(parameters.items()):
446
# Handle None values in max_depth
447
if param_name == 'max_depth':
448
# Replace None with a large number for plotting
449
plot_range = [x if x is not None else 50 for x in param_range]
450
tick_labels = [str(x) if x is not None else 'None' for x in param_range]
451
else:
452
plot_range = param_range
453
tick_labels = None
454
455
viz = ValidationCurve(
456
model,
457
param_name=param_name,
458
param_range=param_range,
459
cv=5,
460
scoring='accuracy',
461
ax=axes[idx]
462
)
463
viz.fit(X, y)
464
viz.finalize()
465
466
if tick_labels:
467
axes[idx].set_xticks(range(len(plot_range)))
468
axes[idx].set_xticklabels(tick_labels)
469
470
axes[idx].set_title(f'Validation Curve - {param_name}')
471
472
plt.tight_layout()
473
plt.show()
474
```
475
476
### Performance Monitoring
477
478
```python
479
from yellowbrick.model_selection import LearningCurve, CVScores
480
from sklearn.datasets import make_classification
481
from sklearn.ensemble import RandomForestClassifier
482
import numpy as np
483
484
# Generate datasets of different sizes
485
dataset_sizes = [100, 500, 1000, 5000]
486
model = RandomForestClassifier(n_estimators=100, random_state=42)
487
488
for size in dataset_sizes:
489
print(f"Dataset size: {size}")
490
491
# Generate data
492
X_sim, y_sim = make_classification(
493
n_samples=size,
494
n_features=20,
495
n_informative=15,
496
n_redundant=5,
497
random_state=42
498
)
499
500
# Learning curve
501
learning_viz = LearningCurve(model, cv=5, scoring='accuracy')
502
learning_viz.fit(X_sim, y_sim)
503
learning_viz.show()
504
505
# CV scores
506
cv_viz = CVScores(model, cv=5, scoring='accuracy')
507
cv_viz.fit(X_sim, y_sim)
508
cv_viz.show()
509
510
print(f"Mean CV score: {cv_viz.cv_scores_.mean():.3f} ± {cv_viz.cv_scores_.std():.3f}")
511
print("-" * 50)
512
```