A suite of visual analysis and diagnostic tools for machine learning.
Visualizers for model selection, hyperparameter tuning, and performance evaluation to guide the machine learning development process. These tools help assess model performance, validate model assumptions, and optimize model parameters.
Learning curve analysis to evaluate model performance as a function of training set size, helping identify underfitting, overfitting, and optimal dataset size requirements.
class LearningCurve(ModelVisualizer):
"""
Learning curve visualizer for model performance analysis.
Parameters:
- estimator: scikit-learn estimator
- cv: int or cross-validation generator, cross-validation strategy
- scoring: str, scoring metric for evaluation
- train_sizes: array-like, training set sizes to evaluate
- n_jobs: int, number of parallel jobs
- random_state: int, random state for reproducibility
"""
def __init__(self, estimator, cv=None, scoring=None, train_sizes=None, n_jobs=None, random_state=None, **kwargs): ...
def fit(self, X, y, **kwargs): ...
def show(self, **kwargs): ...
def learning_curve(estimator, X, y, cv=None, scoring=None, **kwargs):
"""
Functional API for learning curve visualization.
Parameters:
- estimator: scikit-learn estimator
- X: feature matrix
- y: target vector
- cv: int or cross-validation generator
- scoring: str, scoring metric
Returns:
LearningCurve visualizer instance
"""Usage Example:
from yellowbrick.model_selection import LearningCurve, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
# Class-based API
model = RandomForestClassifier()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
visualizer = LearningCurve(model, cv=cv, scoring='accuracy', n_jobs=4)
visualizer.fit(X, y)
visualizer.show()
# Functional API
learning_curve(model, X, y, cv=5, scoring='f1_macro')Validation curve analysis for hyperparameter tuning, showing model performance across different parameter values to identify optimal parameter ranges.
class ValidationCurve(ModelVisualizer):
"""
Validation curve visualizer for hyperparameter tuning.
Parameters:
- estimator: scikit-learn estimator
- param_name: str, parameter name to vary
- param_range: array-like, parameter values to test
- cv: int or cross-validation generator
- scoring: str, scoring metric
- n_jobs: int, number of parallel jobs
- logx: bool, whether to use log scale for parameter axis
"""
def __init__(self, estimator, param_name, param_range, cv=None, scoring=None, n_jobs=None, logx=False, **kwargs): ...
def fit(self, X, y, **kwargs): ...
def show(self, **kwargs): ...
def validation_curve(estimator, X, y, param_name, param_range, cv=None, scoring=None, **kwargs):
"""
Functional API for validation curve visualization.
Parameters:
- estimator: scikit-learn estimator
- X: feature matrix
- y: target vector
- param_name: str, parameter name
- param_range: array-like, parameter values
- cv: int or cross-validation generator
- scoring: str, scoring metric
Returns:
ValidationCurve visualizer instance
"""Usage Example:
from yellowbrick.model_selection import ValidationCurve, validation_curve
from sklearn.ensemble import RandomForestClassifier
import numpy as np
# Parameter range for n_estimators
param_range = np.arange(10, 200, 20)
# Class-based API
model = RandomForestClassifier()
visualizer = ValidationCurve(
model,
param_name='n_estimators',
param_range=param_range,
cv=5,
scoring='accuracy',
n_jobs=4
)
visualizer.fit(X, y)
visualizer.show()
# Functional API with log scale
validation_curve(model, X, y, 'max_depth', [1, 2, 4, 8, 16, 32], logx=True)Cross-validation score visualization for model evaluation, showing score distributions across different folds to assess model stability and performance variance.
class CVScores(ModelVisualizer):
"""
Cross-validation scores visualizer.
Parameters:
- estimator: scikit-learn estimator
- cv: int or cross-validation generator
- scoring: str, scoring metric
"""
def __init__(self, estimator, cv=None, scoring=None, **kwargs): ...
def fit(self, X, y, **kwargs): ...
def show(self, **kwargs): ...
def cv_scores(estimator, X, y, cv=None, scoring=None, **kwargs):
"""
Functional API for cross-validation scores visualization.
Parameters:
- estimator: scikit-learn estimator
- X: feature matrix
- y: target vector
- cv: int or cross-validation generator
- scoring: str, scoring metric
Returns:
CVScores visualizer instance
"""Feature dropping curve analysis to understand the impact of removing features on model performance, helping identify the minimum viable feature set.
class DroppingCurve(ModelVisualizer):
"""
Feature dropping curve visualizer.
Parameters:
- estimator: scikit-learn estimator
- cv: int or cross-validation generator
- scoring: str, scoring metric
"""
def __init__(self, estimator, cv=None, scoring=None, **kwargs): ...
def fit(self, X, y, **kwargs): ...
def show(self, **kwargs): ...
def dropping_curve(estimator, X, y, cv=None, scoring=None, **kwargs):
"""
Functional API for dropping curve visualization.
Parameters:
- estimator: scikit-learn estimator
- X: feature matrix
- y: target vector
- cv: int or cross-validation generator
- scoring: str, scoring metric
Returns:
DroppingCurve visualizer instance
"""Feature importance visualization for tree-based models, showing the relative contribution of each feature to model predictions.
class FeatureImportances(ModelVisualizer):
"""
Feature importances visualizer for tree-based models.
Parameters:
- estimator: scikit-learn estimator with feature_importances_ attribute
- labels: list, feature labels for display
- relative: bool, whether to show relative importance (percentages)
- absolute: bool, whether to show absolute importance values
- xlabel: str, x-axis label
- ylabel: str, y-axis label
"""
def __init__(self, estimator, labels=None, relative=True, absolute=False, xlabel=None, ylabel=None, **kwargs): ...
def fit(self, X, y, **kwargs): ...
def show(self, **kwargs): ...
def feature_importances(estimator, X, y, labels=None, **kwargs):
"""
Functional API for feature importances visualization.
Parameters:
- estimator: scikit-learn estimator
- X: feature matrix
- y: target vector
- labels: list, feature labels
Returns:
FeatureImportances visualizer instance
"""Recursive Feature Elimination with Cross-Validation (RFECV) for systematic feature selection using model performance feedback.
class RFECV(ModelVisualizer):
"""
Recursive Feature Elimination with Cross-Validation visualizer.
Parameters:
- estimator: scikit-learn estimator
- cv: int or cross-validation generator
- scoring: str, scoring metric
- step: int or float, number of features to remove at each step
- groups: array-like, group labels for group cross-validation
"""
def __init__(self, estimator, cv=None, scoring=None, step=1, groups=None, **kwargs): ...
def fit(self, X, y, **kwargs): ...
def show(self, **kwargs): ...
def rfecv(estimator, X, y, cv=None, scoring=None, **kwargs):
"""
Functional API for RFECV visualization.
Parameters:
- estimator: scikit-learn estimator
- X: feature matrix
- y: target vector
- cv: int or cross-validation generator
- scoring: str, scoring metric
Returns:
RFECV visualizer instance
"""from yellowbrick.model_selection import LearningCurve, ValidationCurve, CVScores, FeatureImportances
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Step 1: Learning curve analysis
print("Step 1: Learning curve analysis")
model = RandomForestClassifier(n_estimators=100, random_state=42)
learning_viz = LearningCurve(model, cv=cv, scoring='accuracy', n_jobs=4)
learning_viz.fit(X_train, y_train)
learning_viz.show()
# Step 2: Hyperparameter tuning with validation curves
print("Step 2: Hyperparameter tuning")
param_range = np.arange(10, 200, 20)
validation_viz = ValidationCurve(
model,
param_name='n_estimators',
param_range=param_range,
cv=cv,
scoring='accuracy'
)
validation_viz.fit(X_train, y_train)
validation_viz.show()
# Step 3: Cross-validation score assessment
print("Step 3: Cross-validation assessment")
cv_viz = CVScores(model, cv=cv, scoring='accuracy')
cv_viz.fit(X_train, y_train)
cv_viz.show()
# Step 4: Feature importance analysis
print("Step 4: Feature importance analysis")
fi_viz = FeatureImportances(model, labels=feature_names)
fi_viz.fit(X_train, y_train)
fi_viz.show()from yellowbrick.model_selection import ValidationCurve
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
# Compare hyperparameters across different models
models_params = [
(SVC(), 'C', np.logspace(-3, 3, 7)),
(RandomForestClassifier(), 'n_estimators', np.arange(10, 200, 30)),
(LogisticRegression(), 'C', np.logspace(-3, 3, 7))
]
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for idx, (model, param_name, param_range) in enumerate(models_params):
viz = ValidationCurve(
model,
param_name=param_name,
param_range=param_range,
cv=5,
scoring='accuracy',
ax=axes[idx],
logx=(param_name == 'C') # Use log scale for C parameter
)
viz.fit(X, y)
viz.finalize()
axes[idx].set_title(f'{model.__class__.__name__} - {param_name}')
plt.tight_layout()
plt.show()from yellowbrick.model_selection import RFECV, FeatureImportances, DroppingCurve
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
# Step 1: Initial feature importance analysis
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
fi_viz = FeatureImportances(rf_model, labels=feature_names)
fi_viz.fit(X, y)
fi_viz.show()
# Step 2: Recursive feature elimination
rfecv_viz = RFECV(rf_model, cv=5, scoring='accuracy', step=1)
rfecv_viz.fit(X, y)
rfecv_viz.show()
# Get optimal number of features
n_optimal_features = rfecv_viz.n_features_
print(f"Optimal number of features: {n_optimal_features}")
# Step 3: Feature dropping analysis
dropping_viz = DroppingCurve(rf_model, cv=5, scoring='accuracy')
dropping_viz.fit(X, y)
dropping_viz.show()from yellowbrick.model_selection import LearningCurve, CVScores
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
# Define models to compare
models = {
'Logistic Regression': LogisticRegression(),
'Random Forest': RandomForestClassifier(n_estimators=100),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
'SVM': SVC()
}
# Learning curve comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()
for idx, (name, model) in enumerate(models.items()):
viz = LearningCurve(model, cv=5, scoring='accuracy', ax=axes[idx])
viz.fit(X, y)
viz.finalize()
axes[idx].set_title(f'{name} - Learning Curve')
plt.tight_layout()
plt.show()
# Cross-validation scores comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 8))
axes = axes.ravel()
for idx, (name, model) in enumerate(models.items()):
viz = CVScores(model, cv=10, scoring='accuracy', ax=axes[idx])
viz.fit(X, y)
viz.finalize()
axes[idx].set_title(f'{name} - CV Scores')
plt.tight_layout()
plt.show()from yellowbrick.model_selection import ValidationCurve
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
# Multi-parameter validation curves
model = RandomForestClassifier(random_state=42)
parameters = {
'n_estimators': np.arange(10, 200, 20),
'max_depth': [3, 5, 7, 10, 15, 20, None],
'min_samples_split': [2, 5, 10, 20],
'min_samples_leaf': [1, 2, 4, 8]
}
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()
for idx, (param_name, param_range) in enumerate(parameters.items()):
# Handle None values in max_depth
if param_name == 'max_depth':
# Replace None with a large number for plotting
plot_range = [x if x is not None else 50 for x in param_range]
tick_labels = [str(x) if x is not None else 'None' for x in param_range]
else:
plot_range = param_range
tick_labels = None
viz = ValidationCurve(
model,
param_name=param_name,
param_range=param_range,
cv=5,
scoring='accuracy',
ax=axes[idx]
)
viz.fit(X, y)
viz.finalize()
if tick_labels:
axes[idx].set_xticks(range(len(plot_range)))
axes[idx].set_xticklabels(tick_labels)
axes[idx].set_title(f'Validation Curve - {param_name}')
plt.tight_layout()
plt.show()from yellowbrick.model_selection import LearningCurve, CVScores
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import numpy as np
# Generate datasets of different sizes
dataset_sizes = [100, 500, 1000, 5000]
model = RandomForestClassifier(n_estimators=100, random_state=42)
for size in dataset_sizes:
print(f"Dataset size: {size}")
# Generate data
X_sim, y_sim = make_classification(
n_samples=size,
n_features=20,
n_informative=15,
n_redundant=5,
random_state=42
)
# Learning curve
learning_viz = LearningCurve(model, cv=5, scoring='accuracy')
learning_viz.fit(X_sim, y_sim)
learning_viz.show()
# CV scores
cv_viz = CVScores(model, cv=5, scoring='accuracy')
cv_viz.fit(X_sim, y_sim)
cv_viz.show()
print(f"Mean CV score: {cv_viz.cv_scores_.mean():.3f} ± {cv_viz.cv_scores_.std():.3f}")
print("-" * 50)Install with Tessl CLI
npx tessl i tessl/pypi-yellowbrick