A suite of visual analysis and diagnostic tools for machine learning.
Built-in datasets, utility functions, and styling tools to support machine learning workflows and visualization customization. These components provide sample data for learning and testing, along with visualization theming and styling capabilities.
Collection of real-world datasets for machine learning experimentation, covering various domains including regression, classification, and text analysis tasks.
def load_concrete(data_home=None, return_dataset=False):
"""
Load the concrete compressive strength dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_energy(data_home=None, return_dataset=False):
"""
Load the energy efficiency dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_credit(data_home=None, return_dataset=False):
"""
Load the credit approval dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_occupancy(data_home=None, return_dataset=False):
"""
Load the occupancy detection dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_mushroom(data_home=None, return_dataset=False):
"""
Load the mushroom classification dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_hobbies(data_home=None):
"""
Load the hobbies text corpus.
Parameters:
- data_home: str, optional, path to data directory
Returns:
Corpus: Text corpus object with documents and metadata
"""
def load_game(data_home=None, return_dataset=False):
"""
Load the Connect-4 game dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_bikeshare(data_home=None, return_dataset=False):
"""
Load the bike sharing dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_spam(data_home=None, return_dataset=False):
"""
Load the email spam dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_walking(data_home=None, return_dataset=False):
"""
Load the walking activity dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def load_nfl(data_home=None, return_dataset=False):
"""
Load the NFL football receivers dataset.
Parameters:
- data_home: str, optional, path to data directory
- return_dataset: bool, return Dataset object if True
Returns:
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
"""
def get_data_home(data_home=None):
"""
Get the path to yellowbrick data directory.
Parameters:
- data_home: str, optional, specific data directory path
Returns:
str: Path to the yellowbrick data directory
"""Usage Example:
from yellowbrick.datasets import (
load_concrete, load_energy, load_credit, load_occupancy,
load_mushroom, load_hobbies, load_bikeshare, get_data_home
)
# Load regression dataset
concrete = load_concrete()
X_concrete, y_concrete = concrete.data, concrete.target
print(f"Concrete dataset: {X_concrete.shape} features, {y_concrete.shape} targets")
print(f"Feature names: {concrete.feature_names}")
# Load classification dataset
credit = load_credit()
X_credit, y_credit = credit.data, credit.target
print(f"Credit dataset: {X_credit.shape} features, {y_credit.shape} targets")
print(f"Classes: {credit.target_names}")
# Load text dataset
hobbies = load_hobbies()
texts, labels = hobbies.data, hobbies.target
print(f"Hobbies dataset: {len(texts)} documents, {len(set(labels))} categories")
# Get data directory
data_path = get_data_home()
print(f"Data directory: {data_path}")Comprehensive styling system for customizing Yellowbrick visualizations, including aesthetic themes, color palettes, and matplotlib integration.
def set_aesthetic(aesthetic='whitegrid', palette='flatui', desat=None, **kwargs):
"""
Set the aesthetic style of matplotlib and yellowbrick.
Parameters:
- aesthetic: str, style name ('whitegrid', 'darkgrid', 'white', 'dark', 'ticks')
- palette: str, color palette name
- desat: float, desaturation factor (0-1)
"""
def set_style(style='whitegrid', **kwargs):
"""
Set the matplotlib and yellowbrick plotting style.
Parameters:
- style: str, style name ('whitegrid', 'darkgrid', 'white', 'dark', 'ticks')
"""
def set_palette(palette='flatui', n_colors=None, desat=None, **kwargs):
"""
Set the color palette for yellowbrick visualizations.
Parameters:
- palette: str or list, palette name or color list
- n_colors: int, number of colors to use
- desat: float, desaturation factor
"""
def color_palette(palette=None, n_colors=None, desat=None):
"""
Return a color palette as a list of colors.
Parameters:
- palette: str or list, palette name or color list
- n_colors: int, number of colors
- desat: float, desaturation factor
Returns:
list: List of color values
"""
def set_color_codes(palette='flatui'):
"""
Set color codes for single-letter color specification.
Parameters:
- palette: str, palette name
"""
def reset_defaults():
"""
Reset yellowbrick and matplotlib to default settings.
"""
def reset_orig():
"""
Reset matplotlib to original settings (before yellowbrick import).
"""Usage Example:
from yellowbrick.style import (
set_aesthetic, set_style, set_palette, color_palette,
set_color_codes, reset_defaults, reset_orig
)
from yellowbrick.classifier import ROCAUC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
# Generate sample data
X, y = make_classification(n_samples=1000, n_classes=2, random_state=42)
model = RandomForestClassifier()
# Default yellowbrick style
set_aesthetic()
viz1 = ROCAUC(model, classes=['Class 0', 'Class 1'])
viz1.fit(X, y)
viz1.show()
# Dark theme with custom palette
set_aesthetic(aesthetic='darkgrid', palette='muted')
viz2 = ROCAUC(model, classes=['Class 0', 'Class 1'])
viz2.fit(X, y)
viz2.show()
# Custom color palette
custom_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
set_palette(custom_colors)
viz3 = ROCAUC(model, classes=['Class 0', 'Class 1'])
viz3.fit(X, y)
viz3.show()
# Get current color palette
current_palette = color_palette()
print(f"Current palette: {current_palette}")
# Reset to defaults
reset_defaults()Interactive demonstration functions that showcase Yellowbrick's capabilities with famous statistical datasets and visualizations.
def anscombe():
"""
Generate Anscombe's quartet visualization demonstrating the importance
of data visualization in statistical analysis.
Shows four datasets with identical statistical properties but
different distributions when visualized.
"""
def datasaurus():
"""
Generate the Datasaurus Dozen visualization showing multiple datasets
with identical summary statistics but vastly different distributions.
Demonstrates why visualization is crucial for understanding data
beyond summary statistics.
"""Usage Example:
from yellowbrick import anscombe, datasaurus
# Display Anscombe's quartet
print("Anscombe's Quartet - identical statistics, different patterns:")
anscombe()
# Display Datasaurus dozen
print("Datasaurus Dozen - same statistics, different shapes:")
datasaurus()Core utility types and constants used throughout the Yellowbrick library for consistent behavior and type checking.
from enum import Enum
class TargetType(Enum):
"""
Enumeration of target variable types for visualization adaptation.
"""
AUTO = "auto" # Automatically determine target type
SINGLE = "single" # Single continuous value
DISCRETE = "discrete" # Discrete categorical values
CONTINUOUS = "continuous" # Continuous numerical values
UNKNOWN = "unknown" # Unknown or undefined type
def target_color_type(target, target_type_override=None):
"""
Determine the appropriate color mapping type for target visualization.
Parameters:
- target: array-like, target values
- target_type_override: TargetType, override automatic detection
Returns:
TargetType: Determined target type for coloring
"""
# Constants
MAX_DISCRETE_CLASSES = 12 # Maximum number of discrete classes for color mappingfrom yellowbrick.datasets import load_concrete, load_credit, load_hobbies
from yellowbrick.features import Rank2D, ParallelCoordinates
from yellowbrick.classifier import ClassBalance
from yellowbrick.target import FeatureCorrelation
import matplotlib.pyplot as plt
# Regression dataset analysis
print("=== Concrete Dataset Analysis ===")
concrete = load_concrete()
X_concrete, y_concrete = concrete.data, concrete.target
# Feature correlation analysis
corr_viz = Rank2D(features=concrete.feature_names)
corr_viz.fit(X_concrete, y_concrete)
corr_viz.show()
# Classification dataset analysis
print("\n=== Credit Dataset Analysis ===")
credit = load_credit()
X_credit, y_credit = credit.data, credit.target
# Class balance analysis
balance_viz = ClassBalance(labels=credit.target_names)
balance_viz.fit(y_credit)
balance_viz.show()
# Parallel coordinates
pcoords_viz = ParallelCoordinates(classes=credit.target_names, normalize='standard')
pcoords_viz.fit(X_credit, y_credit)
pcoords_viz.show()
# Text dataset analysis
print("\n=== Hobbies Dataset Analysis ===")
hobbies = load_hobbies()
print(f"Number of documents: {len(hobbies.data)}")
print(f"Number of categories: {len(set(hobbies.target))}")
print(f"Categories: {hobbies.target_names}")from yellowbrick.style import set_aesthetic, set_palette, color_palette
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Load data
from yellowbrick.datasets import load_occupancy
occupancy = load_occupancy()
X, y = occupancy.data, occupancy.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Style 1: Default yellowbrick
print("Default Yellowbrick Style:")
set_aesthetic()
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
cm_viz1 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
cm_viz1.fit(X_train, y_train)
cm_viz1.score(X_test, y_test)
cm_viz1.finalize()
roc_viz1 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
roc_viz1.fit(X_train, y_train)
roc_viz1.score(X_test, y_test)
roc_viz1.finalize()
plt.tight_layout()
plt.show()
# Style 2: Dark theme with custom colors
print("Dark Theme with Custom Colors:")
set_aesthetic(aesthetic='darkgrid', palette='viridis')
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
cm_viz2 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
cm_viz2.fit(X_train, y_train)
cm_viz2.score(X_test, y_test)
cm_viz2.finalize()
roc_viz2 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
roc_viz2.fit(X_train, y_train)
roc_viz2.score(X_test, y_test)
roc_viz2.finalize()
plt.tight_layout()
plt.show()
# Style 3: Minimal white theme
print("Minimal White Theme:")
set_aesthetic(aesthetic='white', palette='husl')
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
cm_viz3 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
cm_viz3.fit(X_train, y_train)
cm_viz3.score(X_test, y_test)
cm_viz3.finalize()
roc_viz3 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
roc_viz3.fit(X_train, y_train)
roc_viz3.score(X_test, y_test)
roc_viz3.finalize()
plt.tight_layout()
plt.show()from yellowbrick import anscombe, datasaurus
from yellowbrick.style import set_aesthetic
import matplotlib.pyplot as plt
# Set up educational styling
set_aesthetic(aesthetic='whitegrid', palette='Set2')
# Demonstrate the importance of visualization
print("Educational Demonstrations:")
print("\n1. Anscombe's Quartet:")
print(" Four datasets with identical statistical properties but different patterns")
anscombe()
print("\n2. Datasaurus Dozen:")
print(" Multiple datasets with same summary statistics but different shapes")
datasaurus()
# Additional educational content
print("\n3. Why these demos matter:")
print(" - Summary statistics can be misleading")
print(" - Visualization reveals hidden patterns")
print(" - Always plot your data before analysis")
print(" - Different distributions can have identical means, variances, and correlations")from yellowbrick.datasets import get_data_home
from yellowbrick.utils.target import target_color_type, TargetType, MAX_DISCRETE_CLASSES
import os
import numpy as np
# Data directory management
data_home = get_data_home()
print(f"Yellowbrick data directory: {data_home}")
print(f"Directory exists: {os.path.exists(data_home)}")
if os.path.exists(data_home):
print(f"Directory contents: {os.listdir(data_home)}")
# Target type determination examples
print(f"\nTarget Type Analysis:")
# Continuous target
continuous_target = np.random.normal(0, 1, 100)
target_type_cont = target_color_type(continuous_target)
print(f"Continuous target type: {target_type_cont}")
# Discrete target with few classes
discrete_target = np.random.choice([0, 1, 2], 100)
target_type_disc = target_color_type(discrete_target)
print(f"Discrete target type: {target_type_disc}")
# Discrete target with many classes
many_classes = np.random.choice(range(20), 100)
target_type_many = target_color_type(many_classes)
print(f"Many classes target type: {target_type_many}")
print(f"Maximum discrete classes: {MAX_DISCRETE_CLASSES}")
# Override target type
target_type_override = target_color_type(continuous_target, TargetType.DISCRETE)
print(f"Overridden target type: {target_type_override}")from yellowbrick.datasets import load_concrete
from yellowbrick.features import PCA, Rank2D
from yellowbrick.regressor import ResidualsPlot
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
# Load yellowbrick dataset
concrete = load_concrete()
X, y = concrete.data, concrete.target
# Convert to pandas for easier manipulation
df = pd.DataFrame(X, columns=concrete.feature_names)
df['target'] = y
print("Dataset Information:")
print(f"Shape: {df.shape}")
print(f"Features: {list(df.columns[:-1])}")
print(f"Target: {df.columns[-1]}")
print("\nDataset statistics:")
print(df.describe())
# Feature analysis
rank2d_viz = Rank2D(features=concrete.feature_names)
rank2d_viz.fit(X, y)
rank2d_viz.show()
# PCA analysis
pca_viz = PCA(scale=True, proj_features=True)
pca_viz.fit(X, y)
pca_viz.show()
# Model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
residuals_viz = ResidualsPlot(model)
residuals_viz.fit(X_train, y_train)
residuals_viz.score(X_test, y_test)
residuals_viz.show()
print(f"\nModel R² Score: {model.score(X_test, y_test):.3f}")Install with Tessl CLI
npx tessl i tessl/pypi-yellowbrick