0
# Data Loading and Utilities
1
2
Built-in datasets, utility functions, and styling tools to support machine learning workflows and visualization customization. These components provide sample data for learning and testing, along with visualization theming and styling capabilities.
3
4
## Capabilities
5
6
### Dataset Loaders
7
8
Collection of real-world datasets for machine learning experimentation, covering various domains including regression, classification, and text analysis tasks.
9
10
```python { .api }
11
def load_concrete(data_home=None, return_dataset=False):
12
"""
13
Load the concrete compressive strength dataset.
14
15
Parameters:
16
- data_home: str, optional, path to data directory
17
- return_dataset: bool, return Dataset object if True
18
19
Returns:
20
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
21
"""
22
23
def load_energy(data_home=None, return_dataset=False):
24
"""
25
Load the energy efficiency dataset.
26
27
Parameters:
28
- data_home: str, optional, path to data directory
29
- return_dataset: bool, return Dataset object if True
30
31
Returns:
32
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
33
"""
34
35
def load_credit(data_home=None, return_dataset=False):
36
"""
37
Load the credit approval dataset.
38
39
Parameters:
40
- data_home: str, optional, path to data directory
41
- return_dataset: bool, return Dataset object if True
42
43
Returns:
44
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
45
"""
46
47
def load_occupancy(data_home=None, return_dataset=False):
48
"""
49
Load the occupancy detection dataset.
50
51
Parameters:
52
- data_home: str, optional, path to data directory
53
- return_dataset: bool, return Dataset object if True
54
55
Returns:
56
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
57
"""
58
59
def load_mushroom(data_home=None, return_dataset=False):
60
"""
61
Load the mushroom classification dataset.
62
63
Parameters:
64
- data_home: str, optional, path to data directory
65
- return_dataset: bool, return Dataset object if True
66
67
Returns:
68
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
69
"""
70
71
def load_hobbies(data_home=None):
72
"""
73
Load the hobbies text corpus.
74
75
Parameters:
76
- data_home: str, optional, path to data directory
77
78
Returns:
79
Corpus: Text corpus object with documents and metadata
80
"""
81
82
def load_game(data_home=None, return_dataset=False):
83
"""
84
Load the Connect-4 game dataset.
85
86
Parameters:
87
- data_home: str, optional, path to data directory
88
- return_dataset: bool, return Dataset object if True
89
90
Returns:
91
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
92
"""
93
94
def load_bikeshare(data_home=None, return_dataset=False):
95
"""
96
Load the bike sharing dataset.
97
98
Parameters:
99
- data_home: str, optional, path to data directory
100
- return_dataset: bool, return Dataset object if True
101
102
Returns:
103
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
104
"""
105
106
def load_spam(data_home=None, return_dataset=False):
107
"""
108
Load the email spam dataset.
109
110
Parameters:
111
- data_home: str, optional, path to data directory
112
- return_dataset: bool, return Dataset object if True
113
114
Returns:
115
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
116
"""
117
118
def load_walking(data_home=None, return_dataset=False):
119
"""
120
Load the walking activity dataset.
121
122
Parameters:
123
- data_home: str, optional, path to data directory
124
- return_dataset: bool, return Dataset object if True
125
126
Returns:
127
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
128
"""
129
130
def load_nfl(data_home=None, return_dataset=False):
131
"""
132
Load the NFL football receivers dataset.
133
134
Parameters:
135
- data_home: str, optional, path to data directory
136
- return_dataset: bool, return Dataset object if True
137
138
Returns:
139
tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
140
"""
141
142
def get_data_home(data_home=None):
143
"""
144
Get the path to yellowbrick data directory.
145
146
Parameters:
147
- data_home: str, optional, specific data directory path
148
149
Returns:
150
str: Path to the yellowbrick data directory
151
"""
152
```
153
154
**Usage Example:**
155
156
```python
157
from yellowbrick.datasets import (
158
load_concrete, load_energy, load_credit, load_occupancy,
159
load_mushroom, load_hobbies, load_bikeshare, get_data_home
160
)
161
162
# Load regression dataset
163
concrete = load_concrete()
164
X_concrete, y_concrete = concrete.data, concrete.target
165
print(f"Concrete dataset: {X_concrete.shape} features, {y_concrete.shape} targets")
166
print(f"Feature names: {concrete.feature_names}")
167
168
# Load classification dataset
169
credit = load_credit()
170
X_credit, y_credit = credit.data, credit.target
171
print(f"Credit dataset: {X_credit.shape} features, {y_credit.shape} targets")
172
print(f"Classes: {credit.target_names}")
173
174
# Load text dataset
175
hobbies = load_hobbies()
176
texts, labels = hobbies.data, hobbies.target
177
print(f"Hobbies dataset: {len(texts)} documents, {len(set(labels))} categories")
178
179
# Get data directory
180
data_path = get_data_home()
181
print(f"Data directory: {data_path}")
182
```
183
184
### Style Management
185
186
Comprehensive styling system for customizing Yellowbrick visualizations, including aesthetic themes, color palettes, and matplotlib integration.
187
188
```python { .api }
189
def set_aesthetic(aesthetic='whitegrid', palette='flatui', desat=None, **kwargs):
190
"""
191
Set the aesthetic style of matplotlib and yellowbrick.
192
193
Parameters:
194
- aesthetic: str, style name ('whitegrid', 'darkgrid', 'white', 'dark', 'ticks')
195
- palette: str, color palette name
196
- desat: float, desaturation factor (0-1)
197
"""
198
199
def set_style(style='whitegrid', **kwargs):
200
"""
201
Set the matplotlib and yellowbrick plotting style.
202
203
Parameters:
204
- style: str, style name ('whitegrid', 'darkgrid', 'white', 'dark', 'ticks')
205
"""
206
207
def set_palette(palette='flatui', n_colors=None, desat=None, **kwargs):
208
"""
209
Set the color palette for yellowbrick visualizations.
210
211
Parameters:
212
- palette: str or list, palette name or color list
213
- n_colors: int, number of colors to use
214
- desat: float, desaturation factor
215
"""
216
217
def color_palette(palette=None, n_colors=None, desat=None):
218
"""
219
Return a color palette as a list of colors.
220
221
Parameters:
222
- palette: str or list, palette name or color list
223
- n_colors: int, number of colors
224
- desat: float, desaturation factor
225
226
Returns:
227
list: List of color values
228
"""
229
230
def set_color_codes(palette='flatui'):
231
"""
232
Set color codes for single-letter color specification.
233
234
Parameters:
235
- palette: str, palette name
236
"""
237
238
def reset_defaults():
239
"""
240
Reset yellowbrick and matplotlib to default settings.
241
"""
242
243
def reset_orig():
244
"""
245
Reset matplotlib to original settings (before yellowbrick import).
246
"""
247
```
248
249
**Usage Example:**
250
251
```python
252
from yellowbrick.style import (
253
set_aesthetic, set_style, set_palette, color_palette,
254
set_color_codes, reset_defaults, reset_orig
255
)
256
from yellowbrick.classifier import ROCAUC
257
from sklearn.ensemble import RandomForestClassifier
258
from sklearn.datasets import make_classification
259
import matplotlib.pyplot as plt
260
261
# Generate sample data
262
X, y = make_classification(n_samples=1000, n_classes=2, random_state=42)
263
model = RandomForestClassifier()
264
265
# Default yellowbrick style
266
set_aesthetic()
267
viz1 = ROCAUC(model, classes=['Class 0', 'Class 1'])
268
viz1.fit(X, y)
269
viz1.show()
270
271
# Dark theme with custom palette
272
set_aesthetic(aesthetic='darkgrid', palette='muted')
273
viz2 = ROCAUC(model, classes=['Class 0', 'Class 1'])
274
viz2.fit(X, y)
275
viz2.show()
276
277
# Custom color palette
278
custom_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
279
set_palette(custom_colors)
280
viz3 = ROCAUC(model, classes=['Class 0', 'Class 1'])
281
viz3.fit(X, y)
282
viz3.show()
283
284
# Get current color palette
285
current_palette = color_palette()
286
print(f"Current palette: {current_palette}")
287
288
# Reset to defaults
289
reset_defaults()
290
```
291
292
### Demo Functions
293
294
Interactive demonstration functions that showcase Yellowbrick's capabilities with famous statistical datasets and visualizations.
295
296
```python { .api }
297
def anscombe():
298
"""
299
Generate Anscombe's quartet visualization demonstrating the importance
300
of data visualization in statistical analysis.
301
302
Shows four datasets with identical statistical properties but
303
different distributions when visualized.
304
"""
305
306
def datasaurus():
307
"""
308
Generate the Datasaurus Dozen visualization showing multiple datasets
309
with identical summary statistics but vastly different distributions.
310
311
Demonstrates why visualization is crucial for understanding data
312
beyond summary statistics.
313
"""
314
```
315
316
**Usage Example:**
317
318
```python
319
from yellowbrick import anscombe, datasaurus
320
321
# Display Anscombe's quartet
322
print("Anscombe's Quartet - identical statistics, different patterns:")
323
anscombe()
324
325
# Display Datasaurus dozen
326
print("Datasaurus Dozen - same statistics, different shapes:")
327
datasaurus()
328
```
329
330
### Utility Constants and Types
331
332
Core utility types and constants used throughout the Yellowbrick library for consistent behavior and type checking.
333
334
```python { .api }
335
from enum import Enum
336
337
class TargetType(Enum):
338
"""
339
Enumeration of target variable types for visualization adaptation.
340
"""
341
AUTO = "auto" # Automatically determine target type
342
SINGLE = "single" # Single continuous value
343
DISCRETE = "discrete" # Discrete categorical values
344
CONTINUOUS = "continuous" # Continuous numerical values
345
UNKNOWN = "unknown" # Unknown or undefined type
346
347
def target_color_type(target, target_type_override=None):
348
"""
349
Determine the appropriate color mapping type for target visualization.
350
351
Parameters:
352
- target: array-like, target values
353
- target_type_override: TargetType, override automatic detection
354
355
Returns:
356
TargetType: Determined target type for coloring
357
"""
358
359
# Constants
360
MAX_DISCRETE_CLASSES = 12 # Maximum number of discrete classes for color mapping
361
```
362
363
## Usage Patterns
364
365
### Dataset Exploration Workflow
366
367
```python
368
from yellowbrick.datasets import load_concrete, load_credit, load_hobbies
369
from yellowbrick.features import Rank2D, ParallelCoordinates
370
from yellowbrick.classifier import ClassBalance
371
from yellowbrick.target import FeatureCorrelation
372
import matplotlib.pyplot as plt
373
374
# Regression dataset analysis
375
print("=== Concrete Dataset Analysis ===")
376
concrete = load_concrete()
377
X_concrete, y_concrete = concrete.data, concrete.target
378
379
# Feature correlation analysis
380
corr_viz = Rank2D(features=concrete.feature_names)
381
corr_viz.fit(X_concrete, y_concrete)
382
corr_viz.show()
383
384
# Classification dataset analysis
385
print("\n=== Credit Dataset Analysis ===")
386
credit = load_credit()
387
X_credit, y_credit = credit.data, credit.target
388
389
# Class balance analysis
390
balance_viz = ClassBalance(labels=credit.target_names)
391
balance_viz.fit(y_credit)
392
balance_viz.show()
393
394
# Parallel coordinates
395
pcoords_viz = ParallelCoordinates(classes=credit.target_names, normalize='standard')
396
pcoords_viz.fit(X_credit, y_credit)
397
pcoords_viz.show()
398
399
# Text dataset analysis
400
print("\n=== Hobbies Dataset Analysis ===")
401
hobbies = load_hobbies()
402
print(f"Number of documents: {len(hobbies.data)}")
403
print(f"Number of categories: {len(set(hobbies.target))}")
404
print(f"Categories: {hobbies.target_names}")
405
```
406
407
### Custom Styling Workflow
408
409
```python
410
from yellowbrick.style import set_aesthetic, set_palette, color_palette
411
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
412
from sklearn.ensemble import RandomForestClassifier
413
from sklearn.model_selection import train_test_split
414
import matplotlib.pyplot as plt
415
416
# Load data
417
from yellowbrick.datasets import load_occupancy
418
occupancy = load_occupancy()
419
X, y = occupancy.data, occupancy.target
420
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
421
422
# Create model
423
model = RandomForestClassifier(n_estimators=100, random_state=42)
424
425
# Style 1: Default yellowbrick
426
print("Default Yellowbrick Style:")
427
set_aesthetic()
428
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
429
430
cm_viz1 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
431
cm_viz1.fit(X_train, y_train)
432
cm_viz1.score(X_test, y_test)
433
cm_viz1.finalize()
434
435
roc_viz1 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
436
roc_viz1.fit(X_train, y_train)
437
roc_viz1.score(X_test, y_test)
438
roc_viz1.finalize()
439
440
plt.tight_layout()
441
plt.show()
442
443
# Style 2: Dark theme with custom colors
444
print("Dark Theme with Custom Colors:")
445
set_aesthetic(aesthetic='darkgrid', palette='viridis')
446
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
447
448
cm_viz2 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
449
cm_viz2.fit(X_train, y_train)
450
cm_viz2.score(X_test, y_test)
451
cm_viz2.finalize()
452
453
roc_viz2 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
454
roc_viz2.fit(X_train, y_train)
455
roc_viz2.score(X_test, y_test)
456
roc_viz2.finalize()
457
458
plt.tight_layout()
459
plt.show()
460
461
# Style 3: Minimal white theme
462
print("Minimal White Theme:")
463
set_aesthetic(aesthetic='white', palette='husl')
464
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
465
466
cm_viz3 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
467
cm_viz3.fit(X_train, y_train)
468
cm_viz3.score(X_test, y_test)
469
cm_viz3.finalize()
470
471
roc_viz3 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
472
roc_viz3.fit(X_train, y_train)
473
roc_viz3.score(X_test, y_test)
474
roc_viz3.finalize()
475
476
plt.tight_layout()
477
plt.show()
478
```
479
480
### Educational Demo Usage
481
482
```python
483
from yellowbrick import anscombe, datasaurus
484
from yellowbrick.style import set_aesthetic
485
import matplotlib.pyplot as plt
486
487
# Set up educational styling
488
set_aesthetic(aesthetic='whitegrid', palette='Set2')
489
490
# Demonstrate the importance of visualization
491
print("Educational Demonstrations:")
492
print("\n1. Anscombe's Quartet:")
493
print(" Four datasets with identical statistical properties but different patterns")
494
anscombe()
495
496
print("\n2. Datasaurus Dozen:")
497
print(" Multiple datasets with same summary statistics but different shapes")
498
datasaurus()
499
500
# Additional educational content
501
print("\n3. Why these demos matter:")
502
print(" - Summary statistics can be misleading")
503
print(" - Visualization reveals hidden patterns")
504
print(" - Always plot your data before analysis")
505
print(" - Different distributions can have identical means, variances, and correlations")
506
```
507
508
### Data Management Utilities
509
510
```python
511
from yellowbrick.datasets import get_data_home
512
from yellowbrick.utils.target import target_color_type, TargetType, MAX_DISCRETE_CLASSES
513
import os
514
import numpy as np
515
516
# Data directory management
517
data_home = get_data_home()
518
print(f"Yellowbrick data directory: {data_home}")
519
print(f"Directory exists: {os.path.exists(data_home)}")
520
521
if os.path.exists(data_home):
522
print(f"Directory contents: {os.listdir(data_home)}")
523
524
# Target type determination examples
525
print(f"\nTarget Type Analysis:")
526
527
# Continuous target
528
continuous_target = np.random.normal(0, 1, 100)
529
target_type_cont = target_color_type(continuous_target)
530
print(f"Continuous target type: {target_type_cont}")
531
532
# Discrete target with few classes
533
discrete_target = np.random.choice([0, 1, 2], 100)
534
target_type_disc = target_color_type(discrete_target)
535
print(f"Discrete target type: {target_type_disc}")
536
537
# Discrete target with many classes
538
many_classes = np.random.choice(range(20), 100)
539
target_type_many = target_color_type(many_classes)
540
print(f"Many classes target type: {target_type_many}")
541
542
print(f"Maximum discrete classes: {MAX_DISCRETE_CLASSES}")
543
544
# Override target type
545
target_type_override = target_color_type(continuous_target, TargetType.DISCRETE)
546
print(f"Overridden target type: {target_type_override}")
547
```
548
549
### Integration with External Data
550
551
```python
552
from yellowbrick.datasets import load_concrete
553
from yellowbrick.features import PCA, Rank2D
554
from yellowbrick.regressor import ResidualsPlot
555
from sklearn.ensemble import RandomForestRegressor
556
from sklearn.model_selection import train_test_split
557
import pandas as pd
558
559
# Load yellowbrick dataset
560
concrete = load_concrete()
561
X, y = concrete.data, concrete.target
562
563
# Convert to pandas for easier manipulation
564
df = pd.DataFrame(X, columns=concrete.feature_names)
565
df['target'] = y
566
567
print("Dataset Information:")
568
print(f"Shape: {df.shape}")
569
print(f"Features: {list(df.columns[:-1])}")
570
print(f"Target: {df.columns[-1]}")
571
print("\nDataset statistics:")
572
print(df.describe())
573
574
# Feature analysis
575
rank2d_viz = Rank2D(features=concrete.feature_names)
576
rank2d_viz.fit(X, y)
577
rank2d_viz.show()
578
579
# PCA analysis
580
pca_viz = PCA(scale=True, proj_features=True)
581
pca_viz.fit(X, y)
582
pca_viz.show()
583
584
# Model evaluation
585
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
586
model = RandomForestRegressor(n_estimators=100, random_state=42)
587
588
residuals_viz = ResidualsPlot(model)
589
residuals_viz.fit(X_train, y_train)
590
residuals_viz.score(X_test, y_test)
591
residuals_viz.show()
592
593
print(f"\nModel R² Score: {model.score(X_test, y_test):.3f}")
594
```