Tessl Tile for pypi/yellowbrick@1.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md clustering.md data-utilities.md features.md index.md model-selection.md regression.md text.md

data-utilities.mddocs/

0
# Data Loading and Utilities
1

2
Built-in datasets, utility functions, and styling tools to support machine learning workflows and visualization customization. These components provide sample data for learning and testing, along with visualization theming and styling capabilities.
3

4
## Capabilities
5

6
### Dataset Loaders
7

8
Collection of real-world datasets for machine learning experimentation, covering various domains including regression, classification, and text analysis tasks.
9

10
```python { .api }
11
def load_concrete(data_home=None, return_dataset=False):
12
    """
13
    Load the concrete compressive strength dataset.
14
    
15
    Parameters:
16
    - data_home: str, optional, path to data directory
17
    - return_dataset: bool, return Dataset object if True
18
    
19
    Returns:
20
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
21
    """
22

23
def load_energy(data_home=None, return_dataset=False):
24
    """
25
    Load the energy efficiency dataset.
26
    
27
    Parameters:
28
    - data_home: str, optional, path to data directory
29
    - return_dataset: bool, return Dataset object if True
30
    
31
    Returns:
32
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
33
    """
34

35
def load_credit(data_home=None, return_dataset=False):
36
    """
37
    Load the credit approval dataset.
38
    
39
    Parameters:
40
    - data_home: str, optional, path to data directory
41
    - return_dataset: bool, return Dataset object if True
42
    
43
    Returns:
44
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
45
    """
46

47
def load_occupancy(data_home=None, return_dataset=False):
48
    """
49
    Load the occupancy detection dataset.
50
    
51
    Parameters:
52
    - data_home: str, optional, path to data directory
53
    - return_dataset: bool, return Dataset object if True
54
    
55
    Returns:
56
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
57
    """
58

59
def load_mushroom(data_home=None, return_dataset=False):
60
    """
61
    Load the mushroom classification dataset.
62
    
63
    Parameters:
64
    - data_home: str, optional, path to data directory
65
    - return_dataset: bool, return Dataset object if True
66
    
67
    Returns:
68
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
69
    """
70

71
def load_hobbies(data_home=None):
72
    """
73
    Load the hobbies text corpus.
74
    
75
    Parameters:
76
    - data_home: str, optional, path to data directory
77
    
78
    Returns:
79
    Corpus: Text corpus object with documents and metadata
80
    """
81

82
def load_game(data_home=None, return_dataset=False):
83
    """
84
    Load the Connect-4 game dataset.
85
    
86
    Parameters:
87
    - data_home: str, optional, path to data directory
88
    - return_dataset: bool, return Dataset object if True
89
    
90
    Returns:
91
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
92
    """
93

94
def load_bikeshare(data_home=None, return_dataset=False):
95
    """
96
    Load the bike sharing dataset.
97
    
98
    Parameters:
99
    - data_home: str, optional, path to data directory
100
    - return_dataset: bool, return Dataset object if True
101
    
102
    Returns:
103
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
104
    """
105

106
def load_spam(data_home=None, return_dataset=False):
107
    """
108
    Load the email spam dataset.
109
    
110
    Parameters:
111
    - data_home: str, optional, path to data directory
112
    - return_dataset: bool, return Dataset object if True
113
    
114
    Returns:
115
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
116
    """
117

118
def load_walking(data_home=None, return_dataset=False):
119
    """
120
    Load the walking activity dataset.
121
    
122
    Parameters:
123
    - data_home: str, optional, path to data directory
124
    - return_dataset: bool, return Dataset object if True
125
    
126
    Returns:
127
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
128
    """
129

130
def load_nfl(data_home=None, return_dataset=False):
131
    """
132
    Load the NFL football receivers dataset.
133
    
134
    Parameters:
135
    - data_home: str, optional, path to data directory
136
    - return_dataset: bool, return Dataset object if True
137
    
138
    Returns:
139
    tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True
140
    """
141

142
def get_data_home(data_home=None):
143
    """
144
    Get the path to yellowbrick data directory.
145
    
146
    Parameters:
147
    - data_home: str, optional, specific data directory path
148
    
149
    Returns:
150
    str: Path to the yellowbrick data directory
151
    """
152
```
153

154
**Usage Example:**
155

156
```python
157
from yellowbrick.datasets import (
158
    load_concrete, load_energy, load_credit, load_occupancy,
159
    load_mushroom, load_hobbies, load_bikeshare, get_data_home
160
)
161

162
# Load regression dataset
163
concrete = load_concrete()
164
X_concrete, y_concrete = concrete.data, concrete.target
165
print(f"Concrete dataset: {X_concrete.shape} features, {y_concrete.shape} targets")
166
print(f"Feature names: {concrete.feature_names}")
167

168
# Load classification dataset
169
credit = load_credit()
170
X_credit, y_credit = credit.data, credit.target
171
print(f"Credit dataset: {X_credit.shape} features, {y_credit.shape} targets")
172
print(f"Classes: {credit.target_names}")
173

174
# Load text dataset
175
hobbies = load_hobbies()
176
texts, labels = hobbies.data, hobbies.target
177
print(f"Hobbies dataset: {len(texts)} documents, {len(set(labels))} categories")
178

179
# Get data directory
180
data_path = get_data_home()
181
print(f"Data directory: {data_path}")
182
```
183

184
### Style Management
185

186
Comprehensive styling system for customizing Yellowbrick visualizations, including aesthetic themes, color palettes, and matplotlib integration.
187

188
```python { .api }
189
def set_aesthetic(aesthetic='whitegrid', palette='flatui', desat=None, **kwargs):
190
    """
191
    Set the aesthetic style of matplotlib and yellowbrick.
192
    
193
    Parameters:
194
    - aesthetic: str, style name ('whitegrid', 'darkgrid', 'white', 'dark', 'ticks')
195
    - palette: str, color palette name
196
    - desat: float, desaturation factor (0-1)
197
    """
198

199
def set_style(style='whitegrid', **kwargs):
200
    """
201
    Set the matplotlib and yellowbrick plotting style.
202
    
203
    Parameters:
204
    - style: str, style name ('whitegrid', 'darkgrid', 'white', 'dark', 'ticks')
205
    """
206

207
def set_palette(palette='flatui', n_colors=None, desat=None, **kwargs):
208
    """
209
    Set the color palette for yellowbrick visualizations.
210
    
211
    Parameters:
212
    - palette: str or list, palette name or color list
213
    - n_colors: int, number of colors to use
214
    - desat: float, desaturation factor
215
    """
216

217
def color_palette(palette=None, n_colors=None, desat=None):
218
    """
219
    Return a color palette as a list of colors.
220
    
221
    Parameters:
222
    - palette: str or list, palette name or color list
223
    - n_colors: int, number of colors
224
    - desat: float, desaturation factor
225
    
226
    Returns:
227
    list: List of color values
228
    """
229

230
def set_color_codes(palette='flatui'):
231
    """
232
    Set color codes for single-letter color specification.
233
    
234
    Parameters:
235
    - palette: str, palette name
236
    """
237

238
def reset_defaults():
239
    """
240
    Reset yellowbrick and matplotlib to default settings.
241
    """
242

243
def reset_orig():
244
    """
245
    Reset matplotlib to original settings (before yellowbrick import).
246
    """
247
```
248

249
**Usage Example:**
250

251
```python
252
from yellowbrick.style import (
253
    set_aesthetic, set_style, set_palette, color_palette,
254
    set_color_codes, reset_defaults, reset_orig
255
)
256
from yellowbrick.classifier import ROCAUC
257
from sklearn.ensemble import RandomForestClassifier
258
from sklearn.datasets import make_classification
259
import matplotlib.pyplot as plt
260

261
# Generate sample data
262
X, y = make_classification(n_samples=1000, n_classes=2, random_state=42)
263
model = RandomForestClassifier()
264

265
# Default yellowbrick style
266
set_aesthetic()
267
viz1 = ROCAUC(model, classes=['Class 0', 'Class 1'])
268
viz1.fit(X, y)
269
viz1.show()
270

271
# Dark theme with custom palette
272
set_aesthetic(aesthetic='darkgrid', palette='muted')
273
viz2 = ROCAUC(model, classes=['Class 0', 'Class 1'])
274
viz2.fit(X, y)
275
viz2.show()
276

277
# Custom color palette
278
custom_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
279
set_palette(custom_colors)
280
viz3 = ROCAUC(model, classes=['Class 0', 'Class 1'])
281
viz3.fit(X, y)
282
viz3.show()
283

284
# Get current color palette
285
current_palette = color_palette()
286
print(f"Current palette: {current_palette}")
287

288
# Reset to defaults
289
reset_defaults()
290
```
291

292
### Demo Functions
293

294
Interactive demonstration functions that showcase Yellowbrick's capabilities with famous statistical datasets and visualizations.
295

296
```python { .api }
297
def anscombe():
298
    """
299
    Generate Anscombe's quartet visualization demonstrating the importance
300
    of data visualization in statistical analysis.
301
    
302
    Shows four datasets with identical statistical properties but
303
    different distributions when visualized.
304
    """
305

306
def datasaurus():
307
    """
308
    Generate the Datasaurus Dozen visualization showing multiple datasets
309
    with identical summary statistics but vastly different distributions.
310
    
311
    Demonstrates why visualization is crucial for understanding data
312
    beyond summary statistics.
313
    """
314
```
315

316
**Usage Example:**
317

318
```python
319
from yellowbrick import anscombe, datasaurus
320

321
# Display Anscombe's quartet
322
print("Anscombe's Quartet - identical statistics, different patterns:")
323
anscombe()
324

325
# Display Datasaurus dozen
326
print("Datasaurus Dozen - same statistics, different shapes:")
327
datasaurus()
328
```
329

330
### Utility Constants and Types
331

332
Core utility types and constants used throughout the Yellowbrick library for consistent behavior and type checking.
333

334
```python { .api }
335
from enum import Enum
336

337
class TargetType(Enum):
338
    """
339
    Enumeration of target variable types for visualization adaptation.
340
    """
341
    AUTO = "auto"           # Automatically determine target type
342
    SINGLE = "single"       # Single continuous value
343
    DISCRETE = "discrete"   # Discrete categorical values
344
    CONTINUOUS = "continuous"  # Continuous numerical values
345
    UNKNOWN = "unknown"     # Unknown or undefined type
346

347
def target_color_type(target, target_type_override=None):
348
    """
349
    Determine the appropriate color mapping type for target visualization.
350
    
351
    Parameters:
352
    - target: array-like, target values
353
    - target_type_override: TargetType, override automatic detection
354
    
355
    Returns:
356
    TargetType: Determined target type for coloring
357
    """
358

359
# Constants
360
MAX_DISCRETE_CLASSES = 12  # Maximum number of discrete classes for color mapping
361
```
362

363
## Usage Patterns
364

365
### Dataset Exploration Workflow
366

367
```python
368
from yellowbrick.datasets import load_concrete, load_credit, load_hobbies
369
from yellowbrick.features import Rank2D, ParallelCoordinates
370
from yellowbrick.classifier import ClassBalance
371
from yellowbrick.target import FeatureCorrelation
372
import matplotlib.pyplot as plt
373

374
# Regression dataset analysis
375
print("=== Concrete Dataset Analysis ===")
376
concrete = load_concrete()
377
X_concrete, y_concrete = concrete.data, concrete.target
378

379
# Feature correlation analysis
380
corr_viz = Rank2D(features=concrete.feature_names)
381
corr_viz.fit(X_concrete, y_concrete)
382
corr_viz.show()
383

384
# Classification dataset analysis
385
print("\n=== Credit Dataset Analysis ===")
386
credit = load_credit()
387
X_credit, y_credit = credit.data, credit.target
388

389
# Class balance analysis
390
balance_viz = ClassBalance(labels=credit.target_names)
391
balance_viz.fit(y_credit)
392
balance_viz.show()
393

394
# Parallel coordinates
395
pcoords_viz = ParallelCoordinates(classes=credit.target_names, normalize='standard')
396
pcoords_viz.fit(X_credit, y_credit)
397
pcoords_viz.show()
398

399
# Text dataset analysis
400
print("\n=== Hobbies Dataset Analysis ===")
401
hobbies = load_hobbies()
402
print(f"Number of documents: {len(hobbies.data)}")
403
print(f"Number of categories: {len(set(hobbies.target))}")
404
print(f"Categories: {hobbies.target_names}")
405
```
406

407
### Custom Styling Workflow
408

409
```python
410
from yellowbrick.style import set_aesthetic, set_palette, color_palette
411
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
412
from sklearn.ensemble import RandomForestClassifier
413
from sklearn.model_selection import train_test_split
414
import matplotlib.pyplot as plt
415

416
# Load data
417
from yellowbrick.datasets import load_occupancy
418
occupancy = load_occupancy()
419
X, y = occupancy.data, occupancy.target
420
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
421

422
# Create model
423
model = RandomForestClassifier(n_estimators=100, random_state=42)
424

425
# Style 1: Default yellowbrick
426
print("Default Yellowbrick Style:")
427
set_aesthetic()
428
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
429

430
cm_viz1 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
431
cm_viz1.fit(X_train, y_train)
432
cm_viz1.score(X_test, y_test)
433
cm_viz1.finalize()
434

435
roc_viz1 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
436
roc_viz1.fit(X_train, y_train)
437
roc_viz1.score(X_test, y_test)
438
roc_viz1.finalize()
439

440
plt.tight_layout()
441
plt.show()
442

443
# Style 2: Dark theme with custom colors
444
print("Dark Theme with Custom Colors:")
445
set_aesthetic(aesthetic='darkgrid', palette='viridis')
446
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
447

448
cm_viz2 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
449
cm_viz2.fit(X_train, y_train)
450
cm_viz2.score(X_test, y_test)
451
cm_viz2.finalize()
452

453
roc_viz2 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
454
roc_viz2.fit(X_train, y_train)
455
roc_viz2.score(X_test, y_test)
456
roc_viz2.finalize()
457

458
plt.tight_layout()
459
plt.show()
460

461
# Style 3: Minimal white theme
462
print("Minimal White Theme:")
463
set_aesthetic(aesthetic='white', palette='husl')
464
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
465

466
cm_viz3 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])
467
cm_viz3.fit(X_train, y_train)
468
cm_viz3.score(X_test, y_test)
469
cm_viz3.finalize()
470

471
roc_viz3 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])
472
roc_viz3.fit(X_train, y_train)
473
roc_viz3.score(X_test, y_test)
474
roc_viz3.finalize()
475

476
plt.tight_layout()
477
plt.show()
478
```
479

480
### Educational Demo Usage
481

482
```python
483
from yellowbrick import anscombe, datasaurus
484
from yellowbrick.style import set_aesthetic
485
import matplotlib.pyplot as plt
486

487
# Set up educational styling
488
set_aesthetic(aesthetic='whitegrid', palette='Set2')
489

490
# Demonstrate the importance of visualization
491
print("Educational Demonstrations:")
492
print("\n1. Anscombe's Quartet:")
493
print("   Four datasets with identical statistical properties but different patterns")
494
anscombe()
495

496
print("\n2. Datasaurus Dozen:")  
497
print("   Multiple datasets with same summary statistics but different shapes")
498
datasaurus()
499

500
# Additional educational content
501
print("\n3. Why these demos matter:")
502
print("   - Summary statistics can be misleading")
503
print("   - Visualization reveals hidden patterns") 
504
print("   - Always plot your data before analysis")
505
print("   - Different distributions can have identical means, variances, and correlations")
506
```
507

508
### Data Management Utilities
509

510
```python
511
from yellowbrick.datasets import get_data_home
512
from yellowbrick.utils.target import target_color_type, TargetType, MAX_DISCRETE_CLASSES
513
import os
514
import numpy as np
515

516
# Data directory management
517
data_home = get_data_home()
518
print(f"Yellowbrick data directory: {data_home}")
519
print(f"Directory exists: {os.path.exists(data_home)}")
520

521
if os.path.exists(data_home):
522
    print(f"Directory contents: {os.listdir(data_home)}")
523

524
# Target type determination examples
525
print(f"\nTarget Type Analysis:")
526

527
# Continuous target
528
continuous_target = np.random.normal(0, 1, 100)
529
target_type_cont = target_color_type(continuous_target)
530
print(f"Continuous target type: {target_type_cont}")
531

532
# Discrete target with few classes
533
discrete_target = np.random.choice([0, 1, 2], 100)
534
target_type_disc = target_color_type(discrete_target)
535
print(f"Discrete target type: {target_type_disc}")
536

537
# Discrete target with many classes
538
many_classes = np.random.choice(range(20), 100)
539
target_type_many = target_color_type(many_classes)
540
print(f"Many classes target type: {target_type_many}")
541

542
print(f"Maximum discrete classes: {MAX_DISCRETE_CLASSES}")
543

544
# Override target type
545
target_type_override = target_color_type(continuous_target, TargetType.DISCRETE)
546
print(f"Overridden target type: {target_type_override}")
547
```
548

549
### Integration with External Data
550

551
```python
552
from yellowbrick.datasets import load_concrete
553
from yellowbrick.features import PCA, Rank2D
554
from yellowbrick.regressor import ResidualsPlot
555
from sklearn.ensemble import RandomForestRegressor
556
from sklearn.model_selection import train_test_split
557
import pandas as pd
558

559
# Load yellowbrick dataset
560
concrete = load_concrete()
561
X, y = concrete.data, concrete.target
562

563
# Convert to pandas for easier manipulation
564
df = pd.DataFrame(X, columns=concrete.feature_names)
565
df['target'] = y
566

567
print("Dataset Information:")
568
print(f"Shape: {df.shape}")
569
print(f"Features: {list(df.columns[:-1])}")
570
print(f"Target: {df.columns[-1]}")
571
print("\nDataset statistics:")
572
print(df.describe())
573

574
# Feature analysis
575
rank2d_viz = Rank2D(features=concrete.feature_names)
576
rank2d_viz.fit(X, y)
577
rank2d_viz.show()
578

579
# PCA analysis
580
pca_viz = PCA(scale=True, proj_features=True)
581
pca_viz.fit(X, y)
582
pca_viz.show()
583

584
# Model evaluation
585
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
586
model = RandomForestRegressor(n_estimators=100, random_state=42)
587

588
residuals_viz = ResidualsPlot(model)
589
residuals_viz.fit(X_train, y_train)
590
residuals_viz.score(X_test, y_test)
591
residuals_viz.show()
592

593
print(f"\nModel R² Score: {model.score(X_test, y_test):.3f}")
594
```

Version

Tile

Files

data-utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-utilities.mddocs/