or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddata-utilities.mdfeatures.mdindex.mdmodel-selection.mdregression.mdtext.md

data-utilities.mddocs/

0

# Data Loading and Utilities

1

2

Built-in datasets, utility functions, and styling tools to support machine learning workflows and visualization customization. These components provide sample data for learning and testing, along with visualization theming and styling capabilities.

3

4

## Capabilities

5

6

### Dataset Loaders

7

8

Collection of real-world datasets for machine learning experimentation, covering various domains including regression, classification, and text analysis tasks.

9

10

```python { .api }

11

def load_concrete(data_home=None, return_dataset=False):

12

"""

13

Load the concrete compressive strength dataset.

14

15

Parameters:

16

- data_home: str, optional, path to data directory

17

- return_dataset: bool, return Dataset object if True

18

19

Returns:

20

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

21

"""

22

23

def load_energy(data_home=None, return_dataset=False):

24

"""

25

Load the energy efficiency dataset.

26

27

Parameters:

28

- data_home: str, optional, path to data directory

29

- return_dataset: bool, return Dataset object if True

30

31

Returns:

32

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

33

"""

34

35

def load_credit(data_home=None, return_dataset=False):

36

"""

37

Load the credit approval dataset.

38

39

Parameters:

40

- data_home: str, optional, path to data directory

41

- return_dataset: bool, return Dataset object if True

42

43

Returns:

44

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

45

"""

46

47

def load_occupancy(data_home=None, return_dataset=False):

48

"""

49

Load the occupancy detection dataset.

50

51

Parameters:

52

- data_home: str, optional, path to data directory

53

- return_dataset: bool, return Dataset object if True

54

55

Returns:

56

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

57

"""

58

59

def load_mushroom(data_home=None, return_dataset=False):

60

"""

61

Load the mushroom classification dataset.

62

63

Parameters:

64

- data_home: str, optional, path to data directory

65

- return_dataset: bool, return Dataset object if True

66

67

Returns:

68

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

69

"""

70

71

def load_hobbies(data_home=None):

72

"""

73

Load the hobbies text corpus.

74

75

Parameters:

76

- data_home: str, optional, path to data directory

77

78

Returns:

79

Corpus: Text corpus object with documents and metadata

80

"""

81

82

def load_game(data_home=None, return_dataset=False):

83

"""

84

Load the Connect-4 game dataset.

85

86

Parameters:

87

- data_home: str, optional, path to data directory

88

- return_dataset: bool, return Dataset object if True

89

90

Returns:

91

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

92

"""

93

94

def load_bikeshare(data_home=None, return_dataset=False):

95

"""

96

Load the bike sharing dataset.

97

98

Parameters:

99

- data_home: str, optional, path to data directory

100

- return_dataset: bool, return Dataset object if True

101

102

Returns:

103

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

104

"""

105

106

def load_spam(data_home=None, return_dataset=False):

107

"""

108

Load the email spam dataset.

109

110

Parameters:

111

- data_home: str, optional, path to data directory

112

- return_dataset: bool, return Dataset object if True

113

114

Returns:

115

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

116

"""

117

118

def load_walking(data_home=None, return_dataset=False):

119

"""

120

Load the walking activity dataset.

121

122

Parameters:

123

- data_home: str, optional, path to data directory

124

- return_dataset: bool, return Dataset object if True

125

126

Returns:

127

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

128

"""

129

130

def load_nfl(data_home=None, return_dataset=False):

131

"""

132

Load the NFL football receivers dataset.

133

134

Parameters:

135

- data_home: str, optional, path to data directory

136

- return_dataset: bool, return Dataset object if True

137

138

Returns:

139

tuple or Dataset: (X, y) arrays or Dataset object if return_dataset=True

140

"""

141

142

def get_data_home(data_home=None):

143

"""

144

Get the path to yellowbrick data directory.

145

146

Parameters:

147

- data_home: str, optional, specific data directory path

148

149

Returns:

150

str: Path to the yellowbrick data directory

151

"""

152

```

153

154

**Usage Example:**

155

156

```python

157

from yellowbrick.datasets import (

158

load_concrete, load_energy, load_credit, load_occupancy,

159

load_mushroom, load_hobbies, load_bikeshare, get_data_home

160

)

161

162

# Load regression dataset

163

concrete = load_concrete()

164

X_concrete, y_concrete = concrete.data, concrete.target

165

print(f"Concrete dataset: {X_concrete.shape} features, {y_concrete.shape} targets")

166

print(f"Feature names: {concrete.feature_names}")

167

168

# Load classification dataset

169

credit = load_credit()

170

X_credit, y_credit = credit.data, credit.target

171

print(f"Credit dataset: {X_credit.shape} features, {y_credit.shape} targets")

172

print(f"Classes: {credit.target_names}")

173

174

# Load text dataset

175

hobbies = load_hobbies()

176

texts, labels = hobbies.data, hobbies.target

177

print(f"Hobbies dataset: {len(texts)} documents, {len(set(labels))} categories")

178

179

# Get data directory

180

data_path = get_data_home()

181

print(f"Data directory: {data_path}")

182

```

183

184

### Style Management

185

186

Comprehensive styling system for customizing Yellowbrick visualizations, including aesthetic themes, color palettes, and matplotlib integration.

187

188

```python { .api }

189

def set_aesthetic(aesthetic='whitegrid', palette='flatui', desat=None, **kwargs):

190

"""

191

Set the aesthetic style of matplotlib and yellowbrick.

192

193

Parameters:

194

- aesthetic: str, style name ('whitegrid', 'darkgrid', 'white', 'dark', 'ticks')

195

- palette: str, color palette name

196

- desat: float, desaturation factor (0-1)

197

"""

198

199

def set_style(style='whitegrid', **kwargs):

200

"""

201

Set the matplotlib and yellowbrick plotting style.

202

203

Parameters:

204

- style: str, style name ('whitegrid', 'darkgrid', 'white', 'dark', 'ticks')

205

"""

206

207

def set_palette(palette='flatui', n_colors=None, desat=None, **kwargs):

208

"""

209

Set the color palette for yellowbrick visualizations.

210

211

Parameters:

212

- palette: str or list, palette name or color list

213

- n_colors: int, number of colors to use

214

- desat: float, desaturation factor

215

"""

216

217

def color_palette(palette=None, n_colors=None, desat=None):

218

"""

219

Return a color palette as a list of colors.

220

221

Parameters:

222

- palette: str or list, palette name or color list

223

- n_colors: int, number of colors

224

- desat: float, desaturation factor

225

226

Returns:

227

list: List of color values

228

"""

229

230

def set_color_codes(palette='flatui'):

231

"""

232

Set color codes for single-letter color specification.

233

234

Parameters:

235

- palette: str, palette name

236

"""

237

238

def reset_defaults():

239

"""

240

Reset yellowbrick and matplotlib to default settings.

241

"""

242

243

def reset_orig():

244

"""

245

Reset matplotlib to original settings (before yellowbrick import).

246

"""

247

```

248

249

**Usage Example:**

250

251

```python

252

from yellowbrick.style import (

253

set_aesthetic, set_style, set_palette, color_palette,

254

set_color_codes, reset_defaults, reset_orig

255

)

256

from yellowbrick.classifier import ROCAUC

257

from sklearn.ensemble import RandomForestClassifier

258

from sklearn.datasets import make_classification

259

import matplotlib.pyplot as plt

260

261

# Generate sample data

262

X, y = make_classification(n_samples=1000, n_classes=2, random_state=42)

263

model = RandomForestClassifier()

264

265

# Default yellowbrick style

266

set_aesthetic()

267

viz1 = ROCAUC(model, classes=['Class 0', 'Class 1'])

268

viz1.fit(X, y)

269

viz1.show()

270

271

# Dark theme with custom palette

272

set_aesthetic(aesthetic='darkgrid', palette='muted')

273

viz2 = ROCAUC(model, classes=['Class 0', 'Class 1'])

274

viz2.fit(X, y)

275

viz2.show()

276

277

# Custom color palette

278

custom_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']

279

set_palette(custom_colors)

280

viz3 = ROCAUC(model, classes=['Class 0', 'Class 1'])

281

viz3.fit(X, y)

282

viz3.show()

283

284

# Get current color palette

285

current_palette = color_palette()

286

print(f"Current palette: {current_palette}")

287

288

# Reset to defaults

289

reset_defaults()

290

```

291

292

### Demo Functions

293

294

Interactive demonstration functions that showcase Yellowbrick's capabilities with famous statistical datasets and visualizations.

295

296

```python { .api }

297

def anscombe():

298

"""

299

Generate Anscombe's quartet visualization demonstrating the importance

300

of data visualization in statistical analysis.

301

302

Shows four datasets with identical statistical properties but

303

different distributions when visualized.

304

"""

305

306

def datasaurus():

307

"""

308

Generate the Datasaurus Dozen visualization showing multiple datasets

309

with identical summary statistics but vastly different distributions.

310

311

Demonstrates why visualization is crucial for understanding data

312

beyond summary statistics.

313

"""

314

```

315

316

**Usage Example:**

317

318

```python

319

from yellowbrick import anscombe, datasaurus

320

321

# Display Anscombe's quartet

322

print("Anscombe's Quartet - identical statistics, different patterns:")

323

anscombe()

324

325

# Display Datasaurus dozen

326

print("Datasaurus Dozen - same statistics, different shapes:")

327

datasaurus()

328

```

329

330

### Utility Constants and Types

331

332

Core utility types and constants used throughout the Yellowbrick library for consistent behavior and type checking.

333

334

```python { .api }

335

from enum import Enum

336

337

class TargetType(Enum):

338

"""

339

Enumeration of target variable types for visualization adaptation.

340

"""

341

AUTO = "auto" # Automatically determine target type

342

SINGLE = "single" # Single continuous value

343

DISCRETE = "discrete" # Discrete categorical values

344

CONTINUOUS = "continuous" # Continuous numerical values

345

UNKNOWN = "unknown" # Unknown or undefined type

346

347

def target_color_type(target, target_type_override=None):

348

"""

349

Determine the appropriate color mapping type for target visualization.

350

351

Parameters:

352

- target: array-like, target values

353

- target_type_override: TargetType, override automatic detection

354

355

Returns:

356

TargetType: Determined target type for coloring

357

"""

358

359

# Constants

360

MAX_DISCRETE_CLASSES = 12 # Maximum number of discrete classes for color mapping

361

```

362

363

## Usage Patterns

364

365

### Dataset Exploration Workflow

366

367

```python

368

from yellowbrick.datasets import load_concrete, load_credit, load_hobbies

369

from yellowbrick.features import Rank2D, ParallelCoordinates

370

from yellowbrick.classifier import ClassBalance

371

from yellowbrick.target import FeatureCorrelation

372

import matplotlib.pyplot as plt

373

374

# Regression dataset analysis

375

print("=== Concrete Dataset Analysis ===")

376

concrete = load_concrete()

377

X_concrete, y_concrete = concrete.data, concrete.target

378

379

# Feature correlation analysis

380

corr_viz = Rank2D(features=concrete.feature_names)

381

corr_viz.fit(X_concrete, y_concrete)

382

corr_viz.show()

383

384

# Classification dataset analysis

385

print("\n=== Credit Dataset Analysis ===")

386

credit = load_credit()

387

X_credit, y_credit = credit.data, credit.target

388

389

# Class balance analysis

390

balance_viz = ClassBalance(labels=credit.target_names)

391

balance_viz.fit(y_credit)

392

balance_viz.show()

393

394

# Parallel coordinates

395

pcoords_viz = ParallelCoordinates(classes=credit.target_names, normalize='standard')

396

pcoords_viz.fit(X_credit, y_credit)

397

pcoords_viz.show()

398

399

# Text dataset analysis

400

print("\n=== Hobbies Dataset Analysis ===")

401

hobbies = load_hobbies()

402

print(f"Number of documents: {len(hobbies.data)}")

403

print(f"Number of categories: {len(set(hobbies.target))}")

404

print(f"Categories: {hobbies.target_names}")

405

```

406

407

### Custom Styling Workflow

408

409

```python

410

from yellowbrick.style import set_aesthetic, set_palette, color_palette

411

from yellowbrick.classifier import ConfusionMatrix, ROCAUC

412

from sklearn.ensemble import RandomForestClassifier

413

from sklearn.model_selection import train_test_split

414

import matplotlib.pyplot as plt

415

416

# Load data

417

from yellowbrick.datasets import load_occupancy

418

occupancy = load_occupancy()

419

X, y = occupancy.data, occupancy.target

420

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

421

422

# Create model

423

model = RandomForestClassifier(n_estimators=100, random_state=42)

424

425

# Style 1: Default yellowbrick

426

print("Default Yellowbrick Style:")

427

set_aesthetic()

428

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

429

430

cm_viz1 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])

431

cm_viz1.fit(X_train, y_train)

432

cm_viz1.score(X_test, y_test)

433

cm_viz1.finalize()

434

435

roc_viz1 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])

436

roc_viz1.fit(X_train, y_train)

437

roc_viz1.score(X_test, y_test)

438

roc_viz1.finalize()

439

440

plt.tight_layout()

441

plt.show()

442

443

# Style 2: Dark theme with custom colors

444

print("Dark Theme with Custom Colors:")

445

set_aesthetic(aesthetic='darkgrid', palette='viridis')

446

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

447

448

cm_viz2 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])

449

cm_viz2.fit(X_train, y_train)

450

cm_viz2.score(X_test, y_test)

451

cm_viz2.finalize()

452

453

roc_viz2 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])

454

roc_viz2.fit(X_train, y_train)

455

roc_viz2.score(X_test, y_test)

456

roc_viz2.finalize()

457

458

plt.tight_layout()

459

plt.show()

460

461

# Style 3: Minimal white theme

462

print("Minimal White Theme:")

463

set_aesthetic(aesthetic='white', palette='husl')

464

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

465

466

cm_viz3 = ConfusionMatrix(model, classes=occupancy.target_names, ax=axes[0])

467

cm_viz3.fit(X_train, y_train)

468

cm_viz3.score(X_test, y_test)

469

cm_viz3.finalize()

470

471

roc_viz3 = ROCAUC(model, classes=occupancy.target_names, ax=axes[1])

472

roc_viz3.fit(X_train, y_train)

473

roc_viz3.score(X_test, y_test)

474

roc_viz3.finalize()

475

476

plt.tight_layout()

477

plt.show()

478

```

479

480

### Educational Demo Usage

481

482

```python

483

from yellowbrick import anscombe, datasaurus

484

from yellowbrick.style import set_aesthetic

485

import matplotlib.pyplot as plt

486

487

# Set up educational styling

488

set_aesthetic(aesthetic='whitegrid', palette='Set2')

489

490

# Demonstrate the importance of visualization

491

print("Educational Demonstrations:")

492

print("\n1. Anscombe's Quartet:")

493

print(" Four datasets with identical statistical properties but different patterns")

494

anscombe()

495

496

print("\n2. Datasaurus Dozen:")

497

print(" Multiple datasets with same summary statistics but different shapes")

498

datasaurus()

499

500

# Additional educational content

501

print("\n3. Why these demos matter:")

502

print(" - Summary statistics can be misleading")

503

print(" - Visualization reveals hidden patterns")

504

print(" - Always plot your data before analysis")

505

print(" - Different distributions can have identical means, variances, and correlations")

506

```

507

508

### Data Management Utilities

509

510

```python

511

from yellowbrick.datasets import get_data_home

512

from yellowbrick.utils.target import target_color_type, TargetType, MAX_DISCRETE_CLASSES

513

import os

514

import numpy as np

515

516

# Data directory management

517

data_home = get_data_home()

518

print(f"Yellowbrick data directory: {data_home}")

519

print(f"Directory exists: {os.path.exists(data_home)}")

520

521

if os.path.exists(data_home):

522

print(f"Directory contents: {os.listdir(data_home)}")

523

524

# Target type determination examples

525

print(f"\nTarget Type Analysis:")

526

527

# Continuous target

528

continuous_target = np.random.normal(0, 1, 100)

529

target_type_cont = target_color_type(continuous_target)

530

print(f"Continuous target type: {target_type_cont}")

531

532

# Discrete target with few classes

533

discrete_target = np.random.choice([0, 1, 2], 100)

534

target_type_disc = target_color_type(discrete_target)

535

print(f"Discrete target type: {target_type_disc}")

536

537

# Discrete target with many classes

538

many_classes = np.random.choice(range(20), 100)

539

target_type_many = target_color_type(many_classes)

540

print(f"Many classes target type: {target_type_many}")

541

542

print(f"Maximum discrete classes: {MAX_DISCRETE_CLASSES}")

543

544

# Override target type

545

target_type_override = target_color_type(continuous_target, TargetType.DISCRETE)

546

print(f"Overridden target type: {target_type_override}")

547

```

548

549

### Integration with External Data

550

551

```python

552

from yellowbrick.datasets import load_concrete

553

from yellowbrick.features import PCA, Rank2D

554

from yellowbrick.regressor import ResidualsPlot

555

from sklearn.ensemble import RandomForestRegressor

556

from sklearn.model_selection import train_test_split

557

import pandas as pd

558

559

# Load yellowbrick dataset

560

concrete = load_concrete()

561

X, y = concrete.data, concrete.target

562

563

# Convert to pandas for easier manipulation

564

df = pd.DataFrame(X, columns=concrete.feature_names)

565

df['target'] = y

566

567

print("Dataset Information:")

568

print(f"Shape: {df.shape}")

569

print(f"Features: {list(df.columns[:-1])}")

570

print(f"Target: {df.columns[-1]}")

571

print("\nDataset statistics:")

572

print(df.describe())

573

574

# Feature analysis

575

rank2d_viz = Rank2D(features=concrete.feature_names)

576

rank2d_viz.fit(X, y)

577

rank2d_viz.show()

578

579

# PCA analysis

580

pca_viz = PCA(scale=True, proj_features=True)

581

pca_viz.fit(X, y)

582

pca_viz.show()

583

584

# Model evaluation

585

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

586

model = RandomForestRegressor(n_estimators=100, random_state=42)

587

588

residuals_viz = ResidualsPlot(model)

589

residuals_viz.fit(X_train, y_train)

590

residuals_viz.score(X_test, y_test)

591

residuals_viz.show()

592

593

print(f"\nModel R² Score: {model.score(X_test, y_test):.3f}")

594

```