or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

explainers.mdindex.mdutilities.mdvisualization.md

utilities.mddocs/

0

# Data Utilities and Helpers

1

2

SHAP provides comprehensive utilities including built-in datasets, masking strategies, helper functions, and model wrappers to support explainability workflows across different data types and use cases.

3

4

## Capabilities

5

6

### Built-in Datasets

7

8

Ready-to-use datasets for testing, benchmarking, and educational purposes, covering various domains and data types.

9

10

```python { .api }

11

# Real-world datasets

12

def adult(display=False, n_points=None) -> tuple[pd.DataFrame, np.ndarray]:

13

"""

14

Census income prediction dataset (>50K income classification).

15

16

Parameters:

17

- display: Return human-readable labels instead of encoded values (bool)

18

- n_points: Sample n data points (int, optional)

19

20

Returns:

21

(features, targets) - DataFrame with 14 features, binary target array

22

"""

23

24

def california(n_points=None) -> tuple[pd.DataFrame, np.ndarray]:

25

"""

26

California housing regression dataset.

27

28

Median house values for California districts with geographic and

29

demographic features.

30

31

Returns:

32

(features, targets) - DataFrame with 8 features, continuous target array

33

"""

34

35

def imagenet50(resolution=224, n_points=None) -> tuple[np.ndarray, np.ndarray]:

36

"""

37

50 representative ImageNet images for background distributions.

38

39

Parameters:

40

- resolution: Image resolution (currently only 224 supported)

41

- n_points: Sample n images (optional)

42

43

Returns:

44

(images, labels) - Image array (N, H, W, C), label array

45

"""

46

47

def imdb(n_points=None) -> tuple[list[str], np.ndarray]:

48

"""

49

Movie review sentiment classification dataset.

50

51

Returns:

52

(reviews, sentiments) - List of review text strings, binary sentiment array

53

"""

54

55

def diabetes(n_points=None) -> tuple[pd.DataFrame, np.ndarray]:

56

"""

57

Diabetes progression prediction dataset.

58

59

Physiological measurements predicting diabetes progression after one year.

60

61

Returns:

62

(features, targets) - DataFrame with 10 features, continuous target array

63

"""

64

65

def iris(display=False, n_points=None) -> tuple[pd.DataFrame, np.ndarray]:

66

"""

67

Classic iris flower classification dataset.

68

69

Parameters:

70

- display: Return species names instead of encoded labels (bool)

71

72

Returns:

73

(features, targets) - DataFrame with 4 features, class labels

74

"""

75

76

def linnerud(n_points=None) -> tuple[pd.DataFrame, pd.DataFrame]:

77

"""

78

Multi-target physiological/exercise dataset.

79

80

Exercise measurements predicting physiological parameters.

81

82

Returns:

83

(exercise_features, physiological_targets) - Both as DataFrames

84

"""

85

86

def nhanesi(display=False, n_points=None) -> tuple[pd.DataFrame, np.ndarray]:

87

"""

88

NHANES I survival analysis dataset.

89

90

National Health and Nutrition Examination Survey data with survival times

91

as labels, used for survival analysis and mortality prediction tasks.

92

93

Parameters:

94

- display: Return features with modified display format (bool)

95

- n_points: Number of data points to sample (int, optional)

96

97

Returns:

98

(features, survival_times) - DataFrame with health measurements, survival time array

99

"""

100

101

def communitiesandcrime(n_points=None) -> tuple[pd.DataFrame, np.ndarray]:

102

"""

103

Communities and Crime regression dataset from UCI ML Repository.

104

105

Community demographic and social features for predicting total number

106

of violent crimes per 100K population.

107

108

Parameters:

109

- n_points: Number of data points to sample (int, optional)

110

111

Returns:

112

(features, crime_rates) - DataFrame with community features, crime rate targets

113

"""

114

115

# Sparse and ranking datasets

116

def a1a(n_points=None) -> tuple[scipy.sparse.csr_matrix, np.ndarray]:

117

"""

118

Sparse binary classification dataset in SVM light format.

119

120

High-dimensional sparse feature matrix for binary classification,

121

commonly used for testing sparse algorithms.

122

123

Parameters:

124

- n_points: Number of data points to sample (int, optional)

125

126

Returns:

127

(sparse_features, binary_targets) - CSR sparse matrix and binary labels

128

"""

129

130

def rank() -> tuple[scipy.sparse.csr_matrix, np.ndarray, scipy.sparse.csr_matrix,

131

np.ndarray, np.ndarray, np.ndarray]:

132

"""

133

Learning-to-rank datasets from LightGBM repository.

134

135

Ranking datasets with query-document pairs and relevance judgments,

136

used for learning-to-rank model evaluation.

137

138

Returns:

139

(train_X, train_y, test_X, test_y, train_queries, test_queries) -

140

Training/test sparse matrices, relevance labels, and query group IDs

141

"""

142

143

# Synthetic datasets

144

def corrgroups60(n_points=1000) -> tuple[pd.DataFrame, np.ndarray]:

145

"""

146

Synthetic dataset with 60 features organized in correlated groups.

147

148

Generated dataset with known correlation structure between distinct

149

feature groups, useful for testing correlation-aware algorithms.

150

151

Parameters:

152

- n_points: Number of data points to generate (int, default: 1000)

153

154

Returns:

155

(features, targets) - DataFrame with correlated features, linear targets

156

"""

157

158

def independentlinear60(n_points=1000) -> tuple[pd.DataFrame, np.ndarray]:

159

"""

160

Synthetic dataset with 60 independent linear features.

161

162

Generated dataset with independent Gaussian features and linear

163

target relationships, used for benchmarking linear methods.

164

165

Parameters:

166

- n_points: Number of data points to generate (int, default: 1000)

167

168

Returns:

169

(features, targets) - DataFrame with independent features, linear targets

170

"""

171

```

172

173

**Usage Example:**

174

175

```python

176

import shap

177

178

# Load real-world dataset

179

X, y = shap.datasets.adult()

180

print(f"Adult dataset: {X.shape[0]} samples, {X.shape[1]} features")

181

182

# Load image dataset for computer vision

183

images, labels = shap.datasets.imagenet50(n_points=10)

184

print(f"ImageNet sample: {images.shape}")

185

186

# Load text dataset for NLP

187

reviews, sentiments = shap.datasets.imdb(n_points=100)

188

print(f"IMDB sample: {len(reviews)} reviews")

189

```

190

191

### Masking Strategies

192

193

Sophisticated masking approaches for different data types, handling feature dependencies and realistic perturbations.

194

195

```python { .api }

196

class Masker:

197

"""Abstract base class for all maskers."""

198

def __call__(self, mask, *args):

199

"""Apply masking with binary mask array."""

200

201

@property

202

def shape(self):

203

"""Expected input dimensions."""

204

205

@property

206

def supports_delta_masking(self):

207

"""Whether masker supports efficient delta masking."""

208

209

# Tabular data maskers

210

class Independent:

211

"""

212

Independent feature masking with background data integration.

213

214

Replaces masked features with values sampled independently

215

from background distribution.

216

"""

217

def __init__(self, data, max_samples=100):

218

"""

219

Parameters:

220

- data: Background dataset for sampling replacement values

221

- max_samples: Maximum background samples to use

222

"""

223

224

class Partition:

225

"""

226

Hierarchical feature masking respecting feature correlations.

227

228

Groups correlated features and masks them together to maintain

229

realistic feature relationships.

230

"""

231

def __init__(self, data, max_samples=100, clustering="correlation"):

232

"""

233

Parameters:

234

- data: Background dataset for correlation analysis

235

- clustering: Clustering method ("correlation", "tree", custom)

236

"""

237

238

class Impute:

239

"""

240

Missing value imputation for masking.

241

242

Uses feature correlations to impute realistic values for

243

masked features instead of random sampling.

244

"""

245

def __init__(self, data, method="linear"):

246

"""

247

Parameters:

248

- data: Training data for imputation model

249

- method: Imputation method ("linear", "tree", "knn")

250

"""

251

252

# Specialized maskers

253

class Text:

254

"""

255

Text tokenization and masking for NLP models.

256

257

Handles tokenization, special tokens, and text-specific

258

masking strategies for language models.

259

"""

260

def __init__(self, tokenizer=None, mask_token=None,

261

collapse_mask_token="auto", output_type="string"):

262

"""

263

Parameters:

264

- tokenizer: Custom tokenizer (optional, uses default splitting)

265

- mask_token: Token to use for masking (e.g., "[MASK]")

266

- collapse_mask_token: How to handle consecutive masked tokens

267

- output_type: Output format ("string", "token_ids", "tokens")

268

"""

269

270

class Image:

271

"""

272

Image region masking with realistic perturbations.

273

274

Supports various masking strategies including blur, inpainting,

275

and noise for computer vision models.

276

"""

277

def __init__(self, mask_value, shape=None):

278

"""

279

Parameters:

280

- mask_value: Value/strategy for masked regions (scalar, "blur", "inpaint", "noise")

281

- shape: Expected image shape (optional, inferred from data)

282

"""

283

284

class Fixed:

285

"""

286

Fixed background values for masking.

287

288

Simple masking strategy using predetermined values

289

for all masked features.

290

"""

291

def __init__(self, mask_value):

292

"""

293

Parameters:

294

- mask_value: Fixed value(s) to use for masking

295

"""

296

297

# Composite maskers

298

class Composite:

299

"""

300

Combine multiple maskers for different feature groups.

301

302

Allows different masking strategies for different parts

303

of the input (e.g., tabular + text + image).

304

"""

305

def __init__(self, **maskers):

306

"""

307

Parameters:

308

- **maskers: Named maskers for different feature groups

309

"""

310

311

class FixedComposite:

312

"""Fixed composite masking with predetermined feature groups."""

313

def __init__(self, **maskers):

314

"""Initialize with fixed feature-to-masker mapping."""

315

316

class OutputComposite:

317

"""Output-specific masking for multi-output models."""

318

def __init__(self, **maskers):

319

"""Initialize with output-specific masking strategies."""

320

```

321

322

### Model Wrappers

323

324

Standardized model interfaces for consistent explainer usage across different frameworks.

325

326

```python { .api }

327

class Model:

328

"""

329

Universal model wrapper with automatic tensor conversion.

330

331

Standardizes model interfaces and handles tensor conversions

332

between NumPy arrays and framework-specific tensors.

333

"""

334

def __init__(self, model=None):

335

"""

336

Parameters:

337

- model: Model object to wrap (optional, can be set later)

338

"""

339

340

def __call__(self, *args):

341

"""

342

Call wrapped model with automatic tensor conversion.

343

344

Converts NumPy inputs to appropriate framework tensors,

345

calls model, and converts outputs back to NumPy arrays.

346

"""

347

348

def save(self, out_file):

349

"""Serialize model to file."""

350

351

@staticmethod

352

def load(in_file, instantiate=True):

353

"""Load model from file."""

354

355

class TeacherForcing:

356

"""

357

Model wrapper for teacher forcing in sequence models.

358

359

Handles sequence generation with known target sequences

360

during training/explanation phases.

361

"""

362

def __init__(self, model, similarity_model=None, masker=None):

363

"""Initialize teacher forcing wrapper for sequence models."""

364

365

class TextGeneration:

366

"""

367

Wrapper for text generation models.

368

369

Standardizes interface for autoregressive text models

370

with generation parameters and stopping criteria.

371

"""

372

def __init__(self, model, masker=None, similarity_model=None):

373

"""Initialize text generation model wrapper."""

374

375

class TopKLM:

376

"""

377

Top-K language model wrapper.

378

379

Restricts language model outputs to top-K most likely tokens

380

for more stable explanations.

381

"""

382

def __init__(self, model, similarity_model=None, masker=None):

383

"""Initialize top-K language model wrapper."""

384

385

class TransformersPipeline:

386

"""

387

HuggingFace transformers pipeline wrapper.

388

389

Integrates with HuggingFace pipelines for standardized

390

transformer model interfaces.

391

"""

392

def __init__(self, pipeline):

393

"""

394

Parameters:

395

- pipeline: HuggingFace pipeline object

396

"""

397

```

398

399

### Utility Functions

400

401

Helper functions for data manipulation, sampling, and analysis workflows.

402

403

```python { .api }

404

# Sampling and data manipulation

405

def sample(X, nsamples=100, random_state=0):

406

"""

407

Sample data points without replacement.

408

409

Parameters:

410

- X: Input data (array, DataFrame, sparse matrix)

411

- nsamples: Number of samples to draw

412

- random_state: Random seed for reproducibility

413

414

Returns:

415

Sampled data in same format as input

416

"""

417

418

def approximate_interactions(index, shap_values, X, feature_names=None) -> np.ndarray:

419

"""

420

Find features with high interactions with target feature.

421

422

Parameters:

423

- index: Target feature index or name

424

- shap_values: SHAP values array or Explanation object

425

- X: Input feature data

426

- feature_names: List of feature names (optional)

427

428

Returns:

429

Array of interaction strength scores for each feature

430

"""

431

432

# Clustering functions

433

def hclust(data, metric="sqeuclidean"):

434

"""

435

Hierarchical clustering of features.

436

437

Parameters:

438

- data: Feature data for clustering

439

- metric: Distance metric for clustering

440

441

Returns:

442

Clustering linkage matrix

443

"""

444

445

def hclust_ordering(X, metric="sqeuclidean"):

446

"""

447

Optimal leaf ordering for hierarchical clustering dendrograms.

448

449

Minimizes distances between adjacent leaves in dendrogram.

450

"""

451

452

def delta_minimization_order():

453

"""Compute ordering that minimizes partition tree delta."""

454

455

def partition_tree():

456

"""Create hierarchical partition tree for feature grouping."""

457

458

def partition_tree_shuffle():

459

"""Shuffle partition tree leaves while preserving structure."""

460

461

# Mathematical utilities

462

def shapley_coefficients(n) -> np.ndarray:

463

"""

464

Compute Shapley coefficients for n players.

465

466

Parameters:

467

- n: Number of features/players

468

469

Returns:

470

Array of Shapley coefficients

471

"""

472

473

# Utility classes

474

class OpChain:

475

"""

476

Chainable operations for delayed execution.

477

478

Enables method chaining on Explanation objects with

479

lazy evaluation for performance optimization.

480

"""

481

def __init__(self, op, *args, **kwargs):

482

"""Initialize operation chain."""

483

484

def __call__(self, obj):

485

"""Apply operation chain to object."""

486

487

class MaskedModel:

488

"""

489

Wrapper for masked model evaluation.

490

491

Handles feature masking during model evaluation with

492

efficient batching and caching.

493

"""

494

def __init__(self, model, masker, *args, **kwargs):

495

"""

496

Parameters:

497

- model: Model function to wrap

498

- masker: Masker object for feature perturbation

499

"""

500

501

def __call__(self, masks, *args, **kwargs):

502

"""Evaluate model with masked inputs."""

503

504

def make_masks():

505

"""Generate binary masks for features."""

506

507

# Display and progress utilities

508

def show_progress():

509

"""Display progress bars for long computations."""

510

511

# Import and error handling

512

def assert_import(package_name):

513

"""Assert that required package is available."""

514

515

def record_import_error(package_name, msg, e):

516

"""Record import errors for debugging."""

517

518

def safe_isinstance(obj, class_path_str) -> bool:

519

"""Safe type checking without importing classes."""

520

521

# String formatting utilities

522

def format_value(s, format_str):

523

"""Format values for display in plots and outputs."""

524

525

def ordinal_str(n):

526

"""Convert numbers to ordinal strings (1st, 2nd, 3rd, etc.)."""

527

528

def convert_name():

529

"""Convert feature names between different formats."""

530

531

def potential_interactions(shap_values_column, shap_values_matrix):

532

"""

533

Order features by interaction strength with target feature.

534

535

Bins SHAP values for a feature along that feature's value to identify

536

potential interactions. For exact Shapley interaction values, use

537

interaction_contribs in XGBoost.

538

539

Parameters:

540

- shap_values_column: SHAP values for target feature

541

- shap_values_matrix: SHAP values matrix for all features

542

543

Returns:

544

Feature ordering by interaction strength

545

"""

546

547

def make_masks(cluster_matrix):

548

"""

549

Build sparse CSR mask matrix from hierarchical clustering.

550

551

Optimized function for creating binary masks from clustering results,

552

particularly useful for large image datasets and tree structures.

553

554

Parameters:

555

- cluster_matrix: Hierarchical clustering matrix

556

557

Returns:

558

scipy.sparse.csr_matrix: Binary mask matrix for feature groups

559

"""

560

561

def suppress_stderr():

562

"""Context manager to suppress stderr output during operations."""

563

```

564

565

### Action Optimization

566

567

Framework for constrained optimization and action recommendation.

568

569

```python { .api }

570

class Action:

571

"""

572

Abstract action class with cost parameter.

573

574

Base class for defining actions in optimization problems

575

with associated costs and execution logic.

576

"""

577

def __init__(self, cost):

578

"""

579

Parameters:

580

- cost: Cost of executing this action (numeric)

581

"""

582

583

def __call__(self, *args):

584

"""Execute the action - must be implemented by subclasses."""

585

586

def __lt__(self, other_action):

587

"""Compare actions by cost for priority queue ordering."""

588

589

class ActionOptimizer:

590

"""

591

Optimize action sequences to satisfy model constraints.

592

593

Uses priority queue search to find minimum-cost action sequences

594

that satisfy specified model constraints.

595

596

Warning:

597

ActionOptimizer is in alpha state and subject to API changes.

598

"""

599

def __init__(self, model, actions):

600

"""

601

Parameters:

602

- model: Function returning True when constraints are satisfied

603

- actions: List of Action objects or lists of mutually exclusive actions

604

"""

605

606

def __call__(self, *args, max_evals=10000):

607

"""

608

Find optimal action sequence.

609

610

Parameters:

611

- max_evals: Maximum evaluations before raising ConvergenceError

612

613

Returns:

614

List of actions that satisfy constraints with minimum cost

615

"""

616

```

617

618

### Link Functions

619

620

Output transformation functions for different model types and scales.

621

622

```python { .api }

623

def identity(x):

624

"""

625

Identity link function (no transformation).

626

627

Returns input unchanged. Used for regression models

628

and when no output transformation is needed.

629

630

Parameters:

631

- x: Input values

632

633

Returns:

634

Unchanged input values

635

"""

636

637

identity.inverse = lambda x: x # Inverse transformation

638

639

def logit(x):

640

"""

641

Logit link function for probability to log-odds conversion.

642

643

Transforms probabilities [0,1] to log-odds (-∞,∞).

644

Useful for binary classification models.

645

646

Parameters:

647

- x: Probability values in [0,1]

648

649

Returns:

650

Log-odds values log(x/(1-x))

651

"""

652

653

logit.inverse = lambda x: 1 / (1 + np.exp(-x)) # Sigmoid inverse

654

```

655

656

## Usage Patterns

657

658

### Dataset Loading and Preprocessing

659

660

```python

661

import shap

662

663

# Load dataset with optional sampling

664

X, y = shap.datasets.adult(n_points=1000)

665

666

# Use for model training

667

from sklearn.ensemble import RandomForestClassifier

668

model = RandomForestClassifier()

669

model.fit(X, y)

670

671

# Background data for explanations

672

X_background = shap.utils.sample(X, 100)

673

```

674

675

### Masking Strategy Selection

676

677

```python

678

# Tabular data with correlations

679

masker = shap.maskers.Partition(X_background, clustering="correlation")

680

681

# Text data

682

masker = shap.maskers.Text(mask_token="[MASK]", output_type="string")

683

684

# Image data

685

masker = shap.maskers.Image(mask_value="blur")

686

687

# Composite data (tabular + text)

688

masker = shap.maskers.Composite(

689

tabular=shap.maskers.Independent(X_tabular),

690

text=shap.maskers.Text()

691

)

692

```

693

694

### Model Wrapping and Standardization

695

696

```python

697

# Wrap PyTorch model for consistent interface

698

wrapped_model = shap.models.Model(pytorch_model)

699

700

# Use with any explainer

701

explainer = shap.KernelExplainer(wrapped_model, X_background)

702

shap_values = explainer(X_test)

703

```

704

705

### Error Handling

706

707

Common utility errors and solutions:

708

709

- **DataError**: Invalid data format or empty dataset

710

- **DimensionError**: Incompatible data dimensions between components

711

- **ImportError**: Missing optional dependencies for specific maskers/models

712

- **ValueError**: Invalid parameters for utility functions

713

- **ConvergenceError**: Action optimization failed to find solution (ActionOptimizer)