or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

blackbox.mddata.mdglassbox.mdgreybox.mdindex.mdperformance.mdprivacy.mdutils.mdvisualization.md

utils.mddocs/

0

# Utilities and Advanced Features

1

2

Utility functions for data preprocessing, feature interaction analysis, synthetic data generation, and development tools to support machine learning interpretability workflows.

3

4

## Capabilities

5

6

### Data Preprocessing

7

8

Specialized preprocessing tools optimized for interpretable machine learning models.

9

10

```python { .api }

11

class EBMPreprocessor:

12

def __init__(self, feature_names=None, feature_types=None, **kwargs):

13

"""

14

EBM-optimized data preprocessor.

15

16

Parameters:

17

feature_names (list, optional): Names for features

18

feature_types (list, optional): Types for features

19

**kwargs: Additional preprocessing parameters

20

"""

21

22

def fit(self, X, y=None):

23

"""Fit preprocessor to data."""

24

25

def transform(self, X):

26

"""Transform data for EBM models."""

27

28

def fit_transform(self, X, y=None):

29

"""Fit and transform data in one step."""

30

31

def inverse_transform(self, X):

32

"""Inverse transform preprocessed data."""

33

34

def purify(X, y, feature_names=None, **kwargs):

35

"""

36

Data purification and cleaning utilities.

37

38

Parameters:

39

X (array-like): Feature data

40

y (array-like): Target data

41

feature_names (list, optional): Names for features

42

**kwargs: Purification options

43

44

Returns:

45

tuple: (X_purified, y_purified, metadata)

46

"""

47

```

48

49

### Feature Analysis

50

51

Tools for analyzing feature relationships and interactions in datasets.

52

53

```python { .api }

54

def measure_interactions(X, y, feature_names=None, n_jobs=-1, **kwargs):

55

"""

56

Measure pairwise feature interactions in dataset.

57

58

Parameters:

59

X (array-like): Feature data

60

y (array-like): Target data

61

feature_names (list, optional): Names for features

62

n_jobs (int): Number of parallel jobs

63

**kwargs: Additional parameters

64

65

Returns:

66

dict: Interaction strengths between feature pairs

67

"""

68

```

69

70

### Synthetic Data Generation

71

72

Generate synthetic datasets for testing and validation of interpretability methods.

73

74

```python { .api }

75

def make_synthetic(

76

n_samples=1000,

77

n_features=10,

78

n_informative=5,

79

n_redundant=2,

80

n_clusters_per_class=1,

81

class_sep=1.0,

82

noise=0.1,

83

random_state=None,

84

**kwargs

85

):

86

"""

87

Generate synthetic dataset for interpretability testing.

88

89

Parameters:

90

n_samples (int): Number of samples

91

n_features (int): Total number of features

92

n_informative (int): Number of informative features

93

n_redundant (int): Number of redundant features

94

n_clusters_per_class (int): Clusters per class

95

class_sep (float): Class separation factor

96

noise (float): Noise level

97

random_state (int, optional): Random seed

98

**kwargs: Additional generation parameters

99

100

Returns:

101

tuple: (X, y, feature_names, true_coefficients)

102

"""

103

```

104

105

### Selection and Optimization

106

107

Advanced algorithms for feature selection and model optimization.

108

109

```python { .api }

110

class SPOT_GreedySubsetSelection:

111

def __init__(self, k=10, **kwargs):

112

"""

113

SPOT greedy subset selection algorithm.

114

115

Parameters:

116

k (int): Number of features to select

117

**kwargs: Algorithm parameters

118

"""

119

120

def fit(self, X, y):

121

"""Fit selection algorithm."""

122

123

def transform(self, X):

124

"""Transform data using selected features."""

125

126

def fit_transform(self, X, y):

127

"""Fit and transform in one step."""

128

129

def get_selected_features(self):

130

"""Get indices of selected features."""

131

```

132

133

### Link Functions

134

135

Mathematical link functions for generalized linear models and probability transformations.

136

137

```python { .api }

138

def link_func(link):

139

"""

140

Get link function by name.

141

142

Parameters:

143

link (str): Link function name ('identity', 'logit', 'log', etc.)

144

145

Returns:

146

callable: Link function

147

"""

148

149

def inv_link(link):

150

"""

151

Get inverse link function by name.

152

153

Parameters:

154

link (str): Link function name

155

156

Returns:

157

callable: Inverse link function

158

"""

159

```

160

161

## Usage Examples

162

163

### Feature Interaction Analysis

164

165

```python

166

from interpret.utils import measure_interactions

167

from sklearn.datasets import load_breast_cancer

168

import numpy as np

169

170

# Load dataset

171

data = load_breast_cancer()

172

X, y = data.data, data.target

173

174

# Measure feature interactions

175

interactions = measure_interactions(

176

X, y,

177

feature_names=data.feature_names,

178

n_jobs=-1

179

)

180

181

# Display top interactions

182

sorted_interactions = sorted(interactions.items(), key=lambda x: x[1], reverse=True)

183

print("Top 10 Feature Interactions:")

184

for (feat1, feat2), strength in sorted_interactions[:10]:

185

print(f"{feat1} <-> {feat2}: {strength:.4f}")

186

```

187

188

### EBM Preprocessing Pipeline

189

190

```python

191

from interpret.utils import EBMPreprocessor

192

from interpret.glassbox import ExplainableBoostingClassifier

193

from sklearn.model_selection import train_test_split

194

195

# Create preprocessing pipeline

196

preprocessor = EBMPreprocessor(

197

feature_names=data.feature_names,

198

feature_types=['continuous'] * len(data.feature_names)

199

)

200

201

# Split and preprocess data

202

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

203

X_train_processed = preprocessor.fit_transform(X_train, y_train)

204

X_test_processed = preprocessor.transform(X_test)

205

206

# Train EBM on processed data

207

ebm = ExplainableBoostingClassifier(

208

feature_names=data.feature_names,

209

random_state=42

210

)

211

ebm.fit(X_train_processed, y_train)

212

213

# Evaluate and explain

214

print(f"Accuracy: {ebm.score(X_test_processed, y_test):.4f}")

215

global_exp = ebm.explain_global()

216

show(global_exp)

217

```

218

219

### Synthetic Data for Testing

220

221

```python

222

from interpret.utils import make_synthetic

223

from interpret.glassbox import ExplainableBoostingClassifier

224

from interpret import show

225

226

# Generate synthetic dataset with known ground truth

227

X_synth, y_synth, feature_names, true_coefs = make_synthetic(

228

n_samples=2000,

229

n_features=15,

230

n_informative=8,

231

n_redundant=3,

232

noise=0.05,

233

random_state=42

234

)

235

236

print(f"Generated dataset: {X_synth.shape}")

237

print(f"True coefficients: {true_coefs[:5]}...")

238

239

# Train model on synthetic data

240

ebm_synth = ExplainableBoostingClassifier(

241

feature_names=feature_names,

242

random_state=42

243

)

244

ebm_synth.fit(X_synth, y_synth)

245

246

# Compare learned vs true importance

247

global_exp = ebm_synth.explain_global(name="Synthetic Data EBM")

248

show(global_exp)

249

250

# Validate that important features match ground truth

251

print("Ground truth vs learned importance correlation analysis...")

252

```

253

254

### Feature Selection with SPOT

255

256

```python

257

from interpret.utils import SPOT_GreedySubsetSelection

258

from sklearn.metrics import accuracy_score

259

260

# Feature selection with SPOT algorithm

261

selector = SPOT_GreedySubsetSelection(k=10)

262

X_train_selected = selector.fit_transform(X_train, y_train)

263

X_test_selected = selector.transform(X_test)

264

265

# Get selected features

266

selected_features = selector.get_selected_features()

267

selected_names = [data.feature_names[i] for i in selected_features]

268

print(f"Selected features: {selected_names}")

269

270

# Train model on selected features

271

ebm_selected = ExplainableBoostingClassifier(

272

feature_names=selected_names,

273

random_state=42

274

)

275

ebm_selected.fit(X_train_selected, y_train)

276

277

# Compare performance

278

full_acc = ebm.score(X_test_processed, y_test)

279

selected_acc = ebm_selected.score(X_test_selected, y_test)

280

print(f"Full features accuracy: {full_acc:.4f}")

281

print(f"Selected features accuracy: {selected_acc:.4f}")

282

283

# Show explanations for selected model

284

selected_exp = ebm_selected.explain_global(name="Selected Features EBM")

285

show(selected_exp)

286

```

287

288

### Data Purification

289

290

```python

291

from interpret.utils import purify

292

import pandas as pd

293

294

# Purify dataset (handle missing values, outliers, etc.)

295

X_purified, y_purified, metadata = purify(

296

X, y,

297

feature_names=data.feature_names,

298

handle_missing=True,

299

remove_outliers=True,

300

outlier_method='iqr'

301

)

302

303

print(f"Original shape: {X.shape}")

304

print(f"Purified shape: {X_purified.shape}")

305

print(f"Purification metadata: {metadata}")

306

307

# Train model on purified data

308

ebm_purified = ExplainableBoostingClassifier(

309

feature_names=data.feature_names,

310

random_state=42

311

)

312

ebm_purified.fit(X_purified, y_purified)

313

314

purified_exp = ebm_purified.explain_global(name="Purified Data EBM")

315

show(purified_exp)

316

```

317

318

### Link Functions for GLMs

319

320

```python

321

from interpret.utils import link_func, inv_link

322

import numpy as np

323

324

# Get link functions

325

logit = link_func('logit')

326

inv_logit = inv_link('logit')

327

328

# Example transformations

329

probabilities = np.array([0.1, 0.5, 0.9])

330

logits = logit(probabilities)

331

recovered_probs = inv_logit(logits)

332

333

print(f"Original probabilities: {probabilities}")

334

print(f"Logits: {logits}")

335

print(f"Recovered probabilities: {recovered_probs}")

336

337

# Use with custom models

338

log_link = link_func('log')

339

inv_log = inv_link('log')

340

341

positive_values = np.array([1, 10, 100])

342

log_values = log_link(positive_values)

343

recovered_values = inv_log(log_values)

344

345

print(f"Original values: {positive_values}")

346

print(f"Log transformed: {log_values}")

347

print(f"Recovered values: {recovered_values}")

348

```