0
# Utilities and Advanced Features
1
2
Utility functions for data preprocessing, feature interaction analysis, synthetic data generation, and development tools to support machine learning interpretability workflows.
3
4
## Capabilities
5
6
### Data Preprocessing
7
8
Specialized preprocessing tools optimized for interpretable machine learning models.
9
10
```python { .api }
11
class EBMPreprocessor:
12
def __init__(self, feature_names=None, feature_types=None, **kwargs):
13
"""
14
EBM-optimized data preprocessor.
15
16
Parameters:
17
feature_names (list, optional): Names for features
18
feature_types (list, optional): Types for features
19
**kwargs: Additional preprocessing parameters
20
"""
21
22
def fit(self, X, y=None):
23
"""Fit preprocessor to data."""
24
25
def transform(self, X):
26
"""Transform data for EBM models."""
27
28
def fit_transform(self, X, y=None):
29
"""Fit and transform data in one step."""
30
31
def inverse_transform(self, X):
32
"""Inverse transform preprocessed data."""
33
34
def purify(X, y, feature_names=None, **kwargs):
35
"""
36
Data purification and cleaning utilities.
37
38
Parameters:
39
X (array-like): Feature data
40
y (array-like): Target data
41
feature_names (list, optional): Names for features
42
**kwargs: Purification options
43
44
Returns:
45
tuple: (X_purified, y_purified, metadata)
46
"""
47
```
48
49
### Feature Analysis
50
51
Tools for analyzing feature relationships and interactions in datasets.
52
53
```python { .api }
54
def measure_interactions(X, y, feature_names=None, n_jobs=-1, **kwargs):
55
"""
56
Measure pairwise feature interactions in dataset.
57
58
Parameters:
59
X (array-like): Feature data
60
y (array-like): Target data
61
feature_names (list, optional): Names for features
62
n_jobs (int): Number of parallel jobs
63
**kwargs: Additional parameters
64
65
Returns:
66
dict: Interaction strengths between feature pairs
67
"""
68
```
69
70
### Synthetic Data Generation
71
72
Generate synthetic datasets for testing and validation of interpretability methods.
73
74
```python { .api }
75
def make_synthetic(
76
n_samples=1000,
77
n_features=10,
78
n_informative=5,
79
n_redundant=2,
80
n_clusters_per_class=1,
81
class_sep=1.0,
82
noise=0.1,
83
random_state=None,
84
**kwargs
85
):
86
"""
87
Generate synthetic dataset for interpretability testing.
88
89
Parameters:
90
n_samples (int): Number of samples
91
n_features (int): Total number of features
92
n_informative (int): Number of informative features
93
n_redundant (int): Number of redundant features
94
n_clusters_per_class (int): Clusters per class
95
class_sep (float): Class separation factor
96
noise (float): Noise level
97
random_state (int, optional): Random seed
98
**kwargs: Additional generation parameters
99
100
Returns:
101
tuple: (X, y, feature_names, true_coefficients)
102
"""
103
```
104
105
### Selection and Optimization
106
107
Advanced algorithms for feature selection and model optimization.
108
109
```python { .api }
110
class SPOT_GreedySubsetSelection:
111
def __init__(self, k=10, **kwargs):
112
"""
113
SPOT greedy subset selection algorithm.
114
115
Parameters:
116
k (int): Number of features to select
117
**kwargs: Algorithm parameters
118
"""
119
120
def fit(self, X, y):
121
"""Fit selection algorithm."""
122
123
def transform(self, X):
124
"""Transform data using selected features."""
125
126
def fit_transform(self, X, y):
127
"""Fit and transform in one step."""
128
129
def get_selected_features(self):
130
"""Get indices of selected features."""
131
```
132
133
### Link Functions
134
135
Mathematical link functions for generalized linear models and probability transformations.
136
137
```python { .api }
138
def link_func(link):
139
"""
140
Get link function by name.
141
142
Parameters:
143
link (str): Link function name ('identity', 'logit', 'log', etc.)
144
145
Returns:
146
callable: Link function
147
"""
148
149
def inv_link(link):
150
"""
151
Get inverse link function by name.
152
153
Parameters:
154
link (str): Link function name
155
156
Returns:
157
callable: Inverse link function
158
"""
159
```
160
161
## Usage Examples
162
163
### Feature Interaction Analysis
164
165
```python
166
from interpret.utils import measure_interactions
167
from sklearn.datasets import load_breast_cancer
168
import numpy as np
169
170
# Load dataset
171
data = load_breast_cancer()
172
X, y = data.data, data.target
173
174
# Measure feature interactions
175
interactions = measure_interactions(
176
X, y,
177
feature_names=data.feature_names,
178
n_jobs=-1
179
)
180
181
# Display top interactions
182
sorted_interactions = sorted(interactions.items(), key=lambda x: x[1], reverse=True)
183
print("Top 10 Feature Interactions:")
184
for (feat1, feat2), strength in sorted_interactions[:10]:
185
print(f"{feat1} <-> {feat2}: {strength:.4f}")
186
```
187
188
### EBM Preprocessing Pipeline
189
190
```python
191
from interpret.utils import EBMPreprocessor
192
from interpret.glassbox import ExplainableBoostingClassifier
193
from sklearn.model_selection import train_test_split
194
195
# Create preprocessing pipeline
196
preprocessor = EBMPreprocessor(
197
feature_names=data.feature_names,
198
feature_types=['continuous'] * len(data.feature_names)
199
)
200
201
# Split and preprocess data
202
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
203
X_train_processed = preprocessor.fit_transform(X_train, y_train)
204
X_test_processed = preprocessor.transform(X_test)
205
206
# Train EBM on processed data
207
ebm = ExplainableBoostingClassifier(
208
feature_names=data.feature_names,
209
random_state=42
210
)
211
ebm.fit(X_train_processed, y_train)
212
213
# Evaluate and explain
214
print(f"Accuracy: {ebm.score(X_test_processed, y_test):.4f}")
215
global_exp = ebm.explain_global()
216
show(global_exp)
217
```
218
219
### Synthetic Data for Testing
220
221
```python
222
from interpret.utils import make_synthetic
223
from interpret.glassbox import ExplainableBoostingClassifier
224
from interpret import show
225
226
# Generate synthetic dataset with known ground truth
227
X_synth, y_synth, feature_names, true_coefs = make_synthetic(
228
n_samples=2000,
229
n_features=15,
230
n_informative=8,
231
n_redundant=3,
232
noise=0.05,
233
random_state=42
234
)
235
236
print(f"Generated dataset: {X_synth.shape}")
237
print(f"True coefficients: {true_coefs[:5]}...")
238
239
# Train model on synthetic data
240
ebm_synth = ExplainableBoostingClassifier(
241
feature_names=feature_names,
242
random_state=42
243
)
244
ebm_synth.fit(X_synth, y_synth)
245
246
# Compare learned vs true importance
247
global_exp = ebm_synth.explain_global(name="Synthetic Data EBM")
248
show(global_exp)
249
250
# Validate that important features match ground truth
251
print("Ground truth vs learned importance correlation analysis...")
252
```
253
254
### Feature Selection with SPOT
255
256
```python
257
from interpret.utils import SPOT_GreedySubsetSelection
258
from sklearn.metrics import accuracy_score
259
260
# Feature selection with SPOT algorithm
261
selector = SPOT_GreedySubsetSelection(k=10)
262
X_train_selected = selector.fit_transform(X_train, y_train)
263
X_test_selected = selector.transform(X_test)
264
265
# Get selected features
266
selected_features = selector.get_selected_features()
267
selected_names = [data.feature_names[i] for i in selected_features]
268
print(f"Selected features: {selected_names}")
269
270
# Train model on selected features
271
ebm_selected = ExplainableBoostingClassifier(
272
feature_names=selected_names,
273
random_state=42
274
)
275
ebm_selected.fit(X_train_selected, y_train)
276
277
# Compare performance
278
full_acc = ebm.score(X_test_processed, y_test)
279
selected_acc = ebm_selected.score(X_test_selected, y_test)
280
print(f"Full features accuracy: {full_acc:.4f}")
281
print(f"Selected features accuracy: {selected_acc:.4f}")
282
283
# Show explanations for selected model
284
selected_exp = ebm_selected.explain_global(name="Selected Features EBM")
285
show(selected_exp)
286
```
287
288
### Data Purification
289
290
```python
291
from interpret.utils import purify
292
import pandas as pd
293
294
# Purify dataset (handle missing values, outliers, etc.)
295
X_purified, y_purified, metadata = purify(
296
X, y,
297
feature_names=data.feature_names,
298
handle_missing=True,
299
remove_outliers=True,
300
outlier_method='iqr'
301
)
302
303
print(f"Original shape: {X.shape}")
304
print(f"Purified shape: {X_purified.shape}")
305
print(f"Purification metadata: {metadata}")
306
307
# Train model on purified data
308
ebm_purified = ExplainableBoostingClassifier(
309
feature_names=data.feature_names,
310
random_state=42
311
)
312
ebm_purified.fit(X_purified, y_purified)
313
314
purified_exp = ebm_purified.explain_global(name="Purified Data EBM")
315
show(purified_exp)
316
```
317
318
### Link Functions for GLMs
319
320
```python
321
from interpret.utils import link_func, inv_link
322
import numpy as np
323
324
# Get link functions
325
logit = link_func('logit')
326
inv_logit = inv_link('logit')
327
328
# Example transformations
329
probabilities = np.array([0.1, 0.5, 0.9])
330
logits = logit(probabilities)
331
recovered_probs = inv_logit(logits)
332
333
print(f"Original probabilities: {probabilities}")
334
print(f"Logits: {logits}")
335
print(f"Recovered probabilities: {recovered_probs}")
336
337
# Use with custom models
338
log_link = link_func('log')
339
inv_log = inv_link('log')
340
341
positive_values = np.array([1, 10, 100])
342
log_values = log_link(positive_values)
343
recovered_values = inv_log(log_values)
344
345
print(f"Original values: {positive_values}")
346
print(f"Log transformed: {log_values}")
347
print(f"Recovered values: {recovered_values}")
348
```