Tessl Tile for pypi/interpret@0.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

blackbox.md data.md glassbox.md greybox.md index.md performance.md privacy.md utils.md visualization.md

utils.mddocs/

0
# Utilities and Advanced Features
1

2
Utility functions for data preprocessing, feature interaction analysis, synthetic data generation, and development tools to support machine learning interpretability workflows.
3

4
## Capabilities
5

6
### Data Preprocessing
7

8
Specialized preprocessing tools optimized for interpretable machine learning models.
9

10
```python { .api }
11
class EBMPreprocessor:
12
    def __init__(self, feature_names=None, feature_types=None, **kwargs):
13
        """
14
        EBM-optimized data preprocessor.
15
        
16
        Parameters:
17
            feature_names (list, optional): Names for features
18
            feature_types (list, optional): Types for features
19
            **kwargs: Additional preprocessing parameters
20
        """
21
    
22
    def fit(self, X, y=None):
23
        """Fit preprocessor to data."""
24
    
25
    def transform(self, X):
26
        """Transform data for EBM models."""
27
    
28
    def fit_transform(self, X, y=None):
29
        """Fit and transform data in one step."""
30
    
31
    def inverse_transform(self, X):
32
        """Inverse transform preprocessed data."""
33

34
def purify(X, y, feature_names=None, **kwargs):
35
    """
36
    Data purification and cleaning utilities.
37
    
38
    Parameters:
39
        X (array-like): Feature data
40
        y (array-like): Target data
41
        feature_names (list, optional): Names for features
42
        **kwargs: Purification options
43
        
44
    Returns:
45
        tuple: (X_purified, y_purified, metadata)
46
    """
47
```
48

49
### Feature Analysis
50

51
Tools for analyzing feature relationships and interactions in datasets.
52

53
```python { .api }
54
def measure_interactions(X, y, feature_names=None, n_jobs=-1, **kwargs):
55
    """
56
    Measure pairwise feature interactions in dataset.
57
    
58
    Parameters:
59
        X (array-like): Feature data
60
        y (array-like): Target data
61
        feature_names (list, optional): Names for features
62
        n_jobs (int): Number of parallel jobs
63
        **kwargs: Additional parameters
64
        
65
    Returns:
66
        dict: Interaction strengths between feature pairs
67
    """
68
```
69

70
### Synthetic Data Generation
71

72
Generate synthetic datasets for testing and validation of interpretability methods.
73

74
```python { .api }
75
def make_synthetic(
76
    n_samples=1000,
77
    n_features=10,
78
    n_informative=5,
79
    n_redundant=2,
80
    n_clusters_per_class=1,
81
    class_sep=1.0,
82
    noise=0.1,
83
    random_state=None,
84
    **kwargs
85
):
86
    """
87
    Generate synthetic dataset for interpretability testing.
88
    
89
    Parameters:
90
        n_samples (int): Number of samples
91
        n_features (int): Total number of features
92
        n_informative (int): Number of informative features
93
        n_redundant (int): Number of redundant features
94
        n_clusters_per_class (int): Clusters per class
95
        class_sep (float): Class separation factor
96
        noise (float): Noise level
97
        random_state (int, optional): Random seed
98
        **kwargs: Additional generation parameters
99
        
100
    Returns:
101
        tuple: (X, y, feature_names, true_coefficients)
102
    """
103
```
104

105
### Selection and Optimization
106

107
Advanced algorithms for feature selection and model optimization.
108

109
```python { .api }
110
class SPOT_GreedySubsetSelection:
111
    def __init__(self, k=10, **kwargs):
112
        """
113
        SPOT greedy subset selection algorithm.
114
        
115
        Parameters:
116
            k (int): Number of features to select
117
            **kwargs: Algorithm parameters
118
        """
119
    
120
    def fit(self, X, y):
121
        """Fit selection algorithm."""
122
    
123
    def transform(self, X):
124
        """Transform data using selected features."""
125
    
126
    def fit_transform(self, X, y):
127
        """Fit and transform in one step."""
128
    
129
    def get_selected_features(self):
130
        """Get indices of selected features."""
131
```
132

133
### Link Functions
134

135
Mathematical link functions for generalized linear models and probability transformations.
136

137
```python { .api }
138
def link_func(link):
139
    """
140
    Get link function by name.
141
    
142
    Parameters:
143
        link (str): Link function name ('identity', 'logit', 'log', etc.)
144
        
145
    Returns:
146
        callable: Link function
147
    """
148

149
def inv_link(link):
150
    """
151
    Get inverse link function by name.
152
    
153
    Parameters:
154
        link (str): Link function name
155
        
156
    Returns:
157
        callable: Inverse link function
158
    """
159
```
160

161
## Usage Examples
162

163
### Feature Interaction Analysis
164

165
```python
166
from interpret.utils import measure_interactions
167
from sklearn.datasets import load_breast_cancer
168
import numpy as np
169

170
# Load dataset
171
data = load_breast_cancer()
172
X, y = data.data, data.target
173

174
# Measure feature interactions
175
interactions = measure_interactions(
176
    X, y, 
177
    feature_names=data.feature_names,
178
    n_jobs=-1
179
)
180

181
# Display top interactions
182
sorted_interactions = sorted(interactions.items(), key=lambda x: x[1], reverse=True)
183
print("Top 10 Feature Interactions:")
184
for (feat1, feat2), strength in sorted_interactions[:10]:
185
    print(f"{feat1} <-> {feat2}: {strength:.4f}")
186
```
187

188
### EBM Preprocessing Pipeline
189

190
```python
191
from interpret.utils import EBMPreprocessor
192
from interpret.glassbox import ExplainableBoostingClassifier
193
from sklearn.model_selection import train_test_split
194

195
# Create preprocessing pipeline
196
preprocessor = EBMPreprocessor(
197
    feature_names=data.feature_names,
198
    feature_types=['continuous'] * len(data.feature_names)
199
)
200

201
# Split and preprocess data
202
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
203
X_train_processed = preprocessor.fit_transform(X_train, y_train)
204
X_test_processed = preprocessor.transform(X_test)
205

206
# Train EBM on processed data
207
ebm = ExplainableBoostingClassifier(
208
    feature_names=data.feature_names,
209
    random_state=42
210
)
211
ebm.fit(X_train_processed, y_train)
212

213
# Evaluate and explain
214
print(f"Accuracy: {ebm.score(X_test_processed, y_test):.4f}")
215
global_exp = ebm.explain_global()
216
show(global_exp)
217
```
218

219
### Synthetic Data for Testing
220

221
```python
222
from interpret.utils import make_synthetic
223
from interpret.glassbox import ExplainableBoostingClassifier
224
from interpret import show
225

226
# Generate synthetic dataset with known ground truth
227
X_synth, y_synth, feature_names, true_coefs = make_synthetic(
228
    n_samples=2000,
229
    n_features=15,
230
    n_informative=8,
231
    n_redundant=3,
232
    noise=0.05,
233
    random_state=42
234
)
235

236
print(f"Generated dataset: {X_synth.shape}")
237
print(f"True coefficients: {true_coefs[:5]}...")
238

239
# Train model on synthetic data
240
ebm_synth = ExplainableBoostingClassifier(
241
    feature_names=feature_names,
242
    random_state=42
243
)
244
ebm_synth.fit(X_synth, y_synth)
245

246
# Compare learned vs true importance
247
global_exp = ebm_synth.explain_global(name="Synthetic Data EBM")
248
show(global_exp)
249

250
# Validate that important features match ground truth
251
print("Ground truth vs learned importance correlation analysis...")
252
```
253

254
### Feature Selection with SPOT
255

256
```python
257
from interpret.utils import SPOT_GreedySubsetSelection
258
from sklearn.metrics import accuracy_score
259

260
# Feature selection with SPOT algorithm
261
selector = SPOT_GreedySubsetSelection(k=10)
262
X_train_selected = selector.fit_transform(X_train, y_train)
263
X_test_selected = selector.transform(X_test)
264

265
# Get selected features
266
selected_features = selector.get_selected_features()
267
selected_names = [data.feature_names[i] for i in selected_features]
268
print(f"Selected features: {selected_names}")
269

270
# Train model on selected features
271
ebm_selected = ExplainableBoostingClassifier(
272
    feature_names=selected_names,
273
    random_state=42
274
)
275
ebm_selected.fit(X_train_selected, y_train)
276

277
# Compare performance
278
full_acc = ebm.score(X_test_processed, y_test)
279
selected_acc = ebm_selected.score(X_test_selected, y_test)
280
print(f"Full features accuracy: {full_acc:.4f}")
281
print(f"Selected features accuracy: {selected_acc:.4f}")
282

283
# Show explanations for selected model
284
selected_exp = ebm_selected.explain_global(name="Selected Features EBM")
285
show(selected_exp)
286
```
287

288
### Data Purification
289

290
```python
291
from interpret.utils import purify
292
import pandas as pd
293

294
# Purify dataset (handle missing values, outliers, etc.)
295
X_purified, y_purified, metadata = purify(
296
    X, y,
297
    feature_names=data.feature_names,
298
    handle_missing=True,
299
    remove_outliers=True,
300
    outlier_method='iqr'
301
)
302

303
print(f"Original shape: {X.shape}")
304
print(f"Purified shape: {X_purified.shape}")
305
print(f"Purification metadata: {metadata}")
306

307
# Train model on purified data
308
ebm_purified = ExplainableBoostingClassifier(
309
    feature_names=data.feature_names,
310
    random_state=42
311
)
312
ebm_purified.fit(X_purified, y_purified)
313

314
purified_exp = ebm_purified.explain_global(name="Purified Data EBM")
315
show(purified_exp)
316
```
317

318
### Link Functions for GLMs
319

320
```python
321
from interpret.utils import link_func, inv_link
322
import numpy as np
323

324
# Get link functions
325
logit = link_func('logit')
326
inv_logit = inv_link('logit')
327

328
# Example transformations
329
probabilities = np.array([0.1, 0.5, 0.9])
330
logits = logit(probabilities)
331
recovered_probs = inv_logit(logits)
332

333
print(f"Original probabilities: {probabilities}")
334
print(f"Logits: {logits}")
335
print(f"Recovered probabilities: {recovered_probs}")
336

337
# Use with custom models
338
log_link = link_func('log')
339
inv_log = inv_link('log')
340

341
positive_values = np.array([1, 10, 100])
342
log_values = log_link(positive_values)
343
recovered_values = inv_log(log_values)
344

345
print(f"Original values: {positive_values}")
346
print(f"Log transformed: {log_values}")
347
print(f"Recovered values: {recovered_values}")
348
```

Version

Tile

Files

utils.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utils.mddocs/