0
# Privacy-Preserving ML
1
2
Differentially private machine learning models that provide formal privacy guarantees while maintaining interpretability for sensitive data applications.
3
4
## Capabilities
5
6
### Differentially Private EBM
7
8
Explainable Boosting Machine with formal differential privacy guarantees, suitable for sensitive datasets in healthcare, finance, and other privacy-critical domains.
9
10
```python { .api }
11
class DPExplainableBoostingClassifier:
12
def __init__(
13
self,
14
epsilon=1.0,
15
delta=None,
16
feature_names=None,
17
feature_types=None,
18
max_bins=1024,
19
interactions=0,
20
validation_size=0.15,
21
outer_bags=16,
22
learning_rate=0.01,
23
max_rounds=25000,
24
early_stopping_rounds=50,
25
random_state=None,
26
n_jobs=-2,
27
**kwargs
28
):
29
"""
30
Differentially private EBM classifier.
31
32
Parameters:
33
epsilon (float): Privacy budget parameter
34
delta (float, optional): Privacy parameter for approximate DP
35
feature_names (list, optional): Names for features
36
feature_types (list, optional): Types for features
37
max_bins (int): Maximum bins for continuous features
38
interactions (int): Number of feature interactions (limited for privacy)
39
validation_size (float): Proportion for validation set
40
outer_bags (int): Number of outer bags
41
learning_rate (float): Learning rate
42
max_rounds (int): Maximum boosting rounds
43
early_stopping_rounds (int): Early stopping patience
44
random_state (int, optional): Random seed
45
n_jobs (int): Parallel jobs
46
**kwargs: Additional EBM parameters
47
"""
48
49
def fit(self, X, y, sample_weight=None):
50
"""Fit DP-EBM classifier with privacy guarantees."""
51
52
def predict(self, X):
53
"""Make predictions."""
54
55
def predict_proba(self, X):
56
"""Predict class probabilities."""
57
58
def explain_global(self, name=None):
59
"""Get global explanation with privacy considerations."""
60
61
def explain_local(self, X, y=None, name=None):
62
"""Get local explanations with privacy considerations."""
63
64
class DPExplainableBoostingRegressor:
65
def __init__(
66
self,
67
epsilon=1.0,
68
delta=None,
69
feature_names=None,
70
feature_types=None,
71
max_bins=1024,
72
interactions=0,
73
validation_size=0.15,
74
outer_bags=16,
75
learning_rate=0.01,
76
max_rounds=25000,
77
early_stopping_rounds=50,
78
random_state=None,
79
n_jobs=-2,
80
**kwargs
81
):
82
"""
83
Differentially private EBM regressor.
84
85
Parameters: Same as DPExplainableBoostingClassifier
86
"""
87
88
def fit(self, X, y, sample_weight=None):
89
"""Fit DP-EBM regressor with privacy guarantees."""
90
91
def predict(self, X):
92
"""Make predictions."""
93
94
def explain_global(self, name=None):
95
"""Get global explanation with privacy considerations."""
96
97
def explain_local(self, X, y=None, name=None):
98
"""Get local explanations with privacy considerations."""
99
```
100
101
## Usage Examples
102
103
### Basic DP-EBM Usage
104
105
```python
106
from interpret.privacy import DPExplainableBoostingClassifier
107
from interpret import show
108
from sklearn.datasets import load_breast_cancer
109
from sklearn.model_selection import train_test_split
110
111
# Load sensitive dataset
112
data = load_breast_cancer()
113
X_train, X_test, y_train, y_test = train_test_split(
114
data.data, data.target, test_size=0.2, random_state=42
115
)
116
117
# Train with differential privacy
118
dp_ebm = DPExplainableBoostingClassifier(
119
epsilon=1.0, # Privacy budget
120
feature_names=data.feature_names,
121
interactions=0, # Disable interactions for stronger privacy
122
random_state=42
123
)
124
dp_ebm.fit(X_train, y_train)
125
126
# Get explanations (privacy-preserving)
127
global_exp = dp_ebm.explain_global(name="DP-EBM Global")
128
show(global_exp)
129
130
local_exp = dp_ebm.explain_local(X_test[:5], name="DP-EBM Local")
131
show(local_exp)
132
```
133
134
### Privacy Budget Analysis
135
136
```python
137
import numpy as np
138
from sklearn.metrics import accuracy_score
139
140
# Compare different epsilon values
141
epsilons = [0.1, 0.5, 1.0, 2.0, 5.0]
142
results = []
143
144
for eps in epsilons:
145
dp_model = DPExplainableBoostingClassifier(
146
epsilon=eps,
147
random_state=42,
148
interactions=0
149
)
150
dp_model.fit(X_train, y_train)
151
152
pred = dp_model.predict(X_test)
153
acc = accuracy_score(y_test, pred)
154
155
results.append({
156
'epsilon': eps,
157
'accuracy': acc,
158
'privacy_strength': 'High' if eps < 1.0 else 'Medium' if eps < 5.0 else 'Low'
159
})
160
161
print(f"ε={eps}: Accuracy={acc:.4f}, Privacy={results[-1]['privacy_strength']}")
162
163
# Visualize trade-off
164
for result in results:
165
model = DPExplainableBoostingClassifier(epsilon=result['epsilon'], random_state=42)
166
model.fit(X_train, y_train)
167
exp = model.explain_global(name=f"ε={result['epsilon']}")
168
show(exp)
169
```
170
171
### Regression with Privacy
172
173
```python
174
from interpret.privacy import DPExplainableBoostingRegressor
175
from sklearn.datasets import load_diabetes
176
from sklearn.metrics import mean_squared_error
177
178
# Load regression dataset
179
diabetes = load_diabetes()
180
X_train, X_test, y_train, y_test = train_test_split(
181
diabetes.data, diabetes.target, test_size=0.2, random_state=42
182
)
183
184
# Train DP regressor
185
dp_regressor = DPExplainableBoostingRegressor(
186
epsilon=2.0,
187
feature_names=diabetes.feature_names,
188
random_state=42
189
)
190
dp_regressor.fit(X_train, y_train)
191
192
# Evaluate privacy-utility trade-off
193
pred = dp_regressor.predict(X_test)
194
mse = mean_squared_error(y_test, pred)
195
print(f"DP-EBM MSE: {mse:.2f}")
196
197
# Get explanations
198
global_exp = dp_regressor.explain_global(name="DP Regression Global")
199
show(global_exp)
200
```
201
202
### Privacy-Preserving Model Comparison
203
204
```python
205
from interpret.glassbox import ExplainableBoostingClassifier
206
207
# Compare standard EBM vs DP-EBM
208
models = {
209
'Standard EBM': ExplainableBoostingClassifier(random_state=42),
210
'DP-EBM (ε=1.0)': DPExplainableBoostingClassifier(epsilon=1.0, random_state=42),
211
'DP-EBM (ε=0.5)': DPExplainableBoostingClassifier(epsilon=0.5, random_state=42)
212
}
213
214
for name, model in models.items():
215
model.fit(X_train, y_train)
216
pred = model.predict(X_test)
217
acc = accuracy_score(y_test, pred)
218
219
print(f"{name}: Accuracy = {acc:.4f}")
220
221
# Show global explanations
222
global_exp = model.explain_global(name=f"{name} Global")
223
show(global_exp)
224
```
225
226
## Privacy Considerations
227
228
### Epsilon Selection Guidelines
229
230
- **ε < 1.0**: Strong privacy protection, may reduce model utility
231
- **ε = 1.0**: Standard choice balancing privacy and utility
232
- **ε > 1.0**: Weaker privacy protection, better model utility
233
- **ε > 10**: Minimal privacy protection
234
235
### Privacy-Utility Trade-offs
236
237
```python
238
# Analyze privacy-utility curve
239
privacy_results = []
240
241
for eps in np.logspace(-1, 1, 10): # 0.1 to 10
242
dp_model = DPExplainableBoostingClassifier(
243
epsilon=eps,
244
interactions=0, # Safer for privacy
245
random_state=42
246
)
247
dp_model.fit(X_train, y_train)
248
249
accuracy = accuracy_score(y_test, dp_model.predict(X_test))
250
privacy_results.append((eps, accuracy))
251
252
# Plot privacy-utility curve (conceptual)
253
for eps, acc in privacy_results:
254
print(f"ε={eps:.2f}: Accuracy={acc:.4f}")
255
```
256
257
### Best Practices
258
259
1. **Minimize interactions**: Set `interactions=0` for stronger privacy
260
2. **Validate epsilon choice**: Consider sensitivity of your data
261
3. **Use composition theorems**: Track cumulative privacy budget
262
4. **Validate explanations**: Ensure explanations don't leak private information
263
5. **Consider delta parameter**: Use for approximate DP when needed