Fit interpretable models and explain blackbox machine learning with comprehensive interpretability tools.
—
Utility functions for data preprocessing, feature interaction analysis, synthetic data generation, and development tools to support machine learning interpretability workflows.
Specialized preprocessing tools optimized for interpretable machine learning models.
class EBMPreprocessor:
def __init__(self, feature_names=None, feature_types=None, **kwargs):
"""
EBM-optimized data preprocessor.
Parameters:
feature_names (list, optional): Names for features
feature_types (list, optional): Types for features
**kwargs: Additional preprocessing parameters
"""
def fit(self, X, y=None):
"""Fit preprocessor to data."""
def transform(self, X):
"""Transform data for EBM models."""
def fit_transform(self, X, y=None):
"""Fit and transform data in one step."""
def inverse_transform(self, X):
"""Inverse transform preprocessed data."""
def purify(X, y, feature_names=None, **kwargs):
"""
Data purification and cleaning utilities.
Parameters:
X (array-like): Feature data
y (array-like): Target data
feature_names (list, optional): Names for features
**kwargs: Purification options
Returns:
tuple: (X_purified, y_purified, metadata)
"""Tools for analyzing feature relationships and interactions in datasets.
def measure_interactions(X, y, feature_names=None, n_jobs=-1, **kwargs):
"""
Measure pairwise feature interactions in dataset.
Parameters:
X (array-like): Feature data
y (array-like): Target data
feature_names (list, optional): Names for features
n_jobs (int): Number of parallel jobs
**kwargs: Additional parameters
Returns:
dict: Interaction strengths between feature pairs
"""Generate synthetic datasets for testing and validation of interpretability methods.
def make_synthetic(
n_samples=1000,
n_features=10,
n_informative=5,
n_redundant=2,
n_clusters_per_class=1,
class_sep=1.0,
noise=0.1,
random_state=None,
**kwargs
):
"""
Generate synthetic dataset for interpretability testing.
Parameters:
n_samples (int): Number of samples
n_features (int): Total number of features
n_informative (int): Number of informative features
n_redundant (int): Number of redundant features
n_clusters_per_class (int): Clusters per class
class_sep (float): Class separation factor
noise (float): Noise level
random_state (int, optional): Random seed
**kwargs: Additional generation parameters
Returns:
tuple: (X, y, feature_names, true_coefficients)
"""Advanced algorithms for feature selection and model optimization.
class SPOT_GreedySubsetSelection:
def __init__(self, k=10, **kwargs):
"""
SPOT greedy subset selection algorithm.
Parameters:
k (int): Number of features to select
**kwargs: Algorithm parameters
"""
def fit(self, X, y):
"""Fit selection algorithm."""
def transform(self, X):
"""Transform data using selected features."""
def fit_transform(self, X, y):
"""Fit and transform in one step."""
def get_selected_features(self):
"""Get indices of selected features."""Mathematical link functions for generalized linear models and probability transformations.
def link_func(link):
"""
Get link function by name.
Parameters:
link (str): Link function name ('identity', 'logit', 'log', etc.)
Returns:
callable: Link function
"""
def inv_link(link):
"""
Get inverse link function by name.
Parameters:
link (str): Link function name
Returns:
callable: Inverse link function
"""from interpret.utils import measure_interactions
from sklearn.datasets import load_breast_cancer
import numpy as np
# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target
# Measure feature interactions
interactions = measure_interactions(
X, y,
feature_names=data.feature_names,
n_jobs=-1
)
# Display top interactions
sorted_interactions = sorted(interactions.items(), key=lambda x: x[1], reverse=True)
print("Top 10 Feature Interactions:")
for (feat1, feat2), strength in sorted_interactions[:10]:
print(f"{feat1} <-> {feat2}: {strength:.4f}")from interpret.utils import EBMPreprocessor
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split
# Create preprocessing pipeline
preprocessor = EBMPreprocessor(
feature_names=data.feature_names,
feature_types=['continuous'] * len(data.feature_names)
)
# Split and preprocess data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_processed = preprocessor.fit_transform(X_train, y_train)
X_test_processed = preprocessor.transform(X_test)
# Train EBM on processed data
ebm = ExplainableBoostingClassifier(
feature_names=data.feature_names,
random_state=42
)
ebm.fit(X_train_processed, y_train)
# Evaluate and explain
print(f"Accuracy: {ebm.score(X_test_processed, y_test):.4f}")
global_exp = ebm.explain_global()
show(global_exp)from interpret.utils import make_synthetic
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show
# Generate synthetic dataset with known ground truth
X_synth, y_synth, feature_names, true_coefs = make_synthetic(
n_samples=2000,
n_features=15,
n_informative=8,
n_redundant=3,
noise=0.05,
random_state=42
)
print(f"Generated dataset: {X_synth.shape}")
print(f"True coefficients: {true_coefs[:5]}...")
# Train model on synthetic data
ebm_synth = ExplainableBoostingClassifier(
feature_names=feature_names,
random_state=42
)
ebm_synth.fit(X_synth, y_synth)
# Compare learned vs true importance
global_exp = ebm_synth.explain_global(name="Synthetic Data EBM")
show(global_exp)
# Validate that important features match ground truth
print("Ground truth vs learned importance correlation analysis...")from interpret.utils import SPOT_GreedySubsetSelection
from sklearn.metrics import accuracy_score
# Feature selection with SPOT algorithm
selector = SPOT_GreedySubsetSelection(k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
# Get selected features
selected_features = selector.get_selected_features()
selected_names = [data.feature_names[i] for i in selected_features]
print(f"Selected features: {selected_names}")
# Train model on selected features
ebm_selected = ExplainableBoostingClassifier(
feature_names=selected_names,
random_state=42
)
ebm_selected.fit(X_train_selected, y_train)
# Compare performance
full_acc = ebm.score(X_test_processed, y_test)
selected_acc = ebm_selected.score(X_test_selected, y_test)
print(f"Full features accuracy: {full_acc:.4f}")
print(f"Selected features accuracy: {selected_acc:.4f}")
# Show explanations for selected model
selected_exp = ebm_selected.explain_global(name="Selected Features EBM")
show(selected_exp)from interpret.utils import purify
import pandas as pd
# Purify dataset (handle missing values, outliers, etc.)
X_purified, y_purified, metadata = purify(
X, y,
feature_names=data.feature_names,
handle_missing=True,
remove_outliers=True,
outlier_method='iqr'
)
print(f"Original shape: {X.shape}")
print(f"Purified shape: {X_purified.shape}")
print(f"Purification metadata: {metadata}")
# Train model on purified data
ebm_purified = ExplainableBoostingClassifier(
feature_names=data.feature_names,
random_state=42
)
ebm_purified.fit(X_purified, y_purified)
purified_exp = ebm_purified.explain_global(name="Purified Data EBM")
show(purified_exp)from interpret.utils import link_func, inv_link
import numpy as np
# Get link functions
logit = link_func('logit')
inv_logit = inv_link('logit')
# Example transformations
probabilities = np.array([0.1, 0.5, 0.9])
logits = logit(probabilities)
recovered_probs = inv_logit(logits)
print(f"Original probabilities: {probabilities}")
print(f"Logits: {logits}")
print(f"Recovered probabilities: {recovered_probs}")
# Use with custom models
log_link = link_func('log')
inv_log = inv_link('log')
positive_values = np.array([1, 10, 100])
log_values = log_link(positive_values)
recovered_values = inv_log(log_values)
print(f"Original values: {positive_values}")
print(f"Log transformed: {log_values}")
print(f"Recovered values: {recovered_values}")Install with Tessl CLI
npx tessl i tessl/pypi-interpret