A Python package to assess and improve fairness of machine learning models
—
Preprocessing techniques that transform features to reduce correlation with sensitive attributes, addressing fairness at the data preparation stage. These methods modify the input data before model training to reduce potential for discriminatory outcomes.
Removes correlations between non-sensitive features and sensitive attributes using linear projection. This preprocessing technique helps ensure that the model cannot infer sensitive attributes from the remaining features.
class CorrelationRemover:
def __init__(self, *, sensitive_feature_ids, alpha=1.0):
"""
Remove correlations between features and sensitive attributes.
Parameters:
- sensitive_feature_ids: list of int or str, indices or names of sensitive features
- alpha: float, strength of correlation removal (0.0 = no removal, 1.0 = full removal)
"""
def fit(self, X, y=None):
"""
Learn the transformation to remove correlations.
Parameters:
- X: array-like or DataFrame, feature matrix including sensitive features
- y: array-like, target values (unused, present for sklearn compatibility)
Returns:
self
"""
def transform(self, X):
"""
Apply the correlation removal transformation.
Parameters:
- X: array-like or DataFrame, feature matrix to transform
Returns:
array-like or DataFrame: Transformed features with reduced correlation
"""
def fit_transform(self, X, y=None):
"""
Fit and transform the data in one step.
Parameters:
- X: array-like or DataFrame, feature matrix
- y: array-like, target values (unused)
Returns:
array-like or DataFrame: Transformed features
"""
@property
def mean_(self):
"""Mean values used for centering during transformation."""
@property
def projection_matrix_(self):
"""Projection matrix used for correlation removal."""import pandas as pd
from fairlearn.preprocessing import CorrelationRemover
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Load data with sensitive features included
data = pd.DataFrame({
'feature1': [1, 2, 3, 4, 5],
'feature2': [2, 4, 6, 8, 10],
'sensitive_gender': [0, 1, 0, 1, 0],
'sensitive_age': [25, 35, 45, 30, 40]
})
target = [0, 1, 0, 1, 1]
# Specify which columns are sensitive
cr = CorrelationRemover(
sensitive_feature_ids=['sensitive_gender', 'sensitive_age'],
alpha=1.0 # Full correlation removal
)
# Fit and transform the data
data_transformed = cr.fit_transform(data)
# Now sensitive features have reduced correlation with other features
# Continue with normal ML pipeline
X_train, X_test, y_train, y_test = train_test_split(
data_transformed, target, test_size=0.3, random_state=42
)
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)import numpy as np
from fairlearn.preprocessing import CorrelationRemover
# Data as numpy array where columns 2 and 3 are sensitive
X = np.array([
[1.0, 2.0, 0, 25], # features + gender + age
[2.0, 4.0, 1, 35],
[3.0, 6.0, 0, 45],
[4.0, 8.0, 1, 30]
])
# Use numeric indices for sensitive features
cr = CorrelationRemover(
sensitive_feature_ids=[2, 3], # Gender and age columns
alpha=0.8 # Partial correlation removal
)
X_transformed = cr.fit_transform(X)The CorrelationRemover works by:
The mathematical approach:
The alpha parameter controls the trade-off between fairness and utility:
# Example of testing different alpha values
alphas = [0.0, 0.3, 0.6, 1.0]
results = {}
for alpha in alphas:
cr = CorrelationRemover(sensitive_feature_ids=[2, 3], alpha=alpha)
X_transformed = cr.fit_transform(X)
# Train model and evaluate fairness/accuracy
model = LogisticRegression()
model.fit(X_transformed, y)
# Store results for comparison
results[alpha] = evaluate_model(model, X_transformed, y)CorrelationRemover follows scikit-learn conventions and can be used in pipelines:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# Create preprocessing pipeline
pipeline = Pipeline([
('correlation_removal', CorrelationRemover(sensitive_feature_ids=[2, 3])),
('scaling', StandardScaler()),
('classifier', LogisticRegression())
])
# Fit entire pipeline
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)# Recommended workflow
from sklearn.model_selection import GridSearchCV
from fairlearn.metrics import MetricFrame
# Grid search over alpha values
param_grid = {'correlation_removal__alpha': [0.0, 0.3, 0.6, 1.0]}
pipeline = Pipeline([
('correlation_removal', CorrelationRemover(sensitive_feature_ids=[2, 3])),
('classifier', LogisticRegression())
])
# Find best alpha balancing accuracy and fairness
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# Evaluate fairness of best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
fairness_metrics = MetricFrame(
metrics={'accuracy': lambda y_true, y_pred: (y_true == y_pred).mean()},
y_true=y_test,
y_pred=predictions,
sensitive_features=sensitive_features_test
)Install with Tessl CLI
npx tessl i tessl/pypi-fairlearn