A Python package to assess and improve fairness of machine learning models
—
Comprehensive tools for measuring fairness through disaggregated metrics across sensitive groups. The assessment module provides the MetricFrame class for computing metrics across subgroups and specialized fairness functions for measuring specific fairness criteria.
The central class for fairness assessment that computes metrics across subgroups defined by sensitive features. Provides disaggregated views of any metric function and supports comparison methods for fairness evaluation.
class MetricFrame:
def __init__(self, *, metrics, y_true, y_pred,
sensitive_features, control_features=None,
sample_params=None, n_boot=None, ci_quantiles=None,
random_state=None):
"""
Collection of disaggregated metric values.
Parameters:
- metrics: callable or dict, metric functions to compute
- y_true: array-like, true target values
- y_pred: array-like, predicted values
- sensitive_features: array-like, sensitive feature values for grouping
- control_features: array-like, optional control feature values
- sample_params: dict, optional parameters for metric functions
- n_boot: int, number of bootstrap samples for confidence intervals
- ci_quantiles: list[float], quantiles for confidence intervals
- random_state: int or RandomState, controls bootstrap sample generation
"""
@property
def overall(self):
"""Overall metrics computed on entire dataset."""
@property
def by_group(self):
"""Metrics computed for each sensitive feature group."""
def group_max(self):
"""Maximum metric value across groups."""
def group_min(self):
"""Minimum metric value across groups."""
def difference(self, method="between_groups"):
"""Difference between group metrics."""
def ratio(self, method="between_groups"):
"""Ratio between group metrics."""
@property
def overall_ci(self):
"""Confidence intervals for overall metrics."""
@property
def by_group_ci(self):
"""Confidence intervals for group metrics."""
def group_max_ci(self):
"""Confidence intervals for maximum metric values."""
def group_min_ci(self):
"""Confidence intervals for minimum metric values."""
def difference_ci(self, method="between_groups"):
"""Confidence intervals for differences between groups."""
def ratio_ci(self, method="between_groups"):
"""Confidence intervals for ratios between groups."""from fairlearn.metrics import MetricFrame
from sklearn.metrics import accuracy_score, precision_score
# Define multiple metrics
metrics = {
'accuracy': accuracy_score,
'precision': precision_score
}
# Create MetricFrame
mf = MetricFrame(
metrics=metrics,
y_true=y_test,
y_pred=y_pred,
sensitive_features=sensitive_features
)
# Access results
print(mf.overall) # Overall metrics
print(mf.by_group) # Metrics by group
print(mf.difference()) # Differences between groups
print(mf.ratio()) # Ratios between groupsFunctions for measuring demographic parity, which requires equal positive prediction rates across groups.
def demographic_parity_difference(y_true, y_pred, *, sensitive_features,
method="between_groups", sample_weight=None):
"""
Calculate difference in selection rates between groups.
Parameters:
- y_true: array-like, true target values (ignored for selection rate)
- y_pred: array-like, predicted values (binary)
- sensitive_features: array-like, sensitive feature values
- method: str, comparison method ("between_groups" or "to_overall")
- sample_weight: array-like, optional sample weights
Returns:
float: Maximum difference in selection rates between any two groups
"""
def demographic_parity_ratio(y_true, y_pred, *, sensitive_features,
method="between_groups", sample_weight=None):
"""
Calculate ratio of selection rates between groups.
Parameters:
- y_true: array-like, true target values (ignored for selection rate)
- y_pred: array-like, predicted values (binary)
- sensitive_features: array-like, sensitive feature values
- method: str, comparison method ("between_groups" or "to_overall")
- sample_weight: array-like, optional sample weights
Returns:
float: Minimum ratio of selection rates between any two groups
"""Functions for measuring equalized odds, which requires equal true positive and false positive rates across groups.
def equalized_odds_difference(y_true, y_pred, *, sensitive_features,
method="between_groups", sample_weight=None,
agg="worst_case"):
"""
Calculate maximum difference in true positive and false positive rates.
Parameters:
- y_true: array-like, true target values (binary)
- y_pred: array-like, predicted values (binary)
- sensitive_features: array-like, sensitive feature values
- method: str, comparison method ("between_groups" or "to_overall")
- sample_weight: array-like, optional sample weights
- agg: str, aggregation method ("worst_case" or "mean")
Returns:
float: Maximum difference in TPR and FPR between any two groups
"""
def equalized_odds_ratio(y_true, y_pred, *, sensitive_features,
method="between_groups", sample_weight=None,
agg="worst_case"):
"""
Calculate minimum ratio in true positive and false positive rates.
Parameters:
- y_true: array-like, true target values (binary)
- y_pred: array-like, predicted values (binary)
- sensitive_features: array-like, sensitive feature values
- method: str, comparison method ("between_groups" or "to_overall")
- sample_weight: array-like, optional sample weights
- agg: str, aggregation method ("worst_case" or "mean")
Returns:
float: Minimum ratio in TPR and FPR between any two groups
"""Functions for measuring equal opportunity, which requires equal true positive rates across groups.
def equal_opportunity_difference(y_true, y_pred, *, sensitive_features,
method="between_groups", sample_weight=None):
"""
Calculate difference in true positive rates between groups.
Parameters:
- y_true: array-like, true target values (binary)
- y_pred: array-like, predicted values (binary)
- sensitive_features: array-like, sensitive feature values
- method: str, comparison method ("between_groups" or "to_overall")
- sample_weight: array-like, optional sample weights
Returns:
float: Maximum difference in TPR between any two groups
"""
def equal_opportunity_ratio(y_true, y_pred, *, sensitive_features,
method="between_groups", sample_weight=None):
"""
Calculate ratio of true positive rates between groups.
Parameters:
- y_true: array-like, true target values (binary)
- y_pred: array-like, predicted values (binary)
- sensitive_features: array-like, sensitive feature values
- method: str, comparison method ("between_groups" or "to_overall")
- sample_weight: array-like, optional sample weights
Returns:
float: Minimum ratio in TPR between any two groups
"""Fundamental metric functions that can be used with MetricFrame or independently.
def true_positive_rate(y_true, y_pred, *, sample_weight=None, pos_label=1):
"""
Calculate true positive rate (sensitivity/recall).
Parameters:
- y_true: array-like, true target values
- y_pred: array-like, predicted values
- sample_weight: array-like, optional sample weights
- pos_label: label considered as positive
Returns:
float: True positive rate
"""
def false_positive_rate(y_true, y_pred, *, sample_weight=None, pos_label=1):
"""Calculate false positive rate."""
def true_negative_rate(y_true, y_pred, *, sample_weight=None, pos_label=1):
"""Calculate true negative rate (specificity)."""
def false_negative_rate(y_true, y_pred, *, sample_weight=None, pos_label=1):
"""Calculate false negative rate."""
def selection_rate(y_true, y_pred, *, sample_weight=None, pos_label=1):
"""Calculate selection rate (positive prediction rate)."""
def mean_prediction(y_true, y_pred, *, sample_weight=None):
"""Calculate mean of predictions."""
def count(y_true, y_pred, *, sample_weight=None):
"""Count number of samples."""Create new fairness metrics from existing metric functions.
def make_derived_metric(*, metric, transform, sample_weight_names=None):
"""
Create a derived metric with specified aggregation method.
Parameters:
- metric: callable, base metric function
- transform: str, aggregation method ('difference', 'ratio', 'group_min', 'group_max')
- sample_weight_names: list, parameter names for sample weights
Returns:
callable: New derived metric function
"""Plotting functions for visualizing model comparison across fairness metrics.
def plot_model_comparison(dashboard_predicted, *,
sensitive_features,
conf_intervals=False):
"""
Plot radar chart comparing multiple models across fairness and performance metrics.
Parameters:
- dashboard_predicted: dict, mapping of model names to prediction dictionaries
- sensitive_features: array-like, sensitive feature values
- conf_intervals: bool, whether to show confidence intervals
Returns:
matplotlib figure object
"""The metrics module dynamically generates additional fairness metrics for many base metrics using the pattern <metric>_{difference,ratio,group_min,group_max}. For example:
accuracy_score_differenceprecision_score_ratiorecall_score_group_minf1_score_group_maxThese generated metrics provide convenient access to common fairness assessments without manually using make_derived_metric.
Install with Tessl CLI
npx tessl i tessl/pypi-fairlearn