Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transformers for removing or selecting features based on various criteria including variance, correlation, performance metrics, and statistical tests to improve model performance and reduce dimensionality.
Drops a list of variables indicated by the user from the dataframe.
class DropFeatures:
def __init__(self, features_to_drop):
"""
Initialize DropFeatures.
Parameters:
- features_to_drop (list): Variable names to be dropped from dataframe
"""
def fit(self, X, y=None):
"""
Validate that features exist in dataset (no parameters learned).
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Drop indicated features from dataset.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with specified features removed
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.selection import DropFeatures
import pandas as pd
# Sample data
data = {'var1': [1, 2, 3], 'var2': [4, 5, 6], 'var3': [7, 8, 9]}
df = pd.DataFrame(data)
# Drop specific features
selector = DropFeatures(['var1', 'var3'])
df_reduced = selector.fit_transform(df)
# Result: only var2 remains
print(selector.features_to_drop_) # Shows features that will be droppedRemoves constant and quasi-constant features that provide little information.
class DropConstantFeatures:
def __init__(self, variables=None, tol=1, missing_values='raise'):
"""
Initialize DropConstantFeatures.
Parameters:
- variables (list): List of variables to evaluate. If None, evaluates all variables
- tol (float): Threshold for quasi-constant detection (0-1). Variables with tol fraction of most frequent value are dropped
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
"""
def fit(self, X, y=None):
"""
Identify constant and quasi-constant features.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Remove constant and quasi-constant features.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with constant features removed
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.selection import DropConstantFeatures
# Drop truly constant features (default)
selector = DropConstantFeatures()
df_reduced = selector.fit_transform(df)
# Drop quasi-constant features (>95% same value)
selector = DropConstantFeatures(tol=0.95)
df_reduced = selector.fit_transform(df)
print(selector.features_to_drop_) # Features identified as constant/quasi-constantRemoves duplicate features from dataframe based on identical values.
class DropDuplicateFeatures:
def __init__(self, variables=None, missing_values='raise'):
"""
Initialize DropDuplicateFeatures.
Parameters:
- variables (list): List of variables to evaluate. If None, evaluates all variables
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
"""
def fit(self, X, y=None):
"""
Identify duplicate features.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Remove duplicate features, keeping first occurrence.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with duplicate features removed
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Removes correlated features from dataframe to reduce multicollinearity.
class DropCorrelatedFeatures:
def __init__(self, variables=None, method='pearson', threshold=0.8, missing_values='raise'):
"""
Initialize DropCorrelatedFeatures.
Parameters:
- variables (list): List of numerical variables to evaluate. If None, selects all numerical variables
- method (str): Correlation method - 'pearson', 'spearman', or 'kendall'
- threshold (float): Correlation threshold (0-1) above which features are considered correlated
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
"""
def fit(self, X, y=None):
"""
Identify correlated features to remove.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Remove correlated features.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with correlated features removed
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.selection import DropCorrelatedFeatures
# Drop features with Pearson correlation > 0.8
selector = DropCorrelatedFeatures(threshold=0.8, method='pearson')
df_reduced = selector.fit_transform(df)
# Use Spearman correlation
selector = DropCorrelatedFeatures(threshold=0.9, method='spearman')
df_reduced = selector.fit_transform(df)
print(selector.correlated_feature_sets_) # Shows groups of correlated features
print(selector.features_to_drop_) # Features selected for removalSelects features from correlated groups based on performance with target variable.
class SmartCorrelatedSelection:
def __init__(self, variables=None, method='pearson', threshold=0.8,
selection_method='variance', estimator=None, scoring='accuracy', cv=3):
"""
Initialize SmartCorrelatedSelection.
Parameters:
- variables (list): List of numerical variables to evaluate. If None, selects all numerical variables
- method (str): Correlation method - 'pearson', 'spearman', or 'kendall'
- threshold (float): Correlation threshold (0-1) for grouping correlated features
- selection_method (str): Method to select from correlated groups - 'variance' or 'model_performance'
- estimator: Sklearn estimator for performance-based selection
- scoring (str): Scoring metric for model performance evaluation
- cv (int): Cross-validation folds
"""
def fit(self, X, y=None):
"""
Identify correlated groups and select best feature from each group.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required for model_performance selection)
Returns:
- self
"""
def transform(self, X):
"""
Keep only selected features from correlated groups.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with smart feature selection applied
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Selects features based on individual performance metrics.
class SelectBySingleFeaturePerformance:
def __init__(self, estimator, scoring='accuracy', cv=3, threshold=0.5, variables=None):
"""
Initialize SelectBySingleFeaturePerformance.
Parameters:
- estimator: Sklearn estimator to evaluate feature performance
- scoring (str): Scoring metric for performance evaluation
- cv (int): Cross-validation folds
- threshold (float): Performance threshold for feature selection
- variables (list): List of variables to evaluate. If None, evaluates all variables
"""
def fit(self, X, y):
"""
Evaluate individual performance of each feature.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Select features that meet performance threshold.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with only high-performing features
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.selection import SelectBySingleFeaturePerformance
from sklearn.ensemble import RandomForestClassifier
# Select features based on individual performance
selector = SelectBySingleFeaturePerformance(
estimator=RandomForestClassifier(n_estimators=10),
scoring='accuracy',
cv=3,
threshold=0.6
)
df_selected = selector.fit_transform(df, y)
print(selector.feature_performance_) # Performance score per feature
print(selector.features_to_drop_) # Features below thresholdSelects features by recursively eliminating worst performing features.
class RecursiveFeatureElimination:
def __init__(self, estimator, scoring='accuracy', cv=3, threshold=0.01, variables=None):
"""
Initialize RecursiveFeatureElimination.
Parameters:
- estimator: Sklearn estimator with feature_importances_ or coef_ attribute
- scoring (str): Scoring metric for performance evaluation
- cv (int): Cross-validation folds
- threshold (float): Performance drop threshold for stopping elimination
- variables (list): List of variables to evaluate. If None, evaluates all variables
"""
def fit(self, X, y):
"""
Perform recursive feature elimination.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Select features identified by recursive elimination.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with selected features only
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Selects features by recursively adding best performing features.
class RecursiveFeatureAddition:
def __init__(self, estimator, scoring='accuracy', cv=3, threshold=0.01, variables=None):
"""
Initialize RecursiveFeatureAddition.
Parameters:
- estimator: Sklearn estimator for performance evaluation
- scoring (str): Scoring metric for performance evaluation
- cv (int): Cross-validation folds
- threshold (float): Performance improvement threshold for stopping addition
- variables (list): List of variables to evaluate. If None, evaluates all variables
"""
def fit(self, X, y):
"""
Perform recursive feature addition.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Select features identified by recursive addition.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with selected features only
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Selects features by evaluating performance drop after shuffling feature values.
class SelectByShuffling:
def __init__(self, estimator, scoring='accuracy', cv=3, threshold=0.01, variables=None):
"""
Initialize SelectByShuffling.
Parameters:
- estimator: Sklearn estimator for performance evaluation
- scoring (str): Scoring metric for performance evaluation
- cv (int): Cross-validation folds
- threshold (float): Performance drop threshold for feature importance
- variables (list): List of variables to evaluate. If None, evaluates all variables
"""
def fit(self, X, y):
"""
Evaluate feature importance by shuffling.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Select features that show significant performance drop when shuffled.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with important features only
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Removes features with high Population Stability Index, indicating significant data drift.
class DropHighPSIFeatures:
def __init__(self, variables=None, split_frac=0.5, threshold=0.25,
missing_values='raise', switch=False):
"""
Initialize DropHighPSIFeatures.
Parameters:
- variables (list): List of variables to evaluate. If None, evaluates all variables
- split_frac (float): Fraction of data to use for reference vs comparison
- threshold (float): PSI threshold above which features are dropped
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
- switch (bool): Whether to switch reference and comparison datasets
"""
def fit(self, X, y=None):
"""
Calculate PSI for each variable and identify features to drop.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Remove features with high PSI.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with high PSI features removed
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.selection import DropHighPSIFeatures
# Drop features with PSI > 0.25 indicating significant data drift
selector = DropHighPSIFeatures(threshold=0.25, split_frac=0.6)
df_stable = selector.fit_transform(df)
print(selector.features_to_drop_) # Features with high PSI
print(selector.psi_values_) # PSI values per featureSelects features based on target mean performance for univariate analysis.
class SelectByTargetMeanPerformance:
def __init__(self, variables=None, scoring='roc_auc', threshold=0.5, bins=5):
"""
Initialize SelectByTargetMeanPerformance.
Parameters:
- variables (list): List of variables to evaluate. If None, evaluates all numerical variables
- scoring (str): Performance metric to use for feature evaluation
- threshold (float): Performance threshold for feature selection
- bins (int): Number of bins for discretizing continuous variables
"""
def fit(self, X, y):
"""
Evaluate target mean performance for each variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Select features that meet target mean performance threshold.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with selected features only
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.selection import SelectByTargetMeanPerformance
# Select features based on target mean performance
selector = SelectByTargetMeanPerformance(
scoring='roc_auc',
threshold=0.6,
bins=5
)
df_selected = selector.fit_transform(df, y)
print(selector.feature_performance_) # Performance scores per feature
print(selector.features_to_drop_) # Features below thresholdfrom sklearn.pipeline import Pipeline
from feature_engine.selection import (
DropConstantFeatures,
DropCorrelatedFeatures,
SelectBySingleFeaturePerformance
)
from sklearn.ensemble import RandomForestClassifier
# Multi-step feature selection pipeline
selection_pipeline = Pipeline([
('drop_constant', DropConstantFeatures(tol=0.99)),
('drop_correlated', DropCorrelatedFeatures(threshold=0.95)),
('performance_selection', SelectBySingleFeaturePerformance(
estimator=RandomForestClassifier(n_estimators=10),
threshold=0.6
))
])
df_selected = selection_pipeline.fit_transform(df, y)from sklearn.model_selection import cross_val_score
from feature_engine.selection import RecursiveFeatureElimination
# Feature selection with proper evaluation
selector = RecursiveFeatureElimination(
estimator=RandomForestClassifier(),
cv=5,
threshold=0.01
)
# Fit selector
selector.fit(X_train, y_train)
# Transform datasets
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
# Evaluate selected features
scores = cross_val_score(
RandomForestClassifier(),
X_train_selected,
y_train,
cv=5
)
print(f"CV Score with selected features: {scores.mean():.3f}")All selection transformers share these fitted attributes:
features_to_drop_ (list): Features identified for removaln_features_in_ (int): Number of features in training setSelector-specific attributes:
correlated_feature_sets_ (list): Groups of correlated features (correlation-based selectors)feature_performance_ (dict): Performance scores per feature (performance-based selectors)performance_drifts_ (dict): Performance changes during selection process (recursive selectors)Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine