Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transformers for identifying and handling outliers using statistical methods including Winsorization, capping, and trimming techniques to improve data quality and model robustness.
Caps outliers to percentile values, replacing extreme values with less extreme percentile values.
class Winsorizer:
def __init__(self, capping_method='gaussian', tail='right', fold=3, variables=None, missing_values='raise'):
"""
Initialize Winsorizer.
Parameters:
- capping_method (str): Method to identify outliers - 'gaussian', 'iqr', 'mad', or 'quantiles'
- tail (str): Which tail to cap - 'right', 'left', or 'both'
- fold (int/float): Factor for outlier boundary calculation (used with gaussian, iqr, mad methods)
- variables (list): List of numerical variables to process. If None, selects all numerical variables
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
"""
def fit(self, X, y=None):
"""
Learn outlier boundaries for each variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Cap outliers to learned boundaries.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with outliers capped to boundary values
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.outliers import Winsorizer
import pandas as pd
import numpy as np
# Sample data with outliers
np.random.seed(42)
data = {
'var1': np.concatenate([np.random.normal(50, 10, 95), [120, 130, 140, 150, 160]]),
'var2': np.concatenate([np.random.normal(100, 20, 97), [200, 220, 250]])
}
df = pd.DataFrame(data)
# Gaussian method (mean ± 3*std)
winsorizer = Winsorizer(capping_method='gaussian', fold=3, tail='both')
df_capped = winsorizer.fit_transform(df)
# IQR method (Q1 - 1.5*IQR, Q3 + 1.5*IQR)
winsorizer = Winsorizer(capping_method='iqr', fold=1.5, tail='both')
df_capped = winsorizer.fit_transform(df)
# Only cap right tail (upper outliers)
winsorizer = Winsorizer(capping_method='gaussian', fold=2, tail='right')
df_capped = winsorizer.fit_transform(df)
# Access learned boundaries
print(winsorizer.right_tail_caps_) # Upper boundaries per variable
print(winsorizer.left_tail_caps_) # Lower boundaries per variableCaps outliers to arbitrary values defined by the user.
class ArbitraryOutlierCapper:
def __init__(self, max_capping_dict=None, min_capping_dict=None, variables=None, missing_values='raise'):
"""
Initialize ArbitraryOutlierCapper.
Parameters:
- max_capping_dict (dict): Dictionary mapping variables to maximum allowed values
- min_capping_dict (dict): Dictionary mapping variables to minimum allowed values
- variables (list): List of numerical variables to process. If None, uses variables from capping dictionaries
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
"""
def fit(self, X, y=None):
"""
Validate capping dictionaries and variables.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Cap outliers using user-defined boundaries.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with outliers capped to specified values
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.outliers import ArbitraryOutlierCapper
# Define custom capping values
max_capping_dict = {'var1': 80, 'var2': 150}
min_capping_dict = {'var1': 20, 'var2': 50}
capper = ArbitraryOutlierCapper(
max_capping_dict=max_capping_dict,
min_capping_dict=min_capping_dict
)
df_capped = capper.fit_transform(df)
# Cap only maximum values
capper = ArbitraryOutlierCapper(max_capping_dict={'var1': 100})
df_capped = capper.fit_transform(df)
# Access capping dictionaries
print(capper.right_tail_caps_) # Maximum capping values
print(capper.left_tail_caps_) # Minimum capping valuesRemoves outlier observations from the dataset instead of capping them.
class OutlierTrimmer:
def __init__(self, capping_method='gaussian', tail='right', fold=3, variables=None, missing_values='raise'):
"""
Initialize OutlierTrimmer.
Parameters:
- capping_method (str): Method to identify outliers - 'gaussian', 'iqr', 'mad', or 'quantiles'
- tail (str): Which tail to consider for outlier detection - 'right', 'left', or 'both'
- fold (int/float): Factor for outlier boundary calculation
- variables (list): List of numerical variables to evaluate. If None, selects all numerical variables
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
"""
def fit(self, X, y=None):
"""
Learn outlier boundaries for each variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Remove observations that are outliers in any of the specified variables.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with outlier observations removed
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def return_outliers(self, X):
"""
Return observations that would be removed as outliers.
Parameters:
- X (pandas.DataFrame): Dataset to evaluate
Returns:
- pandas.DataFrame: Outlier observations that would be removed
"""Usage Example:
from feature_engine.outliers import OutlierTrimmer
# Remove outliers using IQR method
trimmer = OutlierTrimmer(capping_method='iqr', fold=1.5, tail='both')
df_trimmed = trimmer.fit_transform(df)
# Remove only upper outliers
trimmer = OutlierTrimmer(capping_method='gaussian', fold=3, tail='right')
df_trimmed = trimmer.fit_transform(df)
# See which observations would be removed
outliers = trimmer.return_outliers(df)
print(f"Number of outliers detected: {len(outliers)}")
# Access outlier boundaries
print(trimmer.right_tail_caps_) # Upper boundaries
print(trimmer.left_tail_caps_) # Lower boundariesBased on mean and standard deviation: mean ± fold * std
# Example: 3-sigma rule
winsorizer = Winsorizer(capping_method='gaussian', fold=3)
# Outliers: values beyond mean ± 3*stdBased on interquartile range: Q1 - fold * IQR and Q3 + fold * IQR
# Example: Standard IQR rule
winsorizer = Winsorizer(capping_method='iqr', fold=1.5)
# Outliers: values beyond Q1 - 1.5*IQR or Q3 + 1.5*IQRBased on median absolute deviation: median ± fold * MAD
# Example: MAD-based detection
winsorizer = Winsorizer(capping_method='mad', fold=3.5)
# More robust to outliers than gaussian methodBased on specific percentiles:
# Example: 5th and 95th percentiles
winsorizer = Winsorizer(capping_method='quantiles', fold=0.05)
# Caps values below 5th percentile and above 95th percentileimport matplotlib.pyplot as plt
methods = {
'gaussian': Winsorizer(capping_method='gaussian', fold=3),
'iqr': Winsorizer(capping_method='iqr', fold=1.5),
'mad': Winsorizer(capping_method='mad', fold=3.5),
'quantiles': Winsorizer(capping_method='quantiles', fold=0.05)
}
results = {}
for name, method in methods.items():
method.fit(df)
results[name] = {
'lower': method.left_tail_caps_,
'upper': method.right_tail_caps_
}
# Compare boundaries for each method
for var in df.columns:
print(f"\n{var} boundaries:")
for method_name, boundaries in results.items():
lower = boundaries['lower'].get(var, 'None')
upper = boundaries['upper'].get(var, 'None')
print(f" {method_name}: [{lower:.2f}, {upper:.2f}]")from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.outliers import Winsorizer
from sklearn.preprocessing import StandardScaler
# Preprocessing pipeline with outlier handling
pipeline = Pipeline([
('imputer', MeanMedianImputer()),
('outlier_capper', Winsorizer(capping_method='iqr', fold=1.5)),
('scaler', StandardScaler())
])
df_processed = pipeline.fit_transform(df)from feature_engine.outliers import OutlierTrimmer
# Analyze outlier patterns
trimmer = OutlierTrimmer(capping_method='iqr', tail='both')
trimmer.fit(df)
# Get outlier observations
outliers = trimmer.return_outliers(df)
# Analyze outlier characteristics
print("Outlier Statistics:")
print(outliers.describe())
# Count outliers per variable
outlier_counts = {}
for var in df.columns:
lower_bound = trimmer.left_tail_caps_.get(var, float('-inf'))
upper_bound = trimmer.right_tail_caps_.get(var, float('inf'))
outliers_count = ((df[var] < lower_bound) | (df[var] > upper_bound)).sum()
outlier_counts[var] = outliers_count
print("\nOutliers per variable:", outlier_counts)# Compare gaussian (sensitive) vs MAD (robust) methods
gaussian_winsorizer = Winsorizer(capping_method='gaussian', fold=3)
mad_winsorizer = Winsorizer(capping_method='mad', fold=3.5)
# Fit both methods
gaussian_winsorizer.fit(df)
mad_winsorizer.fit(df)
# Compare how many observations would be capped
for var in df.columns:
# Gaussian boundaries
g_lower = gaussian_winsorizer.left_tail_caps_[var]
g_upper = gaussian_winsorizer.right_tail_caps_[var]
g_outliers = ((df[var] < g_lower) | (df[var] > g_upper)).sum()
# MAD boundaries
m_lower = mad_winsorizer.left_tail_caps_[var]
m_upper = mad_winsorizer.right_tail_caps_[var]
m_outliers = ((df[var] < m_lower) | (df[var] > m_upper)).sum()
print(f"{var}: Gaussian={g_outliers}, MAD={m_outliers} outliers")All outlier transformers share these fitted attributes:
variables_ (list): Variables that will be processedn_features_in_ (int): Number of features in training setright_tail_caps_ (dict): Upper boundary values per variableleft_tail_caps_ (dict): Lower boundary values per variableThe boundaries define the thresholds beyond which observations are considered outliers and will be capped (Winsorizer, ArbitraryOutlierCapper) or removed (OutlierTrimmer).
Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine