Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transformers for applying mathematical functions to numerical variables including logarithmic, power, reciprocal, Box-Cox, and Yeo-Johnson transformations to improve data distribution and model performance.
Applies natural logarithm or base 10 logarithm to numerical variables.
class LogTransformer:
def __init__(self, variables=None, base='e'):
"""
Initialize LogTransformer.
Parameters:
- variables (list): List of numerical variables to transform. If None, selects all numerical variables
- base (str): 'e' for natural logarithm or '10' for base 10 logarithm
"""
def fit(self, X, y=None):
"""
Validate that variables are positive (no parameters learned).
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Apply logarithm transformation to variables.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with log-transformed variables
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Convert back to original representation using exponential.
Parameters:
- X (pandas.DataFrame): Dataset with log-transformed values
Returns:
- pandas.DataFrame: Dataset with original scale restored
"""Usage Example:
from feature_engine.transformation import LogTransformer
import pandas as pd
import numpy as np
# Sample data with positive values
data = {'price': [100, 200, 500, 1000, 2000],
'volume': [10, 25, 50, 100, 200]}
df = pd.DataFrame(data)
# Natural log transformation
transformer = LogTransformer(base='e')
df_transformed = transformer.fit_transform(df)
# Base 10 log transformation
transformer = LogTransformer(base='10')
df_transformed = transformer.fit_transform(df)
# Inverse transformation
df_original = transformer.inverse_transform(df_transformed)Applies log(x + C) transformation where C is a positive constant, useful for data with zeros or negative values.
class LogCpTransformer:
def __init__(self, variables=None, base='e', C='auto'):
"""
Initialize LogCpTransformer.
Parameters:
- variables (list): List of numerical variables to transform. If None, selects all numerical variables
- base (str): 'e' for natural logarithm or '10' for base 10 logarithm
- C (int/float/str/dict): Constant to add before log. 'auto' calculates optimal C
"""
def fit(self, X, y=None):
"""
Learn constant C if C='auto', otherwise validate input.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Apply log(x + C) transformation to variables.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with log(x + C) transformed variables
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Convert back to original representation using exp(x) - C.
Parameters:
- X (pandas.DataFrame): Dataset with log-transformed values
Returns:
- pandas.DataFrame: Dataset with original scale restored
"""Usage Example:
from feature_engine.transformation import LogCpTransformer
# Auto-calculate C (makes minimum value positive)
transformer = LogCpTransformer(C='auto')
df_transformed = transformer.fit_transform(df)
# Specify constant C
transformer = LogCpTransformer(C=1)
df_transformed = transformer.fit_transform(df)
# Different C per variable
transformer = LogCpTransformer(C={'var1': 1, 'var2': 5})
df_transformed = transformer.fit_transform(df)
# Access learned C values
print(transformer.C_) # Shows C value per variableApplies Box-Cox transformation to numerical variables to achieve normality.
class BoxCoxTransformer:
def __init__(self, variables=None):
"""
Initialize BoxCoxTransformer.
Parameters:
- variables (list): List of numerical variables to transform. If None, selects all numerical variables
"""
def fit(self, X, y=None):
"""
Learn optimal lambda parameter for Box-Cox transformation per variable.
Parameters:
- X (pandas.DataFrame): Training dataset (must contain positive values)
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Apply Box-Cox transformation using learned lambda values.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with Box-Cox transformed variables
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Convert back to original representation using inverse Box-Cox.
Parameters:
- X (pandas.DataFrame): Dataset with Box-Cox transformed values
Returns:
- pandas.DataFrame: Dataset with original scale restored
"""Usage Example:
from feature_engine.transformation import BoxCoxTransformer
# Box-Cox transformation (requires positive values)
transformer = BoxCoxTransformer()
df_transformed = transformer.fit_transform(df)
# Access learned lambda parameters
print(transformer.lambda_dict_) # Shows optimal lambda per variable
# Inverse transformation
df_original = transformer.inverse_transform(df_transformed)Applies Yeo-Johnson transformation to numerical variables, which works with positive and negative values.
class YeoJohnsonTransformer:
def __init__(self, variables=None):
"""
Initialize YeoJohnsonTransformer.
Parameters:
- variables (list): List of numerical variables to transform. If None, selects all numerical variables
"""
def fit(self, X, y=None):
"""
Learn optimal lambda parameter for Yeo-Johnson transformation per variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Apply Yeo-Johnson transformation using learned lambda values.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with Yeo-Johnson transformed variables
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Convert back to original representation using inverse Yeo-Johnson.
Parameters:
- X (pandas.DataFrame): Dataset with Yeo-Johnson transformed values
Returns:
- pandas.DataFrame: Dataset with original scale restored
"""Usage Example:
from feature_engine.transformation import YeoJohnsonTransformer
# Yeo-Johnson transformation (works with positive and negative values)
transformer = YeoJohnsonTransformer()
df_transformed = transformer.fit_transform(df)
# Access learned lambda parameters
print(transformer.lambda_dict_) # Shows optimal lambda per variable
# Inverse transformation
df_original = transformer.inverse_transform(df_transformed)Applies power transformation (x^lambda) to numerical variables.
class PowerTransformer:
def __init__(self, variables=None, exp=2):
"""
Initialize PowerTransformer.
Parameters:
- variables (list): List of numerical variables to transform. If None, selects all numerical variables
- exp (int/float/list/dict): Exponent for power transformation
"""
def fit(self, X, y=None):
"""
Validate input data (no parameters learned).
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Apply power transformation to variables.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with power-transformed variables
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Convert back to original representation using root transformation.
Parameters:
- X (pandas.DataFrame): Dataset with power-transformed values
Returns:
- pandas.DataFrame: Dataset with original scale restored
"""Usage Example:
from feature_engine.transformation import PowerTransformer
# Square transformation (default)
transformer = PowerTransformer(exp=2)
df_transformed = transformer.fit_transform(df)
# Square root transformation
transformer = PowerTransformer(exp=0.5)
df_transformed = transformer.fit_transform(df)
# Different exponents per variable
transformer = PowerTransformer(exp={'var1': 2, 'var2': 3, 'var3': 0.5})
df_transformed = transformer.fit_transform(df)
# Inverse transformation
df_original = transformer.inverse_transform(df_transformed)Applies reciprocal transformation (1/x) to numerical variables.
class ReciprocalTransformer:
def __init__(self, variables=None):
"""
Initialize ReciprocalTransformer.
Parameters:
- variables (list): List of numerical variables to transform. If None, selects all numerical variables
"""
def fit(self, X, y=None):
"""
Validate that variables don't contain zeros (no parameters learned).
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Apply reciprocal transformation (1/x) to variables.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with reciprocal-transformed variables
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Convert back to original representation using reciprocal (1/x).
Parameters:
- X (pandas.DataFrame): Dataset with reciprocal-transformed values
Returns:
- pandas.DataFrame: Dataset with original scale restored
"""Usage Example:
from feature_engine.transformation import ReciprocalTransformer
# Reciprocal transformation (1/x)
transformer = ReciprocalTransformer()
df_transformed = transformer.fit_transform(df)
# Inverse transformation (also 1/x)
df_original = transformer.inverse_transform(df_transformed)import matplotlib.pyplot as plt
from scipy import stats
# Assess data distribution before transformation
def assess_normality(data, variable):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Histogram
ax1.hist(data[variable], bins=30)
ax1.set_title(f'{variable} Distribution')
# Q-Q plot
stats.probplot(data[variable], dist="norm", plot=ax2)
ax2.set_title(f'{variable} Q-Q Plot')
plt.tight_layout()
plt.show()
# Shapiro-Wilk test
stat, p_value = stats.shapiro(data[variable].dropna())
print(f"Shapiro-Wilk test p-value: {p_value}")
# Test different transformations
from feature_engine.transformation import LogTransformer, BoxCoxTransformer
transformers = {
'log': LogTransformer(),
'boxcox': BoxCoxTransformer()
}
for name, transformer in transformers.items():
try:
df_transformed = transformer.fit_transform(df)
print(f"{name} transformation successful")
except Exception as e:
print(f"{name} transformation failed: {e}")from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.transformation import LogCpTransformer
from sklearn.preprocessing import StandardScaler
# Preprocessing pipeline with transformation
pipeline = Pipeline([
('imputer', MeanMedianImputer()),
('transformer', LogCpTransformer(C='auto')),
('scaler', StandardScaler())
])
df_processed = pipeline.fit_transform(df)All transformation transformers share these fitted attributes:
variables_ (list): Variables that will be transformedn_features_in_ (int): Number of features in training setTransformer-specific attributes:
C_ (dict): Constant C values per variable (LogCpTransformer)lambda_dict_ (dict): Lambda parameters per variable (BoxCoxTransformer, YeoJohnsonTransformer)exp_ (dict): Exponent values per variable (PowerTransformer)Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine