Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transformers for converting categorical variables into numerical representations using various encoding methods including one-hot, ordinal, target-based, frequency-based, and weight of evidence encoders.
Replaces categorical variables by binary variables representing each category.
class OneHotEncoder:
def __init__(self, top_categories=None, drop_last=False, drop_last_binary=False,
variables=None, ignore_format=False):
"""
Initialize OneHotEncoder.
Parameters:
- top_categories (int): Number of most frequent categories to encode. If None, encodes all categories
- drop_last (bool): Whether to create k-1 dummy variables (drop last category to avoid multicollinearity)
- drop_last_binary (bool): Whether to return 1 dummy for binary variables instead of 2
- variables (list): List of categorical variables to encode. If None, selects all object variables
- ignore_format (bool): Whether to ignore variable format and accept numerical variables
"""
def fit(self, X, y=None):
"""
Learn unique categories per variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Replace categorical variables with binary dummy variables.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with categorical variables replaced by dummy variables
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.encoding import OneHotEncoder
import pandas as pd
# Sample categorical data
data = {'color': ['red', 'blue', 'green', 'red', 'blue'],
'size': ['S', 'M', 'L', 'M', 'S']}
df = pd.DataFrame(data)
# Basic one-hot encoding
encoder = OneHotEncoder()
df_encoded = encoder.fit_transform(df)
# Creates columns: color_blue, color_green, color_red, size_L, size_M, size_S
# Drop last category to avoid multicollinearity
encoder = OneHotEncoder(drop_last=True)
df_encoded = encoder.fit_transform(df)
# Creates columns: color_blue, color_green, size_L, size_M
# Encode only top N categories
encoder = OneHotEncoder(top_categories=2)
df_encoded = encoder.fit_transform(df)
# Access learned categories
print(encoder.encoder_dict_) # Shows categories for each variableReplaces categories by ordinal numbers (0, 1, 2, 3, etc).
class OrdinalEncoder:
def __init__(self, encoding_method='ordered', variables=None, ignore_format=False, errors='ignore'):
"""
Initialize OrdinalEncoder.
Parameters:
- encoding_method (str): 'ordered' (requires target y) or 'arbitrary' (lexicographic order)
- variables (list): List of categorical variables to encode. If None, selects all object variables
- ignore_format (bool): Whether to ignore variable format and accept numerical variables
- errors (str): How to handle unseen categories - 'ignore' or 'raise'
"""
def fit(self, X, y=None):
"""
Learn integer mappings for categories.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required if encoding_method='ordered')
Returns:
- self
"""
def transform(self, X):
"""
Encode categories to ordinal numbers.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with categories replaced by ordinal numbers
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Encode numbers back to original categories.
Parameters:
- X (pandas.DataFrame): Dataset with encoded values
Returns:
- pandas.DataFrame: Dataset with original category labels
"""Usage Example:
from feature_engine.encoding import OrdinalEncoder
# Arbitrary encoding (alphabetical order)
encoder = OrdinalEncoder(encoding_method='arbitrary')
df_encoded = encoder.fit_transform(df)
# Categories encoded in lexicographic order: blue=0, green=1, red=2
# Ordered encoding based on target mean
encoder = OrdinalEncoder(encoding_method='ordered')
df_encoded = encoder.fit_transform(df, y)
# Categories ordered by target mean value
# Reverse the encoding
df_original = encoder.inverse_transform(df_encoded)Replaces categories by the mean value of the target for each category.
class MeanEncoder:
def __init__(self, variables=None, ignore_format=False, errors='ignore'):
"""
Initialize MeanEncoder.
Parameters:
- variables (list): List of categorical variables to encode. If None, selects all object variables
- ignore_format (bool): Whether to ignore variable format and accept numerical variables
- errors (str): How to handle unseen categories - 'ignore' or 'raise'
"""
def fit(self, X, y):
"""
Learn target mean value per category per variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Encode categories to target mean values.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with categories replaced by target means
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Encode numbers back to original categories (approximate).
Parameters:
- X (pandas.DataFrame): Dataset with encoded values
Returns:
- pandas.DataFrame: Dataset with closest matching category labels
"""Usage Example:
from feature_engine.encoding import MeanEncoder
# Target encoding
encoder = MeanEncoder()
df_encoded = encoder.fit_transform(df, y)
# Each category replaced by mean target value for that category
# Access learned mappings
print(encoder.encoder_dict_) # Shows target mean per category per variableReplaces categories by their count or frequency in the dataset.
class CountFrequencyEncoder:
def __init__(self, encoding_method='count', variables=None, ignore_format=False):
"""
Initialize CountFrequencyEncoder.
Parameters:
- encoding_method (str): 'count' (absolute count) or 'frequency' (relative frequency)
- variables (list): List of categorical variables to encode. If None, selects all object variables
- ignore_format (bool): Whether to ignore variable format and accept numerical variables
"""
def fit(self, X, y=None):
"""
Learn count or frequency for each category per variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Encode categories to counts or frequencies.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with categories replaced by counts or frequencies
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.encoding import CountFrequencyEncoder
# Count encoding
encoder = CountFrequencyEncoder(encoding_method='count')
df_encoded = encoder.fit_transform(df)
# Each category replaced by its count in training data
# Frequency encoding
encoder = CountFrequencyEncoder(encoding_method='frequency')
df_encoded = encoder.fit_transform(df)
# Each category replaced by its relative frequency (0-1)Replaces categories with predictions of a decision tree trained to predict the target.
class DecisionTreeEncoder:
def __init__(self, variables=None, ignore_format=False, cv=3, scoring='accuracy',
param_grid=None, regression=False, random_state=None):
"""
Initialize DecisionTreeEncoder.
Parameters:
- variables (list): List of categorical variables to encode. If None, selects all object variables
- ignore_format (bool): Whether to ignore variable format and accept numerical variables
- cv (int): Cross-validation folds for hyperparameter tuning
- scoring (str): Scoring metric for model selection
- param_grid (dict): Parameter grid for decision tree hyperparameter tuning
- regression (bool): Whether target is continuous (True) or categorical (False)
- random_state (int): Random state for reproducibility
"""
def fit(self, X, y):
"""
Train decision trees per variable to predict target from categories.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Encode categories using decision tree predictions.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with categories replaced by decision tree predictions
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.encoding import DecisionTreeEncoder
from sklearn.ensemble import RandomForestClassifier
# Decision tree encoding for classification
encoder = DecisionTreeEncoder(cv=5, scoring='accuracy')
df_encoded = encoder.fit_transform(df, y)
# For regression tasks
encoder = DecisionTreeEncoder(
regression=True,
scoring='neg_mean_squared_error',
random_state=42
)
df_encoded = encoder.fit_transform(df, y_continuous)
# Access trained models
print(encoder.encoder_) # Shows trained decision trees per variableGroups infrequent categories into a single category.
class RareLabelEncoder:
def __init__(self, tol=0.05, n_categories=10, max_n_categories=None,
variables=None, ignore_format=False):
"""
Initialize RareLabelEncoder.
Parameters:
- tol (float): Minimum frequency threshold (0-1) for category to be kept separate
- n_categories (int): Maximum number of categories to keep (most frequent)
- max_n_categories (int): Alternative to n_categories, maximum categories per variable
- variables (list): List of categorical variables to encode. If None, selects all object variables
- ignore_format (bool): Whether to ignore variable format and accept numerical variables
"""
def fit(self, X, y=None):
"""
Identify frequent categories per variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Replace rare categories with 'Rare' label.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with rare categories grouped as 'Rare'
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.encoding import RareLabelEncoder
# Group categories appearing in less than 5% of observations
encoder = RareLabelEncoder(tol=0.05)
df_encoded = encoder.fit_transform(df)
# Keep only top 3 most frequent categories
encoder = RareLabelEncoder(n_categories=3)
df_encoded = encoder.fit_transform(df)
# Access frequent categories
print(encoder.encoder_dict_) # Shows kept categories per variableReplaces categories with Weight of Evidence (WoE) values for binary classification.
class WoEEncoder:
def __init__(self, variables=None, ignore_format=False, errors='ignore'):
"""
Initialize WoEEncoder.
Parameters:
- variables (list): List of categorical variables to encode. If None, selects all object variables
- ignore_format (bool): Whether to ignore variable format and accept numerical variables
- errors (str): How to handle unseen categories - 'ignore' or 'raise'
"""
def fit(self, X, y):
"""
Calculate Weight of Evidence for each category.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Binary target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Encode categories to Weight of Evidence values.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with categories replaced by WoE values
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.encoding import WoEEncoder
# Weight of Evidence encoding for binary classification
encoder = WoEEncoder()
df_encoded = encoder.fit_transform(df, y_binary)
# Access learned WoE values
print(encoder.encoder_dict_) # Shows WoE values per category per variableReplaces categories with probability ratios for binary classification.
class PRatioEncoder:
def __init__(self, variables=None, ignore_format=False, errors='ignore'):
"""
Initialize PRatioEncoder.
Parameters:
- variables (list): List of categorical variables to encode. If None, selects all object variables
- ignore_format (bool): Whether to ignore variable format and accept numerical variables
- errors (str): How to handle unseen categories - 'ignore' or 'raise'
"""
def fit(self, X, y):
"""
Calculate probability ratios for each category.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Binary target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Encode categories to probability ratio values.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with categories replaced by probability ratios
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.encoding import PRatioEncoder
# Probability ratio encoding for binary classification
encoder = PRatioEncoder()
df_encoded = encoder.fit_transform(df, y_binary)
# Access learned probability ratios
print(encoder.encoder_dict_) # Shows probability ratios per category per variableAll encoding transformers share these fitted attributes:
variables_ (list): Variables that will be transformedn_features_in_ (int): Number of features in training setencoder_dict_ (dict): Dictionary with category mappings per variableAdditional attributes for specific encoders:
variables_binary_ (list): Binary variables identified in data (OneHotEncoder)encoder_ (dict): Trained models per variable (DecisionTreeEncoder)Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine