Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transformers for converting continuous variables into discrete intervals using equal width, equal frequency, decision tree-based, or user-defined boundaries.
Sorts continuous variables into intervals of equal width.
class EqualWidthDiscretiser:
def __init__(self, q=5, variables=None, return_object=False, return_boundaries=False):
"""
Initialize EqualWidthDiscretiser.
Parameters:
- q (int): Number of intervals to create
- variables (list): List of numerical variables to discretise. If None, selects all numerical variables
- return_object (bool): Whether to return discretised variables as object type
- return_boundaries (bool): Whether to return interval boundaries as part of labels
"""
def fit(self, X, y=None):
"""
Learn interval boundaries for each variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Discretise continuous variables into equal width intervals.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with continuous variables replaced by interval labels
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.discretisation import EqualWidthDiscretiser
import pandas as pd
import numpy as np
# Sample continuous data
data = {'age': np.random.normal(35, 10, 1000),
'income': np.random.normal(50000, 15000, 1000)}
df = pd.DataFrame(data)
# Create 5 equal width intervals
discretiser = EqualWidthDiscretiser(q=5)
df_discretised = discretiser.fit_transform(df)
# Creates intervals like: (18.5, 25.2], (25.2, 31.9], etc.
# Return with boundaries in labels
discretiser = EqualWidthDiscretiser(q=3, return_boundaries=True)
df_discretised = discretiser.fit_transform(df)
# Access learned boundaries
print(discretiser.binner_dict_) # Shows interval boundaries per variableSorts continuous variables into intervals of equal frequency (quantiles).
class EqualFrequencyDiscretiser:
def __init__(self, q=5, variables=None, return_object=False, return_boundaries=False):
"""
Initialize EqualFrequencyDiscretiser.
Parameters:
- q (int): Number of intervals to create (quantiles)
- variables (list): List of numerical variables to discretise. If None, selects all numerical variables
- return_object (bool): Whether to return discretised variables as object type
- return_boundaries (bool): Whether to return interval boundaries as part of labels
"""
def fit(self, X, y=None):
"""
Learn quantile boundaries for each variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Discretise continuous variables into equal frequency intervals.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with continuous variables replaced by interval labels
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.discretisation import EqualFrequencyDiscretiser
# Create 5 quantile-based intervals
discretiser = EqualFrequencyDiscretiser(q=5)
df_discretised = discretiser.fit_transform(df)
# Each interval contains approximately 20% of the data
# Create quartiles (4 intervals)
discretiser = EqualFrequencyDiscretiser(q=4)
df_discretised = discretiser.fit_transform(df)
# Creates Q1, Q2, Q3, Q4 intervalsSorts continuous variables into intervals defined by user-specified boundaries.
class ArbitraryDiscretiser:
def __init__(self, binning_dict, return_object=False, return_boundaries=False):
"""
Initialize ArbitraryDiscretiser.
Parameters:
- binning_dict (dict): Dictionary mapping variables to lists of cut points
- return_object (bool): Whether to return discretised variables as object type
- return_boundaries (bool): Whether to return interval boundaries as part of labels
"""
def fit(self, X, y=None):
"""
Validate binning dictionary and variables.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Discretise continuous variables using user-defined boundaries.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with continuous variables replaced by interval labels
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.discretisation import ArbitraryDiscretiser
# Define custom intervals for each variable
binning_dict = {
'age': [18, 30, 45, 60, 100],
'income': [0, 25000, 50000, 75000, 100000, float('inf')]
}
discretiser = ArbitraryDiscretiser(binning_dict=binning_dict)
df_discretised = discretiser.fit_transform(df)
# Creates intervals: (18,30], (30,45], (45,60], (60,100] for age
# Creates intervals: (0,25000], (25000,50000], etc. for income
# Return as object type with boundaries
discretiser = ArbitraryDiscretiser(
binning_dict=binning_dict,
return_object=True,
return_boundaries=True
)
df_discretised = discretiser.fit_transform(df)Uses decision tree to find optimal cut points for discretisation based on target variable.
class DecisionTreeDiscretiser:
def __init__(self, variables=None, cv=3, scoring='accuracy', param_grid=None,
regression=False, random_state=None, return_object=False,
return_boundaries=False):
"""
Initialize DecisionTreeDiscretiser.
Parameters:
- variables (list): List of numerical variables to discretise. If None, selects all numerical variables
- cv (int): Cross-validation folds for hyperparameter tuning
- scoring (str): Scoring metric for model selection
- param_grid (dict): Parameter grid for decision tree hyperparameter tuning
- regression (bool): Whether target is continuous (True) or categorical (False)
- random_state (int): Random state for reproducibility
- return_object (bool): Whether to return discretised variables as object type
- return_boundaries (bool): Whether to return interval boundaries as part of labels
"""
def fit(self, X, y):
"""
Train decision trees to find optimal cut points per variable.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series): Target variable (required)
Returns:
- self
"""
def transform(self, X):
"""
Discretise variables using decision tree-derived cut points.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with continuous variables replaced by interval labels
"""
def fit_transform(self, X, y):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.discretisation import DecisionTreeDiscretiser
# Automatic discretisation based on target
discretiser = DecisionTreeDiscretiser(cv=5, scoring='accuracy')
df_discretised = discretiser.fit_transform(df, y)
# Finds optimal cut points that best separate target classes
# For regression tasks
discretiser = DecisionTreeDiscretiser(
regression=True,
scoring='neg_mean_squared_error'
)
df_discretised = discretiser.fit_transform(df, y_continuous)
# Access learned boundaries
print(discretiser.binner_dict_) # Shows tree-derived cut points per variable
print(discretiser.scores_dict_) # Shows cross-validation scoresfrom sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.encoding import OneHotEncoder
# Pipeline for preprocessing continuous variables
pipeline = Pipeline([
('imputer', MeanMedianImputer()),
('discretiser', EqualFrequencyDiscretiser(q=5)),
('encoder', OneHotEncoder()) # Convert intervals to dummy variables
])
df_processed = pipeline.fit_transform(df)from feature_engine.discretisation import EqualWidthDiscretiser
# Specify only numerical variables to discretise
discretiser = EqualWidthDiscretiser(
q=4,
variables=['age', 'income', 'score'] # Only these will be discretised
)
df_mixed = discretiser.fit_transform(df_with_mixed_types)
# Categorical variables remain unchangedAll discretisation transformers share these fitted attributes:
variables_ (list): Variables that will be transformedn_features_in_ (int): Number of features in training setbinner_dict_ (dict): Dictionary with interval boundaries per variableAdditional attributes for specific discretisers:
scores_dict_ (dict): Cross-validation scores per variable (DecisionTreeDiscretiser)models_dict_ (dict): Trained decision tree models per variable (DecisionTreeDiscretiser)Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine