Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transformers for generating new features through mathematical combinations, cyclical transformations, and reference feature combinations to enrich the dataset and improve model performance.
Applies basic mathematical operations to multiple features, returning additional features.
class MathematicalCombination:
def __init__(self, variables_to_combine, math_operations=None, new_variables_names=None,
missing_values='raise', drop_original=False):
"""
Initialize MathematicalCombination.
Parameters:
- variables_to_combine (list): List of numerical variables to combine mathematically
- math_operations (list): Operations to perform - 'sum', 'prod', 'mean', 'std', 'max', 'min'
- new_variables_names (list): Names for new variables. If None, auto-generated
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
- drop_original (bool): Whether to drop original variables after combination
"""
def fit(self, X, y=None):
"""
Validate input and create operation dictionary.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Combine variables with mathematical operations and add new features.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with additional combined features
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.creation import MathematicalCombination
import pandas as pd
# Sample numerical data
data = {
'height': [170, 175, 180, 165, 190],
'weight': [70, 80, 85, 60, 95],
'age': [25, 30, 35, 22, 45]
}
df = pd.DataFrame(data)
# Create combinations of height and weight
combiner = MathematicalCombination(
variables_to_combine=['height', 'weight'],
math_operations=['sum', 'mean', 'prod'],
new_variables_names=['height_weight_sum', 'height_weight_mean', 'height_weight_prod']
)
df_combined = combiner.fit_transform(df)
# Auto-generate variable names
combiner = MathematicalCombination(
variables_to_combine=['height', 'weight', 'age'],
math_operations=['mean', 'std', 'max', 'min']
)
df_combined = combiner.fit_transform(df)
# Creates: height_weight_age_mean, height_weight_age_std, etc.
# Access operation mappings
print(combiner.combination_dict_) # Shows operation to variable name mappingCombines multiple features with a reference feature using mathematical operations.
class CombineWithReferenceFeature:
def __init__(self, variables_to_combine, reference_variables, operations_list,
new_variables_names=None, missing_values='raise', drop_original=False):
"""
Initialize CombineWithReferenceFeature.
Parameters:
- variables_to_combine (list): List of variables to combine with reference
- reference_variables (list): List of reference variables for combination
- operations_list (list): Mathematical operations - 'sub', 'div', 'add', 'mul'
- new_variables_names (list): Names for new variables. If None, auto-generated
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
- drop_original (bool): Whether to drop original variables
"""
def fit(self, X, y=None):
"""
Validate input variables and operations.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Combine variables with reference features using specified operations.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with additional combined features
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.creation import CombineWithReferenceFeature
# Combine features with reference features
combiner = CombineWithReferenceFeature(
variables_to_combine=['height', 'weight'],
reference_variables=['age'],
operations_list=['div', 'mul'],
new_variables_names=['height_per_age', 'weight_per_age', 'height_times_age', 'weight_times_age']
)
df_combined = combiner.fit_transform(df)
# Multiple reference variables
combiner = CombineWithReferenceFeature(
variables_to_combine=['height'],
reference_variables=['weight', 'age'],
operations_list=['div', 'sub']
)
df_combined = combiner.fit_transform(df)
# Creates: height_div_weight, height_div_age, height_sub_weight, height_sub_ageCreates cyclical features from numerical variables to capture periodic patterns.
class CyclicalTransformer:
def __init__(self, variables=None, max_values=None, drop_original=False):
"""
Initialize CyclicalTransformer.
Parameters:
- variables (list): List of numerical variables to transform. If None, selects all numerical variables
- max_values (dict/int/float): Maximum values for each variable to define cycle. Auto-detected if None
- drop_original (bool): Whether to drop original variables after transformation
"""
def fit(self, X, y=None):
"""
Learn maximum values for cyclical transformation if not provided.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Create sine and cosine features from numerical variables.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with sine and cosine cyclical features
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.creation import CyclicalTransformer
import numpy as np
# Sample cyclical data (e.g., time-based)
data = {
'hour': np.random.randint(0, 24, 100),
'day_of_week': np.random.randint(0, 7, 100),
'month': np.random.randint(1, 13, 100)
}
df = pd.DataFrame(data)
# Auto-detect maximum values
transformer = CyclicalTransformer()
df_cyclical = transformer.fit_transform(df)
# Creates: hour_sin, hour_cos, day_of_week_sin, day_of_week_cos, etc.
# Specify maximum values for proper cycles
transformer = CyclicalTransformer(
max_values={'hour': 24, 'day_of_week': 7, 'month': 12}
)
df_cyclical = transformer.fit_transform(df)
# Transform specific variables only
transformer = CyclicalTransformer(
variables=['hour', 'month'],
max_values={'hour': 24, 'month': 12},
drop_original=True
)
df_cyclical = transformer.fit_transform(df)
# Access learned max values
print(transformer.max_values_) # Maximum values per variablefrom sklearn.pipeline import Pipeline
from feature_engine.creation import MathematicalCombination, CyclicalTransformer
from feature_engine.imputation import MeanMedianImputer
# Multi-step feature creation pipeline
creation_pipeline = Pipeline([
('imputer', MeanMedianImputer()),
('math_combinations', MathematicalCombination(
variables_to_combine=['var1', 'var2'],
math_operations=['sum', 'prod', 'mean']
)),
('cyclical_features', CyclicalTransformer(
variables=['hour', 'day'],
max_values={'hour': 24, 'day': 365}
))
])
df_enhanced = creation_pipeline.fit_transform(df)# Financial ratios
financial_combiner = MathematicalCombination(
variables_to_combine=['assets', 'liabilities'],
math_operations=['sub'], # Assets - Liabilities = Equity
new_variables_names=['equity']
)
# BMI calculation
bmi_combiner = CombineWithReferenceFeature(
variables_to_combine=['weight'],
reference_variables=['height'],
operations_list=['div'],
new_variables_names=['weight_per_height'] # Weight / Height (need to square height separately)
)
# Time-based cyclical features for seasonality
time_transformer = CyclicalTransformer(
variables=['month', 'hour', 'day_of_week'],
max_values={'month': 12, 'hour': 24, 'day_of_week': 7}
)import numpy as np
# Custom data with multiple numerical variables
data = {
'x1': np.random.normal(10, 2, 1000),
'x2': np.random.normal(20, 5, 1000),
'x3': np.random.normal(5, 1, 1000),
'x4': np.random.normal(100, 15, 1000)
}
df = pd.DataFrame(data)
# Create comprehensive feature combinations
combiner = MathematicalCombination(
variables_to_combine=['x1', 'x2', 'x3', 'x4'],
math_operations=['sum', 'prod', 'mean', 'std', 'max', 'min'],
new_variables_names=[
'total_sum', 'total_product', 'average_value',
'value_std', 'max_value', 'min_value'
]
)
df_enhanced = combiner.fit_transform(df)
print(f"Original features: {len(df.columns)}")
print(f"Enhanced features: {len(df_enhanced.columns)}")
print(f"New features: {list(df_enhanced.columns[-6:])}") # Last 6 are new features# Data with missing values
data_with_na = {
'feature1': [1, 2, None, 4, 5],
'feature2': [10, None, 30, 40, 50],
'feature3': [100, 200, 300, None, 500]
}
df_na = pd.DataFrame(data_with_na)
# Ignore missing values in calculations
combiner_ignore = MathematicalCombination(
variables_to_combine=['feature1', 'feature2', 'feature3'],
math_operations=['mean', 'sum'],
missing_values='ignore' # Skip NaN values in calculations
)
df_combined_ignore = combiner_ignore.fit_transform(df_na)
# Raise error on missing values (default)
try:
combiner_raise = MathematicalCombination(
variables_to_combine=['feature1', 'feature2'],
math_operations=['sum'],
missing_values='raise'
)
df_combined_raise = combiner_raise.fit_transform(df_na)
except ValueError as e:
print(f"Error with missing values: {e}")All creation transformers share these fitted attributes:
variables_ (list): Variables that will be used for feature creationn_features_in_ (int): Number of features in training setTransformer-specific attributes:
combination_dict_ (dict): Mapping of operations to new variable names (MathematicalCombination)max_values_ (dict): Maximum values used for cyclical transformation (CyclicalTransformer)math_operations_ (list): Mathematical operations applied (MathematicalCombination)operations_list_ (list): Operations applied between variables and references (CombineWithReferenceFeature)Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine