Python library with 44+ transformers for feature engineering and selection following scikit-learn API
npx @tessl/cli install tessl/pypi-feature-engine@1.2.0A Python library with multiple transformers to engineer and select features for machine learning. All transformers follow the scikit-learn API pattern, enabling seamless integration with existing machine learning pipelines.
pip install feature-engineimport feature_engineCommon import patterns for specific modules:
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import OneHotEncoder, OrdinalEncoder
from feature_engine.transformation import LogTransformer, BoxCoxTransformer
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropHighPSIFeatures, SelectByTargetMeanPerformance
from feature_engine.outliers import Winsorizerimport pandas as pd
from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
# Create sample data
data = {
'numeric_var1': [1.0, 2.0, None, 4.0, 5.0],
'numeric_var2': [10, 20, 30, None, 50],
'categorical_var': ['A', 'B', 'A', 'C', 'B']
}
df = pd.DataFrame(data)
y = [0, 1, 0, 1, 0]
# Create transformers
imputer = MeanMedianImputer(imputation_method='median')
encoder = OrdinalEncoder(encoding_method='arbitrary')
# Fit and transform data
X_imputed = imputer.fit_transform(df)
X_encoded = encoder.fit_transform(X_imputed)
# Or use in pipeline
pipeline = Pipeline([
('imputer', MeanMedianImputer()),
('encoder', OrdinalEncoder(encoding_method='arbitrary')),
('classifier', RandomForestClassifier())
])
pipeline.fit(df, y)
predictions = pipeline.predict(df)Feature-Engine follows the scikit-learn API design pattern with consistent interfaces across all transformers:
All transformers inherit from base classes that provide:
_Handle missing values in numerical and categorical variables using statistical methods, arbitrary values, or advanced techniques like random sampling.
class MeanMedianImputer:
def __init__(self, imputation_method='median', variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class CategoricalImputer:
def __init__(self, imputation_method='missing', fill_value='Missing', variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class ArbitraryNumberImputer:
def __init__(self, arbitrary_number=999, variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...Transform categorical variables into numerical representations using various encoding methods including one-hot, ordinal, target-based, and frequency-based encoders.
class OneHotEncoder:
def __init__(self, top_categories=None, drop_last=False, variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class OrdinalEncoder:
def __init__(self, encoding_method='ordered', variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class MeanEncoder:
def __init__(self, variables=None, ignore_format=False): ...
def fit(self, X, y): ...
def transform(self, X): ...Convert continuous variables into discrete intervals using equal width, equal frequency, decision tree-based, or user-defined boundaries.
class EqualWidthDiscretiser:
def __init__(self, variables=None, return_object=False, return_boundaries=False): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class EqualFrequencyDiscretiser:
def __init__(self, variables=None, return_object=False, return_boundaries=False): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class ArbitraryDiscretiser:
def __init__(self, binning_dict, return_object=False, return_boundaries=False): ...
def fit(self, X, y=None): ...
def transform(self, X): ...Apply mathematical functions to numerical variables including logarithmic, power, reciprocal, Box-Cox, and Yeo-Johnson transformations.
class LogTransformer:
def __init__(self, variables=None, base='e'): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
def inverse_transform(self, X): ...
class BoxCoxTransformer:
def __init__(self, variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
def inverse_transform(self, X): ...
class PowerTransformer:
def __init__(self, variables=None, exp=2): ...
def fit(self, X, y=None): ...
def transform(self, X): ...Remove or select features based on various criteria including variance, correlation, performance metrics, and statistical tests.
class DropFeatures:
def __init__(self, features_to_drop): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class DropConstantFeatures:
def __init__(self, variables=None, tol=1, missing_values='raise'): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class DropCorrelatedFeatures:
def __init__(self, variables=None, method='pearson', threshold=0.8): ...
def fit(self, X, y=None): ...
def transform(self, X): ...Identify and handle outliers using statistical methods including Winsorization, capping, and trimming techniques.
class Winsorizer:
def __init__(self, capping_method='gaussian', tail='right', fold=3, variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class ArbitraryOutlierCapper:
def __init__(self, max_capping_dict=None, min_capping_dict=None, variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class OutlierTrimmer:
def __init__(self, capping_method='gaussian', tail='right', fold=3, variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...Outlier Detection and Handling
Generate new features through mathematical combinations, cyclical transformations, and reference feature combinations.
class MathematicalCombination:
def __init__(self, variables_to_combine, math_operations=None, new_variables_names=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class CyclicalTransformer:
def __init__(self, variables=None, max_values=None, drop_original=False): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
class CombineWithReferenceFeature:
def __init__(self, variables_to_combine, reference_variables, operations_list): ...
def fit(self, X, y=None): ...
def transform(self, X): ...Extract meaningful features from datetime variables including time components, periods, and date-related boolean flags.
class DatetimeFeatures:
def __init__(self, variables=None, features_to_extract=None, drop_original=True): ...
def fit(self, X, y=None): ...
def transform(self, X): ...Apply scikit-learn transformers to specific subsets of variables while maintaining DataFrame structure and column names.
class SklearnTransformerWrapper:
def __init__(self, transformer, variables=None): ...
def fit(self, X, y=None): ...
def transform(self, X): ...
def fit_transform(self, X, y=None): ...General preprocessing functions for data preparation and variable matching between datasets.
class MatchVariables:
def __init__(self, missing_values='raise'): ...
def fit(self, X, y=None): ...
def transform(self, X): ...