Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transformers for applying scikit-learn transformers to specific subsets of variables while maintaining DataFrame structure and column names, enabling seamless integration of scikit-learn functionality within feature-engine workflows.
Wrapper to apply any Scikit-learn transformer to a selected group of variables while preserving DataFrame structure.
class SklearnTransformerWrapper:
def __init__(self, transformer, variables=None):
"""
Initialize SklearnTransformerWrapper.
Parameters:
- transformer: Instance of a scikit-learn transformer (must have fit, transform methods)
- variables (list): List of variables to be transformed. If None, transforms all numerical variables
"""
def fit(self, X, y=None):
"""
Fit the scikit-learn transformer on selected variables.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (passed to transformer if needed)
Returns:
- self
"""
def transform(self, X):
"""
Transform data using the fitted scikit-learn transformer.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with transformed variables, maintaining DataFrame structure
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""
def inverse_transform(self, X):
"""
Inverse transform using the scikit-learn transformer (if supported).
Parameters:
- X (pandas.DataFrame): Dataset with transformed values
Returns:
- pandas.DataFrame: Dataset with original scale restored
"""Usage Examples:
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
# Sample numerical data
data = {
'feature1': np.random.normal(100, 20, 1000),
'feature2': np.random.normal(50, 10, 1000),
'feature3': np.random.normal(200, 50, 1000),
'categorical': np.random.choice(['A', 'B', 'C'], 1000)
}
df = pd.DataFrame(data)
# Apply StandardScaler to specific numerical variables
scaler_wrapper = SklearnTransformerWrapper(
transformer=StandardScaler(),
variables=['feature1', 'feature2']
)
df_scaled = scaler_wrapper.fit_transform(df)
# feature3 and categorical remain unchanged
# feature1 and feature2 are standardized
print(df_scaled.describe())
print(df_scaled.dtypes) # DataFrame structure preservedfrom sklearn.decomposition import PCA
# Apply PCA to selected variables
pca_wrapper = SklearnTransformerWrapper(
transformer=PCA(n_components=2),
variables=['feature1', 'feature2', 'feature3']
)
df_pca = pca_wrapper.fit_transform(df)
# Note: PCA creates new features, original variables are replaced
# with principal components (PC1, PC2, etc.)
print("PCA explained variance ratio:",
pca_wrapper.transformer_.explained_variance_ratio_)from sklearn.preprocessing import RobustScaler
# Apply RobustScaler (less sensitive to outliers)
robust_wrapper = SklearnTransformerWrapper(
transformer=RobustScaler(),
variables=['feature1', 'feature3']
)
df_robust = robust_wrapper.fit_transform(df)
# Inverse transformation
df_original = robust_wrapper.inverse_transform(df_robust)from sklearn.preprocessing import PolynomialFeatures
# Generate polynomial features
poly_wrapper = SklearnTransformerWrapper(
transformer=PolynomialFeatures(degree=2, include_bias=False),
variables=['feature1', 'feature2']
)
df_poly = poly_wrapper.fit_transform(df)
# Creates additional polynomial combination features
print(f"Original features: {len(df.columns)}")
print(f"With polynomial features: {len(df_poly.columns)}")from sklearn.preprocessing import QuantileTransformer
# Apply quantile transformation for normalization
quantile_wrapper = SklearnTransformerWrapper(
transformer=QuantileTransformer(output_distribution='normal'),
variables=['feature1', 'feature2', 'feature3']
)
df_quantile = quantile_wrapper.fit_transform(df)
# Transforms to normal distributionfrom sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
# Complex preprocessing pipeline
preprocessing_pipeline = Pipeline([
('imputer', MeanMedianImputer()),
('polynomial', SklearnTransformerWrapper(
transformer=PolynomialFeatures(degree=2),
variables=['feature1', 'feature2']
)),
('scaler', SklearnTransformerWrapper(
transformer=StandardScaler(),
variables=None # Scale all numerical variables
)),
('classifier', RandomForestClassifier())
])
# Fit and predict
preprocessing_pipeline.fit(X_train, y_train)
predictions = preprocessing_pipeline.predict(X_test)from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Apply different scalers to different variable groups
standard_scaler_wrapper = SklearnTransformerWrapper(
transformer=StandardScaler(),
variables=['feature1', 'feature2']
)
minmax_scaler_wrapper = SklearnTransformerWrapper(
transformer=MinMaxScaler(),
variables=['feature3']
)
# Sequential application
df_multi_scaled = standard_scaler_wrapper.fit_transform(df)
df_multi_scaled = minmax_scaler_wrapper.fit_transform(df_multi_scaled)from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
# Custom transformer
class LogTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return np.log1p(X) # log(1 + x)
def inverse_transform(self, X):
return np.expm1(X) # exp(x) - 1
# Use with wrapper
log_wrapper = SklearnTransformerWrapper(
transformer=LogTransformer(),
variables=['feature1', 'feature2']
)
df_log = log_wrapper.fit_transform(df)
df_original = log_wrapper.inverse_transform(df_log)from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
# For categorical variables with sklearn transformers
categorical_data = {
'category1': ['A', 'B', 'C', 'A', 'B'],
'category2': ['X', 'Y', 'Z', 'X', 'Y'],
'numerical': [1, 2, 3, 4, 5]
}
df_cat = pd.DataFrame(categorical_data)
# Use OrdinalEncoder for multiple categorical variables
ordinal_wrapper = SklearnTransformerWrapper(
transformer=OrdinalEncoder(),
variables=['category1', 'category2']
)
df_encoded = ordinal_wrapper.fit_transform(df_cat)from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
# Create pipeline with wrapper
pipeline_with_wrapper = Pipeline([
('scaler', SklearnTransformerWrapper(
transformer=StandardScaler(),
variables=['feature1', 'feature2', 'feature3']
)),
('regressor', RandomForestRegressor())
])
# Cross-validation
cv_scores = cross_val_score(
pipeline_with_wrapper,
X_train,
y_train,
cv=5,
scoring='neg_mean_squared_error'
)
print(f"CV RMSE: {np.sqrt(-cv_scores.mean()):.3f}")SklearnTransformerWrapper has these fitted attributes:
transformer_ (sklearn transformer): Fitted scikit-learn transformer instancevariables_ (list): Variables that were transformedn_features_in_ (int): Number of features in training setThe wrapper provides access to the underlying transformer's attributes through the transformer_ attribute, enabling access to learned parameters like feature names, explained variance, etc.
Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine