Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
General preprocessing functions and transformers for data preparation and variable matching between datasets to ensure consistency and compatibility in machine learning workflows.
Ensures that variables in a dataset match those in a reference dataset, handling missing columns and maintaining consistent structure across training and prediction datasets.
class MatchVariables:
def __init__(self, missing_values='raise'):
"""
Initialize MatchVariables.
Parameters:
- missing_values (str): How to handle missing variables - 'raise' or 'ignore'
"""
def fit(self, X, y=None):
"""
Learn the reference set of variables from training data.
Parameters:
- X (pandas.DataFrame): Reference dataset (typically training data)
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Transform dataset to match reference variables.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with variables matching reference set
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it."""Usage Example:
from feature_engine.preprocessing import MatchVariables
import pandas as pd
import numpy as np
# Training dataset
train_data = {
'feature1': np.random.randn(100),
'feature2': np.random.randn(100),
'feature3': np.random.randn(100),
'target': np.random.randint(0, 2, 100)
}
df_train = pd.DataFrame(train_data)
# Test dataset with missing feature and extra feature
test_data = {
'feature1': np.random.randn(50),
'feature2': np.random.randn(50),
# feature3 is missing
'feature4': np.random.randn(50) # Extra feature
}
df_test = pd.DataFrame(test_data)
# Match test data to training data structure
matcher = MatchVariables(missing_values='ignore')
matcher.fit(df_train.drop('target', axis=1)) # Fit on features only
df_test_matched = matcher.transform(df_test)
print("Training features:", df_train.drop('target', axis=1).columns.tolist())
print("Original test features:", df_test.columns.tolist())
print("Matched test features:", df_test_matched.columns.tolist())
# Result: df_test_matched will have feature1, feature2, feature3 (with NaN)
# feature4 is droppedfrom sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.preprocessing import MatchVariables
from feature_engine.encoding import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
# Training pipeline
training_pipeline = Pipeline([
('imputer', MeanMedianImputer()),
('encoder', OneHotEncoder()),
('classifier', RandomForestClassifier())
])
# Fit on training data
training_pipeline.fit(X_train, y_train)
# Deployment pipeline with variable matching
deployment_pipeline = Pipeline([
('matcher', MatchVariables()), # Ensure consistent variables
('imputer', MeanMedianImputer()),
('encoder', OneHotEncoder()),
('classifier', RandomForestClassifier())
])
# Fit matcher on training features
deployment_pipeline.named_steps['matcher'].fit(X_train)
# Copy trained parameters from training pipeline
deployment_pipeline.named_steps['imputer'] = training_pipeline.named_steps['imputer']
deployment_pipeline.named_steps['encoder'] = training_pipeline.named_steps['encoder']
deployment_pipeline.named_steps['classifier'] = training_pipeline.named_steps['classifier']
# Now can handle new data with different column structure
predictions = deployment_pipeline.predict(X_new)# Different datasets with potentially different features
dataset1 = pd.DataFrame({
'age': [25, 30, 35],
'income': [50000, 60000, 70000],
'education': ['BS', 'MS', 'PhD']
})
dataset2 = pd.DataFrame({
'age': [28, 32],
'income': [55000, 65000],
'experience': [3, 5] # Different feature
})
dataset3 = pd.DataFrame({
'income': [45000, 75000],
'education': ['BS', 'MS'],
'location': ['NYC', 'LA'] # Different feature
})
# Use first dataset as reference
matcher = MatchVariables(missing_values='ignore')
matcher.fit(dataset1)
# Transform other datasets to match
dataset2_matched = matcher.transform(dataset2)
dataset3_matched = matcher.transform(dataset3)
print("Reference columns:", dataset1.columns.tolist())
print("Dataset2 matched:", dataset2_matched.columns.tolist())
print("Dataset3 matched:", dataset3_matched.columns.tolist())
# All will have: age, income, education (with NaN where missing)from feature_engine.creation import MathematicalCombination
from feature_engine.datetime import DatetimeFeatures
# Complex feature engineering pipeline
feature_pipeline = Pipeline([
('datetime_features', DatetimeFeatures(
features_to_extract=['month', 'day_of_week']
)),
('math_combinations', MathematicalCombination(
variables_to_combine=['feature1', 'feature2'],
math_operations=['sum', 'prod']
)),
('matcher', MatchVariables()) # Ensure final consistency
])
# Fit on training data
feature_pipeline.fit(X_train)
# Apply to validation/test data with potential missing features
X_val_processed = feature_pipeline.transform(X_val)
X_test_processed = feature_pipeline.transform(X_test)
# All datasets will have consistent feature structure# Original model trained on v1 data schema
v1_schema = ['customer_id', 'purchase_amount', 'product_category', 'region']
v1_data = pd.DataFrame({col: np.random.randn(100) for col in v1_schema})
# New data has updated schema
v2_schema = ['customer_id', 'purchase_amount', 'product_category', 'region', 'channel', 'discount']
v2_data = pd.DataFrame({col: np.random.randn(50) for col in v2_schema})
# Legacy data missing new columns
legacy_schema = ['customer_id', 'purchase_amount', 'product_category'] # Missing region
legacy_data = pd.DataFrame({col: np.random.randn(25) for col in legacy_schema})
# Train matcher on original schema
schema_matcher = MatchVariables(missing_values='ignore')
schema_matcher.fit(v1_data)
# All datasets can be processed consistently
v2_matched = schema_matcher.transform(v2_data) # Extra columns removed
legacy_matched = schema_matcher.transform(legacy_data) # Missing column added with NaN
print("V1 schema:", v1_data.columns.tolist())
print("V2 matched:", v2_matched.columns.tolist())
print("Legacy matched:", legacy_matched.columns.tolist())
# All have same columns: customer_id, purchase_amount, product_category, regionimport json
def preprocess_api_data(api_response, trained_matcher):
"""
Preprocess data from API response to match model expectations.
"""
# Parse API response
data = json.loads(api_response)
df = pd.DataFrame([data]) # Single row from API
# Match to expected schema
df_matched = trained_matcher.transform(df)
return df_matched
# Example API responses with different structures
api_response_1 = '{"feature1": 1.0, "feature2": 2.0, "feature3": 3.0}'
api_response_2 = '{"feature1": 1.5, "feature2": 2.5}' # Missing feature3
api_response_3 = '{"feature1": 2.0, "feature2": 3.0, "feature3": 4.0, "extra_field": 5.0}'
# Trained matcher expects feature1, feature2, feature3
matcher = MatchVariables()
matcher.fit(pd.DataFrame(columns=['feature1', 'feature2', 'feature3']))
# All API responses can be handled consistently
for i, response in enumerate([api_response_1, api_response_2, api_response_3], 1):
processed = preprocess_api_data(response, matcher)
print(f"API response {i} processed shape:", processed.shape)
print(f"Columns: {processed.columns.tolist()}")# Strict mode - raise error on missing variables
strict_matcher = MatchVariables(missing_values='raise')
strict_matcher.fit(df_train)
try:
result = strict_matcher.transform(df_missing_features)
except ValueError as e:
print(f"Strict mode error: {e}")
# Lenient mode - ignore missing variables
lenient_matcher = MatchVariables(missing_values='ignore')
lenient_matcher.fit(df_train)
result = lenient_matcher.transform(df_missing_features) # Succeeds with NaNAlways include MatchVariables in production pipelines to handle schema changes gracefully.
Fit the matcher on training data to establish the canonical variable set.
Use missing_values='ignore' and handle NaN values with appropriate imputation strategies.
Keep track of expected schemas when deploying models to different environments.
Log when MatchVariables adds or removes columns to detect data drift.
MatchVariables has these fitted attributes:
variables_to_match_ (list): Reference set of variables established during fitn_features_in_ (int): Number of features in training setThe transformer ensures that output datasets always have exactly the variables specified in variables_to_match_, adding missing variables as NaN columns and dropping extra variables.
Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine