Python library with 44+ transformers for feature engineering and selection following scikit-learn API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transformers for extracting meaningful features from datetime variables including time components, periods, and date-related boolean flags to capture temporal patterns in machine learning models.
Extracts date and time features from datetime variables, creating multiple new features from each datetime column.
class DatetimeFeatures:
def __init__(self, variables=None, features_to_extract=None, drop_original=True,
missing_values='raise', dayfirst=False, yearfirst=False, utc=None):
"""
Initialize DatetimeFeatures.
Parameters:
- variables (list): List of datetime variables to extract features from. If None, auto-detects datetime columns
- features_to_extract (list/str): Specific features to extract or 'all' for all available features
- drop_original (bool): Whether to drop original datetime variables after extraction
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
- dayfirst (bool): Parse dates with day first (DD/MM/YYYY format)
- yearfirst (bool): Parse dates with year first (YYYY/MM/DD format)
- utc (bool): Return UTC DatetimeIndex. If None, keeps original timezone
"""
def fit(self, X, y=None):
"""
Validate datetime variables and features to extract.
Parameters:
- X (pandas.DataFrame): Training dataset
- y (pandas.Series, optional): Target variable (not used)
Returns:
- self
"""
def transform(self, X):
"""
Extract datetime features and add to dataframe.
Parameters:
- X (pandas.DataFrame): Dataset to transform
Returns:
- pandas.DataFrame: Dataset with extracted datetime features
"""
def fit_transform(self, X, y=None):
"""Fit to data, then transform it.""""month": Month of the year (1-12)"quarter": Quarter of the year (1-4)"semester": Semester of the year (1-2)"year": Year value"week": Week of the year (1-52/53)"day_of_week": Day of the week (0=Monday, 6=Sunday)"day_of_month": Day of the month (1-31)"day_of_year": Day of the year (1-365/366)"hour": Hour of the day (0-23)"minute": Minute of the hour (0-59)"second": Second of the minute (0-59)"weekend": Whether date falls on weekend (Saturday/Sunday)"month_start": Whether date is first day of month"month_end": Whether date is last day of month"quarter_start": Whether date is first day of quarter"quarter_end": Whether date is last day of quarter"year_start": Whether date is first day of year"year_end": Whether date is last day of year"leap_year": Whether year is a leap year"days_in_month": Number of days in the month (28-31)Usage Example:
from feature_engine.datetime import DatetimeFeatures
import pandas as pd
# Sample datetime data
dates = pd.date_range('2023-01-01', periods=100, freq='D')
data = {
'transaction_date': dates,
'created_at': pd.date_range('2023-01-01 09:00:00', periods=100, freq='H'),
'amount': range(100)
}
df = pd.DataFrame(data)
# Extract common datetime features
extractor = DatetimeFeatures(
features_to_extract=['month', 'day_of_week', 'hour', 'weekend'],
drop_original=False
)
df_enhanced = extractor.fit_transform(df)
# Extract all available features
extractor_all = DatetimeFeatures(features_to_extract='all')
df_all_features = extractor_all.fit_transform(df)
# Access extracted feature information
print(extractor.variables_) # Datetime variables processed
print(extractor.features_to_extract_) # Features that were extractedimport pandas as pd
import numpy as np
# Create time series data
dates = pd.date_range('2022-01-01', '2023-12-31', freq='D')
ts_data = {
'date': dates,
'sales': np.random.normal(1000, 200, len(dates)) +
100 * np.sin(2 * np.pi * dates.dayofyear / 365), # Seasonal pattern
'temperature': 20 + 10 * np.sin(2 * np.pi * dates.dayofyear / 365)
}
df_ts = pd.DataFrame(ts_data)
# Extract comprehensive datetime features for time series analysis
ts_extractor = DatetimeFeatures(
variables=['date'],
features_to_extract=[
'month', 'quarter', 'day_of_week', 'day_of_month',
'weekend', 'month_start', 'month_end', 'quarter_start', 'quarter_end'
]
)
df_ts_enhanced = ts_extractor.fit_transform(df_ts)
print(f"Original columns: {len(df_ts.columns)}")
print(f"Enhanced columns: {len(df_ts_enhanced.columns)}")
print("New datetime features:", [col for col in df_ts_enhanced.columns if col not in df_ts.columns])# E-commerce transaction data
transaction_data = {
'order_date': pd.date_range('2023-01-01', periods=1000, freq='H'),
'customer_id': np.random.randint(1, 100, 1000),
'order_amount': np.random.normal(150, 50, 1000)
}
df_ecommerce = pd.DataFrame(transaction_data)
# Extract business-relevant datetime features
business_extractor = DatetimeFeatures(
features_to_extract=[
'month', 'day_of_week', 'hour',
'weekend', 'month_start', 'month_end'
],
drop_original=True
)
df_business = business_extractor.fit_transform(df_ecommerce)
# Now we can analyze patterns like:
# - Monthly seasonality (month feature)
# - Day of week effects (day_of_week, weekend features)
# - Hourly patterns (hour feature)
# - End/start of month effects (month_start, month_end features)# Data with timezone-aware datetimes
utc_dates = pd.date_range('2023-01-01', periods=100, freq='6H', tz='UTC')
est_dates = utc_dates.tz_convert('US/Eastern')
multi_tz_data = {
'utc_timestamp': utc_dates,
'local_timestamp': est_dates,
'value': np.random.randn(100)
}
df_tz = pd.DataFrame(multi_tz_data)
# Extract features preserving timezone info
tz_extractor = DatetimeFeatures(
features_to_extract=['hour', 'day_of_week'],
utc=None # Preserve original timezone
)
df_tz_features = tz_extractor.fit_transform(df_tz)
# Compare UTC vs local time features
print("UTC hours:", df_tz_features['utc_timestamp_hour'].unique())
print("Local hours:", df_tz_features['local_timestamp_hour'].unique())# Financial data requiring specific datetime features
financial_data = {
'trade_date': pd.bdate_range('2023-01-01', periods=250), # Business days only
'stock_price': 100 + np.random.randn(250).cumsum(),
'volume': np.random.randint(1000, 10000, 250)
}
df_financial = pd.DataFrame(financial_data)
# Extract only relevant features for financial analysis
financial_extractor = DatetimeFeatures(
variables=['trade_date'],
features_to_extract=[
'month', 'quarter', 'day_of_week',
'month_start', 'month_end', 'quarter_start', 'quarter_end'
]
)
df_financial_enhanced = financial_extractor.fit_transform(df_financial)
# Features useful for:
# - Monthly/quarterly reporting periods
# - Day of week trading patterns (Monday effect, Friday effect)
# - Period start/end effects (window dressing, rebalancing)# Data with various date formats
mixed_format_data = {
'date_american': ['01/15/2023', '02/20/2023', '03/25/2023'], # MM/DD/YYYY
'date_european': ['15/01/2023', '20/02/2023', '25/03/2023'], # DD/MM/YYYY
'date_iso': ['2023-01-15', '2023-02-20', '2023-03-25'], # YYYY-MM-DD
'value': [100, 200, 300]
}
df_formats = pd.DataFrame(mixed_format_data)
# Convert to datetime with appropriate parsing
df_formats['date_american'] = pd.to_datetime(df_formats['date_american'], format='%m/%d/%Y')
df_formats['date_european'] = pd.to_datetime(df_formats['date_european'], dayfirst=True)
df_formats['date_iso'] = pd.to_datetime(df_formats['date_iso'])
# Extract features from properly parsed dates
format_extractor = DatetimeFeatures(
features_to_extract=['month', 'day_of_month', 'year']
)
df_formats_enhanced = format_extractor.fit_transform(df_formats)from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.encoding import OneHotEncoder
# Preprocessing pipeline with datetime feature extraction
datetime_pipeline = Pipeline([
('datetime_features', DatetimeFeatures(
features_to_extract=['month', 'day_of_week', 'hour', 'weekend']
)),
('imputer', MeanMedianImputer()), # Handle any missing values
('encoder', OneHotEncoder()) # Encode categorical datetime features
])
df_processed = datetime_pipeline.fit_transform(df)DatetimeFeatures creates new columns following the pattern: {original_column_name}_{feature_name}
Examples:
transaction_date + month → transaction_date_monthcreated_at + hour → created_at_hourtimestamp + weekend → timestamp_weekendDatetimeFeatures has these fitted attributes:
variables_ (list): Datetime variables from which features will be extractedfeatures_to_extract_ (list): Features that will be extracted from each datetime variablen_features_in_ (int): Number of features in training setThe transformer automatically handles pandas datetime types and can parse string dates during the transform process when proper parsing parameters are provided.
Install with Tessl CLI
npx tessl i tessl/pypi-feature-engine