Pandas integration with sklearn providing DataFrameMapper for bridging DataFrame columns to sklearn transformations
npx @tessl/cli install tessl/pypi-sklearn-pandas@2.2.0Pandas integration with scikit-learn providing a bridge between pandas DataFrames and sklearn's machine learning transformations. The core component is DataFrameMapper, which allows mapping DataFrame columns to different sklearn transformations that are later recombined into features.
pip install sklearn-pandas or conda install -c conda-forge sklearn-pandasfrom sklearn_pandas import DataFrameMapperAdditional utilities:
from sklearn_pandas import gen_features, NumericalTransformerimport pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, LabelBinarizer
# Create sample data
data = pd.DataFrame({
'pet': ['cat', 'dog', 'dog', 'fish', 'cat', 'dog'],
'children': [4., 6, 3, 3, 2, 3],
'salary': [90., 24, 44, 27, 32, 59]
})
# Define feature mappings
mapper = DataFrameMapper([
('pet', LabelBinarizer()),
(['children'], StandardScaler()),
(['salary'], StandardScaler())
])
# Fit and transform the data
X_transformed = mapper.fit_transform(data)
# Or use in sklearn pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
('mapper', mapper),
('classifier', LogisticRegression())
])sklearn-pandas provides several key components:
The core functionality for mapping pandas DataFrame columns to sklearn transformations, with support for flexible column selection, multiple transformers per column, and comprehensive output options.
class DataFrameMapper:
def __init__(
self,
features,
default=False,
sparse=False,
df_out=False,
input_df=False,
drop_cols=None
):
"""
Map Pandas DataFrame columns to sklearn transformations.
Parameters:
- features: List of tuples with feature definitions [(columns, transformer, options), ...]
- default: Default transformer for unselected columns (False=discard, None=passthrough, transformer=apply)
- sparse: Return sparse matrix if True and any features are sparse
- df_out: Return pandas DataFrame with named columns
- input_df: Pass DataFrame/Series to transformers instead of numpy arrays
- drop_cols: List of columns to drop entirely
"""
def fit(self, X, y=None):
"""
Fit transformations to the data.
Parameters:
- X: pandas DataFrame to fit
- y: Target vector (optional)
Returns:
DataFrameMapper instance
"""
def transform(self, X):
"""
Transform data using fitted transformations.
Parameters:
- X: pandas DataFrame to transform
Returns:
numpy array, sparse matrix, or pandas DataFrame based on configuration
"""
def fit_transform(self, X, y=None):
"""
Fit transformations and transform data in one step.
Parameters:
- X: pandas DataFrame to fit and transform
- y: Target vector (optional)
Returns:
numpy array, sparse matrix, or pandas DataFrame based on configuration
"""
def get_names(self, columns, transformer, x, alias=None, prefix='', suffix=''):
"""
Generate verbose names for transformed columns.
Parameters:
- columns: Original column name(s)
- transformer: Applied transformer
- x: Transformed data
- alias: Custom base name for columns
- prefix: Prefix for column names
- suffix: Suffix for column names
Returns:
List of column names
"""
def get_dtypes(self, extracted):
"""
Get data types for all extracted features.
Parameters:
- extracted: List of extracted feature arrays/DataFrames
Returns:
List of data types for all features
"""
def get_dtype(self, ex):
"""
Get data type(s) for a single extracted feature.
Parameters:
- ex: Single extracted feature (numpy array, sparse matrix, or DataFrame)
Returns:
List of data types (one per column)
"""
# Attributes (set after transform)
transformed_names_: list
"""
List of column names for transformed features.
Set automatically after calling transform() or fit_transform().
"""
built_features: list
"""
List of built feature definitions after calling fit().
Contains tuples of (columns, transformer, options).
"""
built_default: object
"""
Built default transformer for unselected columns, if any.
Set after calling fit().
"""
### Feature Generation Utilities
Helper functions for programmatically generating feature definitions and applying transformations.
```python { .api }
def gen_features(columns, classes=None, prefix='', suffix=''):
"""
Generate feature definition list for DataFrameMapper.
Parameters:
- columns: List of column names to generate features for
- classes: List of transformer classes or dicts with class and params
- prefix: Prefix for transformed column names
- suffix: Suffix for transformed column names
Returns:
List of feature definition tuples
"""Custom pipeline components for transformer chaining and cross-validation compatibility. These must be imported from submodules.
from sklearn_pandas.pipeline import TransformerPipeline, make_transformer_pipeline, _call_fitclass TransformerPipeline(Pipeline):
def __init__(self, steps):
"""
Pipeline expecting all steps to be transformers.
Inherits from sklearn.pipeline.Pipeline.
Parameters:
- steps: List of (name, transformer) tuples
"""
def fit(self, X, y=None, **fit_params):
"""Fit the pipeline."""
def transform(self, X):
"""Transform data using the pipeline."""
def fit_transform(self, X, y=None, **fit_params):
"""Fit and transform using the pipeline."""
def make_transformer_pipeline(*steps):
"""
Construct TransformerPipeline from estimators.
Parameters:
- steps: Transformer instances
Returns:
TransformerPipeline instance
"""
def _call_fit(fit_method, X, y=None, **kwargs):
"""
Helper function for calling fit or fit_transform methods with correct parameters.
Handles transformers that may or may not accept y parameter.
Parameters:
- fit_method: fit or fit_transform method of the transformer
- X: Data to fit
- y: Target vector relative to X (optional)
- **kwargs: Keyword arguments to the fit method
Returns:
Result of the fit or fit_transform method
"""Deprecated numerical transformers maintained for backward compatibility.
class NumericalTransformer:
"""
DEPRECATED: Will be removed in version 3.0.
Use sklearn.base.TransformerMixin for custom transformers.
"""
SUPPORTED_FUNCTIONS = ['log', 'log1p']
def __init__(self, func):
"""
Parameters:
- func: Function name ('log' or 'log1p')
"""
def fit(self, X, y=None):
"""Fit transformer (no-op)."""
def transform(self, X, y=None):
"""Apply numerical transformation."""Compatibility wrapper for older sklearn versions. Must be imported from submodule.
from sklearn_pandas.cross_validation import DataWrapperclass DataWrapper:
def __init__(self, df):
"""
Wrapper for DataFrame with indexing support.
Parameters:
- df: pandas DataFrame to wrap
"""
def __len__(self):
"""Get length of wrapped DataFrame."""
def __getitem__(self, key):
"""Get item using iloc indexing."""Feature definitions are tuples with 1-3 elements:
# Single column as string - passes 1D array to transformer
('column_name', StandardScaler())
# Single column as list - passes 2D array to transformer
(['column_name'], StandardScaler())
# Multiple columns
(['col1', 'col2', 'col3'], StandardScaler())
# Callable column selector
(lambda df: df.select_dtypes(include=[np.number]).columns, StandardScaler())# Custom column naming
('salary', StandardScaler(), {'alias': 'normalized_salary'})
# Column prefixes and suffixes
('category', LabelBinarizer(), {'prefix': 'cat_', 'suffix': '_flag'})
# Input format control
('text_col', CountVectorizer(), {'input_df': True})# Chain transformers with TransformerPipeline
('numeric_col', [StandardScaler(), PCA(n_components=2)])
# Equivalent using make_transformer_pipeline
from sklearn_pandas.pipeline import make_transformer_pipeline
('numeric_col', make_transformer_pipeline(StandardScaler(), PCA(n_components=2)))# Handle categorical and numerical columns differently
mapper = DataFrameMapper([
# Categorical columns - use LabelBinarizer for 1D input
('category', LabelBinarizer()),
('status', LabelBinarizer()),
# Numerical columns - use list notation for 2D input
(['price'], StandardScaler()),
(['quantity'], StandardScaler()),
# Text columns with custom options
('description', CountVectorizer(), {'input_df': True})
])from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
# Create complete ML pipeline
pipeline = Pipeline([
('features', DataFrameMapper([
('category', LabelBinarizer()),
(['numerical_col'], StandardScaler())
])),
('classifier', RandomForestClassifier())
])
# Use in cross-validation
scores = cross_val_score(pipeline, df, target, cv=5)# Return transformed data as DataFrame with named columns
mapper = DataFrameMapper([
('cat_col', LabelBinarizer()),
(['num_col'], StandardScaler())
], df_out=True)
transformed_df = mapper.fit_transform(data)
# Result is a pandas DataFrame with meaningful column names# Apply default transformation to unselected columns
mapper = DataFrameMapper([
('specific_col', StandardScaler())
], default=StandardScaler()) # Apply StandardScaler to all other columns
# Or pass through unselected columns unchanged
mapper = DataFrameMapper([
('specific_col', StandardScaler())
], default=None) # Keep other columns as-isDataFrameMapper provides enhanced error messages that include column names for easier debugging:
# If transformation fails, error message includes problematic column names
try:
mapper.fit_transform(data)
except Exception as e:
# Error message will include column names like: "['column_name']: Original error message"
print(e)# Feature definition tuple format (Python 2.7+ compatible)
FeatureDefinition = tuple # Format: (column_selector, transformer(s), options)
# column_selector: str or list of str or callable
# transformer(s): sklearn transformer instance, list of transformers, or None
# options: dict or None (optional third element)
# Common option keys
TransformationOptions = dict # {
# 'alias': str, # Custom name for transformed features
# 'prefix': str, # Prefix for column names
# 'suffix': str, # Suffix for column names
# 'input_df': bool # Pass DataFrame instead of numpy array
# }