A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.
87
Pipeline utilities for building composite estimators that chain together preprocessing steps and learning algorithms. These tools enable creating robust, reproducible machine learning workflows.
Chain transformers and estimators together in a single workflow.
from sklearn.pipeline import Pipeline
Pipeline(
steps: list[tuple[str, estimator]],
memory: str | object | None = None,
verbose: bool = False
)Combine multiple transformer objects into a single transformer.
from sklearn.pipeline import FeatureUnion
FeatureUnion(
transformer_list: list[tuple[str, transformer]],
n_jobs: int | None = None,
transformer_weights: dict | None = None,
verbose: bool = False
)Apply different transformers to different columns of the data.
from sklearn.compose import ColumnTransformer
ColumnTransformer(
transformers: list[tuple[str, transformer, columns]],
remainder: str | transformer = "drop",
sparse_threshold: float = 0.3,
n_jobs: int | None = None,
transformer_weights: dict | None = None,
verbose: bool = False,
verbose_feature_names_out: bool = True
)Meta-estimator to regress on a transformed target.
from sklearn.compose import TransformedTargetRegressor
TransformedTargetRegressor(
regressor: estimator | None = None,
transformer: transformer | None = None,
func: callable | None = None,
inverse_func: callable | None = None,
check_inverse: bool = True
)Create a Pipeline using abbreviated syntax.
from sklearn.pipeline import make_pipeline
def make_pipeline(
*steps: estimator,
memory: str | object | None = None,
verbose: bool = False
) -> Pipeline: ...Create a FeatureUnion using abbreviated syntax.
from sklearn.pipeline import make_union
def make_union(
*transformers: transformer,
n_jobs: int | None = None,
verbose: bool = False
) -> FeatureUnion: ...Create a ColumnTransformer using abbreviated syntax.
from sklearn.compose import make_column_transformer
def make_column_transformer(
*transformers: tuple[transformer, columns],
remainder: str | transformer = "drop",
sparse_threshold: float = 0.3,
n_jobs: int | None = None,
verbose: bool = False,
verbose_feature_names_out: bool = True
) -> ColumnTransformer: ...Create a callable to select columns based on column properties.
from sklearn.compose import make_column_selector
def make_column_selector(
pattern: str | None = None,
dtype_include: type | list[type] | None = None,
dtype_exclude: type | list[type] | None = None
) -> callable: ...from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# Explicit pipeline creation
pipe = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression())
])
# Abbreviated syntax
pipe = make_pipeline(
StandardScaler(),
LogisticRegression()
)
# Fit and predict
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
# Explicit column transformer
preprocessor = ColumnTransformer([
('num', StandardScaler(), ['age', 'income']),
('cat', OneHotEncoder(), ['category', 'region'])
])
# Using make_column_transformer
preprocessor = make_column_transformer(
(StandardScaler(), ['age', 'income']),
(OneHotEncoder(), ['category', 'region'])
)
# Using column selectors
preprocessor = ColumnTransformer([
('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
('cat', OneHotEncoder(), make_column_selector(dtype_include='object'))
])from sklearn.pipeline import FeatureUnion, make_union
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
# Combine multiple feature transformations
feature_union = FeatureUnion([
('pca', PCA(n_components=2)),
('select_best', SelectKBest(k=3))
])
# Abbreviated syntax
feature_union = make_union(
PCA(n_components=2),
SelectKBest(k=3)
)from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
# Preprocessing for numerical columns
numeric_features = ['age', 'income', 'score']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Preprocessing for categorical columns
categorical_features = ['category', 'region', 'type']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
]
)
# Create full pipeline
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
# Train the pipeline
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LinearRegression
import numpy as np
# Apply log transformation to target variable
regressor = TransformedTargetRegressor(
regressor=LinearRegression(),
func=np.log,
inverse_func=np.exp
)
# Or use a transformer
regressor = TransformedTargetRegressor(
regressor=LinearRegression(),
transformer=QuantileTransformer()
)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from tempfile import mkdtemp
# Cache intermediate results for faster re-fitting
cachedir = mkdtemp()
pipe = Pipeline([
('scale', StandardScaler()),
('reduce_dim', PCA()),
('classify', LogisticRegression())
], memory=cachedir)
# First fit will cache intermediate results
pipe.fit(X_train, y_train)
# Subsequent fits with same early steps will use cache
pipe.set_params(classify__C=0.1)
pipe.fit(X_train, y_train) # Only refits the classifier# Access steps by name
pipe['scaler'] # Returns the scaler step
pipe[0] # Returns first step
pipe[:-1] # Returns all steps except the last
# Get step names
pipe.named_steps.keys()
# Set parameters for specific steps
pipe.set_params(scaler__with_mean=False)# Get feature names from transformers
preprocessor.get_feature_names_out()
# Get transformed feature names
pipe[:-1].get_feature_names_out()
# Feature selection with pipelines
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
pipe = Pipeline([
('select', SelectKBest(k=10)),
('classify', LogisticRegression())
])Install with Tessl CLI
npx tessl i tessl/pypi-scikit-learndocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10