0
# Pipelines and Composition
1
2
Pipeline utilities for building composite estimators that chain together preprocessing steps and learning algorithms. These tools enable creating robust, reproducible machine learning workflows.
3
4
## Core Pipeline Classes
5
6
### Pipeline
7
8
Chain transformers and estimators together in a single workflow.
9
10
```python { .api }
11
from sklearn.pipeline import Pipeline
12
13
Pipeline(
14
steps: list[tuple[str, estimator]],
15
memory: str | object | None = None,
16
verbose: bool = False
17
)
18
```
19
20
### FeatureUnion
21
22
Combine multiple transformer objects into a single transformer.
23
24
```python { .api }
25
from sklearn.pipeline import FeatureUnion
26
27
FeatureUnion(
28
transformer_list: list[tuple[str, transformer]],
29
n_jobs: int | None = None,
30
transformer_weights: dict | None = None,
31
verbose: bool = False
32
)
33
```
34
35
### ColumnTransformer
36
37
Apply different transformers to different columns of the data.
38
39
```python { .api }
40
from sklearn.compose import ColumnTransformer
41
42
ColumnTransformer(
43
transformers: list[tuple[str, transformer, columns]],
44
remainder: str | transformer = "drop",
45
sparse_threshold: float = 0.3,
46
n_jobs: int | None = None,
47
transformer_weights: dict | None = None,
48
verbose: bool = False,
49
verbose_feature_names_out: bool = True
50
)
51
```
52
53
### TransformedTargetRegressor
54
55
Meta-estimator to regress on a transformed target.
56
57
```python { .api }
58
from sklearn.compose import TransformedTargetRegressor
59
60
TransformedTargetRegressor(
61
regressor: estimator | None = None,
62
transformer: transformer | None = None,
63
func: callable | None = None,
64
inverse_func: callable | None = None,
65
check_inverse: bool = True
66
)
67
```
68
69
## Convenience Functions
70
71
### make_pipeline
72
73
Create a Pipeline using abbreviated syntax.
74
75
```python { .api }
76
from sklearn.pipeline import make_pipeline
77
78
def make_pipeline(
79
*steps: estimator,
80
memory: str | object | None = None,
81
verbose: bool = False
82
) -> Pipeline: ...
83
```
84
85
### make_union
86
87
Create a FeatureUnion using abbreviated syntax.
88
89
```python { .api }
90
from sklearn.pipeline import make_union
91
92
def make_union(
93
*transformers: transformer,
94
n_jobs: int | None = None,
95
verbose: bool = False
96
) -> FeatureUnion: ...
97
```
98
99
### make_column_transformer
100
101
Create a ColumnTransformer using abbreviated syntax.
102
103
```python { .api }
104
from sklearn.compose import make_column_transformer
105
106
def make_column_transformer(
107
*transformers: tuple[transformer, columns],
108
remainder: str | transformer = "drop",
109
sparse_threshold: float = 0.3,
110
n_jobs: int | None = None,
111
verbose: bool = False,
112
verbose_feature_names_out: bool = True
113
) -> ColumnTransformer: ...
114
```
115
116
### make_column_selector
117
118
Create a callable to select columns based on column properties.
119
120
```python { .api }
121
from sklearn.compose import make_column_selector
122
123
def make_column_selector(
124
pattern: str | None = None,
125
dtype_include: type | list[type] | None = None,
126
dtype_exclude: type | list[type] | None = None
127
) -> callable: ...
128
```
129
130
## Usage Examples
131
132
### Basic Pipeline
133
134
```python
135
from sklearn.pipeline import Pipeline, make_pipeline
136
from sklearn.preprocessing import StandardScaler
137
from sklearn.linear_model import LogisticRegression
138
139
# Explicit pipeline creation
140
pipe = Pipeline([
141
('scaler', StandardScaler()),
142
('classifier', LogisticRegression())
143
])
144
145
# Abbreviated syntax
146
pipe = make_pipeline(
147
StandardScaler(),
148
LogisticRegression()
149
)
150
151
# Fit and predict
152
pipe.fit(X_train, y_train)
153
y_pred = pipe.predict(X_test)
154
```
155
156
### Column-wise Transformations
157
158
```python
159
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
160
from sklearn.preprocessing import StandardScaler, OneHotEncoder
161
from sklearn.impute import SimpleImputer
162
163
# Explicit column transformer
164
preprocessor = ColumnTransformer([
165
('num', StandardScaler(), ['age', 'income']),
166
('cat', OneHotEncoder(), ['category', 'region'])
167
])
168
169
# Using make_column_transformer
170
preprocessor = make_column_transformer(
171
(StandardScaler(), ['age', 'income']),
172
(OneHotEncoder(), ['category', 'region'])
173
)
174
175
# Using column selectors
176
preprocessor = ColumnTransformer([
177
('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
178
('cat', OneHotEncoder(), make_column_selector(dtype_include='object'))
179
])
180
```
181
182
### Feature Union
183
184
```python
185
from sklearn.pipeline import FeatureUnion, make_union
186
from sklearn.decomposition import PCA
187
from sklearn.feature_selection import SelectKBest
188
189
# Combine multiple feature transformations
190
feature_union = FeatureUnion([
191
('pca', PCA(n_components=2)),
192
('select_best', SelectKBest(k=3))
193
])
194
195
# Abbreviated syntax
196
feature_union = make_union(
197
PCA(n_components=2),
198
SelectKBest(k=3)
199
)
200
```
201
202
### Complex Pipeline with Column Transformer
203
204
```python
205
from sklearn.pipeline import Pipeline
206
from sklearn.compose import ColumnTransformer
207
from sklearn.preprocessing import StandardScaler, OneHotEncoder
208
from sklearn.impute import SimpleImputer
209
from sklearn.ensemble import RandomForestClassifier
210
211
# Preprocessing for numerical columns
212
numeric_features = ['age', 'income', 'score']
213
numeric_transformer = Pipeline(steps=[
214
('imputer', SimpleImputer(strategy='median')),
215
('scaler', StandardScaler())
216
])
217
218
# Preprocessing for categorical columns
219
categorical_features = ['category', 'region', 'type']
220
categorical_transformer = Pipeline(steps=[
221
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
222
('onehot', OneHotEncoder(handle_unknown='ignore'))
223
])
224
225
# Combine preprocessing steps
226
preprocessor = ColumnTransformer(
227
transformers=[
228
('num', numeric_transformer, numeric_features),
229
('cat', categorical_transformer, categorical_features)
230
]
231
)
232
233
# Create full pipeline
234
clf = Pipeline(steps=[
235
('preprocessor', preprocessor),
236
('classifier', RandomForestClassifier())
237
])
238
239
# Train the pipeline
240
clf.fit(X_train, y_train)
241
accuracy = clf.score(X_test, y_test)
242
```
243
244
### Target Transformation
245
246
```python
247
from sklearn.compose import TransformedTargetRegressor
248
from sklearn.preprocessing import QuantileTransformer
249
from sklearn.linear_model import LinearRegression
250
import numpy as np
251
252
# Apply log transformation to target variable
253
regressor = TransformedTargetRegressor(
254
regressor=LinearRegression(),
255
func=np.log,
256
inverse_func=np.exp
257
)
258
259
# Or use a transformer
260
regressor = TransformedTargetRegressor(
261
regressor=LinearRegression(),
262
transformer=QuantileTransformer()
263
)
264
265
regressor.fit(X_train, y_train)
266
y_pred = regressor.predict(X_test)
267
```
268
269
### Pipeline with Memory Caching
270
271
```python
272
from sklearn.pipeline import Pipeline
273
from sklearn.preprocessing import StandardScaler
274
from sklearn.decomposition import PCA
275
from sklearn.linear_model import LogisticRegression
276
from tempfile import mkdtemp
277
278
# Cache intermediate results for faster re-fitting
279
cachedir = mkdtemp()
280
pipe = Pipeline([
281
('scale', StandardScaler()),
282
('reduce_dim', PCA()),
283
('classify', LogisticRegression())
284
], memory=cachedir)
285
286
# First fit will cache intermediate results
287
pipe.fit(X_train, y_train)
288
289
# Subsequent fits with same early steps will use cache
290
pipe.set_params(classify__C=0.1)
291
pipe.fit(X_train, y_train) # Only refits the classifier
292
```
293
294
## Pipeline Properties and Methods
295
296
### Accessing Pipeline Steps
297
298
```python
299
# Access steps by name
300
pipe['scaler'] # Returns the scaler step
301
pipe[0] # Returns first step
302
pipe[:-1] # Returns all steps except the last
303
304
# Get step names
305
pipe.named_steps.keys()
306
307
# Set parameters for specific steps
308
pipe.set_params(scaler__with_mean=False)
309
```
310
311
### Feature Names and Selection
312
313
```python
314
# Get feature names from transformers
315
preprocessor.get_feature_names_out()
316
317
# Get transformed feature names
318
pipe[:-1].get_feature_names_out()
319
320
# Feature selection with pipelines
321
from sklearn.feature_selection import SelectKBest
322
from sklearn.pipeline import Pipeline
323
324
pipe = Pipeline([
325
('select', SelectKBest(k=10)),
326
('classify', LogisticRegression())
327
])
328
```