or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

datasets.mdfeature-extraction.mdindex.mdmetrics.mdmodel-selection.mdneighbors.mdpipelines.mdpreprocessing.mdsupervised-learning.mdunsupervised-learning.mdutilities.md

pipelines.mddocs/

0

# Pipelines and Composition

1

2

Pipeline utilities for building composite estimators that chain together preprocessing steps and learning algorithms. These tools enable creating robust, reproducible machine learning workflows.

3

4

## Core Pipeline Classes

5

6

### Pipeline

7

8

Chain transformers and estimators together in a single workflow.

9

10

```python { .api }

11

from sklearn.pipeline import Pipeline

12

13

Pipeline(

14

steps: list[tuple[str, estimator]],

15

memory: str | object | None = None,

16

verbose: bool = False

17

)

18

```

19

20

### FeatureUnion

21

22

Combine multiple transformer objects into a single transformer.

23

24

```python { .api }

25

from sklearn.pipeline import FeatureUnion

26

27

FeatureUnion(

28

transformer_list: list[tuple[str, transformer]],

29

n_jobs: int | None = None,

30

transformer_weights: dict | None = None,

31

verbose: bool = False

32

)

33

```

34

35

### ColumnTransformer

36

37

Apply different transformers to different columns of the data.

38

39

```python { .api }

40

from sklearn.compose import ColumnTransformer

41

42

ColumnTransformer(

43

transformers: list[tuple[str, transformer, columns]],

44

remainder: str | transformer = "drop",

45

sparse_threshold: float = 0.3,

46

n_jobs: int | None = None,

47

transformer_weights: dict | None = None,

48

verbose: bool = False,

49

verbose_feature_names_out: bool = True

50

)

51

```

52

53

### TransformedTargetRegressor

54

55

Meta-estimator to regress on a transformed target.

56

57

```python { .api }

58

from sklearn.compose import TransformedTargetRegressor

59

60

TransformedTargetRegressor(

61

regressor: estimator | None = None,

62

transformer: transformer | None = None,

63

func: callable | None = None,

64

inverse_func: callable | None = None,

65

check_inverse: bool = True

66

)

67

```

68

69

## Convenience Functions

70

71

### make_pipeline

72

73

Create a Pipeline using abbreviated syntax.

74

75

```python { .api }

76

from sklearn.pipeline import make_pipeline

77

78

def make_pipeline(

79

*steps: estimator,

80

memory: str | object | None = None,

81

verbose: bool = False

82

) -> Pipeline: ...

83

```

84

85

### make_union

86

87

Create a FeatureUnion using abbreviated syntax.

88

89

```python { .api }

90

from sklearn.pipeline import make_union

91

92

def make_union(

93

*transformers: transformer,

94

n_jobs: int | None = None,

95

verbose: bool = False

96

) -> FeatureUnion: ...

97

```

98

99

### make_column_transformer

100

101

Create a ColumnTransformer using abbreviated syntax.

102

103

```python { .api }

104

from sklearn.compose import make_column_transformer

105

106

def make_column_transformer(

107

*transformers: tuple[transformer, columns],

108

remainder: str | transformer = "drop",

109

sparse_threshold: float = 0.3,

110

n_jobs: int | None = None,

111

verbose: bool = False,

112

verbose_feature_names_out: bool = True

113

) -> ColumnTransformer: ...

114

```

115

116

### make_column_selector

117

118

Create a callable to select columns based on column properties.

119

120

```python { .api }

121

from sklearn.compose import make_column_selector

122

123

def make_column_selector(

124

pattern: str | None = None,

125

dtype_include: type | list[type] | None = None,

126

dtype_exclude: type | list[type] | None = None

127

) -> callable: ...

128

```

129

130

## Usage Examples

131

132

### Basic Pipeline

133

134

```python

135

from sklearn.pipeline import Pipeline, make_pipeline

136

from sklearn.preprocessing import StandardScaler

137

from sklearn.linear_model import LogisticRegression

138

139

# Explicit pipeline creation

140

pipe = Pipeline([

141

('scaler', StandardScaler()),

142

('classifier', LogisticRegression())

143

])

144

145

# Abbreviated syntax

146

pipe = make_pipeline(

147

StandardScaler(),

148

LogisticRegression()

149

)

150

151

# Fit and predict

152

pipe.fit(X_train, y_train)

153

y_pred = pipe.predict(X_test)

154

```

155

156

### Column-wise Transformations

157

158

```python

159

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

160

from sklearn.preprocessing import StandardScaler, OneHotEncoder

161

from sklearn.impute import SimpleImputer

162

163

# Explicit column transformer

164

preprocessor = ColumnTransformer([

165

('num', StandardScaler(), ['age', 'income']),

166

('cat', OneHotEncoder(), ['category', 'region'])

167

])

168

169

# Using make_column_transformer

170

preprocessor = make_column_transformer(

171

(StandardScaler(), ['age', 'income']),

172

(OneHotEncoder(), ['category', 'region'])

173

)

174

175

# Using column selectors

176

preprocessor = ColumnTransformer([

177

('num', StandardScaler(), make_column_selector(dtype_include=np.number)),

178

('cat', OneHotEncoder(), make_column_selector(dtype_include='object'))

179

])

180

```

181

182

### Feature Union

183

184

```python

185

from sklearn.pipeline import FeatureUnion, make_union

186

from sklearn.decomposition import PCA

187

from sklearn.feature_selection import SelectKBest

188

189

# Combine multiple feature transformations

190

feature_union = FeatureUnion([

191

('pca', PCA(n_components=2)),

192

('select_best', SelectKBest(k=3))

193

])

194

195

# Abbreviated syntax

196

feature_union = make_union(

197

PCA(n_components=2),

198

SelectKBest(k=3)

199

)

200

```

201

202

### Complex Pipeline with Column Transformer

203

204

```python

205

from sklearn.pipeline import Pipeline

206

from sklearn.compose import ColumnTransformer

207

from sklearn.preprocessing import StandardScaler, OneHotEncoder

208

from sklearn.impute import SimpleImputer

209

from sklearn.ensemble import RandomForestClassifier

210

211

# Preprocessing for numerical columns

212

numeric_features = ['age', 'income', 'score']

213

numeric_transformer = Pipeline(steps=[

214

('imputer', SimpleImputer(strategy='median')),

215

('scaler', StandardScaler())

216

])

217

218

# Preprocessing for categorical columns

219

categorical_features = ['category', 'region', 'type']

220

categorical_transformer = Pipeline(steps=[

221

('imputer', SimpleImputer(strategy='constant', fill_value='missing')),

222

('onehot', OneHotEncoder(handle_unknown='ignore'))

223

])

224

225

# Combine preprocessing steps

226

preprocessor = ColumnTransformer(

227

transformers=[

228

('num', numeric_transformer, numeric_features),

229

('cat', categorical_transformer, categorical_features)

230

]

231

)

232

233

# Create full pipeline

234

clf = Pipeline(steps=[

235

('preprocessor', preprocessor),

236

('classifier', RandomForestClassifier())

237

])

238

239

# Train the pipeline

240

clf.fit(X_train, y_train)

241

accuracy = clf.score(X_test, y_test)

242

```

243

244

### Target Transformation

245

246

```python

247

from sklearn.compose import TransformedTargetRegressor

248

from sklearn.preprocessing import QuantileTransformer

249

from sklearn.linear_model import LinearRegression

250

import numpy as np

251

252

# Apply log transformation to target variable

253

regressor = TransformedTargetRegressor(

254

regressor=LinearRegression(),

255

func=np.log,

256

inverse_func=np.exp

257

)

258

259

# Or use a transformer

260

regressor = TransformedTargetRegressor(

261

regressor=LinearRegression(),

262

transformer=QuantileTransformer()

263

)

264

265

regressor.fit(X_train, y_train)

266

y_pred = regressor.predict(X_test)

267

```

268

269

### Pipeline with Memory Caching

270

271

```python

272

from sklearn.pipeline import Pipeline

273

from sklearn.preprocessing import StandardScaler

274

from sklearn.decomposition import PCA

275

from sklearn.linear_model import LogisticRegression

276

from tempfile import mkdtemp

277

278

# Cache intermediate results for faster re-fitting

279

cachedir = mkdtemp()

280

pipe = Pipeline([

281

('scale', StandardScaler()),

282

('reduce_dim', PCA()),

283

('classify', LogisticRegression())

284

], memory=cachedir)

285

286

# First fit will cache intermediate results

287

pipe.fit(X_train, y_train)

288

289

# Subsequent fits with same early steps will use cache

290

pipe.set_params(classify__C=0.1)

291

pipe.fit(X_train, y_train) # Only refits the classifier

292

```

293

294

## Pipeline Properties and Methods

295

296

### Accessing Pipeline Steps

297

298

```python

299

# Access steps by name

300

pipe['scaler'] # Returns the scaler step

301

pipe[0] # Returns first step

302

pipe[:-1] # Returns all steps except the last

303

304

# Get step names

305

pipe.named_steps.keys()

306

307

# Set parameters for specific steps

308

pipe.set_params(scaler__with_mean=False)

309

```

310

311

### Feature Names and Selection

312

313

```python

314

# Get feature names from transformers

315

preprocessor.get_feature_names_out()

316

317

# Get transformed feature names

318

pipe[:-1].get_feature_names_out()

319

320

# Feature selection with pipelines

321

from sklearn.feature_selection import SelectKBest

322

from sklearn.pipeline import Pipeline

323

324

pipe = Pipeline([

325

('select', SelectKBest(k=10)),

326

('classify', LogisticRegression())

327

])

328

```