A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.
87
This document covers all data preprocessing, feature engineering, and feature selection capabilities in scikit-learn.
from sklearn.preprocessing import StandardScaler
StandardScaler(
copy: bool = True,
with_mean: bool = True,
with_std: bool = True
)Standardize features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import MinMaxScaler
MinMaxScaler(
feature_range: tuple[float, float] = (0, 1),
copy: bool = True,
clip: bool = False
)Transform features by scaling each feature to a given range.
from sklearn.preprocessing import MaxAbsScaler
MaxAbsScaler(
copy: bool = True
)Scale each feature by its maximum absolute value.
from sklearn.preprocessing import RobustScaler
RobustScaler(
quantile_range: tuple[float, float] = (25.0, 75.0),
copy: bool = True,
unit_variance: bool = False
)Scale features using statistics that are robust to outliers.
from sklearn.preprocessing import Normalizer
Normalizer(
norm: str = "l2",
copy: bool = True
)Normalize samples individually to unit norm.
from sklearn.preprocessing import QuantileTransformer
QuantileTransformer(
n_quantiles: int = 1000,
output_distribution: str = "uniform",
ignore_implicit_zeros: bool = False,
subsample: int = 100000,
random_state: int | RandomState | None = None,
copy: bool = True
)Transform features to follow a uniform or a normal distribution.
from sklearn.preprocessing import PowerTransformer
PowerTransformer(
method: str = "yeo-johnson",
standardize: bool = True,
copy: bool = True
)Apply a power transform featurewise to make data more Gaussian-like.
from sklearn.preprocessing import LabelEncoder
LabelEncoder()Encode target labels with value between 0 and n_classes-1.
from sklearn.preprocessing import LabelBinarizer
LabelBinarizer(
neg_label: int = 0,
pos_label: int = 1,
sparse_output: bool = False
)Binarize labels in a one-vs-all fashion.
from sklearn.preprocessing import MultiLabelBinarizer
MultiLabelBinarizer(
classes: ArrayLike | None = None,
sparse_output: bool = False
)Transform between iterable of iterables and a multilabel format.
from sklearn.preprocessing import OneHotEncoder
OneHotEncoder(
categories: str | list[ArrayLike] = "auto",
drop: str | ArrayLike | None = None,
sparse_output: bool = True,
dtype: type = ...,
handle_unknown: str = "error",
min_frequency: int | float | None = None,
max_categories: int | None = None,
feature_name_combiner: str | Callable = "concat"
)Encode categorical features as a one-hot numeric array.
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder(
categories: str | list[ArrayLike] = "auto",
dtype: type = ...,
handle_unknown: str = "error",
unknown_value: int | float | None = None,
encoded_missing_value: int | float = ...,
min_frequency: int | float | None = None,
max_categories: int | None = None
)Encode categorical features as an integer array.
from sklearn.preprocessing import TargetEncoder
TargetEncoder(
categories: str | list[ArrayLike] = "auto",
target_type: str = "auto",
smooth: str | float = "auto",
cv: int | BaseCrossValidator | Iterable = 5,
shuffle: bool = True,
random_state: int | RandomState | None = None
)Target Encoder for regression and classification targets.
from sklearn.preprocessing import KBinsDiscretizer
KBinsDiscretizer(
n_bins: int | ArrayLike = 5,
encode: str = "onehot",
strategy: str = "quantile",
dtype: type | None = None,
subsample: int | None = 200000,
random_state: int | RandomState | None = None
)Bin continuous data into intervals.
from sklearn.preprocessing import Binarizer
Binarizer(
threshold: float = 0.0,
copy: bool = True
)Binarize data (set feature values to 0 or 1) according to a threshold.
from sklearn.preprocessing import PolynomialFeatures
PolynomialFeatures(
degree: int = 2,
interaction_only: bool = False,
include_bias: bool = True,
order: str = "C"
)Generate polynomial and interaction features.
from sklearn.preprocessing import SplineTransformer
SplineTransformer(
n_knots: int = 5,
degree: int = 3,
knots: str | ArrayLike = "uniform",
extrapolation: str = "constant",
include_bias: bool = True,
order: str = "C",
sparse_output: bool = False
)Generate univariate B-spline bases for features.
from sklearn.preprocessing import FunctionTransformer
FunctionTransformer(
func: Callable | None = None,
inverse_func: Callable | None = None,
validate: bool = False,
accept_sparse: bool = False,
check_inverse: bool = True,
feature_names_out: str | Callable | None = None,
kw_args: dict | None = None,
inv_kw_args: dict | None = None
)Constructs a transformer from an arbitrary callable.
from sklearn.preprocessing import KernelCenterer
KernelCenterer()Center a kernel matrix.
from sklearn.feature_selection import SelectKBest
SelectKBest(
score_func: Callable = ...,
k: int | str = 10
)Select features according to the k highest scores.
from sklearn.feature_selection import SelectPercentile
SelectPercentile(
score_func: Callable = ...,
percentile: int = 10
)Select features according to a percentile of the highest scores.
from sklearn.feature_selection import SelectFpr
SelectFpr(
score_func: Callable = ...,
alpha: float = 0.05
)Filter: Select the pvalues below alpha based on a FPR test.
from sklearn.feature_selection import SelectFdr
SelectFdr(
score_func: Callable = ...,
alpha: float = 0.05
)Filter: Select the p-values for an estimated false discovery rate.
from sklearn.feature_selection import SelectFwe
SelectFwe(
score_func: Callable = ...,
alpha: float = 0.05
)Filter: Select the p-values corresponding to Family-wise error rate.
from sklearn.feature_selection import GenericUnivariateSelect
GenericUnivariateSelect(
score_func: Callable = ...,
mode: str = "percentile",
param: int | float = 1e-05
)Univariate feature selector with configurable strategy.
from sklearn.feature_selection import SelectFromModel
SelectFromModel(
estimator: BaseEstimator,
threshold: str | float | None = None,
prefit: bool = False,
norm_order: int = 1,
max_features: int | Callable | None = None,
importance_getter: str | Callable = "auto"
)Meta-transformer for selecting features based on importance weights.
from sklearn.feature_selection import RFE
RFE(
estimator: BaseEstimator,
n_features_to_select: int | float | None = None,
step: int | float = 1,
verbose: int = 0,
importance_getter: str | Callable = "auto"
)Feature ranking with recursive feature elimination.
from sklearn.feature_selection import RFECV
RFECV(
estimator: BaseEstimator,
step: int | float = 1,
min_features_to_select: int = 1,
cv: int | BaseCrossValidator | Iterable | None = None,
scoring: str | Callable | None = None,
verbose: int = 0,
n_jobs: int | None = None,
importance_getter: str | Callable = "auto"
)Recursive feature elimination with cross-validation.
from sklearn.feature_selection import SequentialFeatureSelector
SequentialFeatureSelector(
estimator: BaseEstimator,
n_features_to_select: int | float | str = "auto",
tol: float | None = None,
direction: str = "forward",
scoring: str | Callable | None = None,
cv: int | BaseCrossValidator | Iterable = 5,
n_jobs: int | None = None
)Sequential Feature Selector.
from sklearn.feature_selection import VarianceThreshold
VarianceThreshold(
threshold: float = 0.0
)Feature selector that removes all low-variance features.
from sklearn.feature_selection import SelectorMixin
SelectorMixin()Transformer mixin that performs feature selection given a support mask.
from sklearn.feature_selection import chi2
chi2(
X: ArrayLike,
y: ArrayLike
) -> tuple[ArrayLike, ArrayLike]Compute chi-squared stats between each non-negative feature and class.
from sklearn.feature_selection import f_classif
f_classif(
X: ArrayLike,
y: ArrayLike
) -> tuple[ArrayLike, ArrayLike]Compute the ANOVA F-value for the provided sample.
from sklearn.feature_selection import f_oneway
f_oneway(
*samples: ArrayLike
) -> tuple[ArrayLike, ArrayLike]Test for equal means in two or more samples from the normal distribution.
from sklearn.feature_selection import f_regression
f_regression(
X: ArrayLike,
y: ArrayLike,
center: bool = True
) -> tuple[ArrayLike, ArrayLike]Univariate linear regression tests returning F-statistic and p-values.
from sklearn.feature_selection import r_regression
r_regression(
X: ArrayLike,
y: ArrayLike,
center: bool = True,
force_finite: bool = True
) -> tuple[ArrayLike, ArrayLike]Compute Pearson's r for each feature with the target.
from sklearn.feature_selection import mutual_info_classif
mutual_info_classif(
X: ArrayLike,
y: ArrayLike,
discrete_features: str | bool | ArrayLike = "auto",
n_neighbors: int = 3,
copy: bool = True,
random_state: int | RandomState | None = None
) -> ArrayLikeEstimate mutual information for a discrete target variable.
from sklearn.feature_selection import mutual_info_regression
mutual_info_regression(
X: ArrayLike,
y: ArrayLike,
discrete_features: str | bool | ArrayLike = "auto",
n_neighbors: int = 3,
copy: bool = True,
random_state: int | RandomState | None = None
) -> ArrayLikeEstimate mutual information for a continuous target variable.
from sklearn.preprocessing import scale
scale(
X: ArrayLike,
axis: int = 0,
with_mean: bool = True,
with_std: bool = True,
copy: bool = True
) -> ArrayLikeStandardize a dataset along any axis.
from sklearn.preprocessing import minmax_scale
minmax_scale(
X: ArrayLike,
feature_range: tuple[float, float] = (0, 1),
axis: int = 0,
copy: bool = True
) -> ArrayLikeTransform features by scaling each feature to a given range.
from sklearn.preprocessing import maxabs_scale
maxabs_scale(
X: ArrayLike,
axis: int = 0,
copy: bool = True
) -> ArrayLikeScale each feature to the [-1, 1] range without breaking sparsity.
from sklearn.preprocessing import robust_scale
robust_scale(
X: ArrayLike,
axis: int = 0,
quantile_range: tuple[float, float] = (25.0, 75.0),
copy: bool = True,
unit_variance: bool = False
) -> ArrayLikeStandardize a dataset along any axis.
from sklearn.preprocessing import normalize
normalize(
X: ArrayLike,
norm: str = "l2",
axis: int = 1,
copy: bool = True,
return_norm: bool = False
) -> ArrayLike | tuple[ArrayLike, ArrayLike]Scale input vectors individually to unit norm (vector length).
from sklearn.preprocessing import quantile_transform
quantile_transform(
X: ArrayLike,
axis: int = 0,
n_quantiles: int = 1000,
output_distribution: str = "uniform",
ignore_implicit_zeros: bool = False,
subsample: int = 100000,
random_state: int | RandomState | None = None,
copy: bool = True
) -> ArrayLikeTransform features to follow a uniform or a normal distribution.
from sklearn.preprocessing import power_transform
power_transform(
X: ArrayLike,
method: str = "yeo-johnson",
standardize: bool = True,
copy: bool = True
) -> ArrayLikeApply a power transform featurewise to make data more Gaussian-like.
from sklearn.preprocessing import label_binarize
label_binarize(
y: ArrayLike,
classes: ArrayLike,
neg_label: int = 0,
pos_label: int = 1,
sparse_output: bool = False
) -> ArrayLikeBinarize labels in a one-vs-all fashion.
from sklearn.preprocessing import binarize
binarize(
X: ArrayLike,
threshold: float = 0.0,
copy: bool = True
) -> ArrayLikeBoolean thresholding of array-like or scipy.sparse matrix.
from sklearn.preprocessing import add_dummy_feature
add_dummy_feature(
X: ArrayLike,
value: float = 1.0
) -> ArrayLikeAugment dataset with an additional dummy feature.
from sklearn.feature_extraction import DictVectorizer
DictVectorizer(
dtype: type = ...,
separator: str = "=",
sparse: bool = True,
sort: bool = True
)Transforms lists of feature-value mappings to vectors.
from sklearn.feature_extraction import FeatureHasher
FeatureHasher(
n_features: int = 1048576,
input_type: str = "dict",
dtype: type = ...,
alternate_sign: bool = True
)Implements feature hashing, aka the hashing trick.
from sklearn.feature_extraction import img_to_graph
img_to_graph(
img: ArrayLike,
mask: ArrayLike | None = None,
return_as: type = ...,
dtype: type | None = None
) -> ArrayLikeGraph of the pixel-to-pixel gradient connections.
from sklearn.feature_extraction import grid_to_graph
grid_to_graph(
n_x: int,
n_y: int,
n_z: int | None = None,
mask: ArrayLike | None = None,
return_as: type = ...,
dtype: type = ...,
**kwargs
) -> ArrayLikeGraph of the pixel-to-pixel gradient connections.
from sklearn.impute import SimpleImputer
SimpleImputer(
missing_values: int | float | str | None = ...,
strategy: str = "mean",
fill_value: str | int | float | None = None,
copy: bool = True,
add_indicator: bool = False,
keep_empty_features: bool = False
)Imputation transformer for completing missing values.
from sklearn.impute import KNNImputer
KNNImputer(
missing_values: int | float | str | None = ...,
n_neighbors: int = 5,
weights: str | Callable = "uniform",
metric: str | Callable = "nan_euclidean",
copy: bool = True,
add_indicator: bool = False,
keep_empty_features: bool = False
)Imputation for completing missing values using k-Nearest Neighbors.
from sklearn.impute import MissingIndicator
MissingIndicator(
missing_values: int | float | str | None = ...,
features: str = "missing-only",
sparse: bool | str = "auto",
error_on_new: bool = True
)Binary indicators for missing values.
from sklearn.kernel_approximation import RBFSampler
RBFSampler(
gamma: float = 1.0,
n_components: int = 100,
random_state: int | RandomState | None = None
)Approximate a RBF kernel feature map using random Fourier features.
from sklearn.kernel_approximation import Nystroem
Nystroem(
kernel: str | Callable = "rbf",
gamma: float | None = None,
coef0: float | None = None,
degree: float | None = None,
kernel_params: dict | None = None,
n_components: int = 100,
random_state: int | RandomState | None = None,
n_jobs: int | None = None
)Approximate a kernel map using a subset of the training data.
from sklearn.kernel_approximation import AdditiveChi2Sampler
AdditiveChi2Sampler(
sample_steps: int = 2,
sample_interval: float | None = None
)Approximate feature map for additive chi2 kernel.
from sklearn.kernel_approximation import SkewedChi2Sampler
SkewedChi2Sampler(
skewedness: float = 1.0,
n_components: int = 100,
random_state: int | RandomState | None = None
)Approximate feature map for "skewed chi-squared" kernel.
from sklearn.kernel_approximation import PolynomialCountSketch
PolynomialCountSketch(
gamma: float = 1.0,
degree: int = 2,
coef0: int = 0,
n_components: int = 100,
random_state: int | RandomState | None = None
)Polynomial kernel approximation via Tensor Sketch.
from sklearn.random_projection import GaussianRandomProjection
GaussianRandomProjection(
n_components: int | str = "auto",
eps: float = 0.1,
random_state: int | RandomState | None = None,
compute_inverse_components: bool = False
)Reduce dimensionality through Gaussian random projection.
from sklearn.random_projection import SparseRandomProjection
SparseRandomProjection(
n_components: int | str = "auto",
density: float | str = "auto",
eps: float = 0.1,
dense_output: bool = False,
random_state: int | RandomState | None = None,
compute_inverse_components: bool = False
)Reduce dimensionality through sparse random projection.
from sklearn.random_projection import johnson_lindenstrauss_min_dim
johnson_lindenstrauss_min_dim(
n_samples: int,
eps: float | ArrayLike = 0.1
) -> int | ArrayLikeFind a 'safe' number of components to randomly project to.
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# Create preprocessing pipeline
numeric_features = ['age', 'income', 'score']
categorical_features = ['city', 'gender']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
]
)from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
# Univariate feature selection
selector = SelectKBest(score_func=f_classif, k=10)
# Model-based feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=10)
# Complete pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('selector', selector),
('classifier', RandomForestClassifier())
])Install with Tessl CLI
npx tessl i tessl/pypi-scikit-learndocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10