CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-scikit-learn

A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.

87

0.98x
Overview
Eval results
Files

preprocessing.mddocs/

Data Preprocessing and Feature Engineering

This document covers all data preprocessing, feature engineering, and feature selection capabilities in scikit-learn.

Scaling and Normalization

StandardScaler { .api }

from sklearn.preprocessing import StandardScaler

StandardScaler(
    copy: bool = True,
    with_mean: bool = True,
    with_std: bool = True
)

Standardize features by removing the mean and scaling to unit variance.

MinMaxScaler { .api }

from sklearn.preprocessing import MinMaxScaler

MinMaxScaler(
    feature_range: tuple[float, float] = (0, 1),
    copy: bool = True,
    clip: bool = False
)

Transform features by scaling each feature to a given range.

MaxAbsScaler { .api }

from sklearn.preprocessing import MaxAbsScaler

MaxAbsScaler(
    copy: bool = True
)

Scale each feature by its maximum absolute value.

RobustScaler { .api }

from sklearn.preprocessing import RobustScaler

RobustScaler(
    quantile_range: tuple[float, float] = (25.0, 75.0),
    copy: bool = True,
    unit_variance: bool = False
)

Scale features using statistics that are robust to outliers.

Normalizer { .api }

from sklearn.preprocessing import Normalizer

Normalizer(
    norm: str = "l2",
    copy: bool = True
)

Normalize samples individually to unit norm.

QuantileTransformer { .api }

from sklearn.preprocessing import QuantileTransformer

QuantileTransformer(
    n_quantiles: int = 1000,
    output_distribution: str = "uniform",
    ignore_implicit_zeros: bool = False,
    subsample: int = 100000,
    random_state: int | RandomState | None = None,
    copy: bool = True
)

Transform features to follow a uniform or a normal distribution.

PowerTransformer { .api }

from sklearn.preprocessing import PowerTransformer

PowerTransformer(
    method: str = "yeo-johnson",
    standardize: bool = True,
    copy: bool = True
)

Apply a power transform featurewise to make data more Gaussian-like.

Encoding

LabelEncoder { .api }

from sklearn.preprocessing import LabelEncoder

LabelEncoder()

Encode target labels with value between 0 and n_classes-1.

LabelBinarizer { .api }

from sklearn.preprocessing import LabelBinarizer

LabelBinarizer(
    neg_label: int = 0,
    pos_label: int = 1,
    sparse_output: bool = False
)

Binarize labels in a one-vs-all fashion.

MultiLabelBinarizer { .api }

from sklearn.preprocessing import MultiLabelBinarizer

MultiLabelBinarizer(
    classes: ArrayLike | None = None,
    sparse_output: bool = False
)

Transform between iterable of iterables and a multilabel format.

OneHotEncoder { .api }

from sklearn.preprocessing import OneHotEncoder

OneHotEncoder(
    categories: str | list[ArrayLike] = "auto",
    drop: str | ArrayLike | None = None,
    sparse_output: bool = True,
    dtype: type = ...,
    handle_unknown: str = "error",
    min_frequency: int | float | None = None,
    max_categories: int | None = None,
    feature_name_combiner: str | Callable = "concat"
)

Encode categorical features as a one-hot numeric array.

OrdinalEncoder { .api }

from sklearn.preprocessing import OrdinalEncoder

OrdinalEncoder(
    categories: str | list[ArrayLike] = "auto",
    dtype: type = ...,
    handle_unknown: str = "error",
    unknown_value: int | float | None = None,
    encoded_missing_value: int | float = ...,
    min_frequency: int | float | None = None,
    max_categories: int | None = None
)

Encode categorical features as an integer array.

TargetEncoder { .api }

from sklearn.preprocessing import TargetEncoder

TargetEncoder(
    categories: str | list[ArrayLike] = "auto",
    target_type: str = "auto",
    smooth: str | float = "auto",
    cv: int | BaseCrossValidator | Iterable = 5,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
)

Target Encoder for regression and classification targets.

KBinsDiscretizer { .api }

from sklearn.preprocessing import KBinsDiscretizer

KBinsDiscretizer(
    n_bins: int | ArrayLike = 5,
    encode: str = "onehot",
    strategy: str = "quantile",
    dtype: type | None = None,
    subsample: int | None = 200000,
    random_state: int | RandomState | None = None
)

Bin continuous data into intervals.

Binarizer { .api }

from sklearn.preprocessing import Binarizer

Binarizer(
    threshold: float = 0.0,
    copy: bool = True
)

Binarize data (set feature values to 0 or 1) according to a threshold.

Feature Engineering

PolynomialFeatures { .api }

from sklearn.preprocessing import PolynomialFeatures

PolynomialFeatures(
    degree: int = 2,
    interaction_only: bool = False,
    include_bias: bool = True,
    order: str = "C"
)

Generate polynomial and interaction features.

SplineTransformer { .api }

from sklearn.preprocessing import SplineTransformer

SplineTransformer(
    n_knots: int = 5,
    degree: int = 3,
    knots: str | ArrayLike = "uniform",
    extrapolation: str = "constant",
    include_bias: bool = True,
    order: str = "C",
    sparse_output: bool = False
)

Generate univariate B-spline bases for features.

FunctionTransformer { .api }

from sklearn.preprocessing import FunctionTransformer

FunctionTransformer(
    func: Callable | None = None,
    inverse_func: Callable | None = None,
    validate: bool = False,
    accept_sparse: bool = False,
    check_inverse: bool = True,
    feature_names_out: str | Callable | None = None,
    kw_args: dict | None = None,
    inv_kw_args: dict | None = None
)

Constructs a transformer from an arbitrary callable.

KernelCenterer { .api }

from sklearn.preprocessing import KernelCenterer

KernelCenterer()

Center a kernel matrix.

Feature Selection

Univariate Selection

SelectKBest { .api }

from sklearn.feature_selection import SelectKBest

SelectKBest(
    score_func: Callable = ...,
    k: int | str = 10
)

Select features according to the k highest scores.

SelectPercentile { .api }

from sklearn.feature_selection import SelectPercentile

SelectPercentile(
    score_func: Callable = ...,
    percentile: int = 10
)

Select features according to a percentile of the highest scores.

SelectFpr { .api }

from sklearn.feature_selection import SelectFpr

SelectFpr(
    score_func: Callable = ...,
    alpha: float = 0.05
)

Filter: Select the pvalues below alpha based on a FPR test.

SelectFdr { .api }

from sklearn.feature_selection import SelectFdr

SelectFdr(
    score_func: Callable = ...,
    alpha: float = 0.05
)

Filter: Select the p-values for an estimated false discovery rate.

SelectFwe { .api }

from sklearn.feature_selection import SelectFwe

SelectFwe(
    score_func: Callable = ...,
    alpha: float = 0.05
)

Filter: Select the p-values corresponding to Family-wise error rate.

GenericUnivariateSelect { .api }

from sklearn.feature_selection import GenericUnivariateSelect

GenericUnivariateSelect(
    score_func: Callable = ...,
    mode: str = "percentile",
    param: int | float = 1e-05
)

Univariate feature selector with configurable strategy.

Model-based Selection

SelectFromModel { .api }

from sklearn.feature_selection import SelectFromModel

SelectFromModel(
    estimator: BaseEstimator,
    threshold: str | float | None = None,
    prefit: bool = False,
    norm_order: int = 1,
    max_features: int | Callable | None = None,
    importance_getter: str | Callable = "auto"
)

Meta-transformer for selecting features based on importance weights.

Recursive Feature Elimination

RFE { .api }

from sklearn.feature_selection import RFE

RFE(
    estimator: BaseEstimator,
    n_features_to_select: int | float | None = None,
    step: int | float = 1,
    verbose: int = 0,
    importance_getter: str | Callable = "auto"
)

Feature ranking with recursive feature elimination.

RFECV { .api }

from sklearn.feature_selection import RFECV

RFECV(
    estimator: BaseEstimator,
    step: int | float = 1,
    min_features_to_select: int = 1,
    cv: int | BaseCrossValidator | Iterable | None = None,
    scoring: str | Callable | None = None,
    verbose: int = 0,
    n_jobs: int | None = None,
    importance_getter: str | Callable = "auto"
)

Recursive feature elimination with cross-validation.

Sequential Feature Selection

SequentialFeatureSelector { .api }

from sklearn.feature_selection import SequentialFeatureSelector

SequentialFeatureSelector(
    estimator: BaseEstimator,
    n_features_to_select: int | float | str = "auto",
    tol: float | None = None,
    direction: str = "forward",
    scoring: str | Callable | None = None,
    cv: int | BaseCrossValidator | Iterable = 5,
    n_jobs: int | None = None
)

Sequential Feature Selector.

Variance-based Selection

VarianceThreshold { .api }

from sklearn.feature_selection import VarianceThreshold

VarianceThreshold(
    threshold: float = 0.0
)

Feature selector that removes all low-variance features.

Base Classes

SelectorMixin { .api }

from sklearn.feature_selection import SelectorMixin

SelectorMixin()

Transformer mixin that performs feature selection given a support mask.

Feature Selection Functions

Statistical Tests

chi2 { .api }

from sklearn.feature_selection import chi2

chi2(
    X: ArrayLike,
    y: ArrayLike
) -> tuple[ArrayLike, ArrayLike]

Compute chi-squared stats between each non-negative feature and class.

f_classif { .api }

from sklearn.feature_selection import f_classif

f_classif(
    X: ArrayLike,
    y: ArrayLike
) -> tuple[ArrayLike, ArrayLike]

Compute the ANOVA F-value for the provided sample.

f_oneway { .api }

from sklearn.feature_selection import f_oneway

f_oneway(
    *samples: ArrayLike
) -> tuple[ArrayLike, ArrayLike]

Test for equal means in two or more samples from the normal distribution.

f_regression { .api }

from sklearn.feature_selection import f_regression

f_regression(
    X: ArrayLike,
    y: ArrayLike,
    center: bool = True
) -> tuple[ArrayLike, ArrayLike]

Univariate linear regression tests returning F-statistic and p-values.

r_regression { .api }

from sklearn.feature_selection import r_regression

r_regression(
    X: ArrayLike,
    y: ArrayLike,
    center: bool = True,
    force_finite: bool = True
) -> tuple[ArrayLike, ArrayLike]

Compute Pearson's r for each feature with the target.

Mutual Information

mutual_info_classif { .api }

from sklearn.feature_selection import mutual_info_classif

mutual_info_classif(
    X: ArrayLike,
    y: ArrayLike,
    discrete_features: str | bool | ArrayLike = "auto",
    n_neighbors: int = 3,
    copy: bool = True,
    random_state: int | RandomState | None = None
) -> ArrayLike

Estimate mutual information for a discrete target variable.

mutual_info_regression { .api }

from sklearn.feature_selection import mutual_info_regression

mutual_info_regression(
    X: ArrayLike,
    y: ArrayLike,
    discrete_features: str | bool | ArrayLike = "auto",
    n_neighbors: int = 3,
    copy: bool = True,
    random_state: int | RandomState | None = None
) -> ArrayLike

Estimate mutual information for a continuous target variable.

Preprocessing Functions

Scaling Functions

scale { .api }

from sklearn.preprocessing import scale

scale(
    X: ArrayLike,
    axis: int = 0,
    with_mean: bool = True,
    with_std: bool = True,
    copy: bool = True
) -> ArrayLike

Standardize a dataset along any axis.

minmax_scale { .api }

from sklearn.preprocessing import minmax_scale

minmax_scale(
    X: ArrayLike,
    feature_range: tuple[float, float] = (0, 1),
    axis: int = 0,
    copy: bool = True
) -> ArrayLike

Transform features by scaling each feature to a given range.

maxabs_scale { .api }

from sklearn.preprocessing import maxabs_scale

maxabs_scale(
    X: ArrayLike,
    axis: int = 0,
    copy: bool = True
) -> ArrayLike

Scale each feature to the [-1, 1] range without breaking sparsity.

robust_scale { .api }

from sklearn.preprocessing import robust_scale

robust_scale(
    X: ArrayLike,
    axis: int = 0,
    quantile_range: tuple[float, float] = (25.0, 75.0),
    copy: bool = True,
    unit_variance: bool = False
) -> ArrayLike

Standardize a dataset along any axis.

normalize { .api }

from sklearn.preprocessing import normalize

normalize(
    X: ArrayLike,
    norm: str = "l2",
    axis: int = 1,
    copy: bool = True,
    return_norm: bool = False
) -> ArrayLike | tuple[ArrayLike, ArrayLike]

Scale input vectors individually to unit norm (vector length).

quantile_transform { .api }

from sklearn.preprocessing import quantile_transform

quantile_transform(
    X: ArrayLike,
    axis: int = 0,
    n_quantiles: int = 1000,
    output_distribution: str = "uniform",
    ignore_implicit_zeros: bool = False,
    subsample: int = 100000,
    random_state: int | RandomState | None = None,
    copy: bool = True
) -> ArrayLike

Transform features to follow a uniform or a normal distribution.

power_transform { .api }

from sklearn.preprocessing import power_transform

power_transform(
    X: ArrayLike,
    method: str = "yeo-johnson",
    standardize: bool = True,
    copy: bool = True
) -> ArrayLike

Apply a power transform featurewise to make data more Gaussian-like.

Encoding Functions

label_binarize { .api }

from sklearn.preprocessing import label_binarize

label_binarize(
    y: ArrayLike,
    classes: ArrayLike,
    neg_label: int = 0,
    pos_label: int = 1,
    sparse_output: bool = False
) -> ArrayLike

Binarize labels in a one-vs-all fashion.

binarize { .api }

from sklearn.preprocessing import binarize

binarize(
    X: ArrayLike,
    threshold: float = 0.0,
    copy: bool = True
) -> ArrayLike

Boolean thresholding of array-like or scipy.sparse matrix.

add_dummy_feature { .api }

from sklearn.preprocessing import add_dummy_feature

add_dummy_feature(
    X: ArrayLike,
    value: float = 1.0
) -> ArrayLike

Augment dataset with an additional dummy feature.

Feature Extraction

Text Feature Extraction

DictVectorizer { .api }

from sklearn.feature_extraction import DictVectorizer

DictVectorizer(
    dtype: type = ...,
    separator: str = "=",
    sparse: bool = True,
    sort: bool = True
)

Transforms lists of feature-value mappings to vectors.

FeatureHasher { .api }

from sklearn.feature_extraction import FeatureHasher

FeatureHasher(
    n_features: int = 1048576,
    input_type: str = "dict",
    dtype: type = ...,
    alternate_sign: bool = True
)

Implements feature hashing, aka the hashing trick.

Image Feature Extraction

img_to_graph { .api }

from sklearn.feature_extraction import img_to_graph

img_to_graph(
    img: ArrayLike,
    mask: ArrayLike | None = None,
    return_as: type = ...,
    dtype: type | None = None
) -> ArrayLike

Graph of the pixel-to-pixel gradient connections.

grid_to_graph { .api }

from sklearn.feature_extraction import grid_to_graph

grid_to_graph(
    n_x: int,
    n_y: int,
    n_z: int | None = None,
    mask: ArrayLike | None = None,
    return_as: type = ...,
    dtype: type = ...,
    **kwargs
) -> ArrayLike

Graph of the pixel-to-pixel gradient connections.

Imputation

Simple Imputation

SimpleImputer { .api }

from sklearn.impute import SimpleImputer

SimpleImputer(
    missing_values: int | float | str | None = ...,
    strategy: str = "mean",
    fill_value: str | int | float | None = None,
    copy: bool = True,
    add_indicator: bool = False,
    keep_empty_features: bool = False
)

Imputation transformer for completing missing values.

Advanced Imputation

KNNImputer { .api }

from sklearn.impute import KNNImputer

KNNImputer(
    missing_values: int | float | str | None = ...,
    n_neighbors: int = 5,
    weights: str | Callable = "uniform",
    metric: str | Callable = "nan_euclidean",
    copy: bool = True,
    add_indicator: bool = False,
    keep_empty_features: bool = False
)

Imputation for completing missing values using k-Nearest Neighbors.

Missing Value Indicators

MissingIndicator { .api }

from sklearn.impute import MissingIndicator

MissingIndicator(
    missing_values: int | float | str | None = ...,
    features: str = "missing-only",
    sparse: bool | str = "auto",
    error_on_new: bool = True
)

Binary indicators for missing values.

Kernel Approximation

RBF Kernel Approximation

RBFSampler { .api }

from sklearn.kernel_approximation import RBFSampler

RBFSampler(
    gamma: float = 1.0,
    n_components: int = 100,
    random_state: int | RandomState | None = None
)

Approximate a RBF kernel feature map using random Fourier features.

Nystroem { .api }

from sklearn.kernel_approximation import Nystroem

Nystroem(
    kernel: str | Callable = "rbf",
    gamma: float | None = None,
    coef0: float | None = None,
    degree: float | None = None,
    kernel_params: dict | None = None,
    n_components: int = 100,
    random_state: int | RandomState | None = None,
    n_jobs: int | None = None
)

Approximate a kernel map using a subset of the training data.

Chi-squared Kernel Approximation

AdditiveChi2Sampler { .api }

from sklearn.kernel_approximation import AdditiveChi2Sampler

AdditiveChi2Sampler(
    sample_steps: int = 2,
    sample_interval: float | None = None
)

Approximate feature map for additive chi2 kernel.

SkewedChi2Sampler { .api }

from sklearn.kernel_approximation import SkewedChi2Sampler

SkewedChi2Sampler(
    skewedness: float = 1.0,
    n_components: int = 100,
    random_state: int | RandomState | None = None
)

Approximate feature map for "skewed chi-squared" kernel.

Polynomial Kernel Approximation

PolynomialCountSketch { .api }

from sklearn.kernel_approximation import PolynomialCountSketch

PolynomialCountSketch(
    gamma: float = 1.0,
    degree: int = 2,
    coef0: int = 0,
    n_components: int = 100,
    random_state: int | RandomState | None = None
)

Polynomial kernel approximation via Tensor Sketch.

Random Projection

GaussianRandomProjection { .api }

from sklearn.random_projection import GaussianRandomProjection

GaussianRandomProjection(
    n_components: int | str = "auto",
    eps: float = 0.1,
    random_state: int | RandomState | None = None,
    compute_inverse_components: bool = False
)

Reduce dimensionality through Gaussian random projection.

SparseRandomProjection { .api }

from sklearn.random_projection import SparseRandomProjection

SparseRandomProjection(
    n_components: int | str = "auto",
    density: float | str = "auto",
    eps: float = 0.1,
    dense_output: bool = False,
    random_state: int | RandomState | None = None,
    compute_inverse_components: bool = False
)

Reduce dimensionality through sparse random projection.

Random Projection Functions

johnson_lindenstrauss_min_dim { .api }

from sklearn.random_projection import johnson_lindenstrauss_min_dim

johnson_lindenstrauss_min_dim(
    n_samples: int,
    eps: float | ArrayLike = 0.1
) -> int | ArrayLike

Find a 'safe' number of components to randomly project to.

Examples

Basic Preprocessing Pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Create preprocessing pipeline
numeric_features = ['age', 'income', 'score']
categorical_features = ['city', 'gender']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

Feature Selection Pipeline

from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier

# Univariate feature selection
selector = SelectKBest(score_func=f_classif, k=10)

# Model-based feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=10)

# Complete pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', selector),
    ('classifier', RandomForestClassifier())
])

Install with Tessl CLI

npx tessl i tessl/pypi-scikit-learn

docs

datasets.md

feature-extraction.md

index.md

metrics.md

model-selection.md

neighbors.md

pipelines.md

preprocessing.md

supervised-learning.md

unsupervised-learning.md

utilities.md

tile.json