tessl/pypi-scikit-learn

A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.

0.98x

Overview

Eval results

Files

Data Preprocessing and Feature Engineering

Name: tessl/pypi-scikit-learn
Rating: 0.87 (1 reviews)
Author: tessl

This document covers all data preprocessing, feature engineering, and feature selection capabilities in scikit-learn.

Scaling and Normalization

StandardScaler { .api }

from sklearn.preprocessing import StandardScaler

StandardScaler(
    copy: bool = True,
    with_mean: bool = True,
    with_std: bool = True
)

Standardize features by removing the mean and scaling to unit variance.

MinMaxScaler { .api }

from sklearn.preprocessing import MinMaxScaler

MinMaxScaler(
    feature_range: tuple[float, float] = (0, 1),
    copy: bool = True,
    clip: bool = False
)

Transform features by scaling each feature to a given range.

MaxAbsScaler { .api }

from sklearn.preprocessing import MaxAbsScaler

MaxAbsScaler(
    copy: bool = True
)

Scale each feature by its maximum absolute value.

RobustScaler { .api }

from sklearn.preprocessing import RobustScaler

RobustScaler(
    quantile_range: tuple[float, float] = (25.0, 75.0),
    copy: bool = True,
    unit_variance: bool = False
)

Scale features using statistics that are robust to outliers.

Normalizer { .api }

from sklearn.preprocessing import Normalizer

Normalizer(
    norm: str = "l2",
    copy: bool = True
)

Normalize samples individually to unit norm.

QuantileTransformer { .api }

from sklearn.preprocessing import QuantileTransformer

QuantileTransformer(
    n_quantiles: int = 1000,
    output_distribution: str = "uniform",
    ignore_implicit_zeros: bool = False,
    subsample: int = 100000,
    random_state: int | RandomState | None = None,
    copy: bool = True
)

Transform features to follow a uniform or a normal distribution.

PowerTransformer { .api }

from sklearn.preprocessing import PowerTransformer

PowerTransformer(
    method: str = "yeo-johnson",
    standardize: bool = True,
    copy: bool = True
)

Apply a power transform featurewise to make data more Gaussian-like.

Encoding

LabelEncoder { .api }

from sklearn.preprocessing import LabelEncoder

LabelEncoder()

Encode target labels with value between 0 and n_classes-1.

LabelBinarizer { .api }

from sklearn.preprocessing import LabelBinarizer

LabelBinarizer(
    neg_label: int = 0,
    pos_label: int = 1,
    sparse_output: bool = False
)

Binarize labels in a one-vs-all fashion.

MultiLabelBinarizer { .api }

from sklearn.preprocessing import MultiLabelBinarizer

MultiLabelBinarizer(
    classes: ArrayLike | None = None,
    sparse_output: bool = False
)

Transform between iterable of iterables and a multilabel format.

OneHotEncoder { .api }

from sklearn.preprocessing import OneHotEncoder

OneHotEncoder(
    categories: str | list[ArrayLike] = "auto",
    drop: str | ArrayLike | None = None,
    sparse_output: bool = True,
    dtype: type = ...,
    handle_unknown: str = "error",
    min_frequency: int | float | None = None,
    max_categories: int | None = None,
    feature_name_combiner: str | Callable = "concat"
)

Encode categorical features as a one-hot numeric array.

OrdinalEncoder { .api }

from sklearn.preprocessing import OrdinalEncoder

OrdinalEncoder(
    categories: str | list[ArrayLike] = "auto",
    dtype: type = ...,
    handle_unknown: str = "error",
    unknown_value: int | float | None = None,
    encoded_missing_value: int | float = ...,
    min_frequency: int | float | None = None,
    max_categories: int | None = None
)

Encode categorical features as an integer array.

TargetEncoder { .api }

from sklearn.preprocessing import TargetEncoder

TargetEncoder(
    categories: str | list[ArrayLike] = "auto",
    target_type: str = "auto",
    smooth: str | float = "auto",
    cv: int | BaseCrossValidator | Iterable = 5,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
)

Target Encoder for regression and classification targets.

KBinsDiscretizer { .api }

from sklearn.preprocessing import KBinsDiscretizer

KBinsDiscretizer(
    n_bins: int | ArrayLike = 5,
    encode: str = "onehot",
    strategy: str = "quantile",
    dtype: type | None = None,
    subsample: int | None = 200000,
    random_state: int | RandomState | None = None
)

Bin continuous data into intervals.

Binarizer { .api }

from sklearn.preprocessing import Binarizer

Binarizer(
    threshold: float = 0.0,
    copy: bool = True
)

Binarize data (set feature values to 0 or 1) according to a threshold.

Feature Engineering

PolynomialFeatures { .api }

from sklearn.preprocessing import PolynomialFeatures

PolynomialFeatures(
    degree: int = 2,
    interaction_only: bool = False,
    include_bias: bool = True,
    order: str = "C"
)

Generate polynomial and interaction features.

SplineTransformer { .api }

from sklearn.preprocessing import SplineTransformer

SplineTransformer(
    n_knots: int = 5,
    degree: int = 3,
    knots: str | ArrayLike = "uniform",
    extrapolation: str = "constant",
    include_bias: bool = True,
    order: str = "C",
    sparse_output: bool = False
)

Generate univariate B-spline bases for features.

FunctionTransformer { .api }

from sklearn.preprocessing import FunctionTransformer

FunctionTransformer(
    func: Callable | None = None,
    inverse_func: Callable | None = None,
    validate: bool = False,
    accept_sparse: bool = False,
    check_inverse: bool = True,
    feature_names_out: str | Callable | None = None,
    kw_args: dict | None = None,
    inv_kw_args: dict | None = None
)

Constructs a transformer from an arbitrary callable.

KernelCenterer { .api }

from sklearn.preprocessing import KernelCenterer

KernelCenterer()

Center a kernel matrix.

Feature Selection

Univariate Selection

SelectKBest { .api }

from sklearn.feature_selection import SelectKBest

SelectKBest(
    score_func: Callable = ...,
    k: int | str = 10
)

Select features according to the k highest scores.

SelectPercentile { .api }

from sklearn.feature_selection import SelectPercentile

SelectPercentile(
    score_func: Callable = ...,
    percentile: int = 10
)

Select features according to a percentile of the highest scores.

SelectFpr { .api }

from sklearn.feature_selection import SelectFpr

SelectFpr(
    score_func: Callable = ...,
    alpha: float = 0.05
)

Filter: Select the pvalues below alpha based on a FPR test.

SelectFdr { .api }

from sklearn.feature_selection import SelectFdr

SelectFdr(
    score_func: Callable = ...,
    alpha: float = 0.05
)

Filter: Select the p-values for an estimated false discovery rate.

SelectFwe { .api }

from sklearn.feature_selection import SelectFwe

SelectFwe(
    score_func: Callable = ...,
    alpha: float = 0.05
)

Filter: Select the p-values corresponding to Family-wise error rate.

GenericUnivariateSelect { .api }

from sklearn.feature_selection import GenericUnivariateSelect

GenericUnivariateSelect(
    score_func: Callable = ...,
    mode: str = "percentile",
    param: int | float = 1e-05
)

Univariate feature selector with configurable strategy.

Model-based Selection

SelectFromModel { .api }

from sklearn.feature_selection import SelectFromModel

SelectFromModel(
    estimator: BaseEstimator,
    threshold: str | float | None = None,
    prefit: bool = False,
    norm_order: int = 1,
    max_features: int | Callable | None = None,
    importance_getter: str | Callable = "auto"
)

Meta-transformer for selecting features based on importance weights.

Recursive Feature Elimination

RFE { .api }

from sklearn.feature_selection import RFE

RFE(
    estimator: BaseEstimator,
    n_features_to_select: int | float | None = None,
    step: int | float = 1,
    verbose: int = 0,
    importance_getter: str | Callable = "auto"
)

Feature ranking with recursive feature elimination.

RFECV { .api }

from sklearn.feature_selection import RFECV

RFECV(
    estimator: BaseEstimator,
    step: int | float = 1,
    min_features_to_select: int = 1,
    cv: int | BaseCrossValidator | Iterable | None = None,
    scoring: str | Callable | None = None,
    verbose: int = 0,
    n_jobs: int | None = None,
    importance_getter: str | Callable = "auto"
)

Recursive feature elimination with cross-validation.

Sequential Feature Selection

SequentialFeatureSelector { .api }

from sklearn.feature_selection import SequentialFeatureSelector

SequentialFeatureSelector(
    estimator: BaseEstimator,
    n_features_to_select: int | float | str = "auto",
    tol: float | None = None,
    direction: str = "forward",
    scoring: str | Callable | None = None,
    cv: int | BaseCrossValidator | Iterable = 5,
    n_jobs: int | None = None
)

Sequential Feature Selector.

Variance-based Selection

VarianceThreshold { .api }

from sklearn.feature_selection import VarianceThreshold

VarianceThreshold(
    threshold: float = 0.0
)

Feature selector that removes all low-variance features.

Base Classes

SelectorMixin { .api }

from sklearn.feature_selection import SelectorMixin

SelectorMixin()

Transformer mixin that performs feature selection given a support mask.

Feature Selection Functions

Statistical Tests

chi2 { .api }

from sklearn.feature_selection import chi2

chi2(
    X: ArrayLike,
    y: ArrayLike
) -> tuple[ArrayLike, ArrayLike]

Compute chi-squared stats between each non-negative feature and class.

f_classif { .api }

from sklearn.feature_selection import f_classif

f_classif(
    X: ArrayLike,
    y: ArrayLike
) -> tuple[ArrayLike, ArrayLike]

Compute the ANOVA F-value for the provided sample.

f_oneway { .api }

from sklearn.feature_selection import f_oneway

f_oneway(
    *samples: ArrayLike
) -> tuple[ArrayLike, ArrayLike]

Test for equal means in two or more samples from the normal distribution.

f_regression { .api }

from sklearn.feature_selection import f_regression

f_regression(
    X: ArrayLike,
    y: ArrayLike,
    center: bool = True
) -> tuple[ArrayLike, ArrayLike]

Univariate linear regression tests returning F-statistic and p-values.

r_regression { .api }

from sklearn.feature_selection import r_regression

r_regression(
    X: ArrayLike,
    y: ArrayLike,
    center: bool = True,
    force_finite: bool = True
) -> tuple[ArrayLike, ArrayLike]

Compute Pearson's r for each feature with the target.

Mutual Information

mutual_info_classif { .api }

from sklearn.feature_selection import mutual_info_classif

mutual_info_classif(
    X: ArrayLike,
    y: ArrayLike,
    discrete_features: str | bool | ArrayLike = "auto",
    n_neighbors: int = 3,
    copy: bool = True,
    random_state: int | RandomState | None = None
) -> ArrayLike

Estimate mutual information for a discrete target variable.

mutual_info_regression { .api }

from sklearn.feature_selection import mutual_info_regression

mutual_info_regression(
    X: ArrayLike,
    y: ArrayLike,
    discrete_features: str | bool | ArrayLike = "auto",
    n_neighbors: int = 3,
    copy: bool = True,
    random_state: int | RandomState | None = None
) -> ArrayLike

Estimate mutual information for a continuous target variable.

Preprocessing Functions

Scaling Functions

scale { .api }

from sklearn.preprocessing import scale

scale(
    X: ArrayLike,
    axis: int = 0,
    with_mean: bool = True,
    with_std: bool = True,
    copy: bool = True
) -> ArrayLike

Standardize a dataset along any axis.

minmax_scale { .api }

from sklearn.preprocessing import minmax_scale

minmax_scale(
    X: ArrayLike,
    feature_range: tuple[float, float] = (0, 1),
    axis: int = 0,
    copy: bool = True
) -> ArrayLike

Transform features by scaling each feature to a given range.

maxabs_scale { .api }

from sklearn.preprocessing import maxabs_scale

maxabs_scale(
    X: ArrayLike,
    axis: int = 0,
    copy: bool = True
) -> ArrayLike

Scale each feature to the [-1, 1] range without breaking sparsity.

robust_scale { .api }

from sklearn.preprocessing import robust_scale

robust_scale(
    X: ArrayLike,
    axis: int = 0,
    quantile_range: tuple[float, float] = (25.0, 75.0),
    copy: bool = True,
    unit_variance: bool = False
) -> ArrayLike

Standardize a dataset along any axis.

normalize { .api }

from sklearn.preprocessing import normalize

normalize(
    X: ArrayLike,
    norm: str = "l2",
    axis: int = 1,
    copy: bool = True,
    return_norm: bool = False
) -> ArrayLike | tuple[ArrayLike, ArrayLike]

Scale input vectors individually to unit norm (vector length).

quantile_transform { .api }

from sklearn.preprocessing import quantile_transform

quantile_transform(
    X: ArrayLike,
    axis: int = 0,
    n_quantiles: int = 1000,
    output_distribution: str = "uniform",
    ignore_implicit_zeros: bool = False,
    subsample: int = 100000,
    random_state: int | RandomState | None = None,
    copy: bool = True
) -> ArrayLike

Transform features to follow a uniform or a normal distribution.

power_transform { .api }

from sklearn.preprocessing import power_transform

power_transform(
    X: ArrayLike,
    method: str = "yeo-johnson",
    standardize: bool = True,
    copy: bool = True
) -> ArrayLike

Apply a power transform featurewise to make data more Gaussian-like.

Encoding Functions

label_binarize { .api }

from sklearn.preprocessing import label_binarize

label_binarize(
    y: ArrayLike,
    classes: ArrayLike,
    neg_label: int = 0,
    pos_label: int = 1,
    sparse_output: bool = False
) -> ArrayLike

Binarize labels in a one-vs-all fashion.

binarize { .api }

from sklearn.preprocessing import binarize

binarize(
    X: ArrayLike,
    threshold: float = 0.0,
    copy: bool = True
) -> ArrayLike

Boolean thresholding of array-like or scipy.sparse matrix.

add_dummy_feature { .api }

from sklearn.preprocessing import add_dummy_feature

add_dummy_feature(
    X: ArrayLike,
    value: float = 1.0
) -> ArrayLike

Augment dataset with an additional dummy feature.

Feature Extraction

Text Feature Extraction

DictVectorizer { .api }

from sklearn.feature_extraction import DictVectorizer

DictVectorizer(
    dtype: type = ...,
    separator: str = "=",
    sparse: bool = True,
    sort: bool = True
)

Transforms lists of feature-value mappings to vectors.

FeatureHasher { .api }

from sklearn.feature_extraction import FeatureHasher

FeatureHasher(
    n_features: int = 1048576,
    input_type: str = "dict",
    dtype: type = ...,
    alternate_sign: bool = True
)

Implements feature hashing, aka the hashing trick.

Image Feature Extraction

img_to_graph { .api }

from sklearn.feature_extraction import img_to_graph

img_to_graph(
    img: ArrayLike,
    mask: ArrayLike | None = None,
    return_as: type = ...,
    dtype: type | None = None
) -> ArrayLike

Graph of the pixel-to-pixel gradient connections.

grid_to_graph { .api }

from sklearn.feature_extraction import grid_to_graph

grid_to_graph(
    n_x: int,
    n_y: int,
    n_z: int | None = None,
    mask: ArrayLike | None = None,
    return_as: type = ...,
    dtype: type = ...,
    **kwargs
) -> ArrayLike

Graph of the pixel-to-pixel gradient connections.

Imputation

Simple Imputation

SimpleImputer { .api }

from sklearn.impute import SimpleImputer

SimpleImputer(
    missing_values: int | float | str | None = ...,
    strategy: str = "mean",
    fill_value: str | int | float | None = None,
    copy: bool = True,
    add_indicator: bool = False,
    keep_empty_features: bool = False
)

Imputation transformer for completing missing values.

Advanced Imputation

KNNImputer { .api }

from sklearn.impute import KNNImputer

KNNImputer(
    missing_values: int | float | str | None = ...,
    n_neighbors: int = 5,
    weights: str | Callable = "uniform",
    metric: str | Callable = "nan_euclidean",
    copy: bool = True,
    add_indicator: bool = False,
    keep_empty_features: bool = False
)

Imputation for completing missing values using k-Nearest Neighbors.

Missing Value Indicators

MissingIndicator { .api }

from sklearn.impute import MissingIndicator

MissingIndicator(
    missing_values: int | float | str | None = ...,
    features: str = "missing-only",
    sparse: bool | str = "auto",
    error_on_new: bool = True
)

Binary indicators for missing values.

Kernel Approximation

RBF Kernel Approximation

RBFSampler { .api }

from sklearn.kernel_approximation import RBFSampler

RBFSampler(
    gamma: float = 1.0,
    n_components: int = 100,
    random_state: int | RandomState | None = None
)

Approximate a RBF kernel feature map using random Fourier features.

Nystroem { .api }

from sklearn.kernel_approximation import Nystroem

Nystroem(
    kernel: str | Callable = "rbf",
    gamma: float | None = None,
    coef0: float | None = None,
    degree: float | None = None,
    kernel_params: dict | None = None,
    n_components: int = 100,
    random_state: int | RandomState | None = None,
    n_jobs: int | None = None
)

Approximate a kernel map using a subset of the training data.

Chi-squared Kernel Approximation

AdditiveChi2Sampler { .api }

from sklearn.kernel_approximation import AdditiveChi2Sampler

AdditiveChi2Sampler(
    sample_steps: int = 2,
    sample_interval: float | None = None
)

Approximate feature map for additive chi2 kernel.

SkewedChi2Sampler { .api }

from sklearn.kernel_approximation import SkewedChi2Sampler

SkewedChi2Sampler(
    skewedness: float = 1.0,
    n_components: int = 100,
    random_state: int | RandomState | None = None
)

Approximate feature map for "skewed chi-squared" kernel.

Polynomial Kernel Approximation

PolynomialCountSketch { .api }

from sklearn.kernel_approximation import PolynomialCountSketch

PolynomialCountSketch(
    gamma: float = 1.0,
    degree: int = 2,
    coef0: int = 0,
    n_components: int = 100,
    random_state: int | RandomState | None = None
)

Polynomial kernel approximation via Tensor Sketch.

Random Projection

GaussianRandomProjection { .api }

from sklearn.random_projection import GaussianRandomProjection

GaussianRandomProjection(
    n_components: int | str = "auto",
    eps: float = 0.1,
    random_state: int | RandomState | None = None,
    compute_inverse_components: bool = False
)

Reduce dimensionality through Gaussian random projection.

SparseRandomProjection { .api }

from sklearn.random_projection import SparseRandomProjection

SparseRandomProjection(
    n_components: int | str = "auto",
    density: float | str = "auto",
    eps: float = 0.1,
    dense_output: bool = False,
    random_state: int | RandomState | None = None,
    compute_inverse_components: bool = False
)

Reduce dimensionality through sparse random projection.

Random Projection Functions

johnson_lindenstrauss_min_dim { .api }

from sklearn.random_projection import johnson_lindenstrauss_min_dim

johnson_lindenstrauss_min_dim(
    n_samples: int,
    eps: float | ArrayLike = 0.1
) -> int | ArrayLike

Find a 'safe' number of components to randomly project to.

Examples

Basic Preprocessing Pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Create preprocessing pipeline
numeric_features = ['age', 'income', 'score']
categorical_features = ['city', 'gender']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

Feature Selection Pipeline

from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier

# Univariate feature selection
selector = SelectKBest(score_func=f_classif, k=10)

# Model-based feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=10)

# Complete pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', selector),
    ('classifier', RandomForestClassifier())
])

Install with Tessl CLI

npx tessl i tessl/pypi-scikit-learn