A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.
87
This document covers core utilities, configuration functions, pipelines, composition tools, and other utility functions in scikit-learn.
from sklearn.base import clone
clone(
estimator: BaseEstimator,
safe: bool = True
) -> BaseEstimatorConstruct a new unfitted estimator with the same parameters.
from sklearn import get_config
get_config() -> dictRetrieve current scikit-learn configuration.
from sklearn import set_config
set_config(
assume_finite: bool | None = None,
working_memory: int | None = None,
print_changed_only: bool | None = None,
display: str | None = None,
pairwise_distances_chunk_size: int | None = None,
enable_cython_pairwise_dist: bool | None = None,
array_api_dispatch: bool | None = None,
transform_output: str | None = None,
enable_metadata_routing: bool | None = None,
skip_parameter_validation: bool | None = None
) -> dictSet global scikit-learn configuration.
from sklearn import config_context
config_context(**new_config) -> ContextManagerTemporarily change global configuration.
from sklearn import show_versions
show_versions() -> NonePrint system and dependency version information.
import sklearn
sklearn.__version__ # "1.7.1"Current scikit-learn version string.
from sklearn.pipeline import Pipeline
Pipeline(
steps: list[tuple[str, BaseEstimator]],
memory: str | object | None = None,
verbose: bool = False
)Pipeline of transforms with a final estimator.
from sklearn.pipeline import FeatureUnion
FeatureUnion(
transformer_list: list[tuple[str, BaseTransformer]],
n_jobs: int | None = None,
transformer_weights: dict | None = None,
verbose: bool = False,
verbose_feature_names_out: bool = True
)Concatenates results of multiple transformer objects.
from sklearn.pipeline import make_pipeline
make_pipeline(
*steps: BaseEstimator,
memory: str | object | None = None,
verbose: bool = False
) -> PipelineConstruct a Pipeline from the given estimators.
from sklearn.pipeline import make_union
make_union(
*transformers: BaseTransformer,
n_jobs: int | None = None,
verbose: bool = False
) -> FeatureUnionConstruct a FeatureUnion from the given transformers.
from sklearn.compose import ColumnTransformer
ColumnTransformer(
transformers: list[tuple[str, BaseTransformer, ArrayLike | str | Callable]],
remainder: str | BaseTransformer = "drop",
sparse_threshold: float = 0.3,
n_jobs: int | None = None,
transformer_weights: dict | None = None,
verbose: bool = False,
verbose_feature_names_out: bool = True,
force_int_remainder_cols: bool = True
)Applies transformers to columns of an array or pandas DataFrame.
from sklearn.compose import TransformedTargetRegressor
TransformedTargetRegressor(
regressor: BaseRegressor | None = None,
transformer: BaseTransformer | None = None,
func: Callable | None = None,
inverse_func: Callable | None = None,
check_inverse: bool = True
)Meta-estimator to regress on a transformed target.
from sklearn.compose import make_column_transformer
make_column_transformer(
*transformers: tuple[BaseTransformer, ArrayLike | str | Callable],
remainder: str | BaseTransformer = "drop",
sparse_threshold: float = 0.3,
n_jobs: int | None = None,
verbose: bool = False,
verbose_feature_names_out: bool = True,
force_int_remainder_cols: bool = True
) -> ColumnTransformerConstruct a ColumnTransformer from the given transformers.
from sklearn.compose import make_column_selector
make_column_selector(
pattern: str | None = None,
dtype_include: type | str | list | None = None,
dtype_exclude: type | str | list | None = None
) -> CallableCreate a callable to select columns to be used with ColumnTransformer.
from sklearn.inspection import partial_dependence
partial_dependence(
estimator: BaseEstimator,
X: ArrayLike,
features: int | str | ArrayLike | list,
response_method: str = "auto",
percentiles: tuple[float, float] = (0.05, 0.95),
grid_resolution: int = 100,
method: str = "auto",
kind: str = "average",
subsample: int | float | None = 1000,
n_jobs: int | None = None,
verbose: int = 0,
feature_names: ArrayLike | None = None,
categorical_features: ArrayLike | None = None
) -> dictPartial dependence of features.
from sklearn.inspection import permutation_importance
permutation_importance(
estimator: BaseEstimator,
X: ArrayLike,
y: ArrayLike,
scoring: str | Callable | list | tuple | dict | None = None,
n_repeats: int = 5,
n_jobs: int | None = None,
random_state: int | RandomState | None = None,
sample_weight: ArrayLike | None = None,
max_samples: int | float = 1.0
) -> dictPermutation importance for feature evaluation.
from sklearn.inspection import PartialDependenceDisplay
PartialDependenceDisplay(
pd_results: list[dict],
features: list,
feature_names: ArrayLike | None = None,
target_idx: int | None = None,
deciles: dict | None = None
)Partial Dependence Plot (PDP).
from sklearn.inspection import DecisionBoundaryDisplay
DecisionBoundaryDisplay(
xx0: ArrayLike,
xx1: ArrayLike,
response: ArrayLike
)Visualization of decision boundaries of a classifier.
from sklearn.isotonic import check_increasing
check_increasing(
x: ArrayLike,
y: ArrayLike
) -> boolDetermine whether y is monotonically correlated with x.
from sklearn.isotonic import isotonic_regression
isotonic_regression(
y: ArrayLike,
sample_weight: ArrayLike | None = None,
y_min: float | None = None,
y_max: float | None = None,
increasing: bool = True
) -> ArrayLikeSolve the isotonic regression model.
from sklearn.neighbors import kneighbors_graph
kneighbors_graph(
X: ArrayLike,
n_neighbors: int,
mode: str = "connectivity",
metric: str | Callable = "minkowski",
p: int = 2,
metric_params: dict | None = None,
include_self: bool | str = "auto",
n_jobs: int | None = None
) -> ArrayLikeCompute the (weighted) graph of k-Neighbors for points in X.
from sklearn.neighbors import radius_neighbors_graph
radius_neighbors_graph(
X: ArrayLike,
radius: float,
mode: str = "connectivity",
metric: str | Callable = "minkowski",
p: int = 2,
metric_params: dict | None = None,
include_self: bool | str = "auto",
n_jobs: int | None = None
) -> ArrayLikeCompute the (weighted) graph of Neighbors for points in X.
from sklearn.neighbors import sort_graph_by_row_values
sort_graph_by_row_values(
graph: ArrayLike,
copy: bool = True,
warn_when_not_sorted: bool = True
) -> ArrayLikeSort a sparse graph such that each row has its data sorted by value.
from sklearn.neighbors import BallTree
BallTree(
X: ArrayLike,
leaf_size: int = 40,
metric: str | DistanceMetric = "minkowski",
**kwargs
)BallTree for fast generalized N-point problems.
from sklearn.neighbors import KDTree
KDTree(
X: ArrayLike,
leaf_size: int = 40,
metric: str = "minkowski",
**kwargs
)KDTree for fast generalized N-point problems.
from sklearn.neighbors import KernelDensity
KernelDensity(
bandwidth: float | str = 1.0,
algorithm: str = "auto",
kernel: str = "gaussian",
metric: str = "euclidean",
atol: float = 0,
rtol: float = 0,
breadth_first: bool = True,
leaf_size: int = 40,
metric_params: dict | None = None
)Kernel Density Estimation.
from sklearn.neighbors import NearestNeighbors
NearestNeighbors(
n_neighbors: int = 5,
radius: float = 1.0,
algorithm: str = "auto",
leaf_size: int = 30,
metric: str | Callable = "minkowski",
p: int = 2,
metric_params: dict | None = None,
n_jobs: int | None = None
)Unsupervised learner for implementing neighbor searches.
from sklearn.neighbors import KNeighborsTransformer
KNeighborsTransformer(
mode: str = "distance",
n_neighbors: int = 5,
algorithm: str = "auto",
leaf_size: int = 30,
metric: str | Callable = "minkowski",
p: int = 2,
metric_params: dict | None = None,
n_jobs: int | None = None
)Transform X into a (weighted) graph of k nearest neighbors.
from sklearn.neighbors import RadiusNeighborsTransformer
RadiusNeighborsTransformer(
mode: str = "distance",
radius: float = 1.0,
algorithm: str = "auto",
leaf_size: int = 30,
metric: str | Callable = "minkowski",
p: int = 2,
metric_params: dict | None = None,
n_jobs: int | None = None
)Transform X into a (weighted) graph of neighbors nearer than a radius.
from sklearn.neighbors import NeighborhoodComponentsAnalysis
NeighborhoodComponentsAnalysis(
n_components: int | None = None,
init: str | ArrayLike = "auto",
warm_start: bool = False,
max_iter: int = 50,
tol: float = 1e-05,
callback: Callable | None = None,
verbose: int = 0,
random_state: int | RandomState | None = None
)Neighborhood Components Analysis.
from sklearn.neighbors import VALID_METRICS
# Dictionary mapping algorithm names to valid metrics
VALID_METRICS: dict[str, list[str]]Valid metrics for neighbor algorithms.
from sklearn.neighbors import VALID_METRICS_SPARSE
# Dictionary mapping algorithm names to valid metrics for sparse matrices
VALID_METRICS_SPARSE: dict[str, list[str]]Valid metrics for neighbor algorithms with sparse matrices.
from sklearn.exceptions import NotFittedError
class NotFittedError(ValueError, AttributeError):
"""Exception class to raise if estimator is used before fitting."""
passException class to raise if estimator is used before fitting.
from sklearn.exceptions import ConvergenceWarning
class ConvergenceWarning(UserWarning):
"""Custom warning to capture convergence problems."""
passCustom warning to capture convergence problems.
from sklearn.exceptions import DataConversionWarning
class DataConversionWarning(UserWarning):
"""Warning used to notify implicit data conversions happening in the code."""
passWarning used to notify implicit data conversions happening in the code.
from sklearn.exceptions import DataDimensionalityWarning
class DataDimensionalityWarning(UserWarning):
"""Custom warning to capture data dimensionality problems."""
passCustom warning to capture data dimensionality problems.
from sklearn.exceptions import EfficiencyWarning
class EfficiencyWarning(UserWarning):
"""Warning used to notify the user of inefficient computation."""
passWarning used to notify the user of inefficient computation.
from sklearn.exceptions import EstimatorCheckFailedWarning
class EstimatorCheckFailedWarning(UserWarning):
"""Warning used when an estimator check fails."""
passWarning used when an estimator check fails.
from sklearn.exceptions import FitFailedWarning
class FitFailedWarning(RuntimeWarning):
"""Warning class used if there is an error while fitting the estimator."""
passWarning class used if there is an error while fitting the estimator.
from sklearn.exceptions import PositiveSpectrumWarning
class PositiveSpectrumWarning(UserWarning):
"""Warning raised when the eigenvalues of a PSD matrix have issues."""
passWarning raised when the eigenvalues of a PSD matrix have issues.
from sklearn.exceptions import SkipTestWarning
class SkipTestWarning(UserWarning):
"""Warning class used to notify the user of a test that was skipped."""
passWarning class used to notify the user of a test that was skipped.
from sklearn.exceptions import UndefinedMetricWarning
class UndefinedMetricWarning(UserWarning):
"""Warning used when the metric is invalid."""
passWarning used when the metric is invalid.
from sklearn.exceptions import UnsetMetadataPassedError
class UnsetMetadataPassedError(ValueError):
"""Exception when metadata is passed which is not explicitly requested."""
passException when metadata is passed which is not explicitly requested.
from sklearn.frozen import FrozenEstimator
FrozenEstimator(
estimator: BaseEstimator
)Wrapper to freeze an estimator and use it as a transformer.
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
# Load data
X, y = load_iris(return_X_y=True)
# Method 1: Using Pipeline class
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression())
])
# Method 2: Using make_pipeline function
pipeline = make_pipeline(
StandardScaler(),
LogisticRegression()
)
# Fit and predict
pipeline.fit(X, y)
predictions = pipeline.predict(X)from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
# Example with mixed data types
data = pd.DataFrame({
'age': [25, 30, 35],
'income': [50000, 60000, 70000],
'city': ['NYC', 'LA', 'Chicago'],
'gender': ['M', 'F', 'M']
})
# Method 1: Using ColumnTransformer class
preprocessor = ColumnTransformer([
('num', StandardScaler(), ['age', 'income']),
('cat', OneHotEncoder(), ['city', 'gender'])
])
# Method 2: Using make_column_transformer function
preprocessor = make_column_transformer(
(StandardScaler(), ['age', 'income']),
(OneHotEncoder(), ['city', 'gender'])
)
# Transform data
transformed = preprocessor.fit_transform(data)from sklearn.pipeline import FeatureUnion, make_union
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
# Combine PCA and feature selection
feature_union = FeatureUnion([
('pca', PCA(n_components=2)),
('select_k_best', SelectKBest(k=2))
])
# Or using make_union
feature_union = make_union(
PCA(n_components=2),
SelectKBest(k=2)
)
# Transform features
X_combined = feature_union.fit_transform(X, y)from sklearn import set_config, get_config, config_context
from sklearn.linear_model import LinearRegression
# Get current config
current_config = get_config()
print(current_config)
# Set global configuration
set_config(display='diagram', print_changed_only=True)
# Use configuration context
with config_context(assume_finite=True):
# Operations within this block use assume_finite=True
model = LinearRegression()
model.fit(X, y)
# Configuration reverts to previous state outside the contextfrom sklearn.inspection import partial_dependence, PartialDependenceDisplay
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
# Compute partial dependence
pd_result = partial_dependence(
model, X, features=[0, 1],
grid_resolution=20
)
# Create display
display = PartialDependenceDisplay.from_estimator(
model, X, features=[0, 1]
)
display.plot()
plt.show()from sklearn.inspection import permutation_importance
# Calculate permutation importance
result = permutation_importance(
model, X, y, n_repeats=10, random_state=42
)
# Get importance scores
importance_scores = result.importances_mean
importance_std = result.importances_std
# Print results
for i, (score, std) in enumerate(zip(importance_scores, importance_std)):
print(f"Feature {i}: {score:.3f} +/- {std:.3f}")Install with Tessl CLI
npx tessl i tessl/pypi-scikit-learndocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10