A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.
87
This document covers all model selection, cross-validation, hyperparameter tuning, and evaluation capabilities in scikit-learn.
from sklearn.model_selection import KFold
KFold(
n_splits: int = 5,
shuffle: bool = False,
random_state: int | RandomState | None = None
)K-Folds cross-validator.
from sklearn.model_selection import StratifiedKFold
StratifiedKFold(
n_splits: int = 5,
shuffle: bool = False,
random_state: int | RandomState | None = None
)Stratified K-Folds cross-validator.
from sklearn.model_selection import GroupKFold
GroupKFold(
n_splits: int = 5
)K-fold iterator variant with non-overlapping groups.
from sklearn.model_selection import StratifiedGroupKFold
StratifiedGroupKFold(
n_splits: int = 5,
shuffle: bool = False,
random_state: int | RandomState | None = None
)Stratified K-Folds iterator variant with non-overlapping groups.
from sklearn.model_selection import TimeSeriesSplit
TimeSeriesSplit(
n_splits: int = 5,
max_train_size: int | None = None,
test_size: int | None = None,
gap: int = 0
)Time Series cross-validator.
from sklearn.model_selection import LeaveOneOut
LeaveOneOut()Leave-One-Out cross-validator.
from sklearn.model_selection import LeavePOut
LeavePOut(
p: int
)Leave-P-Out cross-validator.
from sklearn.model_selection import LeaveOneGroupOut
LeaveOneGroupOut()Leave One Group Out cross-validator.
from sklearn.model_selection import LeavePGroupsOut
LeavePGroupsOut(
n_groups: int
)Leave P Group(s) Out cross-validator.
from sklearn.model_selection import ShuffleSplit
ShuffleSplit(
n_splits: int = 10,
test_size: float | int | None = None,
train_size: float | int | None = None,
random_state: int | RandomState | None = None
)Random permutation cross-validator.
from sklearn.model_selection import StratifiedShuffleSplit
StratifiedShuffleSplit(
n_splits: int = 10,
test_size: float | int | None = None,
train_size: float | int | None = None,
random_state: int | RandomState | None = None
)Stratified ShuffleSplit cross-validator.
from sklearn.model_selection import GroupShuffleSplit
GroupShuffleSplit(
n_splits: int = 5,
test_size: float | int | None = None,
train_size: float | int | None = None,
random_state: int | RandomState | None = None
)Shuffle-Group(s)-Out cross-validation iterator.
from sklearn.model_selection import PredefinedSplit
PredefinedSplit(
test_fold: ArrayLike
)Predefined split cross-validator.
from sklearn.model_selection import RepeatedKFold
RepeatedKFold(
n_splits: int = 5,
n_repeats: int = 10,
random_state: int | RandomState | None = None
)Repeated K-Fold cross validator.
from sklearn.model_selection import RepeatedStratifiedKFold
RepeatedStratifiedKFold(
n_splits: int = 5,
n_repeats: int = 10,
random_state: int | RandomState | None = None
)Repeated Stratified K-Fold cross validator.
from sklearn.model_selection import BaseCrossValidator
BaseCrossValidator()Base class for all cross-validators.
from sklearn.model_selection import BaseShuffleSplit
BaseShuffleSplit(
n_splits: int = 10,
test_size: float | int | None = None,
train_size: float | int | None = None,
random_state: int | RandomState | None = None
)Base class for ShuffleSplit cross-validator.
from sklearn.model_selection import GridSearchCV
GridSearchCV(
estimator: BaseEstimator,
param_grid: dict | list[dict],
scoring: str | Callable | list | tuple | dict | None = None,
n_jobs: int | None = None,
refit: bool | str | Callable = True,
cv: int | BaseCrossValidator | Iterable | None = None,
verbose: int = 0,
pre_dispatch: int | str = "2*n_jobs",
error_score: float | str = ...,
return_train_score: bool = False
)Exhaustive search over specified parameter values for an estimator.
from sklearn.model_selection import RandomizedSearchCV
RandomizedSearchCV(
estimator: BaseEstimator,
param_distributions: dict | list[dict],
n_iter: int = 10,
scoring: str | Callable | list | tuple | dict | None = None,
n_jobs: int | None = None,
refit: bool | str | Callable = True,
cv: int | BaseCrossValidator | Iterable | None = None,
verbose: int = 0,
pre_dispatch: int | str = "2*n_jobs",
random_state: int | RandomState | None = None,
error_score: float | str = ...,
return_train_score: bool = False
)Randomized search on hyper parameters.
from sklearn.model_selection import ParameterGrid
ParameterGrid(
param_grid: dict | list[dict]
)Grid of parameters with a discrete number of values for each.
from sklearn.model_selection import ParameterSampler
ParameterSampler(
param_distributions: dict,
n_iter: int,
random_state: int | RandomState | None = None
)Generator on parameters sampled from given distributions.
from sklearn.model_selection import TunedThresholdClassifierCV
TunedThresholdClassifierCV(
estimator: BaseClassifier,
scoring: str | Callable = "balanced_accuracy",
response_method: str = "auto",
thresholds: int | ArrayLike = 100,
cv: int | BaseCrossValidator | Iterable | None = None,
refit: bool = True,
n_jobs: int | None = None,
verbose: int = 0,
random_state: int | RandomState | None = None,
store_cv_results: bool = False
)Classifier that post-tunes the decision threshold using cross-validation.
from sklearn.model_selection import FixedThresholdClassifier
FixedThresholdClassifier(
estimator: BaseClassifier,
threshold: float | str = 0.5,
response_method: str = "auto"
)Binary classifier that manually sets the decision threshold.
from sklearn.model_selection import cross_val_score
cross_val_score(
estimator: BaseEstimator,
X: ArrayLike,
y: ArrayLike | None = None,
groups: ArrayLike | None = None,
scoring: str | Callable | None = None,
cv: int | BaseCrossValidator | Iterable | None = None,
n_jobs: int | None = None,
verbose: int = 0,
fit_params: dict | None = None,
pre_dispatch: int | str = "2*n_jobs",
error_score: float | str = ...,
params: dict | None = None
) -> ArrayLikeEvaluate a score by cross-validation.
from sklearn.model_selection import cross_validate
cross_validate(
estimator: BaseEstimator,
X: ArrayLike,
y: ArrayLike | None = None,
groups: ArrayLike | None = None,
scoring: str | Callable | list | tuple | dict | None = None,
cv: int | BaseCrossValidator | Iterable | None = None,
n_jobs: int | None = None,
verbose: int = 0,
fit_params: dict | None = None,
pre_dispatch: int | str = "2*n_jobs",
return_train_score: bool = False,
return_estimator: bool = False,
return_indices: bool = False,
error_score: float | str = ...,
params: dict | None = None
) -> dict[str, ArrayLike]Evaluate metric(s) by cross-validation and also record fit/score times.
from sklearn.model_selection import cross_val_predict
cross_val_predict(
estimator: BaseEstimator,
X: ArrayLike,
y: ArrayLike | None = None,
groups: ArrayLike | None = None,
cv: int | BaseCrossValidator | Iterable | None = None,
n_jobs: int | None = None,
verbose: int = 0,
fit_params: dict | None = None,
pre_dispatch: int | str = "2*n_jobs",
method: str = "predict",
params: dict | None = None
) -> ArrayLikeGenerate cross-validated estimates for each input data point.
from sklearn.model_selection import train_test_split
train_test_split(
*arrays: ArrayLike,
test_size: float | int | None = None,
train_size: float | int | None = None,
random_state: int | RandomState | None = None,
shuffle: bool = True,
stratify: ArrayLike | None = None
) -> list[ArrayLike]Split arrays or matrices into random train and test subsets.
from sklearn.model_selection import validation_curve
validation_curve(
estimator: BaseEstimator,
X: ArrayLike,
y: ArrayLike,
param_name: str,
param_range: ArrayLike,
groups: ArrayLike | None = None,
cv: int | BaseCrossValidator | Iterable | None = None,
scoring: str | Callable | None = None,
n_jobs: int | None = None,
pre_dispatch: int | str = "all",
verbose: int = 0,
error_score: float | str = ...,
fit_params: dict | None = None,
params: dict | None = None
) -> tuple[ArrayLike, ArrayLike]Validation curve.
from sklearn.model_selection import learning_curve
learning_curve(
estimator: BaseEstimator,
X: ArrayLike,
y: ArrayLike,
groups: ArrayLike | None = None,
train_sizes: ArrayLike = ...,
cv: int | BaseCrossValidator | Iterable | None = None,
scoring: str | Callable | None = None,
exploit_incremental_learning: bool = False,
n_jobs: int | None = None,
pre_dispatch: int | str = "all",
verbose: int = 0,
shuffle: bool = False,
random_state: int | RandomState | None = None,
error_score: float | str = ...,
return_times: bool = False,
fit_params: dict | None = None,
params: dict | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike, ArrayLike]Learning curve.
from sklearn.model_selection import permutation_test_score
permutation_test_score(
estimator: BaseEstimator,
X: ArrayLike,
y: ArrayLike,
groups: ArrayLike | None = None,
cv: int | BaseCrossValidator | Iterable | None = None,
n_permutations: int = 100,
n_jobs: int | None = None,
random_state: int | RandomState | None = None,
verbose: int = 0,
scoring: str | Callable | None = None,
fit_params: dict | None = None
) -> tuple[float, ArrayLike, float]Evaluate the significance of a cross-validated score with permutations.
from sklearn.model_selection import check_cv
check_cv(
cv: int | BaseCrossValidator | Iterable | None = 5,
y: ArrayLike | None = None,
classifier: bool = False
) -> BaseCrossValidatorInput checker utility for building a cross-validator.
from sklearn.model_selection import LearningCurveDisplay
LearningCurveDisplay(
train_sizes: ArrayLike,
train_scores: ArrayLike,
test_scores: ArrayLike,
train_scores_std: ArrayLike | None = None,
test_scores_std: ArrayLike | None = None
)Learning Curve visualization.
from sklearn.model_selection import ValidationCurveDisplay
ValidationCurveDisplay(
param_name: str,
param_range: ArrayLike,
train_scores: ArrayLike,
test_scores: ArrayLike,
train_scores_std: ArrayLike | None = None,
test_scores_std: ArrayLike | None = None
)Validation Curve visualization.
from sklearn.calibration import CalibratedClassifierCV
CalibratedClassifierCV(
estimator: BaseClassifier | None = None,
method: str = "sigmoid",
cv: int | BaseCrossValidator | Iterable | str | None = None,
n_jobs: int | None = None,
ensemble: bool = True,
base_estimator: BaseClassifier = "deprecated"
)Probability calibration with isotonic regression or logistic regression.
from sklearn.calibration import calibration_curve
calibration_curve(
y_true: ArrayLike,
y_prob: ArrayLike,
pos_label: int | str | None = None,
normalize: bool = "deprecated",
n_bins: int = 5,
strategy: str = "uniform"
) -> tuple[ArrayLike, ArrayLike]Compute true and predicted probabilities for a calibration curve.
from sklearn.calibration import CalibrationDisplay
CalibrationDisplay(
prob_true: ArrayLike,
prob_pred: ArrayLike,
y_prob: ArrayLike,
estimator_name: str | None = None,
pos_label: int | str | None = None
)Calibration curve visualization.
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# Load data
X, y = load_iris(return_X_y=True)
# Create model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Cross-validation with different strategies
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
# Load data
X, y = load_digits(return_X_y=True)
# Define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10]
}
# Grid search
grid_search = GridSearchCV(
estimator=RandomForestClassifier(random_state=42),
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
# Fit and get results
grid_search.fit(X, y)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")from sklearn.model_selection import learning_curve, LearningCurveDisplay
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
# Generate learning curve
train_sizes, train_scores, test_scores = learning_curve(
RandomForestClassifier(n_estimators=100, random_state=42),
X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10)
)
# Plot learning curve
display = LearningCurveDisplay(
train_sizes=train_sizes,
train_scores=train_scores,
test_scores=test_scores
)
display.plot()
plt.show()from sklearn.model_selection import validation_curve, ValidationCurveDisplay
# Generate validation curve for max_depth parameter
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
train_scores, test_scores = validation_curve(
RandomForestClassifier(n_estimators=100, random_state=42),
X, y, param_name='max_depth', param_range=param_range,
cv=5, scoring='accuracy', n_jobs=-1
)
# Plot validation curve
display = ValidationCurveDisplay(
param_name='max_depth',
param_range=param_range,
train_scores=train_scores,
test_scores=test_scores
)
display.plot()
plt.show()from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train uncalibrated classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
# Calibrate classifier
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=3)
calibrated_clf.fit(X_train, y_train)
# Get calibrated probabilities
y_prob = calibrated_clf.predict_proba(X_test)[:, 1]
# Evaluate calibration
fraction_of_positives, mean_predicted_value = calibration_curve(
y_test, y_prob, n_bins=10
)Install with Tessl CLI
npx tessl i tessl/pypi-scikit-learndocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10