A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.
87
This document covers all dataset loading, fetching, generation, and utility functions in scikit-learn.
from sklearn.datasets import load_iris
load_iris(
return_X_y: bool = False,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load and return the iris dataset (classification).
from sklearn.datasets import load_digits
load_digits(
n_class: int = 10,
return_X_y: bool = False,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load and return the digits dataset (classification).
from sklearn.datasets import load_wine
load_wine(
return_X_y: bool = False,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load and return the wine dataset (classification).
from sklearn.datasets import load_breast_cancer
load_breast_cancer(
return_X_y: bool = False,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load and return the breast cancer wisconsin dataset (classification).
from sklearn.datasets import load_diabetes
load_diabetes(
return_X_y: bool = False,
as_frame: bool = False,
scaled: bool = True
) -> Bunch | tuple[ArrayLike, ArrayLike]Load and return the diabetes dataset (regression).
from sklearn.datasets import load_linnerud
load_linnerud(
return_X_y: bool = False,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load and return the linnerud dataset (multivariate regression).
from sklearn.datasets import load_files
load_files(
container_path: str,
description: str | None = None,
categories: list[str] | None = None,
load_content: bool = True,
shuffle: bool = True,
encoding: str | None = None,
decode_error: str = "strict",
random_state: int | RandomState | None = 0
) -> BunchLoad text files with categories as subfolder names.
from sklearn.datasets import load_sample_images
load_sample_images() -> BunchLoad sample images for image manipulation.
from sklearn.datasets import load_sample_image
load_sample_image(
image_name: str
) -> ArrayLikeLoad the numpy array of a single sample image.
from sklearn.datasets import fetch_20newsgroups
fetch_20newsgroups(
data_home: str | None = None,
subset: str = "train",
categories: list[str] | None = None,
shuffle: bool = True,
random_state: int | RandomState | None = 42,
remove: tuple | None = (),
download_if_missing: bool = True,
return_X_y: bool = False
) -> Bunch | tuple[list[str], ArrayLike]Load the filenames and data from the 20 newsgroups dataset.
from sklearn.datasets import fetch_20newsgroups_vectorized
fetch_20newsgroups_vectorized(
subset: str = "train",
remove: tuple = (),
data_home: str | None = None,
download_if_missing: bool = True,
return_X_y: bool = False,
normalize: bool = True,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load the 20 newsgroups dataset and vectorize it.
from sklearn.datasets import fetch_rcv1
fetch_rcv1(
data_home: str | None = None,
subset: str = "all",
download_if_missing: bool = True,
random_state: int | RandomState | None = None,
shuffle: bool = False,
return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load the RCV1 multilabel dataset.
from sklearn.datasets import fetch_lfw_people
fetch_lfw_people(
data_home: str | None = None,
funneled: bool = True,
resize: float = 0.5,
min_faces_per_person: int = 0,
color: bool = False,
slice_: tuple | None = (slice(70, 195), slice(78, 172)),
download_if_missing: bool = True,
return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load the Labeled Faces in the Wild (LFW) people dataset.
from sklearn.datasets import fetch_lfw_pairs
fetch_lfw_pairs(
subset: str = "train",
data_home: str | None = None,
funneled: bool = True,
resize: float = 0.5,
color: bool = False,
slice_: tuple | None = (slice(70, 195), slice(78, 172)),
download_if_missing: bool = True
) -> BunchLoad the Labeled Faces in the Wild (LFW) pairs dataset.
from sklearn.datasets import fetch_olivetti_faces
fetch_olivetti_faces(
data_home: str | None = None,
shuffle: bool = False,
random_state: int | RandomState | None = 0,
download_if_missing: bool = True,
return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load the Olivetti faces dataset.
from sklearn.datasets import fetch_california_housing
fetch_california_housing(
data_home: str | None = None,
download_if_missing: bool = True,
return_X_y: bool = False,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load the California housing dataset.
from sklearn.datasets import fetch_kddcup99
fetch_kddcup99(
subset: str | None = None,
data_home: str | None = None,
shuffle: bool = False,
random_state: int | RandomState | None = None,
percent10: bool = True,
download_if_missing: bool = True,
return_X_y: bool = False,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load the kddcup99 dataset.
from sklearn.datasets import fetch_covtype
fetch_covtype(
data_home: str | None = None,
download_if_missing: bool = True,
random_state: int | RandomState | None = None,
shuffle: bool = False,
return_X_y: bool = False,
as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]Load the covertype dataset.
from sklearn.datasets import fetch_species_distributions
fetch_species_distributions(
data_home: str | None = None,
download_if_missing: bool = True
) -> BunchLoader for species distribution dataset.
from sklearn.datasets import fetch_openml
fetch_openml(
name: str | int | None = None,
version: int | str = "active",
data_id: int | None = None,
data_home: str | None = None,
target_column: str | list | None = "default-target",
cache: bool = True,
return_X_y: bool = False,
as_frame: bool | str = "auto",
n_retries: int = 3,
delay: float = 1.0,
parser: str = "auto",
read_csv_kwargs: dict | None = None
) -> Bunch | tuple[ArrayLike, ArrayLike]Fetch dataset from openml by name or dataset id.
from sklearn.datasets import fetch_file
fetch_file(
url: str,
data_home: str | None = None,
cache_subdir: str = "",
hash_: str | None = None,
hash_algorithm: str = "auto",
extract: bool = False,
force_extract: bool = False,
quiet: bool = False,
local_folder: str | None = None
) -> strLoad a file from the Web.
from sklearn.datasets import make_classification
make_classification(
n_samples: int = 100,
n_features: int = 20,
n_informative: int = 2,
n_redundant: int = 2,
n_repeated: int = 0,
n_classes: int = 2,
n_clusters_per_class: int = 2,
weights: ArrayLike | None = None,
flip_y: float = 0.01,
class_sep: float = 1.0,
hypercube: bool = True,
shift: float | ArrayLike | None = 0.0,
scale: float | ArrayLike | None = 1.0,
shuffle: bool = True,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]Generate a random n-class classification problem.
from sklearn.datasets import make_multilabel_classification
make_multilabel_classification(
n_samples: int = 100,
n_features: int = 20,
n_classes: int = 5,
n_labels: int = 2,
length: int = 50,
allow_unlabeled: bool = True,
sparse: bool = False,
return_indicator: str = "dense",
return_distributions: bool = False,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]Generate a random multilabel classification problem.
from sklearn.datasets import make_hastie_10_2
make_hastie_10_2(
n_samples: int = 12000,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]Generate data for binary classification used in Hastie et al. 2009.
from sklearn.datasets import make_gaussian_quantiles
make_gaussian_quantiles(
mean: ArrayLike | None = None,
cov: float = 1.0,
n_samples: int = 100,
n_features: int = 2,
n_classes: int = 3,
shuffle: bool = True,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]Generate isotropic Gaussian and label samples by quantile.
from sklearn.datasets import make_regression
make_regression(
n_samples: int = 100,
n_features: int = 100,
n_informative: int = 10,
n_targets: int = 1,
bias: float = 0.0,
effective_rank: int | None = None,
tail_strength: float = 0.5,
noise: float = 0.0,
shuffle: bool = True,
coef: bool = False,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]Generate a random regression problem.
from sklearn.datasets import make_friedman1
make_friedman1(
n_samples: int = 100,
n_features: int = 10,
noise: float = 0.0,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]Generate the "Friedman #1" regression problem.
from sklearn.datasets import make_friedman2
make_friedman2(
n_samples: int = 100,
noise: float = 0.0,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]Generate the "Friedman #2" regression problem.
from sklearn.datasets import make_friedman3
make_friedman3(
n_samples: int = 100,
noise: float = 0.0,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]Generate the "Friedman #3" regression problem.
from sklearn.datasets import make_sparse_uncorrelated
make_sparse_uncorrelated(
n_samples: int = 100,
n_features: int = 10,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]Generate a random regression problem with sparse uncorrelated design.
from sklearn.datasets import make_blobs
make_blobs(
n_samples: int | ArrayLike = 100,
n_features: int = 2,
centers: int | ArrayLike | None = None,
cluster_std: float | ArrayLike = 1.0,
center_box: tuple[float, float] = (-10.0, 10.0),
shuffle: bool = True,
random_state: int | RandomState | None = None,
return_centers: bool = False
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]Generate isotropic Gaussian blobs for clustering.
from sklearn.datasets import make_circles
make_circles(
n_samples: int | tuple[int, int] = 100,
shuffle: bool = True,
noise: float | None = None,
random_state: int | RandomState | None = None,
factor: float = 0.8
) -> tuple[ArrayLike, ArrayLike]Make a large circle containing a smaller circle in 2d.
from sklearn.datasets import make_moons
make_moons(
n_samples: int | tuple[int, int] = 100,
shuffle: bool = True,
noise: float | None = None,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]Make two interleaving half circles.
from sklearn.datasets import make_swiss_roll
make_swiss_roll(
n_samples: int = 100,
noise: float = 0.0,
random_state: int | RandomState | None = None,
hole: bool = False
) -> tuple[ArrayLike, ArrayLike]Generate a swiss roll dataset.
from sklearn.datasets import make_s_curve
make_s_curve(
n_samples: int = 100,
noise: float = 0.0,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]Generate an S curve dataset.
from sklearn.datasets import make_biclusters
make_biclusters(
shape: tuple[int, int],
n_clusters: int,
noise: float = 0.0,
minval: int = 10,
maxval: int = 100,
shuffle: bool = True,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]Generate an array with constant block diagonal structure.
from sklearn.datasets import make_checkerboard
make_checkerboard(
shape: tuple[int, int],
n_clusters: int | tuple[int, int],
noise: float = 0.0,
minval: int = 10,
maxval: int = 100,
shuffle: bool = True,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]Generate an array with block checkerboard structure.
from sklearn.datasets import make_low_rank_matrix
make_low_rank_matrix(
n_samples: int = 100,
n_features: int = 100,
effective_rank: int = 10,
tail_strength: float = 0.5,
random_state: int | RandomState | None = None
) -> ArrayLikeGenerate a mostly low rank matrix with bell-shaped singular values.
from sklearn.datasets import make_sparse_coded_signal
make_sparse_coded_signal(
n_samples: int,
n_components: int,
n_features: int,
n_nonzero_coefs: int,
random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]Generate a signal as a sparse combination of dictionary elements.
from sklearn.datasets import make_spd_matrix
make_spd_matrix(
n_dim: int,
random_state: int | RandomState | None = None
) -> ArrayLikeGenerate a random symmetric, positive-definite matrix.
from sklearn.datasets import make_sparse_spd_matrix
make_sparse_spd_matrix(
dim: int = 1,
alpha: float = 0.95,
norm_diag: bool = False,
smallest_coef: float = 0.1,
largest_coef: float = 0.9,
random_state: int | RandomState | None = None
) -> ArrayLikeGenerate a sparse symmetric definite positive matrix.
from sklearn.datasets import load_svmlight_file
load_svmlight_file(
f: str | IO,
n_features: int | None = None,
dtype: type = ...,
multilabel: bool = False,
zero_based: bool | str = "auto",
query_id: bool = False,
offset: int = 0,
length: int = -1
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]Load datasets in the svmlight / libsvm format into sparse CSR matrix.
from sklearn.datasets import load_svmlight_files
load_svmlight_files(
files: list[str | IO],
n_features: int | None = None,
dtype: type = ...,
multilabel: bool = False,
zero_based: bool | str = "auto",
query_id: bool = False,
offset: int = 0,
length: int = -1
) -> list[tuple[ArrayLike, ArrayLike]] | list[tuple[ArrayLike, ArrayLike, ArrayLike]]Load dataset from multiple files in SVMlight format.
from sklearn.datasets import dump_svmlight_file
dump_svmlight_file(
X: ArrayLike,
y: ArrayLike,
f: str | IO,
zero_based: bool = True,
comment: str | bytes | None = None,
query_id: ArrayLike | None = None,
multilabel: bool = False
) -> NoneDump the dataset in svmlight / libsvm file format.
from sklearn.datasets import get_data_home
get_data_home(
data_home: str | None = None
) -> strReturn the path to scikit-learn data dir.
from sklearn.datasets import clear_data_home
clear_data_home(
data_home: str | None = None
) -> NoneDelete all the content in the data home cache.
from sklearn.datasets import load_iris, load_digits, load_wine
# Load iris dataset
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
print(f"Iris dataset: {X_iris.shape}, classes: {len(iris.target_names)}")
# Load digits dataset
digits = load_digits(n_class=10)
X_digits, y_digits = digits.data, digits.target
print(f"Digits dataset: {X_digits.shape}")
# Load wine dataset as tuple
X_wine, y_wine = load_wine(return_X_y=True)
print(f"Wine dataset: {X_wine.shape}")
# Load as pandas DataFrame
wine_frame = load_wine(as_frame=True)
df = wine_frame.frame
print(df.head())from sklearn.datasets import fetch_california_housing, fetch_20newsgroups
# Fetch California housing dataset
housing = fetch_california_housing()
X_housing, y_housing = housing.data, housing.target
print(f"Housing dataset: {X_housing.shape}")
print(f"Features: {housing.feature_names}")
# Fetch text data (20 newsgroups)
newsgroups = fetch_20newsgroups(
subset='train',
categories=['alt.atheism', 'sci.space']
)
print(f"Newsgroups: {len(newsgroups.data)} documents")
print(f"Categories: {newsgroups.target_names}")from sklearn.datasets import (
make_classification, make_regression, make_blobs,
make_circles, make_moons
)
# Classification data
X_clf, y_clf = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, n_classes=3, random_state=42
)
print(f"Classification data: {X_clf.shape}")
# Regression data
X_reg, y_reg = make_regression(
n_samples=1000, n_features=20, n_informative=10,
noise=0.1, random_state=42
)
print(f"Regression data: {X_reg.shape}")
# Clustering data - blobs
X_blobs, y_blobs = make_blobs(
n_samples=300, centers=4, n_features=2,
random_state=42, cluster_std=0.8
)
# Non-linear clustering data
X_circles, y_circles = make_circles(
n_samples=300, noise=0.05, factor=0.6, random_state=42
)
X_moons, y_moons = make_moons(
n_samples=300, noise=0.1, random_state=42
)
print(f"Blobs: {X_blobs.shape}, Circles: {X_circles.shape}, Moons: {X_moons.shape}")from sklearn.datasets import make_swiss_roll, make_s_curve
# Generate swiss roll manifold
X_swiss, t_swiss = make_swiss_roll(n_samples=1000, noise=0.1, random_state=42)
print(f"Swiss roll: {X_swiss.shape}")
# Generate S-curve manifold
X_s_curve, t_s_curve = make_s_curve(n_samples=1000, noise=0.1, random_state=42)
print(f"S-curve: {X_s_curve.shape}")from sklearn.datasets import dump_svmlight_file, load_svmlight_file
import tempfile
import os
# Create sample data
X, y = make_classification(n_samples=100, n_features=10, random_state=42)
# Save to SVMLight format
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.svmlight') as f:
dump_svmlight_file(X, y, f.name)
filename = f.name
# Load from SVMLight format
X_loaded, y_loaded = load_svmlight_file(filename)
print(f"Original: {X.shape}, Loaded: {X_loaded.shape}")
# Clean up
os.unlink(filename)import numpy as np
from sklearn.utils import Bunch
def create_custom_dataset(n_samples=100):
"""Create a custom dataset with specific characteristics."""
np.random.seed(42)
# Generate features
X = np.random.randn(n_samples, 5)
# Create target with specific pattern
y = (X[:, 0] + X[:, 1] > 0).astype(int)
# Create a Bunch object similar to sklearn datasets
return Bunch(
data=X,
target=y,
feature_names=[f'feature_{i}' for i in range(5)],
target_names=['class_0', 'class_1'],
DESCR='Custom synthetic dataset'
)
# Use custom dataset
custom_data = create_custom_dataset(500)
print(f"Custom dataset: {custom_data.data.shape}")
print(f"Features: {custom_data.feature_names}")Install with Tessl CLI
npx tessl i tessl/pypi-scikit-learndocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10