CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-scikit-learn

A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.

87

0.98x
Overview
Eval results
Files

datasets.mddocs/

Datasets and Data Generation

This document covers all dataset loading, fetching, generation, and utility functions in scikit-learn.

Built-in Toy Datasets

Classification Datasets

load_iris { .api }

from sklearn.datasets import load_iris

load_iris(
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the iris dataset (classification).

load_digits { .api }

from sklearn.datasets import load_digits

load_digits(
    n_class: int = 10,
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the digits dataset (classification).

load_wine { .api }

from sklearn.datasets import load_wine

load_wine(
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the wine dataset (classification).

load_breast_cancer { .api }

from sklearn.datasets import load_breast_cancer

load_breast_cancer(
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the breast cancer wisconsin dataset (classification).

Regression Datasets

load_diabetes { .api }

from sklearn.datasets import load_diabetes

load_diabetes(
    return_X_y: bool = False,
    as_frame: bool = False,
    scaled: bool = True
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the diabetes dataset (regression).

load_linnerud { .api }

from sklearn.datasets import load_linnerud

load_linnerud(
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the linnerud dataset (multivariate regression).

General Data Loading

load_files { .api }

from sklearn.datasets import load_files

load_files(
    container_path: str,
    description: str | None = None,
    categories: list[str] | None = None,
    load_content: bool = True,
    shuffle: bool = True,
    encoding: str | None = None,
    decode_error: str = "strict",
    random_state: int | RandomState | None = 0
) -> Bunch

Load text files with categories as subfolder names.

Sample Images

load_sample_images { .api }

from sklearn.datasets import load_sample_images

load_sample_images() -> Bunch

Load sample images for image manipulation.

load_sample_image { .api }

from sklearn.datasets import load_sample_image

load_sample_image(
    image_name: str
) -> ArrayLike

Load the numpy array of a single sample image.

Real-World Datasets (Fetch Functions)

Text Datasets

fetch_20newsgroups { .api }

from sklearn.datasets import fetch_20newsgroups

fetch_20newsgroups(
    data_home: str | None = None,
    subset: str = "train",
    categories: list[str] | None = None,
    shuffle: bool = True,
    random_state: int | RandomState | None = 42,
    remove: tuple | None = (),
    download_if_missing: bool = True,
    return_X_y: bool = False
) -> Bunch | tuple[list[str], ArrayLike]

Load the filenames and data from the 20 newsgroups dataset.

fetch_20newsgroups_vectorized { .api }

from sklearn.datasets import fetch_20newsgroups_vectorized

fetch_20newsgroups_vectorized(
    subset: str = "train",
    remove: tuple = (),
    data_home: str | None = None,
    download_if_missing: bool = True,
    return_X_y: bool = False,
    normalize: bool = True,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the 20 newsgroups dataset and vectorize it.

fetch_rcv1 { .api }

from sklearn.datasets import fetch_rcv1

fetch_rcv1(
    data_home: str | None = None,
    subset: str = "all",
    download_if_missing: bool = True,
    random_state: int | RandomState | None = None,
    shuffle: bool = False,
    return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the RCV1 multilabel dataset.

Computer Vision Datasets

fetch_lfw_people { .api }

from sklearn.datasets import fetch_lfw_people

fetch_lfw_people(
    data_home: str | None = None,
    funneled: bool = True,
    resize: float = 0.5,
    min_faces_per_person: int = 0,
    color: bool = False,
    slice_: tuple | None = (slice(70, 195), slice(78, 172)),
    download_if_missing: bool = True,
    return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the Labeled Faces in the Wild (LFW) people dataset.

fetch_lfw_pairs { .api }

from sklearn.datasets import fetch_lfw_pairs

fetch_lfw_pairs(
    subset: str = "train",
    data_home: str | None = None,
    funneled: bool = True,
    resize: float = 0.5,
    color: bool = False,
    slice_: tuple | None = (slice(70, 195), slice(78, 172)),
    download_if_missing: bool = True
) -> Bunch

Load the Labeled Faces in the Wild (LFW) pairs dataset.

fetch_olivetti_faces { .api }

from sklearn.datasets import fetch_olivetti_faces

fetch_olivetti_faces(
    data_home: str | None = None,
    shuffle: bool = False,
    random_state: int | RandomState | None = 0,
    download_if_missing: bool = True,
    return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the Olivetti faces dataset.

Real Estate and Regression Datasets

fetch_california_housing { .api }

from sklearn.datasets import fetch_california_housing

fetch_california_housing(
    data_home: str | None = None,
    download_if_missing: bool = True,
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the California housing dataset.

Network Security Datasets

fetch_kddcup99 { .api }

from sklearn.datasets import fetch_kddcup99

fetch_kddcup99(
    subset: str | None = None,
    data_home: str | None = None,
    shuffle: bool = False,
    random_state: int | RandomState | None = None,
    percent10: bool = True,
    download_if_missing: bool = True,
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the kddcup99 dataset.

Environmental Datasets

fetch_covtype { .api }

from sklearn.datasets import fetch_covtype

fetch_covtype(
    data_home: str | None = None,
    download_if_missing: bool = True,
    random_state: int | RandomState | None = None,
    shuffle: bool = False,
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the covertype dataset.

fetch_species_distributions { .api }

from sklearn.datasets import fetch_species_distributions

fetch_species_distributions(
    data_home: str | None = None,
    download_if_missing: bool = True
) -> Bunch

Loader for species distribution dataset.

OpenML Integration

fetch_openml { .api }

from sklearn.datasets import fetch_openml

fetch_openml(
    name: str | int | None = None,
    version: int | str = "active",
    data_id: int | None = None,
    data_home: str | None = None,
    target_column: str | list | None = "default-target",
    cache: bool = True,
    return_X_y: bool = False,
    as_frame: bool | str = "auto",
    n_retries: int = 3,
    delay: float = 1.0,
    parser: str = "auto",
    read_csv_kwargs: dict | None = None
) -> Bunch | tuple[ArrayLike, ArrayLike]

Fetch dataset from openml by name or dataset id.

General File Fetching

fetch_file { .api }

from sklearn.datasets import fetch_file

fetch_file(
    url: str,
    data_home: str | None = None,
    cache_subdir: str = "",
    hash_: str | None = None,
    hash_algorithm: str = "auto",
    extract: bool = False,
    force_extract: bool = False,
    quiet: bool = False,
    local_folder: str | None = None
) -> str

Load a file from the Web.

Synthetic Data Generation

Classification Data Generation

make_classification { .api }

from sklearn.datasets import make_classification

make_classification(
    n_samples: int = 100,
    n_features: int = 20,
    n_informative: int = 2,
    n_redundant: int = 2,
    n_repeated: int = 0,
    n_classes: int = 2,
    n_clusters_per_class: int = 2,
    weights: ArrayLike | None = None,
    flip_y: float = 0.01,
    class_sep: float = 1.0,
    hypercube: bool = True,
    shift: float | ArrayLike | None = 0.0,
    scale: float | ArrayLike | None = 1.0,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]

Generate a random n-class classification problem.

make_multilabel_classification { .api }

from sklearn.datasets import make_multilabel_classification

make_multilabel_classification(
    n_samples: int = 100,
    n_features: int = 20,
    n_classes: int = 5,
    n_labels: int = 2,
    length: int = 50,
    allow_unlabeled: bool = True,
    sparse: bool = False,
    return_indicator: str = "dense",
    return_distributions: bool = False,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]

Generate a random multilabel classification problem.

make_hastie_10_2 { .api }

from sklearn.datasets import make_hastie_10_2

make_hastie_10_2(
    n_samples: int = 12000,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate data for binary classification used in Hastie et al. 2009.

make_gaussian_quantiles { .api }

from sklearn.datasets import make_gaussian_quantiles

make_gaussian_quantiles(
    mean: ArrayLike | None = None,
    cov: float = 1.0,
    n_samples: int = 100,
    n_features: int = 2,
    n_classes: int = 3,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate isotropic Gaussian and label samples by quantile.

Regression Data Generation

make_regression { .api }

from sklearn.datasets import make_regression

make_regression(
    n_samples: int = 100,
    n_features: int = 100,
    n_informative: int = 10,
    n_targets: int = 1,
    bias: float = 0.0,
    effective_rank: int | None = None,
    tail_strength: float = 0.5,
    noise: float = 0.0,
    shuffle: bool = True,
    coef: bool = False,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]

Generate a random regression problem.

make_friedman1 { .api }

from sklearn.datasets import make_friedman1

make_friedman1(
    n_samples: int = 100,
    n_features: int = 10,
    noise: float = 0.0,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate the "Friedman #1" regression problem.

make_friedman2 { .api }

from sklearn.datasets import make_friedman2

make_friedman2(
    n_samples: int = 100,
    noise: float = 0.0,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate the "Friedman #2" regression problem.

make_friedman3 { .api }

from sklearn.datasets import make_friedman3

make_friedman3(
    n_samples: int = 100,
    noise: float = 0.0,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate the "Friedman #3" regression problem.

make_sparse_uncorrelated { .api }

from sklearn.datasets import make_sparse_uncorrelated

make_sparse_uncorrelated(
    n_samples: int = 100,
    n_features: int = 10,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate a random regression problem with sparse uncorrelated design.

Clustering Data Generation

make_blobs { .api }

from sklearn.datasets import make_blobs

make_blobs(
    n_samples: int | ArrayLike = 100,
    n_features: int = 2,
    centers: int | ArrayLike | None = None,
    cluster_std: float | ArrayLike = 1.0,
    center_box: tuple[float, float] = (-10.0, 10.0),
    shuffle: bool = True,
    random_state: int | RandomState | None = None,
    return_centers: bool = False
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]

Generate isotropic Gaussian blobs for clustering.

make_circles { .api }

from sklearn.datasets import make_circles

make_circles(
    n_samples: int | tuple[int, int] = 100,
    shuffle: bool = True,
    noise: float | None = None,
    random_state: int | RandomState | None = None,
    factor: float = 0.8
) -> tuple[ArrayLike, ArrayLike]

Make a large circle containing a smaller circle in 2d.

make_moons { .api }

from sklearn.datasets import make_moons

make_moons(
    n_samples: int | tuple[int, int] = 100,
    shuffle: bool = True,
    noise: float | None = None,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Make two interleaving half circles.

Manifold Data Generation

make_swiss_roll { .api }

from sklearn.datasets import make_swiss_roll

make_swiss_roll(
    n_samples: int = 100,
    noise: float = 0.0,
    random_state: int | RandomState | None = None,
    hole: bool = False
) -> tuple[ArrayLike, ArrayLike]

Generate a swiss roll dataset.

make_s_curve { .api }

from sklearn.datasets import make_s_curve

make_s_curve(
    n_samples: int = 100,
    noise: float = 0.0,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate an S curve dataset.

Biclustering Data Generation

make_biclusters { .api }

from sklearn.datasets import make_biclusters

make_biclusters(
    shape: tuple[int, int],
    n_clusters: int,
    noise: float = 0.0,
    minval: int = 10,
    maxval: int = 100,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]

Generate an array with constant block diagonal structure.

make_checkerboard { .api }

from sklearn.datasets import make_checkerboard

make_checkerboard(
    shape: tuple[int, int],
    n_clusters: int | tuple[int, int],
    noise: float = 0.0,
    minval: int = 10,
    maxval: int = 100,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]

Generate an array with block checkerboard structure.

Matrix Generation

make_low_rank_matrix { .api }

from sklearn.datasets import make_low_rank_matrix

make_low_rank_matrix(
    n_samples: int = 100,
    n_features: int = 100,
    effective_rank: int = 10,
    tail_strength: float = 0.5,
    random_state: int | RandomState | None = None
) -> ArrayLike

Generate a mostly low rank matrix with bell-shaped singular values.

make_sparse_coded_signal { .api }

from sklearn.datasets import make_sparse_coded_signal

make_sparse_coded_signal(
    n_samples: int,
    n_components: int,
    n_features: int,
    n_nonzero_coefs: int,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]

Generate a signal as a sparse combination of dictionary elements.

make_spd_matrix { .api }

from sklearn.datasets import make_spd_matrix

make_spd_matrix(
    n_dim: int,
    random_state: int | RandomState | None = None
) -> ArrayLike

Generate a random symmetric, positive-definite matrix.

make_sparse_spd_matrix { .api }

from sklearn.datasets import make_sparse_spd_matrix

make_sparse_spd_matrix(
    dim: int = 1,
    alpha: float = 0.95,
    norm_diag: bool = False,
    smallest_coef: float = 0.1,
    largest_coef: float = 0.9,
    random_state: int | RandomState | None = None
) -> ArrayLike

Generate a sparse symmetric definite positive matrix.

File I/O Utilities

SVMLight Format

load_svmlight_file { .api }

from sklearn.datasets import load_svmlight_file

load_svmlight_file(
    f: str | IO,
    n_features: int | None = None,
    dtype: type = ...,
    multilabel: bool = False,
    zero_based: bool | str = "auto",
    query_id: bool = False,
    offset: int = 0,
    length: int = -1
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]

Load datasets in the svmlight / libsvm format into sparse CSR matrix.

load_svmlight_files { .api }

from sklearn.datasets import load_svmlight_files

load_svmlight_files(
    files: list[str | IO],
    n_features: int | None = None,
    dtype: type = ...,
    multilabel: bool = False,
    zero_based: bool | str = "auto",
    query_id: bool = False,
    offset: int = 0,
    length: int = -1
) -> list[tuple[ArrayLike, ArrayLike]] | list[tuple[ArrayLike, ArrayLike, ArrayLike]]

Load dataset from multiple files in SVMlight format.

dump_svmlight_file { .api }

from sklearn.datasets import dump_svmlight_file

dump_svmlight_file(
    X: ArrayLike,
    y: ArrayLike,
    f: str | IO,
    zero_based: bool = True,
    comment: str | bytes | None = None,
    query_id: ArrayLike | None = None,
    multilabel: bool = False
) -> None

Dump the dataset in svmlight / libsvm file format.

Data Directory Management

get_data_home { .api }

from sklearn.datasets import get_data_home

get_data_home(
    data_home: str | None = None
) -> str

Return the path to scikit-learn data dir.

clear_data_home { .api }

from sklearn.datasets import clear_data_home

clear_data_home(
    data_home: str | None = None
) -> None

Delete all the content in the data home cache.

Examples

Loading Built-in Datasets

from sklearn.datasets import load_iris, load_digits, load_wine

# Load iris dataset
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
print(f"Iris dataset: {X_iris.shape}, classes: {len(iris.target_names)}")

# Load digits dataset  
digits = load_digits(n_class=10)
X_digits, y_digits = digits.data, digits.target
print(f"Digits dataset: {X_digits.shape}")

# Load wine dataset as tuple
X_wine, y_wine = load_wine(return_X_y=True)
print(f"Wine dataset: {X_wine.shape}")

# Load as pandas DataFrame
wine_frame = load_wine(as_frame=True)
df = wine_frame.frame
print(df.head())

Fetching Real-World Datasets

from sklearn.datasets import fetch_california_housing, fetch_20newsgroups

# Fetch California housing dataset
housing = fetch_california_housing()
X_housing, y_housing = housing.data, housing.target
print(f"Housing dataset: {X_housing.shape}")
print(f"Features: {housing.feature_names}")

# Fetch text data (20 newsgroups)
newsgroups = fetch_20newsgroups(
    subset='train', 
    categories=['alt.atheism', 'sci.space']
)
print(f"Newsgroups: {len(newsgroups.data)} documents")
print(f"Categories: {newsgroups.target_names}")

Generating Synthetic Data

from sklearn.datasets import (
    make_classification, make_regression, make_blobs, 
    make_circles, make_moons
)

# Classification data
X_clf, y_clf = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    n_redundant=5, n_classes=3, random_state=42
)
print(f"Classification data: {X_clf.shape}")

# Regression data
X_reg, y_reg = make_regression(
    n_samples=1000, n_features=20, n_informative=10,
    noise=0.1, random_state=42
)
print(f"Regression data: {X_reg.shape}")

# Clustering data - blobs
X_blobs, y_blobs = make_blobs(
    n_samples=300, centers=4, n_features=2,
    random_state=42, cluster_std=0.8
)

# Non-linear clustering data
X_circles, y_circles = make_circles(
    n_samples=300, noise=0.05, factor=0.6, random_state=42
)

X_moons, y_moons = make_moons(
    n_samples=300, noise=0.1, random_state=42
)

print(f"Blobs: {X_blobs.shape}, Circles: {X_circles.shape}, Moons: {X_moons.shape}")

Manifold Learning Data

from sklearn.datasets import make_swiss_roll, make_s_curve

# Generate swiss roll manifold
X_swiss, t_swiss = make_swiss_roll(n_samples=1000, noise=0.1, random_state=42)
print(f"Swiss roll: {X_swiss.shape}")

# Generate S-curve manifold
X_s_curve, t_s_curve = make_s_curve(n_samples=1000, noise=0.1, random_state=42)
print(f"S-curve: {X_s_curve.shape}")

Working with SVMLight Format

from sklearn.datasets import dump_svmlight_file, load_svmlight_file
import tempfile
import os

# Create sample data
X, y = make_classification(n_samples=100, n_features=10, random_state=42)

# Save to SVMLight format
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.svmlight') as f:
    dump_svmlight_file(X, y, f.name)
    filename = f.name

# Load from SVMLight format
X_loaded, y_loaded = load_svmlight_file(filename)
print(f"Original: {X.shape}, Loaded: {X_loaded.shape}")

# Clean up
os.unlink(filename)

Custom Dataset Creation

import numpy as np
from sklearn.utils import Bunch

def create_custom_dataset(n_samples=100):
    """Create a custom dataset with specific characteristics."""
    np.random.seed(42)
    
    # Generate features
    X = np.random.randn(n_samples, 5)
    
    # Create target with specific pattern
    y = (X[:, 0] + X[:, 1] > 0).astype(int)
    
    # Create a Bunch object similar to sklearn datasets
    return Bunch(
        data=X,
        target=y,
        feature_names=[f'feature_{i}' for i in range(5)],
        target_names=['class_0', 'class_1'],
        DESCR='Custom synthetic dataset'
    )

# Use custom dataset
custom_data = create_custom_dataset(500)
print(f"Custom dataset: {custom_data.data.shape}")
print(f"Features: {custom_data.feature_names}")

Install with Tessl CLI

npx tessl i tessl/pypi-scikit-learn

docs

datasets.md

feature-extraction.md

index.md

metrics.md

model-selection.md

neighbors.md

pipelines.md

preprocessing.md

supervised-learning.md

unsupervised-learning.md

utilities.md

tile.json