tessl/pypi-scikit-learn

A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.

0.98x

Overview

Eval results

Files

Datasets and Data Generation

Name: tessl/pypi-scikit-learn
Rating: 0.87 (1 reviews)
Author: tessl

This document covers all dataset loading, fetching, generation, and utility functions in scikit-learn.

Built-in Toy Datasets

Classification Datasets

load_iris { .api }

from sklearn.datasets import load_iris

load_iris(
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the iris dataset (classification).

load_digits { .api }

from sklearn.datasets import load_digits

load_digits(
    n_class: int = 10,
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the digits dataset (classification).

load_wine { .api }

from sklearn.datasets import load_wine

load_wine(
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the wine dataset (classification).

load_breast_cancer { .api }

from sklearn.datasets import load_breast_cancer

load_breast_cancer(
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the breast cancer wisconsin dataset (classification).

Regression Datasets

load_diabetes { .api }

from sklearn.datasets import load_diabetes

load_diabetes(
    return_X_y: bool = False,
    as_frame: bool = False,
    scaled: bool = True
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the diabetes dataset (regression).

load_linnerud { .api }

from sklearn.datasets import load_linnerud

load_linnerud(
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load and return the linnerud dataset (multivariate regression).

General Data Loading

load_files { .api }

from sklearn.datasets import load_files

load_files(
    container_path: str,
    description: str | None = None,
    categories: list[str] | None = None,
    load_content: bool = True,
    shuffle: bool = True,
    encoding: str | None = None,
    decode_error: str = "strict",
    random_state: int | RandomState | None = 0
) -> Bunch

Load text files with categories as subfolder names.

Sample Images

load_sample_images { .api }

from sklearn.datasets import load_sample_images

load_sample_images() -> Bunch

Load sample images for image manipulation.

load_sample_image { .api }

from sklearn.datasets import load_sample_image

load_sample_image(
    image_name: str
) -> ArrayLike

Load the numpy array of a single sample image.

Real-World Datasets (Fetch Functions)

Text Datasets

fetch_20newsgroups { .api }

from sklearn.datasets import fetch_20newsgroups

fetch_20newsgroups(
    data_home: str | None = None,
    subset: str = "train",
    categories: list[str] | None = None,
    shuffle: bool = True,
    random_state: int | RandomState | None = 42,
    remove: tuple | None = (),
    download_if_missing: bool = True,
    return_X_y: bool = False
) -> Bunch | tuple[list[str], ArrayLike]

Load the filenames and data from the 20 newsgroups dataset.

fetch_20newsgroups_vectorized { .api }

from sklearn.datasets import fetch_20newsgroups_vectorized

fetch_20newsgroups_vectorized(
    subset: str = "train",
    remove: tuple = (),
    data_home: str | None = None,
    download_if_missing: bool = True,
    return_X_y: bool = False,
    normalize: bool = True,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the 20 newsgroups dataset and vectorize it.

fetch_rcv1 { .api }

from sklearn.datasets import fetch_rcv1

fetch_rcv1(
    data_home: str | None = None,
    subset: str = "all",
    download_if_missing: bool = True,
    random_state: int | RandomState | None = None,
    shuffle: bool = False,
    return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the RCV1 multilabel dataset.

Computer Vision Datasets

fetch_lfw_people { .api }

from sklearn.datasets import fetch_lfw_people

fetch_lfw_people(
    data_home: str | None = None,
    funneled: bool = True,
    resize: float = 0.5,
    min_faces_per_person: int = 0,
    color: bool = False,
    slice_: tuple | None = (slice(70, 195), slice(78, 172)),
    download_if_missing: bool = True,
    return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the Labeled Faces in the Wild (LFW) people dataset.

fetch_lfw_pairs { .api }

from sklearn.datasets import fetch_lfw_pairs

fetch_lfw_pairs(
    subset: str = "train",
    data_home: str | None = None,
    funneled: bool = True,
    resize: float = 0.5,
    color: bool = False,
    slice_: tuple | None = (slice(70, 195), slice(78, 172)),
    download_if_missing: bool = True
) -> Bunch

Load the Labeled Faces in the Wild (LFW) pairs dataset.

fetch_olivetti_faces { .api }

from sklearn.datasets import fetch_olivetti_faces

fetch_olivetti_faces(
    data_home: str | None = None,
    shuffle: bool = False,
    random_state: int | RandomState | None = 0,
    download_if_missing: bool = True,
    return_X_y: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the Olivetti faces dataset.

Real Estate and Regression Datasets

fetch_california_housing { .api }

from sklearn.datasets import fetch_california_housing

fetch_california_housing(
    data_home: str | None = None,
    download_if_missing: bool = True,
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the California housing dataset.

Network Security Datasets

fetch_kddcup99 { .api }

from sklearn.datasets import fetch_kddcup99

fetch_kddcup99(
    subset: str | None = None,
    data_home: str | None = None,
    shuffle: bool = False,
    random_state: int | RandomState | None = None,
    percent10: bool = True,
    download_if_missing: bool = True,
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the kddcup99 dataset.

Environmental Datasets

fetch_covtype { .api }

from sklearn.datasets import fetch_covtype

fetch_covtype(
    data_home: str | None = None,
    download_if_missing: bool = True,
    random_state: int | RandomState | None = None,
    shuffle: bool = False,
    return_X_y: bool = False,
    as_frame: bool = False
) -> Bunch | tuple[ArrayLike, ArrayLike]

Load the covertype dataset.

fetch_species_distributions { .api }

from sklearn.datasets import fetch_species_distributions

fetch_species_distributions(
    data_home: str | None = None,
    download_if_missing: bool = True
) -> Bunch

Loader for species distribution dataset.

OpenML Integration

fetch_openml { .api }

from sklearn.datasets import fetch_openml

fetch_openml(
    name: str | int | None = None,
    version: int | str = "active",
    data_id: int | None = None,
    data_home: str | None = None,
    target_column: str | list | None = "default-target",
    cache: bool = True,
    return_X_y: bool = False,
    as_frame: bool | str = "auto",
    n_retries: int = 3,
    delay: float = 1.0,
    parser: str = "auto",
    read_csv_kwargs: dict | None = None
) -> Bunch | tuple[ArrayLike, ArrayLike]

Fetch dataset from openml by name or dataset id.

General File Fetching

fetch_file { .api }

from sklearn.datasets import fetch_file

fetch_file(
    url: str,
    data_home: str | None = None,
    cache_subdir: str = "",
    hash_: str | None = None,
    hash_algorithm: str = "auto",
    extract: bool = False,
    force_extract: bool = False,
    quiet: bool = False,
    local_folder: str | None = None
) -> str

Load a file from the Web.

Synthetic Data Generation

Classification Data Generation

make_classification { .api }

from sklearn.datasets import make_classification

make_classification(
    n_samples: int = 100,
    n_features: int = 20,
    n_informative: int = 2,
    n_redundant: int = 2,
    n_repeated: int = 0,
    n_classes: int = 2,
    n_clusters_per_class: int = 2,
    weights: ArrayLike | None = None,
    flip_y: float = 0.01,
    class_sep: float = 1.0,
    hypercube: bool = True,
    shift: float | ArrayLike | None = 0.0,
    scale: float | ArrayLike | None = 1.0,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]

Generate a random n-class classification problem.

make_multilabel_classification { .api }

from sklearn.datasets import make_multilabel_classification

make_multilabel_classification(
    n_samples: int = 100,
    n_features: int = 20,
    n_classes: int = 5,
    n_labels: int = 2,
    length: int = 50,
    allow_unlabeled: bool = True,
    sparse: bool = False,
    return_indicator: str = "dense",
    return_distributions: bool = False,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]

Generate a random multilabel classification problem.

make_hastie_10_2 { .api }

from sklearn.datasets import make_hastie_10_2

make_hastie_10_2(
    n_samples: int = 12000,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate data for binary classification used in Hastie et al. 2009.

make_gaussian_quantiles { .api }

from sklearn.datasets import make_gaussian_quantiles

make_gaussian_quantiles(
    mean: ArrayLike | None = None,
    cov: float = 1.0,
    n_samples: int = 100,
    n_features: int = 2,
    n_classes: int = 3,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate isotropic Gaussian and label samples by quantile.

Regression Data Generation

make_regression { .api }

from sklearn.datasets import make_regression

make_regression(
    n_samples: int = 100,
    n_features: int = 100,
    n_informative: int = 10,
    n_targets: int = 1,
    bias: float = 0.0,
    effective_rank: int | None = None,
    tail_strength: float = 0.5,
    noise: float = 0.0,
    shuffle: bool = True,
    coef: bool = False,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]

Generate a random regression problem.

make_friedman1 { .api }

from sklearn.datasets import make_friedman1

make_friedman1(
    n_samples: int = 100,
    n_features: int = 10,
    noise: float = 0.0,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate the "Friedman #1" regression problem.

make_friedman2 { .api }

from sklearn.datasets import make_friedman2

make_friedman2(
    n_samples: int = 100,
    noise: float = 0.0,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate the "Friedman #2" regression problem.

make_friedman3 { .api }

from sklearn.datasets import make_friedman3

make_friedman3(
    n_samples: int = 100,
    noise: float = 0.0,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate the "Friedman #3" regression problem.

make_sparse_uncorrelated { .api }

from sklearn.datasets import make_sparse_uncorrelated

make_sparse_uncorrelated(
    n_samples: int = 100,
    n_features: int = 10,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate a random regression problem with sparse uncorrelated design.

Clustering Data Generation

make_blobs { .api }

from sklearn.datasets import make_blobs

make_blobs(
    n_samples: int | ArrayLike = 100,
    n_features: int = 2,
    centers: int | ArrayLike | None = None,
    cluster_std: float | ArrayLike = 1.0,
    center_box: tuple[float, float] = (-10.0, 10.0),
    shuffle: bool = True,
    random_state: int | RandomState | None = None,
    return_centers: bool = False
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]

Generate isotropic Gaussian blobs for clustering.

make_circles { .api }

from sklearn.datasets import make_circles

make_circles(
    n_samples: int | tuple[int, int] = 100,
    shuffle: bool = True,
    noise: float | None = None,
    random_state: int | RandomState | None = None,
    factor: float = 0.8
) -> tuple[ArrayLike, ArrayLike]

Make a large circle containing a smaller circle in 2d.

make_moons { .api }

from sklearn.datasets import make_moons

make_moons(
    n_samples: int | tuple[int, int] = 100,
    shuffle: bool = True,
    noise: float | None = None,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Make two interleaving half circles.

Manifold Data Generation

make_swiss_roll { .api }

from sklearn.datasets import make_swiss_roll

make_swiss_roll(
    n_samples: int = 100,
    noise: float = 0.0,
    random_state: int | RandomState | None = None,
    hole: bool = False
) -> tuple[ArrayLike, ArrayLike]

Generate a swiss roll dataset.

make_s_curve { .api }

from sklearn.datasets import make_s_curve

make_s_curve(
    n_samples: int = 100,
    noise: float = 0.0,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike]

Generate an S curve dataset.

Biclustering Data Generation

make_biclusters { .api }

from sklearn.datasets import make_biclusters

make_biclusters(
    shape: tuple[int, int],
    n_clusters: int,
    noise: float = 0.0,
    minval: int = 10,
    maxval: int = 100,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]

Generate an array with constant block diagonal structure.

make_checkerboard { .api }

from sklearn.datasets import make_checkerboard

make_checkerboard(
    shape: tuple[int, int],
    n_clusters: int | tuple[int, int],
    noise: float = 0.0,
    minval: int = 10,
    maxval: int = 100,
    shuffle: bool = True,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]

Generate an array with block checkerboard structure.

Matrix Generation

make_low_rank_matrix { .api }

from sklearn.datasets import make_low_rank_matrix

make_low_rank_matrix(
    n_samples: int = 100,
    n_features: int = 100,
    effective_rank: int = 10,
    tail_strength: float = 0.5,
    random_state: int | RandomState | None = None
) -> ArrayLike

Generate a mostly low rank matrix with bell-shaped singular values.

make_sparse_coded_signal { .api }

from sklearn.datasets import make_sparse_coded_signal

make_sparse_coded_signal(
    n_samples: int,
    n_components: int,
    n_features: int,
    n_nonzero_coefs: int,
    random_state: int | RandomState | None = None
) -> tuple[ArrayLike, ArrayLike, ArrayLike]

Generate a signal as a sparse combination of dictionary elements.

make_spd_matrix { .api }

from sklearn.datasets import make_spd_matrix

make_spd_matrix(
    n_dim: int,
    random_state: int | RandomState | None = None
) -> ArrayLike

Generate a random symmetric, positive-definite matrix.

make_sparse_spd_matrix { .api }

from sklearn.datasets import make_sparse_spd_matrix

make_sparse_spd_matrix(
    dim: int = 1,
    alpha: float = 0.95,
    norm_diag: bool = False,
    smallest_coef: float = 0.1,
    largest_coef: float = 0.9,
    random_state: int | RandomState | None = None
) -> ArrayLike

Generate a sparse symmetric definite positive matrix.

File I/O Utilities

SVMLight Format

load_svmlight_file { .api }

from sklearn.datasets import load_svmlight_file

load_svmlight_file(
    f: str | IO,
    n_features: int | None = None,
    dtype: type = ...,
    multilabel: bool = False,
    zero_based: bool | str = "auto",
    query_id: bool = False,
    offset: int = 0,
    length: int = -1
) -> tuple[ArrayLike, ArrayLike] | tuple[ArrayLike, ArrayLike, ArrayLike]

Load datasets in the svmlight / libsvm format into sparse CSR matrix.

load_svmlight_files { .api }

from sklearn.datasets import load_svmlight_files

load_svmlight_files(
    files: list[str | IO],
    n_features: int | None = None,
    dtype: type = ...,
    multilabel: bool = False,
    zero_based: bool | str = "auto",
    query_id: bool = False,
    offset: int = 0,
    length: int = -1
) -> list[tuple[ArrayLike, ArrayLike]] | list[tuple[ArrayLike, ArrayLike, ArrayLike]]

Load dataset from multiple files in SVMlight format.

dump_svmlight_file { .api }

from sklearn.datasets import dump_svmlight_file

dump_svmlight_file(
    X: ArrayLike,
    y: ArrayLike,
    f: str | IO,
    zero_based: bool = True,
    comment: str | bytes | None = None,
    query_id: ArrayLike | None = None,
    multilabel: bool = False
) -> None

Dump the dataset in svmlight / libsvm file format.

Data Directory Management

get_data_home { .api }

from sklearn.datasets import get_data_home

get_data_home(
    data_home: str | None = None
) -> str

Return the path to scikit-learn data dir.

clear_data_home { .api }

from sklearn.datasets import clear_data_home

clear_data_home(
    data_home: str | None = None
) -> None

Delete all the content in the data home cache.

Examples

Loading Built-in Datasets

from sklearn.datasets import load_iris, load_digits, load_wine

# Load iris dataset
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
print(f"Iris dataset: {X_iris.shape}, classes: {len(iris.target_names)}")

# Load digits dataset  
digits = load_digits(n_class=10)
X_digits, y_digits = digits.data, digits.target
print(f"Digits dataset: {X_digits.shape}")

# Load wine dataset as tuple
X_wine, y_wine = load_wine(return_X_y=True)
print(f"Wine dataset: {X_wine.shape}")

# Load as pandas DataFrame
wine_frame = load_wine(as_frame=True)
df = wine_frame.frame
print(df.head())

Fetching Real-World Datasets

from sklearn.datasets import fetch_california_housing, fetch_20newsgroups

# Fetch California housing dataset
housing = fetch_california_housing()
X_housing, y_housing = housing.data, housing.target
print(f"Housing dataset: {X_housing.shape}")
print(f"Features: {housing.feature_names}")

# Fetch text data (20 newsgroups)
newsgroups = fetch_20newsgroups(
    subset='train', 
    categories=['alt.atheism', 'sci.space']
)
print(f"Newsgroups: {len(newsgroups.data)} documents")
print(f"Categories: {newsgroups.target_names}")

Generating Synthetic Data

from sklearn.datasets import (
    make_classification, make_regression, make_blobs, 
    make_circles, make_moons
)

# Classification data
X_clf, y_clf = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    n_redundant=5, n_classes=3, random_state=42
)
print(f"Classification data: {X_clf.shape}")

# Regression data
X_reg, y_reg = make_regression(
    n_samples=1000, n_features=20, n_informative=10,
    noise=0.1, random_state=42
)
print(f"Regression data: {X_reg.shape}")

# Clustering data - blobs
X_blobs, y_blobs = make_blobs(
    n_samples=300, centers=4, n_features=2,
    random_state=42, cluster_std=0.8
)

# Non-linear clustering data
X_circles, y_circles = make_circles(
    n_samples=300, noise=0.05, factor=0.6, random_state=42
)

X_moons, y_moons = make_moons(
    n_samples=300, noise=0.1, random_state=42
)

print(f"Blobs: {X_blobs.shape}, Circles: {X_circles.shape}, Moons: {X_moons.shape}")

Manifold Learning Data

from sklearn.datasets import make_swiss_roll, make_s_curve

# Generate swiss roll manifold
X_swiss, t_swiss = make_swiss_roll(n_samples=1000, noise=0.1, random_state=42)
print(f"Swiss roll: {X_swiss.shape}")

# Generate S-curve manifold
X_s_curve, t_s_curve = make_s_curve(n_samples=1000, noise=0.1, random_state=42)
print(f"S-curve: {X_s_curve.shape}")

Working with SVMLight Format

from sklearn.datasets import dump_svmlight_file, load_svmlight_file
import tempfile
import os

# Create sample data
X, y = make_classification(n_samples=100, n_features=10, random_state=42)

# Save to SVMLight format
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.svmlight') as f:
    dump_svmlight_file(X, y, f.name)
    filename = f.name

# Load from SVMLight format
X_loaded, y_loaded = load_svmlight_file(filename)
print(f"Original: {X.shape}, Loaded: {X_loaded.shape}")

# Clean up
os.unlink(filename)

Custom Dataset Creation

import numpy as np
from sklearn.utils import Bunch

def create_custom_dataset(n_samples=100):
    """Create a custom dataset with specific characteristics."""
    np.random.seed(42)
    
    # Generate features
    X = np.random.randn(n_samples, 5)
    
    # Create target with specific pattern
    y = (X[:, 0] + X[:, 1] > 0).astype(int)
    
    # Create a Bunch object similar to sklearn datasets
    return Bunch(
        data=X,
        target=y,
        feature_names=[f'feature_{i}' for i in range(5)],
        target_names=['class_0', 'class_1'],
        DESCR='Custom synthetic dataset'
    )

# Use custom dataset
custom_data = create_custom_dataset(500)
print(f"Custom dataset: {custom_data.data.shape}")
print(f"Features: {custom_data.feature_names}")

Install with Tessl CLI

npx tessl i tessl/pypi-scikit-learn