HuggingFace community-driven open-source library of datasets for machine learning with one-line dataloaders, efficient preprocessing, and multi-framework support
npx @tessl/cli install tessl/pypi-datasets@4.0.0A comprehensive dataset management library that enables developers to easily load, process, and work with machine learning datasets. It offers one-line dataloaders for thousands of public datasets from the HuggingFace Datasets Hub, efficient data pre-processing capabilities with memory-mapped storage using Apache Arrow for handling large datasets without RAM limitations, and built-in interoperability with major ML frameworks including NumPy, PyTorch, TensorFlow, JAX, and Pandas.
pip install datasetsimport datasetsCommon patterns for loading and working with datasets:
from datasets import load_dataset, Dataset, DatasetDict
from datasets import concatenate_datasets, interleave_datasetsfrom datasets import load_dataset
# Load a dataset from the Hub
dataset = load_dataset("squad", split="train")
# Access dataset features and data
print(dataset.features)
print(len(dataset))
print(dataset[0])
# Apply transformations
def preprocess(example):
example["question_length"] = len(example["question"])
return example
dataset = dataset.map(preprocess)
# Convert to different formats
torch_dataset = dataset.with_format("torch")
pandas_df = dataset.to_pandas()
# Save to disk
dataset.save_to_disk("./my_dataset")The datasets library is built around these key components:
Dataset for map-style access and IterableDataset for streaming large datasetsload_dataset() function with automatic discovery of dataset buildersThis design enables efficient processing of datasets ranging from small research datasets to massive production corpora, with seamless integration into popular ML frameworks and automatic optimization through caching and memory mapping.
The primary interface for loading datasets from the HuggingFace Hub, local files, or custom data sources. Supports automatic format detection, streaming for large datasets, and flexible data splitting.
def load_dataset(
path: str,
name: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
split: Optional[Union[str, Split]] = None,
cache_dir: Optional[str] = None,
features: Optional[Features] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[DownloadMode, str]] = None,
verification_mode: Optional[Union[VerificationMode, str]] = None,
keep_in_memory: Optional[bool] = None,
save_infos: bool = False,
revision: Optional[Union[str, Version]] = None,
token: Optional[Union[bool, str]] = None,
streaming: bool = False,
num_proc: Optional[int] = None,
storage_options: Optional[Dict] = None,
trust_remote_code: bool = None,
**config_kwargs,
) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
"""Load a dataset from the HuggingFace Hub, local files, or custom sources."""
def load_dataset_builder(
path: str,
name: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
cache_dir: Optional[str] = None,
features: Optional[Features] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[DownloadMode, str]] = None,
revision: Optional[Union[str, Version]] = None,
token: Optional[Union[bool, str]] = None,
storage_options: Optional[Dict] = None,
trust_remote_code: bool = None,
**config_kwargs,
) -> DatasetBuilder:
"""Load a dataset builder without building the dataset."""
def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> Union[Dataset, DatasetDict]:
"""Load a dataset that was previously saved using save_to_disk."""The fundamental dataset classes that provide different access patterns and capabilities for working with dataset collections.
class Dataset:
"""Map-style dataset backed by Apache Arrow for efficient random access."""
def __getitem__(self, key): ...
def __len__(self) -> int: ...
def map(self, function, **kwargs) -> "Dataset": ...
def filter(self, function, **kwargs) -> "Dataset": ...
def select(self, indices) -> "Dataset": ...
def with_format(self, type: Optional[str] = None, **kwargs) -> "Dataset": ...
def to_pandas(self) -> "pandas.DataFrame": ...
def save_to_disk(self, dataset_path: str) -> None: ...
class DatasetDict(dict):
"""Dictionary of Dataset objects, typically for train/validation/test splits."""
def map(self, function, **kwargs) -> "DatasetDict": ...
def filter(self, function, **kwargs) -> "DatasetDict": ...
def with_format(self, type: Optional[str] = None, **kwargs) -> "DatasetDict": ...
def save_to_disk(self, dataset_dict_path: str) -> None: ...
class IterableDataset:
"""Iterable-style dataset for streaming large datasets without loading into memory."""
def __iter__(self): ...
def map(self, function, **kwargs) -> "IterableDataset": ...
def filter(self, function, **kwargs) -> "IterableDataset": ...
def take(self, n: int) -> "IterableDataset": ...
def skip(self, n: int) -> "IterableDataset": ...Functions for combining, transforming, and manipulating datasets, including concatenation, interleaving, and caching control.
def concatenate_datasets(
dsets: List[Dataset],
info: Optional[DatasetInfo] = None,
split: Optional[NamedSplit] = None,
axis: int = 0,
) -> Dataset:
"""Concatenate multiple Dataset objects."""
def interleave_datasets(
datasets: List[Union[Dataset, IterableDataset]],
probabilities: Optional[List[float]] = None,
seed: Optional[int] = None,
info: Optional[DatasetInfo] = None,
split: Optional[NamedSplit] = None,
stopping_strategy: str = "first_exhausted",
) -> Union[Dataset, IterableDataset]:
"""Interleave multiple datasets."""
def enable_caching() -> None:
"""Enable caching of dataset operations."""
def disable_caching() -> None:
"""Disable caching of dataset operations."""
def is_caching_enabled() -> bool:
"""Check if caching is currently enabled."""Comprehensive type system for defining and validating dataset schemas, supporting primitive types, complex nested structures, and multimedia data.
class Features(dict):
"""Dictionary-like container for dataset features with type validation."""
def encode_example(self, example: dict) -> dict: ...
def decode_example(self, example: dict) -> dict: ...
class Value:
"""Feature for primitive data types (int32, float64, string, bool, etc.)."""
def __init__(self, dtype: str, id: Optional[str] = None): ...
class ClassLabel:
"""Feature for categorical/classification labels."""
def __init__(
self,
num_classes: Optional[int] = None,
names: Optional[List[str]] = None,
names_file: Optional[str] = None,
id: Optional[str] = None,
): ...
class Audio:
"""Feature for audio data with automatic format handling."""
def __init__(self, sampling_rate: Optional[int] = None, mono: bool = True, decode: bool = True): ...
class Image:
"""Feature for image data with automatic format handling."""
def __init__(self, decode: bool = True, id: Optional[str] = None): ...Classes and utilities for creating custom dataset builders and configurations for new datasets.
class DatasetBuilder(ABC):
"""Abstract base class for dataset builders."""
def download_and_prepare(self, download_config: Optional[DownloadConfig] = None, **kwargs) -> None: ...
def as_dataset(self, split: Optional[Split] = None, **kwargs) -> Union[Dataset, DatasetDict]: ...
class GeneratorBasedBuilder(DatasetBuilder):
"""Dataset builder for datasets generated from Python generators."""
def _generate_examples(self, **kwargs): ...
class BuilderConfig:
"""Configuration class for dataset builders."""
def __init__(
self,
name: str = "default",
version: Optional[Union[str, Version]] = "0.0.0",
data_dir: Optional[str] = None,
data_files: Optional[DataFilesDict] = None,
description: Optional[str] = None,
): ...Functions and classes for inspecting dataset metadata, configurations, and available splits.
class DatasetInfo:
"""Container for dataset metadata and information."""
description: str
features: Optional[Features]
total_num_examples: Optional[int]
splits: Optional[SplitDict]
supervised_keys: Optional[SupervisedKeysData]
version: Optional[Version]
license: Optional[str]
citation: Optional[str]
def get_dataset_config_names(path: str, **kwargs) -> List[str]:
"""Get available configuration names for a dataset."""
def get_dataset_split_names(path: str, config_name: Optional[str] = None, **kwargs) -> List[str]:
"""Get available split names for a dataset."""
def get_dataset_infos(path: str, **kwargs) -> Dict[str, DatasetInfo]:
"""Get information about all configurations of a dataset."""class Split:
"""Standard dataset splits."""
TRAIN: str = "train"
TEST: str = "test"
VALIDATION: str = "validation"
class DownloadMode:
"""Download behavior modes."""
REUSE_DATASET_IF_EXISTS: str = "reuse_dataset_if_exists"
REUSE_CACHE_IF_EXISTS: str = "reuse_cache_if_exists"
FORCE_REDOWNLOAD: str = "force_redownload"
class VerificationMode:
"""Dataset verification modes."""
BASIC_CHECKS: str = "basic_checks"
ALL_CHECKS: str = "all_checks"
NO_CHECKS: str = "no_checks"