HuggingFace community-driven open-source library of datasets for machine learning with one-line dataloaders, efficient preprocessing, and multi-framework support
—
Functions and classes for inspecting dataset metadata, configurations, and available splits. This module provides comprehensive capabilities for dataset discovery, split management, and metadata handling without requiring full dataset downloads.
Central metadata container documenting all known information about a dataset including structure, licensing, and statistics.
class DatasetInfo:
"""Information about a dataset."""
# Static dataset information (set by dataset builders)
description: str
citation: str
homepage: str
license: str
features: Optional[Features]
post_processed: Optional[PostProcessedInfo]
supervised_keys: Optional[SupervisedKeysData]
# Dynamic dataset information (set by builder/processing)
builder_name: Optional[str]
dataset_name: Optional[str]
config_name: Optional[str]
version: Optional[Union[str, Version]]
splits: Optional[dict]
download_checksums: Optional[dict]
download_size: Optional[int]
post_processing_size: Optional[int]
dataset_size: Optional[int]
size_in_bytes: Optional[int]
# File I/O operations
def write_to_directory(
self,
dataset_info_dir: str,
pretty_print: bool = False,
storage_options: Optional[dict] = None,
) -> None: ...
@classmethod
def from_directory(
cls,
dataset_info_dir: str,
storage_options: Optional[dict] = None,
) -> "DatasetInfo": ...
@classmethod
def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo": ...
# Manipulation operations
def update(self, other_dataset_info: "DatasetInfo", ignore_none: bool = True) -> None: ...
def copy(self) -> "DatasetInfo": ...
@classmethod
def from_merge(cls, dataset_infos: List["DatasetInfo"]) -> "DatasetInfo": ...Usage Examples:
from datasets import load_dataset, DatasetInfo
# Access dataset info from loaded dataset
dataset = load_dataset("squad", split="train")
info = dataset.info
print(f"Description: {info.description}")
print(f"Dataset size: {info.dataset_size} bytes")
print(f"Number of examples: {info.splits['train'].num_examples}")
print(f"Features: {info.features}")
# Save dataset info to disk
info.write_to_directory("./squad_info/")
# Load dataset info from disk
loaded_info = DatasetInfo.from_directory("./squad_info/")
# Create custom dataset info
custom_info = DatasetInfo(
description="My custom dataset",
features=Features({
"text": Value("string"),
"label": ClassLabel(names=["positive", "negative"])
}),
license="MIT",
citation="@misc{my_dataset, ...}"
)Functions for discovering available dataset configurations, splits, and metadata without downloading the full dataset.
def get_dataset_config_names(
path: str,
revision: Optional[Union[str, Version]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[DownloadMode, str]] = None,
data_files: Optional[Union[dict, list, str]] = None,
**download_kwargs,
) -> List[str]:
"""
Get the list of available config names for a dataset.
Parameters:
- path (str): Path or name of the dataset
- revision (str, Version, optional): Version of the dataset script to load
- download_config (DownloadConfig, optional): Specific download configuration parameters
- download_mode (DownloadMode or str, optional): Select the download/generation mode
- data_files (dict, list, str, optional): Path(s) to source data file(s)
- **download_kwargs: Additional download arguments
Returns:
- List[str]: List of available configuration names
"""
def get_dataset_default_config_name(
path: str,
revision: Optional[Union[str, Version]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[DownloadMode, str]] = None,
data_files: Optional[Union[dict, list, str]] = None,
**download_kwargs,
) -> Optional[str]:
"""
Get the default config name for a dataset.
Parameters:
- path (str): Path or name of the dataset
- revision (str, Version, optional): Version of the dataset script to load
- download_config (DownloadConfig, optional): Specific download configuration parameters
- download_mode (DownloadMode or str, optional): Select the download/generation mode
- data_files (dict, list, str, optional): Path(s) to source data file(s)
- **download_kwargs: Additional download arguments
Returns:
- Optional[str]: Default configuration name, or None if no default
"""
def get_dataset_split_names(
path: str,
config_name: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[DownloadMode, str]] = None,
revision: Optional[Union[str, Version]] = None,
token: Optional[Union[bool, str]] = None,
**config_kwargs,
) -> List[str]:
"""
Get the list of available splits for a particular config and dataset.
Parameters:
- path (str): Path or name of the dataset
- config_name (str, optional): Configuration name
- data_files (str, Sequence[str], Mapping, optional): Path(s) to source data file(s)
- download_config (DownloadConfig, optional): Specific download configuration parameters
- download_mode (DownloadMode or str, optional): Select the download/generation mode
- revision (str, Version, optional): Version of the dataset script to load
- token (bool or str, optional): Optional string or boolean to use as Bearer token
- **config_kwargs: Keyword arguments to be passed to the BuilderConfig
Returns:
- List[str]: List of available split names
"""Usage Examples:
from datasets import get_dataset_config_names, get_dataset_default_config_name, get_dataset_split_names
# Discover available configurations
configs = get_dataset_config_names("nyu-mll/glue")
print(f"Available configs: {configs}")
# ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', ...]
# Get default configuration
default = get_dataset_default_config_name("squad")
print(f"Default config: {default}") # 'plain_text'
# Get available splits for a configuration
splits = get_dataset_split_names("nyu-mll/glue", config_name="cola")
print(f"Available splits: {splits}") # ['train', 'validation', 'test']
# Check splits for default configuration
splits = get_dataset_split_names("squad")
print(f"SQuAD splits: {splits}") # ['train', 'validation']Functions for retrieving detailed metadata about datasets and their configurations.
def get_dataset_infos(
path: str,
data_files: Optional[Union[dict, list, str]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[DownloadMode, str]] = None,
revision: Optional[Union[str, Version]] = None,
token: Optional[Union[bool, str]] = None,
**config_kwargs,
) -> Dict[str, DatasetInfo]:
"""
Get meta information about a dataset, returned as dict mapping config name to DatasetInfo.
Parameters:
- path (str): Path or name of the dataset
- data_files (dict, list, str, optional): Path(s) to source data file(s)
- download_config (DownloadConfig, optional): Specific download configuration parameters
- download_mode (DownloadMode or str, optional): Select the download/generation mode
- revision (str, Version, optional): Version of the dataset script to load
- token (bool or str, optional): Optional string or boolean to use as Bearer token
- **config_kwargs: Keyword arguments to be passed to the BuilderConfig
Returns:
- Dict[str, DatasetInfo]: Dictionary mapping configuration names to DatasetInfo objects
"""
def get_dataset_config_info(
path: str,
config_name: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[DownloadMode, str]] = None,
revision: Optional[Union[str, Version]] = None,
token: Optional[Union[bool, str]] = None,
**config_kwargs,
) -> DatasetInfo:
"""
Get the meta information (DatasetInfo) about a dataset for a particular config.
Parameters:
- path (str): Path or name of the dataset
- config_name (str, optional): Configuration name
- data_files (str, Sequence[str], Mapping, optional): Path(s) to source data file(s)
- download_config (DownloadConfig, optional): Specific download configuration parameters
- download_mode (DownloadMode or str, optional): Select the download/generation mode
- revision (str, Version, optional): Version of the dataset script to load
- token (bool or str, optional): Optional string or boolean to use as Bearer token
- **config_kwargs: Keyword arguments to be passed to the BuilderConfig
Returns:
- DatasetInfo: Dataset information for the specified configuration
"""Usage Examples:
from datasets import get_dataset_infos, get_dataset_config_info
# Get info for all configurations
all_infos = get_dataset_infos("nyu-mll/glue")
for config, info in all_infos.items():
print(f"Config: {config}")
print(f" Description: {info.description[:100]}...")
print(f" Features: {list(info.features.keys())}")
print(f" Splits: {list(info.splits.keys())}")
# Get info for specific configuration
cola_info = get_dataset_config_info("nyu-mll/glue", config_name="cola")
print(f"CoLA dataset size: {cola_info.dataset_size} bytes")
print(f"CoLA train examples: {cola_info.splits['train'].num_examples}")Classes for managing dataset splits, subsplits, and split composition operations.
class SplitBase:
"""Abstract base class for Split compositionality."""
def get_read_instruction(self, split_dict): ...
def __add__(self, other): ... # Merging: Split.TRAIN + Split.TEST
def subsplit(self, arg=None, k=None, percent=None, weighted=None): ...
class NamedSplit(SplitBase):
"""Descriptor corresponding to a named split (train, test, ...)."""
def __init__(self, name: str): ...
def __eq__(self, other) -> bool: ... # Supports Split.TRAIN == 'train'
def __str__(self) -> str: ...
def __repr__(self) -> str: ...
def get_read_instruction(self, split_dict): ...
class NamedSplitAll(NamedSplit):
"""Split corresponding to the union of all defined dataset splits."""
def get_read_instruction(self, split_dict): ...
class Split:
"""Enum for dataset splits with predefined constants."""
TRAIN = NamedSplit("train")
TEST = NamedSplit("test")
VALIDATION = NamedSplit("validation")
ALL = NamedSplitAll()
def __new__(cls, name: str): ... # Create custom split
class SplitInfo:
"""Information about a single dataset split."""
def __init__(
self,
name: str = "",
num_bytes: int = 0,
num_examples: int = 0,
shard_lengths: Optional[List[int]] = None,
dataset_name: Optional[str] = None,
): ...
@property
def file_instructions(self) -> List[dict]: ...
class SplitDict(dict):
"""Split info object - dictionary of split names to SplitInfo objects."""
def __init__(self, *args, dataset_name=None, **kwargs): ...
def __getitem__(self, key: Union[SplitBase, str]): ... # Supports subsplit instructions
def add(self, split_info: SplitInfo): ...
@property
def total_num_examples(self) -> int: ...
@classmethod
def from_split_dict(
cls,
split_infos: Union[list, dict],
dataset_name: Optional[str] = None,
) -> "SplitDict": ...
class SplitGenerator:
"""Defines split information for the generator in DatasetBuilder._split_generators."""
def __init__(self, name: str, gen_kwargs: dict = None): ...Usage Examples:
from datasets import Split, SplitInfo, SplitDict, percent
# Use predefined splits
train_split = Split.TRAIN
test_split = Split.TEST
validation_split = Split.VALIDATION
# Create custom split
custom_split = Split("custom_name")
# Combine splits
combined = Split.TRAIN + Split.TEST
# Create subsplits using percentages
train_subset = Split.TRAIN.subsplit(percent[:80]) # First 80%
val_subset = Split.TRAIN.subsplit(percent[80:]) # Last 20%
# Multiple subsplits with weights
train, val, test = Split.TRAIN.subsplit(weighted=[70, 15, 15])
# Create split information
split_info = SplitInfo(
name="train",
num_examples=1000,
num_bytes=1024000
)
# Create split dictionary
split_dict = SplitDict({
"train": SplitInfo(name="train", num_examples=800),
"test": SplitInfo(name="test", num_examples=200),
})
print(f"Total examples: {split_dict.total_num_examples}") # 1000Classes for specifying input/output relationships in supervised learning scenarios.
class SupervisedKeysData:
"""Specifies input and output for supervised learning."""
def __init__(self, input: str = "", output: str = ""): ...
class PostProcessedInfo:
"""Information about post-processed resources (e.g., indices)."""
def __init__(
self,
features: Optional[Features] = None,
resources_checksums: Optional[dict] = None,
): ...Usage Examples:
from datasets import DatasetInfo, SupervisedKeysData, Features, Value, ClassLabel
# Define supervised learning keys
supervised_keys = SupervisedKeysData(input="text", output="label")
# Create dataset info with supervised keys
info = DatasetInfo(
description="Text classification dataset",
features=Features({
"text": Value("string"),
"label": ClassLabel(names=["positive", "negative"])
}),
supervised_keys=supervised_keys,
license="MIT"
)
# Access supervised keys
print(f"Input column: {info.supervised_keys.input}") # text
print(f"Output column: {info.supervised_keys.output}") # labelfrom datasets import get_dataset_config_names, get_dataset_split_names, get_dataset_infos
def explore_dataset(dataset_name):
"""Comprehensively explore a dataset's structure."""
print(f"Exploring dataset: {dataset_name}")
# Get all configurations
configs = get_dataset_config_names(dataset_name)
print(f"Available configurations: {configs}")
# Explore each configuration
for config in configs:
print(f"\nConfiguration: {config}")
# Get splits for this config
splits = get_dataset_split_names(dataset_name, config_name=config)
print(f" Splits: {splits}")
# Get detailed info
info = get_dataset_config_info(dataset_name, config_name=config)
print(f" Description: {info.description[:100]}...")
print(f" Features: {list(info.features.keys())}")
# Show split statistics
for split_name in splits:
split_info = info.splits[split_name]
print(f" {split_name}: {split_info.num_examples} examples, {split_info.num_bytes} bytes")
# Usage
explore_dataset("nyu-mll/glue")from datasets import DatasetInfo, Features, Value, SplitDict, SplitInfo
def create_custom_dataset_info():
"""Create comprehensive dataset metadata."""
# Define features
features = Features({
"id": Value("string"),
"text": Value("string"),
"label": ClassLabel(names=["positive", "negative", "neutral"]),
"confidence": Value("float32"),
"metadata": {
"source": Value("string"),
"timestamp": Value("timestamp[ms]")
}
})
# Define split information
splits = SplitDict({
"train": SplitInfo(name="train", num_examples=10000, num_bytes=50000000),
"validation": SplitInfo(name="validation", num_examples=1000, num_bytes=5000000),
"test": SplitInfo(name="test", num_examples=2000, num_bytes=10000000),
})
# Create comprehensive dataset info
info = DatasetInfo(
description="A comprehensive sentiment analysis dataset with confidence scores",
citation="@misc{my_dataset_2024, title={My Dataset}, author={Author}, year={2024}}",
homepage="https://example.com/dataset",
license="Apache-2.0",
features=features,
splits=splits,
supervised_keys=SupervisedKeysData(input="text", output="label"),
version="1.0.0",
download_size=65000000,
dataset_size=65000000,
)
return info
# Save and load metadata
info = create_custom_dataset_info()
info.write_to_directory("./my_dataset_info/")
loaded_info = DatasetInfo.from_directory("./my_dataset_info/")from datasets import Split, percent, load_dataset
def create_complex_splits():
"""Demonstrate advanced split composition."""
# Load dataset
dataset = load_dataset("imdb", split=Split.TRAIN)
# Create multiple subsplits
train_80, val_10, test_10 = dataset.train_test_split(
test_size=0.2,
train_size=0.8
)["train"].train_test_split(
test_size=0.125 # 10% of original (0.1/0.8)
).values()
# Alternative using split composition
train_subset = Split.TRAIN.subsplit(percent[:80])
val_subset = Split.TRAIN.subsplit(percent[80:90])
test_subset = Split.TRAIN.subsplit(percent[90:])
# Use weighted subsplits
splits = Split.TRAIN.subsplit(weighted=[8, 1, 1]) # 80%, 10%, 10%
return {
"train": splits[0],
"validation": splits[1],
"test": splits[2]
}This comprehensive dataset information system provides powerful tools for dataset discovery, metadata management, and split composition, enabling efficient exploration and manipulation of dataset structures without requiring full downloads.
Install with Tessl CLI
npx tessl i tessl/pypi-datasets