HuggingFace community-driven open-source library of datasets for machine learning with one-line dataloaders, efficient preprocessing, and multi-framework support
—
Classes and utilities for creating custom dataset builders and configurations for new datasets. The dataset building system provides a robust framework for defining how datasets are downloaded, processed, and structured, with support for both generator-based and Arrow-based processing patterns.
Abstract base class for all datasets providing the core infrastructure for dataset download, preparation, and access.
class DatasetBuilder(ABC):
"""Abstract base class for all datasets."""
# Class attributes (set in subclass)
VERSION: Optional[str] = None
BUILDER_CONFIG_CLASS: Type[BuilderConfig] = BuilderConfig
BUILDER_CONFIGS: List[BuilderConfig] = []
DEFAULT_CONFIG_NAME: Optional[str] = None
DEFAULT_WRITER_BATCH_SIZE: Optional[int] = 1000
# Core abstract methods (must be implemented)
@abc.abstractmethod
def _info(self) -> DatasetInfo:
"""Construct the DatasetInfo object with dataset metadata."""
@abc.abstractmethod
def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]) -> List[SplitGenerator]:
"""Return list of SplitGenerators defining how to generate data and splits."""
@abc.abstractmethod
def _prepare_split(self, split_generator: SplitGenerator, **kwargs):
"""Generate examples and record them on disk."""
# Main public methods
def download_and_prepare(
self,
output_dir: Optional[str] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[DownloadMode, str]] = None,
verification_mode: Optional[Union[VerificationMode, str]] = None,
dl_manager: Optional[DownloadManager] = None,
base_path: Optional[str] = None,
file_format: str = "arrow",
max_shard_size: Optional[Union[int, str]] = None,
num_proc: Optional[int] = None,
storage_options: Optional[dict] = None,
**kwargs,
) -> None: ...
def as_dataset(
self,
split: Optional[Union[str, Split, List[str], List[Split]]] = None,
run_post_process: bool = True,
verification_mode: Optional[Union[VerificationMode, str]] = None,
in_memory: bool = False,
) -> Union[Dataset, DatasetDict]: ...
# Properties
@property
def cache_dir(self) -> str: ...
@property
def manual_download_instructions(self) -> Optional[str]: ...
@classproperty
@classmethod
def builder_configs(cls) -> Dict[str, BuilderConfig]: ...Dataset builder for datasets generated from Python generators yielding dictionaries. Best for custom data processing and complex transformations.
class GeneratorBasedBuilder(DatasetBuilder):
"""Base class for datasets with data generation based on dict generators."""
@abc.abstractmethod
def _generate_examples(self, **kwargs):
"""
Default function generating examples for each SplitGenerator.
Args:
**kwargs: Arguments forwarded from the SplitGenerator.gen_kwargs
Yields:
key: Union[str, int] - A unique deterministic example identification key
example: Dict[str, Any] - A feature dictionary ready to be encoded
"""Usage Example:
from datasets import GeneratorBasedBuilder, DatasetInfo, Features, Value, ClassLabel, Split, SplitGenerator
class MyTextClassificationBuilder(GeneratorBasedBuilder):
def _info(self) -> DatasetInfo:
return DatasetInfo(
description="A custom text classification dataset",
features=Features({
"text": Value("string"),
"label": ClassLabel(names=["positive", "negative", "neutral"]),
"confidence": Value("float32"),
}),
citation="Custom dataset citation",
license="MIT",
)
def _split_generators(self, dl_manager):
# Download files using the download manager
train_file = dl_manager.download("https://example.com/train.jsonl")
test_file = dl_manager.download("https://example.com/test.jsonl")
return [
SplitGenerator(
name=Split.TRAIN,
gen_kwargs={"filepath": train_file, "split": "train"}
),
SplitGenerator(
name=Split.TEST,
gen_kwargs={"filepath": test_file, "split": "test"}
),
]
def _generate_examples(self, filepath, split):
"""Generate examples from the downloaded files."""
import json
with open(filepath, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f):
data = json.loads(line.strip())
yield f"{split}_{idx}", {
"text": data["text"],
"label": data["label"],
"confidence": data.get("confidence", 1.0),
}Dataset builder for datasets generated from Arrow tables. More efficient for large datasets and standard formats (CSV, JSON, Parquet).
class ArrowBasedBuilder(DatasetBuilder):
"""Base class for datasets with data generation based on Arrow loading functions."""
@abc.abstractmethod
def _generate_tables(self, **kwargs):
"""
Default function generating tables for each SplitGenerator.
Args:
**kwargs: Arguments forwarded from the SplitGenerator.gen_kwargs
Yields:
key: Union[str, int] - A unique deterministic example identification key
table: pyarrow.Table - A feature table ready to be written to disk
"""Usage Example:
import pyarrow as pa
import pandas as pd
from datasets import ArrowBasedBuilder, DatasetInfo, Features, Value, Split, SplitGenerator
class MyCSVDatasetBuilder(ArrowBasedBuilder):
def _info(self) -> DatasetInfo:
return DatasetInfo(
description="A dataset built from CSV files",
features=Features({
"id": Value("int64"),
"text": Value("string"),
"score": Value("float64"),
"category": Value("string"),
})
)
def _split_generators(self, dl_manager):
# Download multiple CSV files
urls = {
"train": ["https://example.com/train1.csv", "https://example.com/train2.csv"],
"test": ["https://example.com/test.csv"]
}
downloaded_files = {}
for split, split_urls in urls.items():
downloaded_files[split] = [dl_manager.download(url) for url in split_urls]
return [
SplitGenerator(name=Split.TRAIN, gen_kwargs={"files": downloaded_files["train"]}),
SplitGenerator(name=Split.TEST, gen_kwargs={"files": downloaded_files["test"]}),
]
def _generate_tables(self, files):
"""Generate Arrow tables from CSV files."""
for idx, filepath in enumerate(files):
# Read CSV into pandas DataFrame
df = pd.read_csv(filepath)
# Convert to Arrow table
table = pa.Table.from_pandas(df)
yield idx, tableConfiguration class for dataset builders that defines named configurations and their parameters.
class BuilderConfig:
"""Base class for DatasetBuilder data configuration."""
def __init__(
self,
name: str = "default",
version: Optional[Union[str, Version]] = "0.0.0",
data_dir: Optional[str] = None,
data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None,
description: Optional[str] = None,
): ...
def create_config_id(
self,
config_kwargs: dict,
custom_features: Optional[Features] = None,
) -> str: ...Usage Example:
from datasets import BuilderConfig
class MyBuilderConfig(BuilderConfig):
"""Custom configuration with additional parameters."""
def __init__(
self,
name: str = "default",
version: Optional[Union[str, Version]] = "1.0.0",
data_dir: Optional[str] = None,
data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None,
description: Optional[str] = None,
# Custom parameters
language: str = "en",
preprocessing: str = "standard",
**kwargs,
):
super().__init__(
name=name,
version=version,
data_dir=data_dir,
data_files=data_files,
description=description,
**kwargs,
)
self.language = language
self.preprocessing = preprocessing
class MyConfigurableBuilder(GeneratorBasedBuilder):
BUILDER_CONFIG_CLASS = MyBuilderConfig
BUILDER_CONFIGS = [
MyBuilderConfig(
name="en_standard",
description="English dataset with standard preprocessing",
language="en",
preprocessing="standard",
),
MyBuilderConfig(
name="en_minimal",
description="English dataset with minimal preprocessing",
language="en",
preprocessing="minimal",
),
MyBuilderConfig(
name="es_standard",
description="Spanish dataset with standard preprocessing",
language="es",
preprocessing="standard",
),
]
DEFAULT_CONFIG_NAME = "en_standard"
def _info(self) -> DatasetInfo:
return DatasetInfo(
description=f"Dataset in {self.config.language} with {self.config.preprocessing} preprocessing",
features=Features({
"text": Value("string"),
"label": ClassLabel(names=["pos", "neg"]),
"language": Value("string"),
})
)
def _split_generators(self, dl_manager):
# Use config parameters to determine data sources
url = f"https://example.com/{self.config.language}/data.jsonl"
filepath = dl_manager.download(url)
return [
SplitGenerator(
name=Split.TRAIN,
gen_kwargs={"filepath": filepath, "preprocessing": self.config.preprocessing}
)
]
def _generate_examples(self, filepath, preprocessing):
# Use preprocessing parameter to determine processing logic
with open(filepath, 'r') as f:
for idx, line in enumerate(f):
data = json.loads(line)
text = data["text"]
if preprocessing == "standard":
text = text.lower().strip()
elif preprocessing == "minimal":
text = text.strip()
yield idx, {
"text": text,
"label": data["label"],
"language": self.config.language,
}class MultiFormatBuilder(GeneratorBasedBuilder):
"""Builder that can handle multiple input formats."""
def _split_generators(self, dl_manager):
# Handle different file types
files = {
"csv_files": [dl_manager.download(url) for url in self.config.csv_urls],
"json_files": [dl_manager.download(url) for url in self.config.json_urls],
"txt_files": [dl_manager.download(url) for url in self.config.txt_urls],
}
return [
SplitGenerator(name=Split.TRAIN, gen_kwargs=files)
]
def _generate_examples(self, csv_files, json_files, txt_files):
example_id = 0
# Process CSV files
for filepath in csv_files:
df = pd.read_csv(filepath)
for _, row in df.iterrows():
yield example_id, {"text": row["text"], "source": "csv"}
example_id += 1
# Process JSON files
for filepath in json_files:
with open(filepath) as f:
data = json.load(f)
for item in data:
yield example_id, {"text": item["text"], "source": "json"}
example_id += 1
# Process text files
for filepath in txt_files:
with open(filepath) as f:
for line in f:
yield example_id, {"text": line.strip(), "source": "txt"}
example_id += 1class ManualDownloadBuilder(GeneratorBasedBuilder):
"""Builder for datasets requiring manual download."""
MANUAL_DOWNLOAD_INSTRUCTIONS = """
Please download the dataset files manually from: https://example.com/dataset
Extract the files and place them in: {manual_dir}
The expected files are:
- train.jsonl
- test.jsonl
- metadata.json
"""
def _split_generators(self, dl_manager):
# dl_manager.manual_dir points to the manually downloaded files
manual_dir = dl_manager.manual_dir
return [
SplitGenerator(
name=Split.TRAIN,
gen_kwargs={"filepath": os.path.join(manual_dir, "train.jsonl")}
),
SplitGenerator(
name=Split.TEST,
gen_kwargs={"filepath": os.path.join(manual_dir, "test.jsonl")}
),
]class OptimizedBuilder(GeneratorBasedBuilder):
"""Builder with performance optimizations."""
# Optimize batch size for writing
DEFAULT_WRITER_BATCH_SIZE = 10000
def download_and_prepare(self, **kwargs):
# Use multiprocessing for faster preparation
kwargs.setdefault("num_proc", 4)
# Use larger shard size for fewer files
kwargs.setdefault("max_shard_size", "1GB")
super().download_and_prepare(**kwargs)
def _generate_examples(self, filepath):
# Use efficient file reading
with open(filepath, 'rb') as f:
for idx, line in enumerate(f):
# Process line efficiently
data = orjson.loads(line) # orjson is faster than json
yield idx, self._process_example(data)
def _process_example(self, data):
# Efficient data processing
return {
"text": data["text"],
"label": data["label"],
"features": np.array(data["features"], dtype=np.float32),
}def _generate_examples(self, filepath):
"""Generate examples with proper error handling."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f):
try:
data = json.loads(line.strip())
# Validate required fields
if "text" not in data or "label" not in data:
logger.warning(f"Skipping incomplete example at line {idx}")
continue
yield idx, {
"text": str(data["text"]),
"label": str(data["label"]),
}
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse JSON at line {idx}: {e}")
continue
except FileNotFoundError:
raise FileNotFoundError(f"Data file not found: {filepath}")def test_builder():
"""Test the custom dataset builder."""
from datasets import load_dataset_builder
# Test builder instantiation
builder = MyTextClassificationBuilder()
# Test info generation
info = builder._info()
assert "text" in info.features
assert "label" in info.features
# Test dataset building
builder.download_and_prepare()
dataset = builder.as_dataset()
# Validate dataset
assert len(dataset["train"]) > 0
assert all(key in dataset["train"].features for key in ["text", "label"])This comprehensive dataset building system provides flexible, efficient tools for creating custom datasets that integrate seamlessly with the Hugging Face datasets ecosystem, supporting features like multiprocessing, caching, streaming, and various output formats.
Install with Tessl CLI
npx tessl i tessl/pypi-datasets