CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-datasets

HuggingFace community-driven open-source library of datasets for machine learning with one-line dataloaders, efficient preprocessing, and multi-framework support

Pending
Overview
Eval results
Files

dataset-building.mddocs/

Dataset Building

Classes and utilities for creating custom dataset builders and configurations for new datasets. The dataset building system provides a robust framework for defining how datasets are downloaded, processed, and structured, with support for both generator-based and Arrow-based processing patterns.

Capabilities

Dataset Builder Base Class

Abstract base class for all datasets providing the core infrastructure for dataset download, preparation, and access.

class DatasetBuilder(ABC):
    """Abstract base class for all datasets."""
    
    # Class attributes (set in subclass)
    VERSION: Optional[str] = None
    BUILDER_CONFIG_CLASS: Type[BuilderConfig] = BuilderConfig
    BUILDER_CONFIGS: List[BuilderConfig] = []
    DEFAULT_CONFIG_NAME: Optional[str] = None
    DEFAULT_WRITER_BATCH_SIZE: Optional[int] = 1000
    
    # Core abstract methods (must be implemented)
    @abc.abstractmethod
    def _info(self) -> DatasetInfo:
        """Construct the DatasetInfo object with dataset metadata."""
    
    @abc.abstractmethod  
    def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]) -> List[SplitGenerator]:
        """Return list of SplitGenerators defining how to generate data and splits."""
    
    @abc.abstractmethod
    def _prepare_split(self, split_generator: SplitGenerator, **kwargs):
        """Generate examples and record them on disk."""
    
    # Main public methods
    def download_and_prepare(
        self,
        output_dir: Optional[str] = None,
        download_config: Optional[DownloadConfig] = None,
        download_mode: Optional[Union[DownloadMode, str]] = None,
        verification_mode: Optional[Union[VerificationMode, str]] = None,
        dl_manager: Optional[DownloadManager] = None,
        base_path: Optional[str] = None,
        file_format: str = "arrow",
        max_shard_size: Optional[Union[int, str]] = None,
        num_proc: Optional[int] = None,
        storage_options: Optional[dict] = None,
        **kwargs,
    ) -> None: ...
    
    def as_dataset(
        self,
        split: Optional[Union[str, Split, List[str], List[Split]]] = None,
        run_post_process: bool = True,
        verification_mode: Optional[Union[VerificationMode, str]] = None,
        in_memory: bool = False,
    ) -> Union[Dataset, DatasetDict]: ...
    
    # Properties
    @property
    def cache_dir(self) -> str: ...
    
    @property
    def manual_download_instructions(self) -> Optional[str]: ...
    
    @classproperty
    @classmethod  
    def builder_configs(cls) -> Dict[str, BuilderConfig]: ...

Generator-Based Builder

Dataset builder for datasets generated from Python generators yielding dictionaries. Best for custom data processing and complex transformations.

class GeneratorBasedBuilder(DatasetBuilder):
    """Base class for datasets with data generation based on dict generators."""
    
    @abc.abstractmethod
    def _generate_examples(self, **kwargs):
        """
        Default function generating examples for each SplitGenerator.
        
        Args:
            **kwargs: Arguments forwarded from the SplitGenerator.gen_kwargs
            
        Yields:
            key: Union[str, int] - A unique deterministic example identification key
            example: Dict[str, Any] - A feature dictionary ready to be encoded
        """

Usage Example:

from datasets import GeneratorBasedBuilder, DatasetInfo, Features, Value, ClassLabel, Split, SplitGenerator

class MyTextClassificationBuilder(GeneratorBasedBuilder):
    
    def _info(self) -> DatasetInfo:
        return DatasetInfo(
            description="A custom text classification dataset",
            features=Features({
                "text": Value("string"),
                "label": ClassLabel(names=["positive", "negative", "neutral"]),
                "confidence": Value("float32"),
            }),
            citation="Custom dataset citation",
            license="MIT",
        )
    
    def _split_generators(self, dl_manager):
        # Download files using the download manager
        train_file = dl_manager.download("https://example.com/train.jsonl")
        test_file = dl_manager.download("https://example.com/test.jsonl")
        
        return [
            SplitGenerator(
                name=Split.TRAIN,
                gen_kwargs={"filepath": train_file, "split": "train"}
            ),
            SplitGenerator(
                name=Split.TEST,
                gen_kwargs={"filepath": test_file, "split": "test"}
            ),
        ]
    
    def _generate_examples(self, filepath, split):
        """Generate examples from the downloaded files."""
        import json
        
        with open(filepath, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                data = json.loads(line.strip())
                yield f"{split}_{idx}", {
                    "text": data["text"],
                    "label": data["label"],
                    "confidence": data.get("confidence", 1.0),
                }

Arrow-Based Builder

Dataset builder for datasets generated from Arrow tables. More efficient for large datasets and standard formats (CSV, JSON, Parquet).

class ArrowBasedBuilder(DatasetBuilder):
    """Base class for datasets with data generation based on Arrow loading functions."""
    
    @abc.abstractmethod
    def _generate_tables(self, **kwargs):
        """
        Default function generating tables for each SplitGenerator.
        
        Args:
            **kwargs: Arguments forwarded from the SplitGenerator.gen_kwargs
            
        Yields:
            key: Union[str, int] - A unique deterministic example identification key
            table: pyarrow.Table - A feature table ready to be written to disk
        """

Usage Example:

import pyarrow as pa
import pandas as pd
from datasets import ArrowBasedBuilder, DatasetInfo, Features, Value, Split, SplitGenerator

class MyCSVDatasetBuilder(ArrowBasedBuilder):
    
    def _info(self) -> DatasetInfo:
        return DatasetInfo(
            description="A dataset built from CSV files",
            features=Features({
                "id": Value("int64"),
                "text": Value("string"),
                "score": Value("float64"),
                "category": Value("string"),
            })
        )
    
    def _split_generators(self, dl_manager):
        # Download multiple CSV files
        urls = {
            "train": ["https://example.com/train1.csv", "https://example.com/train2.csv"],
            "test": ["https://example.com/test.csv"]
        }
        
        downloaded_files = {}
        for split, split_urls in urls.items():
            downloaded_files[split] = [dl_manager.download(url) for url in split_urls]
        
        return [
            SplitGenerator(name=Split.TRAIN, gen_kwargs={"files": downloaded_files["train"]}),
            SplitGenerator(name=Split.TEST, gen_kwargs={"files": downloaded_files["test"]}),
        ]
    
    def _generate_tables(self, files):
        """Generate Arrow tables from CSV files."""
        for idx, filepath in enumerate(files):
            # Read CSV into pandas DataFrame
            df = pd.read_csv(filepath)
            
            # Convert to Arrow table
            table = pa.Table.from_pandas(df)
            
            yield idx, table

Builder Configuration

Configuration class for dataset builders that defines named configurations and their parameters.

class BuilderConfig:
    """Base class for DatasetBuilder data configuration."""
    
    def __init__(
        self,
        name: str = "default",
        version: Optional[Union[str, Version]] = "0.0.0",
        data_dir: Optional[str] = None,
        data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None,
        description: Optional[str] = None,
    ): ...
    
    def create_config_id(
        self,
        config_kwargs: dict,
        custom_features: Optional[Features] = None,
    ) -> str: ...

Usage Example:

from datasets import BuilderConfig

class MyBuilderConfig(BuilderConfig):
    """Custom configuration with additional parameters."""
    
    def __init__(
        self,
        name: str = "default",
        version: Optional[Union[str, Version]] = "1.0.0",
        data_dir: Optional[str] = None,
        data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None,
        description: Optional[str] = None,
        # Custom parameters
        language: str = "en",
        preprocessing: str = "standard",
        **kwargs,
    ):
        super().__init__(
            name=name,
            version=version,
            data_dir=data_dir,
            data_files=data_files,
            description=description,
            **kwargs,
        )
        self.language = language
        self.preprocessing = preprocessing

class MyConfigurableBuilder(GeneratorBasedBuilder):
    
    BUILDER_CONFIG_CLASS = MyBuilderConfig
    BUILDER_CONFIGS = [
        MyBuilderConfig(
            name="en_standard",
            description="English dataset with standard preprocessing",
            language="en",
            preprocessing="standard",
        ),
        MyBuilderConfig(
            name="en_minimal",
            description="English dataset with minimal preprocessing",
            language="en", 
            preprocessing="minimal",
        ),
        MyBuilderConfig(
            name="es_standard",
            description="Spanish dataset with standard preprocessing",
            language="es",
            preprocessing="standard",
        ),
    ]
    DEFAULT_CONFIG_NAME = "en_standard"
    
    def _info(self) -> DatasetInfo:
        return DatasetInfo(
            description=f"Dataset in {self.config.language} with {self.config.preprocessing} preprocessing",
            features=Features({
                "text": Value("string"),
                "label": ClassLabel(names=["pos", "neg"]),
                "language": Value("string"),
            })
        )
    
    def _split_generators(self, dl_manager):
        # Use config parameters to determine data sources
        url = f"https://example.com/{self.config.language}/data.jsonl"
        filepath = dl_manager.download(url)
        
        return [
            SplitGenerator(
                name=Split.TRAIN,
                gen_kwargs={"filepath": filepath, "preprocessing": self.config.preprocessing}
            )
        ]
    
    def _generate_examples(self, filepath, preprocessing):
        # Use preprocessing parameter to determine processing logic
        with open(filepath, 'r') as f:
            for idx, line in enumerate(f):
                data = json.loads(line)
                
                text = data["text"]
                if preprocessing == "standard":
                    text = text.lower().strip()
                elif preprocessing == "minimal":
                    text = text.strip()
                
                yield idx, {
                    "text": text,
                    "label": data["label"],
                    "language": self.config.language,
                }

Advanced Dataset Building Patterns

Multi-Format Dataset Builder

class MultiFormatBuilder(GeneratorBasedBuilder):
    """Builder that can handle multiple input formats."""
    
    def _split_generators(self, dl_manager):
        # Handle different file types
        files = {
            "csv_files": [dl_manager.download(url) for url in self.config.csv_urls],
            "json_files": [dl_manager.download(url) for url in self.config.json_urls],
            "txt_files": [dl_manager.download(url) for url in self.config.txt_urls],
        }
        
        return [
            SplitGenerator(name=Split.TRAIN, gen_kwargs=files)
        ]
    
    def _generate_examples(self, csv_files, json_files, txt_files):
        example_id = 0
        
        # Process CSV files
        for filepath in csv_files:
            df = pd.read_csv(filepath)
            for _, row in df.iterrows():
                yield example_id, {"text": row["text"], "source": "csv"}
                example_id += 1
        
        # Process JSON files
        for filepath in json_files:
            with open(filepath) as f:
                data = json.load(f)
                for item in data:
                    yield example_id, {"text": item["text"], "source": "json"}
                    example_id += 1
        
        # Process text files
        for filepath in txt_files:
            with open(filepath) as f:
                for line in f:
                    yield example_id, {"text": line.strip(), "source": "txt"}
                    example_id += 1

Dataset with Manual Download

class ManualDownloadBuilder(GeneratorBasedBuilder):
    """Builder for datasets requiring manual download."""
    
    MANUAL_DOWNLOAD_INSTRUCTIONS = """
    Please download the dataset files manually from: https://example.com/dataset
    Extract the files and place them in: {manual_dir}
    The expected files are:
    - train.jsonl
    - test.jsonl
    - metadata.json
    """
    
    def _split_generators(self, dl_manager):
        # dl_manager.manual_dir points to the manually downloaded files
        manual_dir = dl_manager.manual_dir
        
        return [
            SplitGenerator(
                name=Split.TRAIN,
                gen_kwargs={"filepath": os.path.join(manual_dir, "train.jsonl")}
            ),
            SplitGenerator(
                name=Split.TEST,
                gen_kwargs={"filepath": os.path.join(manual_dir, "test.jsonl")}
            ),
        ]

Performance Optimization

class OptimizedBuilder(GeneratorBasedBuilder):
    """Builder with performance optimizations."""
    
    # Optimize batch size for writing
    DEFAULT_WRITER_BATCH_SIZE = 10000
    
    def download_and_prepare(self, **kwargs):
        # Use multiprocessing for faster preparation
        kwargs.setdefault("num_proc", 4)
        
        # Use larger shard size for fewer files
        kwargs.setdefault("max_shard_size", "1GB")
        
        super().download_and_prepare(**kwargs)
    
    def _generate_examples(self, filepath):
        # Use efficient file reading
        with open(filepath, 'rb') as f:
            for idx, line in enumerate(f):
                # Process line efficiently
                data = orjson.loads(line)  # orjson is faster than json
                yield idx, self._process_example(data)
    
    def _process_example(self, data):
        # Efficient data processing
        return {
            "text": data["text"],
            "label": data["label"],
            "features": np.array(data["features"], dtype=np.float32),
        }

Best Practices

Error Handling and Validation

def _generate_examples(self, filepath):
    """Generate examples with proper error handling."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                try:
                    data = json.loads(line.strip())
                    
                    # Validate required fields
                    if "text" not in data or "label" not in data:
                        logger.warning(f"Skipping incomplete example at line {idx}")
                        continue
                    
                    yield idx, {
                        "text": str(data["text"]),
                        "label": str(data["label"]),
                    }
                    
                except json.JSONDecodeError as e:
                    logger.warning(f"Failed to parse JSON at line {idx}: {e}")
                    continue
                    
    except FileNotFoundError:
        raise FileNotFoundError(f"Data file not found: {filepath}")

Testing Dataset Builders

def test_builder():
    """Test the custom dataset builder."""
    from datasets import load_dataset_builder
    
    # Test builder instantiation
    builder = MyTextClassificationBuilder()
    
    # Test info generation
    info = builder._info()
    assert "text" in info.features
    assert "label" in info.features
    
    # Test dataset building
    builder.download_and_prepare()
    dataset = builder.as_dataset()
    
    # Validate dataset
    assert len(dataset["train"]) > 0
    assert all(key in dataset["train"].features for key in ["text", "label"])

This comprehensive dataset building system provides flexible, efficient tools for creating custom datasets that integrate seamlessly with the Hugging Face datasets ecosystem, supporting features like multiprocessing, caching, streaming, and various output formats.

Install with Tessl CLI

npx tessl i tessl/pypi-datasets

docs

core-dataset-classes.md

data-loading.md

dataset-building.md

dataset-information.md

dataset-operations.md

features-and-types.md

index.md

tile.json