tessl/pypi-datasets

HuggingFace community-driven open-source library of datasets for machine learning with one-line dataloaders, efficient preprocessing, and multi-framework support

—

Pending

Overview

Eval results

Files

Core Dataset Classes

Name: tessl/pypi-datasets
Author: tessl

The fundamental dataset classes that provide different access patterns and capabilities for working with dataset collections. These classes form the core of the datasets library, offering both in-memory and streaming approaches to dataset processing.

Capabilities

Dataset - Map-style Dataset

The main dataset class backed by Apache Arrow for efficient random access. Provides comprehensive data transformation, filtering, and export capabilities.

class Dataset:
    """Map-style dataset backed by Apache Arrow for efficient random access."""
    
    # Core properties
    features: Features
    info: DatasetInfo
    split: Optional[NamedSplit]
    num_rows: int
    num_columns: int
    column_names: List[str]
    shape: Tuple[int, int]
    
    # Dataset creation (class methods)
    @classmethod
    def from_pandas(
        cls,
        df: "pandas.DataFrame",
        features: Optional[Features] = None,
        info: Optional[DatasetInfo] = None,
        split: Optional[NamedSplit] = None,
        preserve_index: Optional[bool] = None,
    ) -> "Dataset": ...
    
    @classmethod
    def from_dict(
        cls,
        mapping: dict,
        features: Optional[Features] = None,
        info: Optional[DatasetInfo] = None,
        split: Optional[NamedSplit] = None,
    ) -> "Dataset": ...
    
    @classmethod
    def from_list(
        cls,
        mapping: List[dict],
        features: Optional[Features] = None,
        info: Optional[DatasetInfo] = None,
        split: Optional[NamedSplit] = None,
    ) -> "Dataset": ...
    
    # Dataset creation (static methods)
    @staticmethod
    def from_csv(
        path_or_paths: Union[PathLike, List[PathLike]],
        split: Optional[NamedSplit] = None,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        num_proc: Optional[int] = None,
        **kwargs,
    ) -> "Dataset": ...
    
    @staticmethod
    def from_json(
        path_or_paths: Union[PathLike, List[PathLike]],
        split: Optional[NamedSplit] = None,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        field: Optional[str] = None,
        num_proc: Optional[int] = None,
        **kwargs,
    ) -> "Dataset": ...
    
    @staticmethod
    def from_parquet(
        path_or_paths: Union[PathLike, List[PathLike]],
        split: Optional[NamedSplit] = None,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        columns: Optional[List[str]] = None,
        num_proc: Optional[int] = None,
        **kwargs,
    ) -> "Dataset": ...
    
    @staticmethod
    def from_text(
        path_or_paths: Union[PathLike, List[PathLike]],
        split: Optional[NamedSplit] = None,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        num_proc: Optional[int] = None,
        **kwargs,
    ) -> "Dataset": ...
    
    @staticmethod
    def from_generator(
        generator: Callable,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        gen_kwargs: Optional[dict] = None,
        num_proc: Optional[int] = None,
        split: NamedSplit = "train",
        **kwargs,
    ) -> "Dataset": ...
    
    # Data access
    def __getitem__(self, key): ...
    def __len__(self) -> int: ...
    
    # Data transformation
    def map(
        self,
        function=None,
        with_indices: bool = False,
        with_rank: bool = False,
        input_columns: Optional[Union[str, List[str]]] = None,
        batched: bool = False,
        batch_size: int = 1000,
        drop_last_batch: bool = False,
        remove_columns: Optional[Union[str, List[str]]] = None,
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        cache_file_name: Optional[str] = None,
        writer_batch_size: int = 1000,
        features: Optional[Features] = None,
        disable_nullable: bool = False,
        fn_kwargs: Optional[dict] = None,
        num_proc: Optional[int] = None,
        desc: Optional[str] = None,
        **kwargs
    ) -> "Dataset": ...
    
    def filter(
        self,
        function=None,
        with_indices: bool = False,
        with_rank: bool = False,
        input_columns: Optional[Union[str, List[str]]] = None,
        batched: bool = False,
        batch_size: int = 1000,
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        cache_file_name: Optional[str] = None,
        writer_batch_size: int = 1000,
        fn_kwargs: Optional[dict] = None,
        num_proc: Optional[int] = None,
        desc: Optional[str] = None,
        **kwargs
    ) -> "Dataset": ...
    
    def select(
        self,
        indices: Union[int, List[int], Iterable[int]],
        keep_in_memory: bool = False,
        indices_cache_file_name: Optional[str] = None,
        writer_batch_size: int = 1000,
        **kwargs
    ) -> "Dataset": ...
    
    def sort(
        self,
        column_names: Union[str, List[str]],
        reverse: Union[bool, List[bool]] = False,
        null_placement: str = "at_end",
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        **kwargs
    ) -> "Dataset": ...
    
    def shuffle(
        self,
        seed: Optional[int] = None,
        generator: Optional = None,
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        **kwargs
    ) -> "Dataset": ...
    
    # Column operations
    def remove_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
    def rename_column(self, original_column_name: str, new_column_name: str, **kwargs) -> "Dataset": ...
    def rename_columns(self, column_mapping: Dict[str, str], **kwargs) -> "Dataset": ...
    def select_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
    def add_column(self, name: str, column: Union[list, np.array], **kwargs) -> "Dataset": ...
    
    # Type casting
    def cast(self, features: Features, **kwargs) -> "Dataset": ...
    def cast_column(self, column: str, feature, **kwargs) -> "Dataset": ...
    
    # Data formatting
    def with_format(
        self,
        type: Optional[str] = None,
        columns: Optional[List] = None,
        output_all_columns: bool = False,
        **format_kwargs
    ) -> "Dataset": ...
    
    def set_format(
        self,
        type: Optional[str] = None,
        columns: Optional[List] = None,
        output_all_columns: bool = False,
        **format_kwargs
    ) -> None: ...
    
    def reset_format(self) -> None: ...
    
    # Data export
    def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> dict: ...
    def to_pandas(
        self, 
        batch_size: Optional[int] = None, 
        batched: bool = False
    ) -> Union["pandas.DataFrame", Iterator["pandas.DataFrame"]]: ...
    def save_to_disk(
        self,
        dataset_path: PathLike,
        max_shard_size: Optional[Union[str, int]] = None,
        num_shards: Optional[int] = None,
        num_proc: Optional[int] = None,
        storage_options: Optional[dict] = None,
    ) -> None: ...
    
    # Dataset splitting
    def train_test_split(
        self,
        test_size: Optional[Union[float, int]] = None,
        train_size: Optional[Union[float, int]] = None,
        shuffle: bool = True,
        seed: Optional[int] = None,
        **kwargs
    ) -> "DatasetDict": ...
    
    def shard(
        self,
        num_shards: int,
        index: int,
        contiguous: bool = True,
        **kwargs
    ) -> "Dataset": ...

Usage Examples:

from datasets import Dataset

# Create dataset from dictionary
data = {"text": ["Hello", "World"], "label": [0, 1]}
dataset = Dataset.from_dict(data)

# Transform data
def uppercase(example):
    example["text"] = example["text"].upper()
    return example

dataset = dataset.map(uppercase)

# Filter data
dataset = dataset.filter(lambda x: len(x["text"]) > 3)

# Export to different formats
dataset.set_format("torch")
pandas_df = dataset.to_pandas()

DatasetDict - Multiple Dataset Container

Dictionary-like container that holds multiple Dataset objects, typically representing different splits (train, validation, test).

class DatasetDict(dict):
    """Dictionary of Dataset objects, typically for train/validation/test splits."""
    
    # Properties
    num_columns: Dict[str, int]
    num_rows: Dict[str, int]
    column_names: Dict[str, List[str]]
    shape: Dict[str, Tuple[int, int]]
    
    # Data transformation (applied to all splits)
    def map(self, function=None, **kwargs) -> "DatasetDict": ...
    def filter(self, function=None, **kwargs) -> "DatasetDict": ...
    def sort(self, column_names: Union[str, List[str]], **kwargs) -> "DatasetDict": ...
    def shuffle(self, **kwargs) -> "DatasetDict": ...
    
    # Column operations (applied to all splits)
    def remove_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
    def rename_column(self, original_column_name: str, new_column_name: str) -> "DatasetDict": ...
    def rename_columns(self, column_mapping: Dict[str, str]) -> "DatasetDict": ...
    def select_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
    
    # Type operations (applied to all splits)  
    def cast(self, features: Features) -> "DatasetDict": ...
    def cast_column(self, column: str, feature) -> "DatasetDict": ...
    
    # Formatting (applied to all splits)
    def with_format(
        self,
        type: Optional[str] = None,
        columns: Optional[List] = None,
        output_all_columns: bool = False,
        **format_kwargs
    ) -> "DatasetDict": ...
    
    def set_format(
        self,
        type: Optional[str] = None,
        columns: Optional[List] = None,
        output_all_columns: bool = False,
        **format_kwargs
    ) -> None: ...
    
    def reset_format(self) -> None: ...
    
    # Data export
    def save_to_disk(self, dataset_dict_path: str, **kwargs) -> None: ...
    
    # Utilities
    def flatten(self, max_depth: int = 16) -> "DatasetDict": ...
    def unique(self, column: str) -> Dict[str, List]: ...
    def cleanup_cache_files(self) -> Dict[str, int]: ...

Usage Examples:

from datasets import DatasetDict, Dataset

# Create DatasetDict from separate datasets
dataset_dict = DatasetDict({
    "train": Dataset.from_dict({"text": ["train1", "train2"], "label": [0, 1]}),
    "test": Dataset.from_dict({"text": ["test1"], "label": [0]})
})

# Apply operations to all splits
dataset_dict = dataset_dict.map(lambda x: {"length": len(x["text"])})
dataset_dict = dataset_dict.filter(lambda x: x["length"] > 3)

# Access individual splits
train_data = dataset_dict["train"]
test_data = dataset_dict["test"]

IterableDataset - Streaming Dataset

Iterable-style dataset for streaming large datasets without loading everything into memory. Processes data on-the-fly with sequential access only.

class IterableDataset:
    """Iterable-style dataset for streaming large datasets without loading into memory."""
    
    # Properties
    features: Optional[Features]
    info: DatasetInfo
    split: Optional[NamedSplit]
    num_columns: Optional[int]
    column_names: Optional[List[str]]
    
    # Iteration
    def __iter__(self): ...
    def iter(self, batch_size: int, drop_last_batch: bool = False): ...
    
    # Iteration control
    def take(self, n: int) -> "IterableDataset": ...
    def skip(self, n: int) -> "IterableDataset": ...
    
    # Data transformation (streaming)
    def map(
        self,
        function=None,
        with_indices: bool = False,
        input_columns: Optional[Union[str, List[str]]] = None,
        batched: bool = False,
        batch_size: int = 1000,
        drop_last_batch: bool = False,
        remove_columns: Optional[Union[str, List[str]]] = None,
        features: Optional[Features] = None,
        fn_kwargs: Optional[dict] = None,
    ) -> "IterableDataset": ...
    
    def filter(
        self,
        function=None,
        with_indices: bool = False,
        input_columns: Optional[Union[str, List[str]]] = None,
        batched: bool = False,
        batch_size: int = 1000,
        fn_kwargs: Optional[dict] = None,
    ) -> "IterableDataset": ...
    
    def shuffle(
        self,
        seed: Optional[int] = None,
        generator: Optional = None,
        buffer_size: int = 1000,
    ) -> "IterableDataset": ...
    
    # Column operations (streaming)
    def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
    def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDataset": ...
    def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDataset": ...
    def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
    
    # Type operations (streaming)
    def cast(self, features: Features) -> "IterableDataset": ...
    def cast_column(self, column: str, feature) -> "IterableDataset": ...
    
    # Formatting (streaming)
    def with_format(self, type: Optional[str] = None) -> "IterableDataset": ...

Usage Examples:

from datasets import load_dataset

# Create streaming dataset
streaming_dataset = load_dataset("oscar", "unshuffled_deduplicated_en", streaming=True)

# Take first 1000 examples
small_dataset = streaming_dataset.take(1000)

# Apply transformations on-the-fly
def preprocess(example):
    example["length"] = len(example["text"])
    return example

processed = small_dataset.map(preprocess)

# Iterate through examples
for example in processed:
    print(example["length"])
    break

IterableDatasetDict - Streaming Dataset Container

Dictionary-like container for multiple IterableDataset objects representing different splits for streaming workflows.

class IterableDatasetDict(dict):
    """Dictionary of IterableDataset objects for streaming workflows."""
    
    # Properties
    num_columns: Optional[Dict[str, int]]
    column_names: Optional[Dict[str, List[str]]]
    
    # Data transformation (applied to all streaming splits)
    def map(self, function=None, **kwargs) -> "IterableDatasetDict": ...
    def filter(self, function=None, **kwargs) -> "IterableDatasetDict": ...
    def shuffle(self, **kwargs) -> "IterableDatasetDict": ...
    
    # Column operations (applied to all streaming splits)
    def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
    def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDatasetDict": ...
    def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDatasetDict": ...
    def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
    
    # Type operations (applied to all streaming splits)
    def cast(self, features: Features) -> "IterableDatasetDict": ...
    def cast_column(self, column: str, feature) -> "IterableDatasetDict": ...
    
    # Formatting (applied to all streaming splits)  
    def with_format(self, type: Optional[str] = None) -> "IterableDatasetDict": ...

Usage Examples:

# Load streaming dataset with multiple splits
streaming_dict = load_dataset("squad", streaming=True)

# Apply operations to all streaming splits
streaming_dict = streaming_dict.map(lambda x: {"question_length": len(x["question"])})

# Access individual streaming splits
train_stream = streaming_dict["train"]
validation_stream = streaming_dict["validation"]

# Take samples from each split
for example in train_stream.take(5):
    print(f"Question length: {example['question_length']}")
    break

Types

Path Types

from os import PathLike

Column Types

class Column:
    """Iterable for accessing specific columns of a dataset."""
    
    def __init__(self, table, info: Optional[DatasetInfo] = None): ...
    def __iter__(self): ...
    def __len__(self) -> int: ...

class IterableColumn:
    """Iterable column access for IterableDataset."""
    
    def __init__(self, dataset, key: str): ...
    def __iter__(self): ...

Performance Considerations

Dataset/DatasetDict: Best for smaller datasets that fit in memory, supports random access and complex operations
IterableDataset/IterableDatasetDict: Best for large datasets, memory efficient streaming, sequential access only
Caching: Dataset operations are cached by default for reproducibility
Multiprocessing: Many operations support num_proc parameter for parallel processing
Apache Arrow: Underlying storage format provides efficient columnar operations