CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-datasets

HuggingFace community-driven open-source library of datasets for machine learning with one-line dataloaders, efficient preprocessing, and multi-framework support

Pending
Overview
Eval results
Files

core-dataset-classes.mddocs/

Core Dataset Classes

The fundamental dataset classes that provide different access patterns and capabilities for working with dataset collections. These classes form the core of the datasets library, offering both in-memory and streaming approaches to dataset processing.

Capabilities

Dataset - Map-style Dataset

The main dataset class backed by Apache Arrow for efficient random access. Provides comprehensive data transformation, filtering, and export capabilities.

class Dataset:
    """Map-style dataset backed by Apache Arrow for efficient random access."""
    
    # Core properties
    features: Features
    info: DatasetInfo
    split: Optional[NamedSplit]
    num_rows: int
    num_columns: int
    column_names: List[str]
    shape: Tuple[int, int]
    
    # Dataset creation (class methods)
    @classmethod
    def from_pandas(
        cls,
        df: "pandas.DataFrame",
        features: Optional[Features] = None,
        info: Optional[DatasetInfo] = None,
        split: Optional[NamedSplit] = None,
        preserve_index: Optional[bool] = None,
    ) -> "Dataset": ...
    
    @classmethod
    def from_dict(
        cls,
        mapping: dict,
        features: Optional[Features] = None,
        info: Optional[DatasetInfo] = None,
        split: Optional[NamedSplit] = None,
    ) -> "Dataset": ...
    
    @classmethod
    def from_list(
        cls,
        mapping: List[dict],
        features: Optional[Features] = None,
        info: Optional[DatasetInfo] = None,
        split: Optional[NamedSplit] = None,
    ) -> "Dataset": ...
    
    # Dataset creation (static methods)
    @staticmethod
    def from_csv(
        path_or_paths: Union[PathLike, List[PathLike]],
        split: Optional[NamedSplit] = None,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        num_proc: Optional[int] = None,
        **kwargs,
    ) -> "Dataset": ...
    
    @staticmethod
    def from_json(
        path_or_paths: Union[PathLike, List[PathLike]],
        split: Optional[NamedSplit] = None,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        field: Optional[str] = None,
        num_proc: Optional[int] = None,
        **kwargs,
    ) -> "Dataset": ...
    
    @staticmethod
    def from_parquet(
        path_or_paths: Union[PathLike, List[PathLike]],
        split: Optional[NamedSplit] = None,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        columns: Optional[List[str]] = None,
        num_proc: Optional[int] = None,
        **kwargs,
    ) -> "Dataset": ...
    
    @staticmethod
    def from_text(
        path_or_paths: Union[PathLike, List[PathLike]],
        split: Optional[NamedSplit] = None,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        num_proc: Optional[int] = None,
        **kwargs,
    ) -> "Dataset": ...
    
    @staticmethod
    def from_generator(
        generator: Callable,
        features: Optional[Features] = None,
        cache_dir: Optional[str] = None,
        keep_in_memory: bool = False,
        gen_kwargs: Optional[dict] = None,
        num_proc: Optional[int] = None,
        split: NamedSplit = "train",
        **kwargs,
    ) -> "Dataset": ...
    
    # Data access
    def __getitem__(self, key): ...
    def __len__(self) -> int: ...
    
    # Data transformation
    def map(
        self,
        function=None,
        with_indices: bool = False,
        with_rank: bool = False,
        input_columns: Optional[Union[str, List[str]]] = None,
        batched: bool = False,
        batch_size: int = 1000,
        drop_last_batch: bool = False,
        remove_columns: Optional[Union[str, List[str]]] = None,
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        cache_file_name: Optional[str] = None,
        writer_batch_size: int = 1000,
        features: Optional[Features] = None,
        disable_nullable: bool = False,
        fn_kwargs: Optional[dict] = None,
        num_proc: Optional[int] = None,
        desc: Optional[str] = None,
        **kwargs
    ) -> "Dataset": ...
    
    def filter(
        self,
        function=None,
        with_indices: bool = False,
        with_rank: bool = False,
        input_columns: Optional[Union[str, List[str]]] = None,
        batched: bool = False,
        batch_size: int = 1000,
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        cache_file_name: Optional[str] = None,
        writer_batch_size: int = 1000,
        fn_kwargs: Optional[dict] = None,
        num_proc: Optional[int] = None,
        desc: Optional[str] = None,
        **kwargs
    ) -> "Dataset": ...
    
    def select(
        self,
        indices: Union[int, List[int], Iterable[int]],
        keep_in_memory: bool = False,
        indices_cache_file_name: Optional[str] = None,
        writer_batch_size: int = 1000,
        **kwargs
    ) -> "Dataset": ...
    
    def sort(
        self,
        column_names: Union[str, List[str]],
        reverse: Union[bool, List[bool]] = False,
        null_placement: str = "at_end",
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        **kwargs
    ) -> "Dataset": ...
    
    def shuffle(
        self,
        seed: Optional[int] = None,
        generator: Optional = None,
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        **kwargs
    ) -> "Dataset": ...
    
    # Column operations
    def remove_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
    def rename_column(self, original_column_name: str, new_column_name: str, **kwargs) -> "Dataset": ...
    def rename_columns(self, column_mapping: Dict[str, str], **kwargs) -> "Dataset": ...
    def select_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
    def add_column(self, name: str, column: Union[list, np.array], **kwargs) -> "Dataset": ...
    
    # Type casting
    def cast(self, features: Features, **kwargs) -> "Dataset": ...
    def cast_column(self, column: str, feature, **kwargs) -> "Dataset": ...
    
    # Data formatting
    def with_format(
        self,
        type: Optional[str] = None,
        columns: Optional[List] = None,
        output_all_columns: bool = False,
        **format_kwargs
    ) -> "Dataset": ...
    
    def set_format(
        self,
        type: Optional[str] = None,
        columns: Optional[List] = None,
        output_all_columns: bool = False,
        **format_kwargs
    ) -> None: ...
    
    def reset_format(self) -> None: ...
    
    # Data export
    def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> dict: ...
    def to_pandas(
        self, 
        batch_size: Optional[int] = None, 
        batched: bool = False
    ) -> Union["pandas.DataFrame", Iterator["pandas.DataFrame"]]: ...
    def save_to_disk(
        self,
        dataset_path: PathLike,
        max_shard_size: Optional[Union[str, int]] = None,
        num_shards: Optional[int] = None,
        num_proc: Optional[int] = None,
        storage_options: Optional[dict] = None,
    ) -> None: ...
    
    # Dataset splitting
    def train_test_split(
        self,
        test_size: Optional[Union[float, int]] = None,
        train_size: Optional[Union[float, int]] = None,
        shuffle: bool = True,
        seed: Optional[int] = None,
        **kwargs
    ) -> "DatasetDict": ...
    
    def shard(
        self,
        num_shards: int,
        index: int,
        contiguous: bool = True,
        **kwargs
    ) -> "Dataset": ...

Usage Examples:

from datasets import Dataset

# Create dataset from dictionary
data = {"text": ["Hello", "World"], "label": [0, 1]}
dataset = Dataset.from_dict(data)

# Transform data
def uppercase(example):
    example["text"] = example["text"].upper()
    return example

dataset = dataset.map(uppercase)

# Filter data
dataset = dataset.filter(lambda x: len(x["text"]) > 3)

# Export to different formats
dataset.set_format("torch")
pandas_df = dataset.to_pandas()

DatasetDict - Multiple Dataset Container

Dictionary-like container that holds multiple Dataset objects, typically representing different splits (train, validation, test).

class DatasetDict(dict):
    """Dictionary of Dataset objects, typically for train/validation/test splits."""
    
    # Properties
    num_columns: Dict[str, int]
    num_rows: Dict[str, int]
    column_names: Dict[str, List[str]]
    shape: Dict[str, Tuple[int, int]]
    
    # Data transformation (applied to all splits)
    def map(self, function=None, **kwargs) -> "DatasetDict": ...
    def filter(self, function=None, **kwargs) -> "DatasetDict": ...
    def sort(self, column_names: Union[str, List[str]], **kwargs) -> "DatasetDict": ...
    def shuffle(self, **kwargs) -> "DatasetDict": ...
    
    # Column operations (applied to all splits)
    def remove_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
    def rename_column(self, original_column_name: str, new_column_name: str) -> "DatasetDict": ...
    def rename_columns(self, column_mapping: Dict[str, str]) -> "DatasetDict": ...
    def select_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
    
    # Type operations (applied to all splits)  
    def cast(self, features: Features) -> "DatasetDict": ...
    def cast_column(self, column: str, feature) -> "DatasetDict": ...
    
    # Formatting (applied to all splits)
    def with_format(
        self,
        type: Optional[str] = None,
        columns: Optional[List] = None,
        output_all_columns: bool = False,
        **format_kwargs
    ) -> "DatasetDict": ...
    
    def set_format(
        self,
        type: Optional[str] = None,
        columns: Optional[List] = None,
        output_all_columns: bool = False,
        **format_kwargs
    ) -> None: ...
    
    def reset_format(self) -> None: ...
    
    # Data export
    def save_to_disk(self, dataset_dict_path: str, **kwargs) -> None: ...
    
    # Utilities
    def flatten(self, max_depth: int = 16) -> "DatasetDict": ...
    def unique(self, column: str) -> Dict[str, List]: ...
    def cleanup_cache_files(self) -> Dict[str, int]: ...

Usage Examples:

from datasets import DatasetDict, Dataset

# Create DatasetDict from separate datasets
dataset_dict = DatasetDict({
    "train": Dataset.from_dict({"text": ["train1", "train2"], "label": [0, 1]}),
    "test": Dataset.from_dict({"text": ["test1"], "label": [0]})
})

# Apply operations to all splits
dataset_dict = dataset_dict.map(lambda x: {"length": len(x["text"])})
dataset_dict = dataset_dict.filter(lambda x: x["length"] > 3)

# Access individual splits
train_data = dataset_dict["train"]
test_data = dataset_dict["test"]

IterableDataset - Streaming Dataset

Iterable-style dataset for streaming large datasets without loading everything into memory. Processes data on-the-fly with sequential access only.

class IterableDataset:
    """Iterable-style dataset for streaming large datasets without loading into memory."""
    
    # Properties
    features: Optional[Features]
    info: DatasetInfo
    split: Optional[NamedSplit]
    num_columns: Optional[int]
    column_names: Optional[List[str]]
    
    # Iteration
    def __iter__(self): ...
    def iter(self, batch_size: int, drop_last_batch: bool = False): ...
    
    # Iteration control
    def take(self, n: int) -> "IterableDataset": ...
    def skip(self, n: int) -> "IterableDataset": ...
    
    # Data transformation (streaming)
    def map(
        self,
        function=None,
        with_indices: bool = False,
        input_columns: Optional[Union[str, List[str]]] = None,
        batched: bool = False,
        batch_size: int = 1000,
        drop_last_batch: bool = False,
        remove_columns: Optional[Union[str, List[str]]] = None,
        features: Optional[Features] = None,
        fn_kwargs: Optional[dict] = None,
    ) -> "IterableDataset": ...
    
    def filter(
        self,
        function=None,
        with_indices: bool = False,
        input_columns: Optional[Union[str, List[str]]] = None,
        batched: bool = False,
        batch_size: int = 1000,
        fn_kwargs: Optional[dict] = None,
    ) -> "IterableDataset": ...
    
    def shuffle(
        self,
        seed: Optional[int] = None,
        generator: Optional = None,
        buffer_size: int = 1000,
    ) -> "IterableDataset": ...
    
    # Column operations (streaming)
    def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
    def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDataset": ...
    def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDataset": ...
    def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
    
    # Type operations (streaming)
    def cast(self, features: Features) -> "IterableDataset": ...
    def cast_column(self, column: str, feature) -> "IterableDataset": ...
    
    # Formatting (streaming)
    def with_format(self, type: Optional[str] = None) -> "IterableDataset": ...

Usage Examples:

from datasets import load_dataset

# Create streaming dataset
streaming_dataset = load_dataset("oscar", "unshuffled_deduplicated_en", streaming=True)

# Take first 1000 examples
small_dataset = streaming_dataset.take(1000)

# Apply transformations on-the-fly
def preprocess(example):
    example["length"] = len(example["text"])
    return example

processed = small_dataset.map(preprocess)

# Iterate through examples
for example in processed:
    print(example["length"])
    break

IterableDatasetDict - Streaming Dataset Container

Dictionary-like container for multiple IterableDataset objects representing different splits for streaming workflows.

class IterableDatasetDict(dict):
    """Dictionary of IterableDataset objects for streaming workflows."""
    
    # Properties
    num_columns: Optional[Dict[str, int]]
    column_names: Optional[Dict[str, List[str]]]
    
    # Data transformation (applied to all streaming splits)
    def map(self, function=None, **kwargs) -> "IterableDatasetDict": ...
    def filter(self, function=None, **kwargs) -> "IterableDatasetDict": ...
    def shuffle(self, **kwargs) -> "IterableDatasetDict": ...
    
    # Column operations (applied to all streaming splits)
    def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
    def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDatasetDict": ...
    def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDatasetDict": ...
    def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
    
    # Type operations (applied to all streaming splits)
    def cast(self, features: Features) -> "IterableDatasetDict": ...
    def cast_column(self, column: str, feature) -> "IterableDatasetDict": ...
    
    # Formatting (applied to all streaming splits)  
    def with_format(self, type: Optional[str] = None) -> "IterableDatasetDict": ...

Usage Examples:

# Load streaming dataset with multiple splits
streaming_dict = load_dataset("squad", streaming=True)

# Apply operations to all streaming splits
streaming_dict = streaming_dict.map(lambda x: {"question_length": len(x["question"])})

# Access individual streaming splits
train_stream = streaming_dict["train"]
validation_stream = streaming_dict["validation"]

# Take samples from each split
for example in train_stream.take(5):
    print(f"Question length: {example['question_length']}")
    break

Types

Path Types

from os import PathLike

Column Types

class Column:
    """Iterable for accessing specific columns of a dataset."""
    
    def __init__(self, table, info: Optional[DatasetInfo] = None): ...
    def __iter__(self): ...
    def __len__(self) -> int: ...

class IterableColumn:
    """Iterable column access for IterableDataset."""
    
    def __init__(self, dataset, key: str): ...
    def __iter__(self): ...

Performance Considerations

  • Dataset/DatasetDict: Best for smaller datasets that fit in memory, supports random access and complex operations
  • IterableDataset/IterableDatasetDict: Best for large datasets, memory efficient streaming, sequential access only
  • Caching: Dataset operations are cached by default for reproducibility
  • Multiprocessing: Many operations support num_proc parameter for parallel processing
  • Apache Arrow: Underlying storage format provides efficient columnar operations

Install with Tessl CLI

npx tessl i tessl/pypi-datasets

docs

core-dataset-classes.md

data-loading.md

dataset-building.md

dataset-information.md

dataset-operations.md

features-and-types.md

index.md

tile.json