HuggingFace community-driven open-source library of datasets for machine learning with one-line dataloaders, efficient preprocessing, and multi-framework support
—
The fundamental dataset classes that provide different access patterns and capabilities for working with dataset collections. These classes form the core of the datasets library, offering both in-memory and streaming approaches to dataset processing.
The main dataset class backed by Apache Arrow for efficient random access. Provides comprehensive data transformation, filtering, and export capabilities.
class Dataset:
"""Map-style dataset backed by Apache Arrow for efficient random access."""
# Core properties
features: Features
info: DatasetInfo
split: Optional[NamedSplit]
num_rows: int
num_columns: int
column_names: List[str]
shape: Tuple[int, int]
# Dataset creation (class methods)
@classmethod
def from_pandas(
cls,
df: "pandas.DataFrame",
features: Optional[Features] = None,
info: Optional[DatasetInfo] = None,
split: Optional[NamedSplit] = None,
preserve_index: Optional[bool] = None,
) -> "Dataset": ...
@classmethod
def from_dict(
cls,
mapping: dict,
features: Optional[Features] = None,
info: Optional[DatasetInfo] = None,
split: Optional[NamedSplit] = None,
) -> "Dataset": ...
@classmethod
def from_list(
cls,
mapping: List[dict],
features: Optional[Features] = None,
info: Optional[DatasetInfo] = None,
split: Optional[NamedSplit] = None,
) -> "Dataset": ...
# Dataset creation (static methods)
@staticmethod
def from_csv(
path_or_paths: Union[PathLike, List[PathLike]],
split: Optional[NamedSplit] = None,
features: Optional[Features] = None,
cache_dir: Optional[str] = None,
keep_in_memory: bool = False,
num_proc: Optional[int] = None,
**kwargs,
) -> "Dataset": ...
@staticmethod
def from_json(
path_or_paths: Union[PathLike, List[PathLike]],
split: Optional[NamedSplit] = None,
features: Optional[Features] = None,
cache_dir: Optional[str] = None,
keep_in_memory: bool = False,
field: Optional[str] = None,
num_proc: Optional[int] = None,
**kwargs,
) -> "Dataset": ...
@staticmethod
def from_parquet(
path_or_paths: Union[PathLike, List[PathLike]],
split: Optional[NamedSplit] = None,
features: Optional[Features] = None,
cache_dir: Optional[str] = None,
keep_in_memory: bool = False,
columns: Optional[List[str]] = None,
num_proc: Optional[int] = None,
**kwargs,
) -> "Dataset": ...
@staticmethod
def from_text(
path_or_paths: Union[PathLike, List[PathLike]],
split: Optional[NamedSplit] = None,
features: Optional[Features] = None,
cache_dir: Optional[str] = None,
keep_in_memory: bool = False,
num_proc: Optional[int] = None,
**kwargs,
) -> "Dataset": ...
@staticmethod
def from_generator(
generator: Callable,
features: Optional[Features] = None,
cache_dir: Optional[str] = None,
keep_in_memory: bool = False,
gen_kwargs: Optional[dict] = None,
num_proc: Optional[int] = None,
split: NamedSplit = "train",
**kwargs,
) -> "Dataset": ...
# Data access
def __getitem__(self, key): ...
def __len__(self) -> int: ...
# Data transformation
def map(
self,
function=None,
with_indices: bool = False,
with_rank: bool = False,
input_columns: Optional[Union[str, List[str]]] = None,
batched: bool = False,
batch_size: int = 1000,
drop_last_batch: bool = False,
remove_columns: Optional[Union[str, List[str]]] = None,
keep_in_memory: bool = False,
load_from_cache_file: Optional[bool] = None,
cache_file_name: Optional[str] = None,
writer_batch_size: int = 1000,
features: Optional[Features] = None,
disable_nullable: bool = False,
fn_kwargs: Optional[dict] = None,
num_proc: Optional[int] = None,
desc: Optional[str] = None,
**kwargs
) -> "Dataset": ...
def filter(
self,
function=None,
with_indices: bool = False,
with_rank: bool = False,
input_columns: Optional[Union[str, List[str]]] = None,
batched: bool = False,
batch_size: int = 1000,
keep_in_memory: bool = False,
load_from_cache_file: Optional[bool] = None,
cache_file_name: Optional[str] = None,
writer_batch_size: int = 1000,
fn_kwargs: Optional[dict] = None,
num_proc: Optional[int] = None,
desc: Optional[str] = None,
**kwargs
) -> "Dataset": ...
def select(
self,
indices: Union[int, List[int], Iterable[int]],
keep_in_memory: bool = False,
indices_cache_file_name: Optional[str] = None,
writer_batch_size: int = 1000,
**kwargs
) -> "Dataset": ...
def sort(
self,
column_names: Union[str, List[str]],
reverse: Union[bool, List[bool]] = False,
null_placement: str = "at_end",
keep_in_memory: bool = False,
load_from_cache_file: Optional[bool] = None,
**kwargs
) -> "Dataset": ...
def shuffle(
self,
seed: Optional[int] = None,
generator: Optional = None,
keep_in_memory: bool = False,
load_from_cache_file: Optional[bool] = None,
**kwargs
) -> "Dataset": ...
# Column operations
def remove_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
def rename_column(self, original_column_name: str, new_column_name: str, **kwargs) -> "Dataset": ...
def rename_columns(self, column_mapping: Dict[str, str], **kwargs) -> "Dataset": ...
def select_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
def add_column(self, name: str, column: Union[list, np.array], **kwargs) -> "Dataset": ...
# Type casting
def cast(self, features: Features, **kwargs) -> "Dataset": ...
def cast_column(self, column: str, feature, **kwargs) -> "Dataset": ...
# Data formatting
def with_format(
self,
type: Optional[str] = None,
columns: Optional[List] = None,
output_all_columns: bool = False,
**format_kwargs
) -> "Dataset": ...
def set_format(
self,
type: Optional[str] = None,
columns: Optional[List] = None,
output_all_columns: bool = False,
**format_kwargs
) -> None: ...
def reset_format(self) -> None: ...
# Data export
def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> dict: ...
def to_pandas(
self,
batch_size: Optional[int] = None,
batched: bool = False
) -> Union["pandas.DataFrame", Iterator["pandas.DataFrame"]]: ...
def save_to_disk(
self,
dataset_path: PathLike,
max_shard_size: Optional[Union[str, int]] = None,
num_shards: Optional[int] = None,
num_proc: Optional[int] = None,
storage_options: Optional[dict] = None,
) -> None: ...
# Dataset splitting
def train_test_split(
self,
test_size: Optional[Union[float, int]] = None,
train_size: Optional[Union[float, int]] = None,
shuffle: bool = True,
seed: Optional[int] = None,
**kwargs
) -> "DatasetDict": ...
def shard(
self,
num_shards: int,
index: int,
contiguous: bool = True,
**kwargs
) -> "Dataset": ...Usage Examples:
from datasets import Dataset
# Create dataset from dictionary
data = {"text": ["Hello", "World"], "label": [0, 1]}
dataset = Dataset.from_dict(data)
# Transform data
def uppercase(example):
example["text"] = example["text"].upper()
return example
dataset = dataset.map(uppercase)
# Filter data
dataset = dataset.filter(lambda x: len(x["text"]) > 3)
# Export to different formats
dataset.set_format("torch")
pandas_df = dataset.to_pandas()Dictionary-like container that holds multiple Dataset objects, typically representing different splits (train, validation, test).
class DatasetDict(dict):
"""Dictionary of Dataset objects, typically for train/validation/test splits."""
# Properties
num_columns: Dict[str, int]
num_rows: Dict[str, int]
column_names: Dict[str, List[str]]
shape: Dict[str, Tuple[int, int]]
# Data transformation (applied to all splits)
def map(self, function=None, **kwargs) -> "DatasetDict": ...
def filter(self, function=None, **kwargs) -> "DatasetDict": ...
def sort(self, column_names: Union[str, List[str]], **kwargs) -> "DatasetDict": ...
def shuffle(self, **kwargs) -> "DatasetDict": ...
# Column operations (applied to all splits)
def remove_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
def rename_column(self, original_column_name: str, new_column_name: str) -> "DatasetDict": ...
def rename_columns(self, column_mapping: Dict[str, str]) -> "DatasetDict": ...
def select_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
# Type operations (applied to all splits)
def cast(self, features: Features) -> "DatasetDict": ...
def cast_column(self, column: str, feature) -> "DatasetDict": ...
# Formatting (applied to all splits)
def with_format(
self,
type: Optional[str] = None,
columns: Optional[List] = None,
output_all_columns: bool = False,
**format_kwargs
) -> "DatasetDict": ...
def set_format(
self,
type: Optional[str] = None,
columns: Optional[List] = None,
output_all_columns: bool = False,
**format_kwargs
) -> None: ...
def reset_format(self) -> None: ...
# Data export
def save_to_disk(self, dataset_dict_path: str, **kwargs) -> None: ...
# Utilities
def flatten(self, max_depth: int = 16) -> "DatasetDict": ...
def unique(self, column: str) -> Dict[str, List]: ...
def cleanup_cache_files(self) -> Dict[str, int]: ...Usage Examples:
from datasets import DatasetDict, Dataset
# Create DatasetDict from separate datasets
dataset_dict = DatasetDict({
"train": Dataset.from_dict({"text": ["train1", "train2"], "label": [0, 1]}),
"test": Dataset.from_dict({"text": ["test1"], "label": [0]})
})
# Apply operations to all splits
dataset_dict = dataset_dict.map(lambda x: {"length": len(x["text"])})
dataset_dict = dataset_dict.filter(lambda x: x["length"] > 3)
# Access individual splits
train_data = dataset_dict["train"]
test_data = dataset_dict["test"]Iterable-style dataset for streaming large datasets without loading everything into memory. Processes data on-the-fly with sequential access only.
class IterableDataset:
"""Iterable-style dataset for streaming large datasets without loading into memory."""
# Properties
features: Optional[Features]
info: DatasetInfo
split: Optional[NamedSplit]
num_columns: Optional[int]
column_names: Optional[List[str]]
# Iteration
def __iter__(self): ...
def iter(self, batch_size: int, drop_last_batch: bool = False): ...
# Iteration control
def take(self, n: int) -> "IterableDataset": ...
def skip(self, n: int) -> "IterableDataset": ...
# Data transformation (streaming)
def map(
self,
function=None,
with_indices: bool = False,
input_columns: Optional[Union[str, List[str]]] = None,
batched: bool = False,
batch_size: int = 1000,
drop_last_batch: bool = False,
remove_columns: Optional[Union[str, List[str]]] = None,
features: Optional[Features] = None,
fn_kwargs: Optional[dict] = None,
) -> "IterableDataset": ...
def filter(
self,
function=None,
with_indices: bool = False,
input_columns: Optional[Union[str, List[str]]] = None,
batched: bool = False,
batch_size: int = 1000,
fn_kwargs: Optional[dict] = None,
) -> "IterableDataset": ...
def shuffle(
self,
seed: Optional[int] = None,
generator: Optional = None,
buffer_size: int = 1000,
) -> "IterableDataset": ...
# Column operations (streaming)
def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDataset": ...
def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDataset": ...
def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
# Type operations (streaming)
def cast(self, features: Features) -> "IterableDataset": ...
def cast_column(self, column: str, feature) -> "IterableDataset": ...
# Formatting (streaming)
def with_format(self, type: Optional[str] = None) -> "IterableDataset": ...Usage Examples:
from datasets import load_dataset
# Create streaming dataset
streaming_dataset = load_dataset("oscar", "unshuffled_deduplicated_en", streaming=True)
# Take first 1000 examples
small_dataset = streaming_dataset.take(1000)
# Apply transformations on-the-fly
def preprocess(example):
example["length"] = len(example["text"])
return example
processed = small_dataset.map(preprocess)
# Iterate through examples
for example in processed:
print(example["length"])
breakDictionary-like container for multiple IterableDataset objects representing different splits for streaming workflows.
class IterableDatasetDict(dict):
"""Dictionary of IterableDataset objects for streaming workflows."""
# Properties
num_columns: Optional[Dict[str, int]]
column_names: Optional[Dict[str, List[str]]]
# Data transformation (applied to all streaming splits)
def map(self, function=None, **kwargs) -> "IterableDatasetDict": ...
def filter(self, function=None, **kwargs) -> "IterableDatasetDict": ...
def shuffle(self, **kwargs) -> "IterableDatasetDict": ...
# Column operations (applied to all streaming splits)
def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDatasetDict": ...
def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDatasetDict": ...
def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
# Type operations (applied to all streaming splits)
def cast(self, features: Features) -> "IterableDatasetDict": ...
def cast_column(self, column: str, feature) -> "IterableDatasetDict": ...
# Formatting (applied to all streaming splits)
def with_format(self, type: Optional[str] = None) -> "IterableDatasetDict": ...Usage Examples:
# Load streaming dataset with multiple splits
streaming_dict = load_dataset("squad", streaming=True)
# Apply operations to all streaming splits
streaming_dict = streaming_dict.map(lambda x: {"question_length": len(x["question"])})
# Access individual streaming splits
train_stream = streaming_dict["train"]
validation_stream = streaming_dict["validation"]
# Take samples from each split
for example in train_stream.take(5):
print(f"Question length: {example['question_length']}")
breakfrom os import PathLikeclass Column:
"""Iterable for accessing specific columns of a dataset."""
def __init__(self, table, info: Optional[DatasetInfo] = None): ...
def __iter__(self): ...
def __len__(self) -> int: ...
class IterableColumn:
"""Iterable column access for IterableDataset."""
def __init__(self, dataset, key: str): ...
def __iter__(self): ...num_proc parameter for parallel processingInstall with Tessl CLI
npx tessl i tessl/pypi-datasets