0
# Core Dataset Classes
1
2
The fundamental dataset classes that provide different access patterns and capabilities for working with dataset collections. These classes form the core of the datasets library, offering both in-memory and streaming approaches to dataset processing.
3
4
## Capabilities
5
6
### Dataset - Map-style Dataset
7
8
The main dataset class backed by Apache Arrow for efficient random access. Provides comprehensive data transformation, filtering, and export capabilities.
9
10
```python { .api }
11
class Dataset:
12
"""Map-style dataset backed by Apache Arrow for efficient random access."""
13
14
# Core properties
15
features: Features
16
info: DatasetInfo
17
split: Optional[NamedSplit]
18
num_rows: int
19
num_columns: int
20
column_names: List[str]
21
shape: Tuple[int, int]
22
23
# Dataset creation (class methods)
24
@classmethod
25
def from_pandas(
26
cls,
27
df: "pandas.DataFrame",
28
features: Optional[Features] = None,
29
info: Optional[DatasetInfo] = None,
30
split: Optional[NamedSplit] = None,
31
preserve_index: Optional[bool] = None,
32
) -> "Dataset": ...
33
34
@classmethod
35
def from_dict(
36
cls,
37
mapping: dict,
38
features: Optional[Features] = None,
39
info: Optional[DatasetInfo] = None,
40
split: Optional[NamedSplit] = None,
41
) -> "Dataset": ...
42
43
@classmethod
44
def from_list(
45
cls,
46
mapping: List[dict],
47
features: Optional[Features] = None,
48
info: Optional[DatasetInfo] = None,
49
split: Optional[NamedSplit] = None,
50
) -> "Dataset": ...
51
52
# Dataset creation (static methods)
53
@staticmethod
54
def from_csv(
55
path_or_paths: Union[PathLike, List[PathLike]],
56
split: Optional[NamedSplit] = None,
57
features: Optional[Features] = None,
58
cache_dir: Optional[str] = None,
59
keep_in_memory: bool = False,
60
num_proc: Optional[int] = None,
61
**kwargs,
62
) -> "Dataset": ...
63
64
@staticmethod
65
def from_json(
66
path_or_paths: Union[PathLike, List[PathLike]],
67
split: Optional[NamedSplit] = None,
68
features: Optional[Features] = None,
69
cache_dir: Optional[str] = None,
70
keep_in_memory: bool = False,
71
field: Optional[str] = None,
72
num_proc: Optional[int] = None,
73
**kwargs,
74
) -> "Dataset": ...
75
76
@staticmethod
77
def from_parquet(
78
path_or_paths: Union[PathLike, List[PathLike]],
79
split: Optional[NamedSplit] = None,
80
features: Optional[Features] = None,
81
cache_dir: Optional[str] = None,
82
keep_in_memory: bool = False,
83
columns: Optional[List[str]] = None,
84
num_proc: Optional[int] = None,
85
**kwargs,
86
) -> "Dataset": ...
87
88
@staticmethod
89
def from_text(
90
path_or_paths: Union[PathLike, List[PathLike]],
91
split: Optional[NamedSplit] = None,
92
features: Optional[Features] = None,
93
cache_dir: Optional[str] = None,
94
keep_in_memory: bool = False,
95
num_proc: Optional[int] = None,
96
**kwargs,
97
) -> "Dataset": ...
98
99
@staticmethod
100
def from_generator(
101
generator: Callable,
102
features: Optional[Features] = None,
103
cache_dir: Optional[str] = None,
104
keep_in_memory: bool = False,
105
gen_kwargs: Optional[dict] = None,
106
num_proc: Optional[int] = None,
107
split: NamedSplit = "train",
108
**kwargs,
109
) -> "Dataset": ...
110
111
# Data access
112
def __getitem__(self, key): ...
113
def __len__(self) -> int: ...
114
115
# Data transformation
116
def map(
117
self,
118
function=None,
119
with_indices: bool = False,
120
with_rank: bool = False,
121
input_columns: Optional[Union[str, List[str]]] = None,
122
batched: bool = False,
123
batch_size: int = 1000,
124
drop_last_batch: bool = False,
125
remove_columns: Optional[Union[str, List[str]]] = None,
126
keep_in_memory: bool = False,
127
load_from_cache_file: Optional[bool] = None,
128
cache_file_name: Optional[str] = None,
129
writer_batch_size: int = 1000,
130
features: Optional[Features] = None,
131
disable_nullable: bool = False,
132
fn_kwargs: Optional[dict] = None,
133
num_proc: Optional[int] = None,
134
desc: Optional[str] = None,
135
**kwargs
136
) -> "Dataset": ...
137
138
def filter(
139
self,
140
function=None,
141
with_indices: bool = False,
142
with_rank: bool = False,
143
input_columns: Optional[Union[str, List[str]]] = None,
144
batched: bool = False,
145
batch_size: int = 1000,
146
keep_in_memory: bool = False,
147
load_from_cache_file: Optional[bool] = None,
148
cache_file_name: Optional[str] = None,
149
writer_batch_size: int = 1000,
150
fn_kwargs: Optional[dict] = None,
151
num_proc: Optional[int] = None,
152
desc: Optional[str] = None,
153
**kwargs
154
) -> "Dataset": ...
155
156
def select(
157
self,
158
indices: Union[int, List[int], Iterable[int]],
159
keep_in_memory: bool = False,
160
indices_cache_file_name: Optional[str] = None,
161
writer_batch_size: int = 1000,
162
**kwargs
163
) -> "Dataset": ...
164
165
def sort(
166
self,
167
column_names: Union[str, List[str]],
168
reverse: Union[bool, List[bool]] = False,
169
null_placement: str = "at_end",
170
keep_in_memory: bool = False,
171
load_from_cache_file: Optional[bool] = None,
172
**kwargs
173
) -> "Dataset": ...
174
175
def shuffle(
176
self,
177
seed: Optional[int] = None,
178
generator: Optional = None,
179
keep_in_memory: bool = False,
180
load_from_cache_file: Optional[bool] = None,
181
**kwargs
182
) -> "Dataset": ...
183
184
# Column operations
185
def remove_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
186
def rename_column(self, original_column_name: str, new_column_name: str, **kwargs) -> "Dataset": ...
187
def rename_columns(self, column_mapping: Dict[str, str], **kwargs) -> "Dataset": ...
188
def select_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
189
def add_column(self, name: str, column: Union[list, np.array], **kwargs) -> "Dataset": ...
190
191
# Type casting
192
def cast(self, features: Features, **kwargs) -> "Dataset": ...
193
def cast_column(self, column: str, feature, **kwargs) -> "Dataset": ...
194
195
# Data formatting
196
def with_format(
197
self,
198
type: Optional[str] = None,
199
columns: Optional[List] = None,
200
output_all_columns: bool = False,
201
**format_kwargs
202
) -> "Dataset": ...
203
204
def set_format(
205
self,
206
type: Optional[str] = None,
207
columns: Optional[List] = None,
208
output_all_columns: bool = False,
209
**format_kwargs
210
) -> None: ...
211
212
def reset_format(self) -> None: ...
213
214
# Data export
215
def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> dict: ...
216
def to_pandas(
217
self,
218
batch_size: Optional[int] = None,
219
batched: bool = False
220
) -> Union["pandas.DataFrame", Iterator["pandas.DataFrame"]]: ...
221
def save_to_disk(
222
self,
223
dataset_path: PathLike,
224
max_shard_size: Optional[Union[str, int]] = None,
225
num_shards: Optional[int] = None,
226
num_proc: Optional[int] = None,
227
storage_options: Optional[dict] = None,
228
) -> None: ...
229
230
# Dataset splitting
231
def train_test_split(
232
self,
233
test_size: Optional[Union[float, int]] = None,
234
train_size: Optional[Union[float, int]] = None,
235
shuffle: bool = True,
236
seed: Optional[int] = None,
237
**kwargs
238
) -> "DatasetDict": ...
239
240
def shard(
241
self,
242
num_shards: int,
243
index: int,
244
contiguous: bool = True,
245
**kwargs
246
) -> "Dataset": ...
247
```
248
249
**Usage Examples:**
250
251
```python
252
from datasets import Dataset
253
254
# Create dataset from dictionary
255
data = {"text": ["Hello", "World"], "label": [0, 1]}
256
dataset = Dataset.from_dict(data)
257
258
# Transform data
259
def uppercase(example):
260
example["text"] = example["text"].upper()
261
return example
262
263
dataset = dataset.map(uppercase)
264
265
# Filter data
266
dataset = dataset.filter(lambda x: len(x["text"]) > 3)
267
268
# Export to different formats
269
dataset.set_format("torch")
270
pandas_df = dataset.to_pandas()
271
```
272
273
### DatasetDict - Multiple Dataset Container
274
275
Dictionary-like container that holds multiple Dataset objects, typically representing different splits (train, validation, test).
276
277
```python { .api }
278
class DatasetDict(dict):
279
"""Dictionary of Dataset objects, typically for train/validation/test splits."""
280
281
# Properties
282
num_columns: Dict[str, int]
283
num_rows: Dict[str, int]
284
column_names: Dict[str, List[str]]
285
shape: Dict[str, Tuple[int, int]]
286
287
# Data transformation (applied to all splits)
288
def map(self, function=None, **kwargs) -> "DatasetDict": ...
289
def filter(self, function=None, **kwargs) -> "DatasetDict": ...
290
def sort(self, column_names: Union[str, List[str]], **kwargs) -> "DatasetDict": ...
291
def shuffle(self, **kwargs) -> "DatasetDict": ...
292
293
# Column operations (applied to all splits)
294
def remove_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
295
def rename_column(self, original_column_name: str, new_column_name: str) -> "DatasetDict": ...
296
def rename_columns(self, column_mapping: Dict[str, str]) -> "DatasetDict": ...
297
def select_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
298
299
# Type operations (applied to all splits)
300
def cast(self, features: Features) -> "DatasetDict": ...
301
def cast_column(self, column: str, feature) -> "DatasetDict": ...
302
303
# Formatting (applied to all splits)
304
def with_format(
305
self,
306
type: Optional[str] = None,
307
columns: Optional[List] = None,
308
output_all_columns: bool = False,
309
**format_kwargs
310
) -> "DatasetDict": ...
311
312
def set_format(
313
self,
314
type: Optional[str] = None,
315
columns: Optional[List] = None,
316
output_all_columns: bool = False,
317
**format_kwargs
318
) -> None: ...
319
320
def reset_format(self) -> None: ...
321
322
# Data export
323
def save_to_disk(self, dataset_dict_path: str, **kwargs) -> None: ...
324
325
# Utilities
326
def flatten(self, max_depth: int = 16) -> "DatasetDict": ...
327
def unique(self, column: str) -> Dict[str, List]: ...
328
def cleanup_cache_files(self) -> Dict[str, int]: ...
329
```
330
331
**Usage Examples:**
332
333
```python
334
from datasets import DatasetDict, Dataset
335
336
# Create DatasetDict from separate datasets
337
dataset_dict = DatasetDict({
338
"train": Dataset.from_dict({"text": ["train1", "train2"], "label": [0, 1]}),
339
"test": Dataset.from_dict({"text": ["test1"], "label": [0]})
340
})
341
342
# Apply operations to all splits
343
dataset_dict = dataset_dict.map(lambda x: {"length": len(x["text"])})
344
dataset_dict = dataset_dict.filter(lambda x: x["length"] > 3)
345
346
# Access individual splits
347
train_data = dataset_dict["train"]
348
test_data = dataset_dict["test"]
349
```
350
351
### IterableDataset - Streaming Dataset
352
353
Iterable-style dataset for streaming large datasets without loading everything into memory. Processes data on-the-fly with sequential access only.
354
355
```python { .api }
356
class IterableDataset:
357
"""Iterable-style dataset for streaming large datasets without loading into memory."""
358
359
# Properties
360
features: Optional[Features]
361
info: DatasetInfo
362
split: Optional[NamedSplit]
363
num_columns: Optional[int]
364
column_names: Optional[List[str]]
365
366
# Iteration
367
def __iter__(self): ...
368
def iter(self, batch_size: int, drop_last_batch: bool = False): ...
369
370
# Iteration control
371
def take(self, n: int) -> "IterableDataset": ...
372
def skip(self, n: int) -> "IterableDataset": ...
373
374
# Data transformation (streaming)
375
def map(
376
self,
377
function=None,
378
with_indices: bool = False,
379
input_columns: Optional[Union[str, List[str]]] = None,
380
batched: bool = False,
381
batch_size: int = 1000,
382
drop_last_batch: bool = False,
383
remove_columns: Optional[Union[str, List[str]]] = None,
384
features: Optional[Features] = None,
385
fn_kwargs: Optional[dict] = None,
386
) -> "IterableDataset": ...
387
388
def filter(
389
self,
390
function=None,
391
with_indices: bool = False,
392
input_columns: Optional[Union[str, List[str]]] = None,
393
batched: bool = False,
394
batch_size: int = 1000,
395
fn_kwargs: Optional[dict] = None,
396
) -> "IterableDataset": ...
397
398
def shuffle(
399
self,
400
seed: Optional[int] = None,
401
generator: Optional = None,
402
buffer_size: int = 1000,
403
) -> "IterableDataset": ...
404
405
# Column operations (streaming)
406
def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
407
def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDataset": ...
408
def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDataset": ...
409
def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
410
411
# Type operations (streaming)
412
def cast(self, features: Features) -> "IterableDataset": ...
413
def cast_column(self, column: str, feature) -> "IterableDataset": ...
414
415
# Formatting (streaming)
416
def with_format(self, type: Optional[str] = None) -> "IterableDataset": ...
417
```
418
419
**Usage Examples:**
420
421
```python
422
from datasets import load_dataset
423
424
# Create streaming dataset
425
streaming_dataset = load_dataset("oscar", "unshuffled_deduplicated_en", streaming=True)
426
427
# Take first 1000 examples
428
small_dataset = streaming_dataset.take(1000)
429
430
# Apply transformations on-the-fly
431
def preprocess(example):
432
example["length"] = len(example["text"])
433
return example
434
435
processed = small_dataset.map(preprocess)
436
437
# Iterate through examples
438
for example in processed:
439
print(example["length"])
440
break
441
```
442
443
### IterableDatasetDict - Streaming Dataset Container
444
445
Dictionary-like container for multiple IterableDataset objects representing different splits for streaming workflows.
446
447
```python { .api }
448
class IterableDatasetDict(dict):
449
"""Dictionary of IterableDataset objects for streaming workflows."""
450
451
# Properties
452
num_columns: Optional[Dict[str, int]]
453
column_names: Optional[Dict[str, List[str]]]
454
455
# Data transformation (applied to all streaming splits)
456
def map(self, function=None, **kwargs) -> "IterableDatasetDict": ...
457
def filter(self, function=None, **kwargs) -> "IterableDatasetDict": ...
458
def shuffle(self, **kwargs) -> "IterableDatasetDict": ...
459
460
# Column operations (applied to all streaming splits)
461
def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
462
def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDatasetDict": ...
463
def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDatasetDict": ...
464
def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
465
466
# Type operations (applied to all streaming splits)
467
def cast(self, features: Features) -> "IterableDatasetDict": ...
468
def cast_column(self, column: str, feature) -> "IterableDatasetDict": ...
469
470
# Formatting (applied to all streaming splits)
471
def with_format(self, type: Optional[str] = None) -> "IterableDatasetDict": ...
472
```
473
474
**Usage Examples:**
475
476
```python
477
# Load streaming dataset with multiple splits
478
streaming_dict = load_dataset("squad", streaming=True)
479
480
# Apply operations to all streaming splits
481
streaming_dict = streaming_dict.map(lambda x: {"question_length": len(x["question"])})
482
483
# Access individual streaming splits
484
train_stream = streaming_dict["train"]
485
validation_stream = streaming_dict["validation"]
486
487
# Take samples from each split
488
for example in train_stream.take(5):
489
print(f"Question length: {example['question_length']}")
490
break
491
```
492
493
## Types
494
495
### Path Types
496
497
```python { .api }
498
from os import PathLike
499
```
500
501
### Column Types
502
503
```python { .api }
504
class Column:
505
"""Iterable for accessing specific columns of a dataset."""
506
507
def __init__(self, table, info: Optional[DatasetInfo] = None): ...
508
def __iter__(self): ...
509
def __len__(self) -> int: ...
510
511
class IterableColumn:
512
"""Iterable column access for IterableDataset."""
513
514
def __init__(self, dataset, key: str): ...
515
def __iter__(self): ...
516
```
517
518
### Performance Considerations
519
520
- **Dataset/DatasetDict**: Best for smaller datasets that fit in memory, supports random access and complex operations
521
- **IterableDataset/IterableDatasetDict**: Best for large datasets, memory efficient streaming, sequential access only
522
- **Caching**: Dataset operations are cached by default for reproducibility
523
- **Multiprocessing**: Many operations support `num_proc` parameter for parallel processing
524
- **Apache Arrow**: Underlying storage format provides efficient columnar operations