Tessl Tile for pypi/datasets@4.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-dataset-classes.md data-loading.md dataset-building.md dataset-information.md dataset-operations.md features-and-types.md index.md

core-dataset-classes.mddocs/

0
# Core Dataset Classes
1

2
The fundamental dataset classes that provide different access patterns and capabilities for working with dataset collections. These classes form the core of the datasets library, offering both in-memory and streaming approaches to dataset processing.
3

4
## Capabilities
5

6
### Dataset - Map-style Dataset
7

8
The main dataset class backed by Apache Arrow for efficient random access. Provides comprehensive data transformation, filtering, and export capabilities.
9

10
```python { .api }
11
class Dataset:
12
    """Map-style dataset backed by Apache Arrow for efficient random access."""
13
    
14
    # Core properties
15
    features: Features
16
    info: DatasetInfo
17
    split: Optional[NamedSplit]
18
    num_rows: int
19
    num_columns: int
20
    column_names: List[str]
21
    shape: Tuple[int, int]
22
    
23
    # Dataset creation (class methods)
24
    @classmethod
25
    def from_pandas(
26
        cls,
27
        df: "pandas.DataFrame",
28
        features: Optional[Features] = None,
29
        info: Optional[DatasetInfo] = None,
30
        split: Optional[NamedSplit] = None,
31
        preserve_index: Optional[bool] = None,
32
    ) -> "Dataset": ...
33
    
34
    @classmethod
35
    def from_dict(
36
        cls,
37
        mapping: dict,
38
        features: Optional[Features] = None,
39
        info: Optional[DatasetInfo] = None,
40
        split: Optional[NamedSplit] = None,
41
    ) -> "Dataset": ...
42
    
43
    @classmethod
44
    def from_list(
45
        cls,
46
        mapping: List[dict],
47
        features: Optional[Features] = None,
48
        info: Optional[DatasetInfo] = None,
49
        split: Optional[NamedSplit] = None,
50
    ) -> "Dataset": ...
51
    
52
    # Dataset creation (static methods)
53
    @staticmethod
54
    def from_csv(
55
        path_or_paths: Union[PathLike, List[PathLike]],
56
        split: Optional[NamedSplit] = None,
57
        features: Optional[Features] = None,
58
        cache_dir: Optional[str] = None,
59
        keep_in_memory: bool = False,
60
        num_proc: Optional[int] = None,
61
        **kwargs,
62
    ) -> "Dataset": ...
63
    
64
    @staticmethod
65
    def from_json(
66
        path_or_paths: Union[PathLike, List[PathLike]],
67
        split: Optional[NamedSplit] = None,
68
        features: Optional[Features] = None,
69
        cache_dir: Optional[str] = None,
70
        keep_in_memory: bool = False,
71
        field: Optional[str] = None,
72
        num_proc: Optional[int] = None,
73
        **kwargs,
74
    ) -> "Dataset": ...
75
    
76
    @staticmethod
77
    def from_parquet(
78
        path_or_paths: Union[PathLike, List[PathLike]],
79
        split: Optional[NamedSplit] = None,
80
        features: Optional[Features] = None,
81
        cache_dir: Optional[str] = None,
82
        keep_in_memory: bool = False,
83
        columns: Optional[List[str]] = None,
84
        num_proc: Optional[int] = None,
85
        **kwargs,
86
    ) -> "Dataset": ...
87
    
88
    @staticmethod
89
    def from_text(
90
        path_or_paths: Union[PathLike, List[PathLike]],
91
        split: Optional[NamedSplit] = None,
92
        features: Optional[Features] = None,
93
        cache_dir: Optional[str] = None,
94
        keep_in_memory: bool = False,
95
        num_proc: Optional[int] = None,
96
        **kwargs,
97
    ) -> "Dataset": ...
98
    
99
    @staticmethod
100
    def from_generator(
101
        generator: Callable,
102
        features: Optional[Features] = None,
103
        cache_dir: Optional[str] = None,
104
        keep_in_memory: bool = False,
105
        gen_kwargs: Optional[dict] = None,
106
        num_proc: Optional[int] = None,
107
        split: NamedSplit = "train",
108
        **kwargs,
109
    ) -> "Dataset": ...
110
    
111
    # Data access
112
    def __getitem__(self, key): ...
113
    def __len__(self) -> int: ...
114
    
115
    # Data transformation
116
    def map(
117
        self,
118
        function=None,
119
        with_indices: bool = False,
120
        with_rank: bool = False,
121
        input_columns: Optional[Union[str, List[str]]] = None,
122
        batched: bool = False,
123
        batch_size: int = 1000,
124
        drop_last_batch: bool = False,
125
        remove_columns: Optional[Union[str, List[str]]] = None,
126
        keep_in_memory: bool = False,
127
        load_from_cache_file: Optional[bool] = None,
128
        cache_file_name: Optional[str] = None,
129
        writer_batch_size: int = 1000,
130
        features: Optional[Features] = None,
131
        disable_nullable: bool = False,
132
        fn_kwargs: Optional[dict] = None,
133
        num_proc: Optional[int] = None,
134
        desc: Optional[str] = None,
135
        **kwargs
136
    ) -> "Dataset": ...
137
    
138
    def filter(
139
        self,
140
        function=None,
141
        with_indices: bool = False,
142
        with_rank: bool = False,
143
        input_columns: Optional[Union[str, List[str]]] = None,
144
        batched: bool = False,
145
        batch_size: int = 1000,
146
        keep_in_memory: bool = False,
147
        load_from_cache_file: Optional[bool] = None,
148
        cache_file_name: Optional[str] = None,
149
        writer_batch_size: int = 1000,
150
        fn_kwargs: Optional[dict] = None,
151
        num_proc: Optional[int] = None,
152
        desc: Optional[str] = None,
153
        **kwargs
154
    ) -> "Dataset": ...
155
    
156
    def select(
157
        self,
158
        indices: Union[int, List[int], Iterable[int]],
159
        keep_in_memory: bool = False,
160
        indices_cache_file_name: Optional[str] = None,
161
        writer_batch_size: int = 1000,
162
        **kwargs
163
    ) -> "Dataset": ...
164
    
165
    def sort(
166
        self,
167
        column_names: Union[str, List[str]],
168
        reverse: Union[bool, List[bool]] = False,
169
        null_placement: str = "at_end",
170
        keep_in_memory: bool = False,
171
        load_from_cache_file: Optional[bool] = None,
172
        **kwargs
173
    ) -> "Dataset": ...
174
    
175
    def shuffle(
176
        self,
177
        seed: Optional[int] = None,
178
        generator: Optional = None,
179
        keep_in_memory: bool = False,
180
        load_from_cache_file: Optional[bool] = None,
181
        **kwargs
182
    ) -> "Dataset": ...
183
    
184
    # Column operations
185
    def remove_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
186
    def rename_column(self, original_column_name: str, new_column_name: str, **kwargs) -> "Dataset": ...
187
    def rename_columns(self, column_mapping: Dict[str, str], **kwargs) -> "Dataset": ...
188
    def select_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...
189
    def add_column(self, name: str, column: Union[list, np.array], **kwargs) -> "Dataset": ...
190
    
191
    # Type casting
192
    def cast(self, features: Features, **kwargs) -> "Dataset": ...
193
    def cast_column(self, column: str, feature, **kwargs) -> "Dataset": ...
194
    
195
    # Data formatting
196
    def with_format(
197
        self,
198
        type: Optional[str] = None,
199
        columns: Optional[List] = None,
200
        output_all_columns: bool = False,
201
        **format_kwargs
202
    ) -> "Dataset": ...
203
    
204
    def set_format(
205
        self,
206
        type: Optional[str] = None,
207
        columns: Optional[List] = None,
208
        output_all_columns: bool = False,
209
        **format_kwargs
210
    ) -> None: ...
211
    
212
    def reset_format(self) -> None: ...
213
    
214
    # Data export
215
    def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> dict: ...
216
    def to_pandas(
217
        self, 
218
        batch_size: Optional[int] = None, 
219
        batched: bool = False
220
    ) -> Union["pandas.DataFrame", Iterator["pandas.DataFrame"]]: ...
221
    def save_to_disk(
222
        self,
223
        dataset_path: PathLike,
224
        max_shard_size: Optional[Union[str, int]] = None,
225
        num_shards: Optional[int] = None,
226
        num_proc: Optional[int] = None,
227
        storage_options: Optional[dict] = None,
228
    ) -> None: ...
229
    
230
    # Dataset splitting
231
    def train_test_split(
232
        self,
233
        test_size: Optional[Union[float, int]] = None,
234
        train_size: Optional[Union[float, int]] = None,
235
        shuffle: bool = True,
236
        seed: Optional[int] = None,
237
        **kwargs
238
    ) -> "DatasetDict": ...
239
    
240
    def shard(
241
        self,
242
        num_shards: int,
243
        index: int,
244
        contiguous: bool = True,
245
        **kwargs
246
    ) -> "Dataset": ...
247
```
248

249
**Usage Examples:**
250

251
```python
252
from datasets import Dataset
253

254
# Create dataset from dictionary
255
data = {"text": ["Hello", "World"], "label": [0, 1]}
256
dataset = Dataset.from_dict(data)
257

258
# Transform data
259
def uppercase(example):
260
    example["text"] = example["text"].upper()
261
    return example
262

263
dataset = dataset.map(uppercase)
264

265
# Filter data
266
dataset = dataset.filter(lambda x: len(x["text"]) > 3)
267

268
# Export to different formats
269
dataset.set_format("torch")
270
pandas_df = dataset.to_pandas()
271
```
272

273
### DatasetDict - Multiple Dataset Container
274

275
Dictionary-like container that holds multiple Dataset objects, typically representing different splits (train, validation, test).
276

277
```python { .api }
278
class DatasetDict(dict):
279
    """Dictionary of Dataset objects, typically for train/validation/test splits."""
280
    
281
    # Properties
282
    num_columns: Dict[str, int]
283
    num_rows: Dict[str, int]
284
    column_names: Dict[str, List[str]]
285
    shape: Dict[str, Tuple[int, int]]
286
    
287
    # Data transformation (applied to all splits)
288
    def map(self, function=None, **kwargs) -> "DatasetDict": ...
289
    def filter(self, function=None, **kwargs) -> "DatasetDict": ...
290
    def sort(self, column_names: Union[str, List[str]], **kwargs) -> "DatasetDict": ...
291
    def shuffle(self, **kwargs) -> "DatasetDict": ...
292
    
293
    # Column operations (applied to all splits)
294
    def remove_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
295
    def rename_column(self, original_column_name: str, new_column_name: str) -> "DatasetDict": ...
296
    def rename_columns(self, column_mapping: Dict[str, str]) -> "DatasetDict": ...
297
    def select_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...
298
    
299
    # Type operations (applied to all splits)  
300
    def cast(self, features: Features) -> "DatasetDict": ...
301
    def cast_column(self, column: str, feature) -> "DatasetDict": ...
302
    
303
    # Formatting (applied to all splits)
304
    def with_format(
305
        self,
306
        type: Optional[str] = None,
307
        columns: Optional[List] = None,
308
        output_all_columns: bool = False,
309
        **format_kwargs
310
    ) -> "DatasetDict": ...
311
    
312
    def set_format(
313
        self,
314
        type: Optional[str] = None,
315
        columns: Optional[List] = None,
316
        output_all_columns: bool = False,
317
        **format_kwargs
318
    ) -> None: ...
319
    
320
    def reset_format(self) -> None: ...
321
    
322
    # Data export
323
    def save_to_disk(self, dataset_dict_path: str, **kwargs) -> None: ...
324
    
325
    # Utilities
326
    def flatten(self, max_depth: int = 16) -> "DatasetDict": ...
327
    def unique(self, column: str) -> Dict[str, List]: ...
328
    def cleanup_cache_files(self) -> Dict[str, int]: ...
329
```
330

331
**Usage Examples:**
332

333
```python
334
from datasets import DatasetDict, Dataset
335

336
# Create DatasetDict from separate datasets
337
dataset_dict = DatasetDict({
338
    "train": Dataset.from_dict({"text": ["train1", "train2"], "label": [0, 1]}),
339
    "test": Dataset.from_dict({"text": ["test1"], "label": [0]})
340
})
341

342
# Apply operations to all splits
343
dataset_dict = dataset_dict.map(lambda x: {"length": len(x["text"])})
344
dataset_dict = dataset_dict.filter(lambda x: x["length"] > 3)
345

346
# Access individual splits
347
train_data = dataset_dict["train"]
348
test_data = dataset_dict["test"]
349
```
350

351
### IterableDataset - Streaming Dataset
352

353
Iterable-style dataset for streaming large datasets without loading everything into memory. Processes data on-the-fly with sequential access only.
354

355
```python { .api }
356
class IterableDataset:
357
    """Iterable-style dataset for streaming large datasets without loading into memory."""
358
    
359
    # Properties
360
    features: Optional[Features]
361
    info: DatasetInfo
362
    split: Optional[NamedSplit]
363
    num_columns: Optional[int]
364
    column_names: Optional[List[str]]
365
    
366
    # Iteration
367
    def __iter__(self): ...
368
    def iter(self, batch_size: int, drop_last_batch: bool = False): ...
369
    
370
    # Iteration control
371
    def take(self, n: int) -> "IterableDataset": ...
372
    def skip(self, n: int) -> "IterableDataset": ...
373
    
374
    # Data transformation (streaming)
375
    def map(
376
        self,
377
        function=None,
378
        with_indices: bool = False,
379
        input_columns: Optional[Union[str, List[str]]] = None,
380
        batched: bool = False,
381
        batch_size: int = 1000,
382
        drop_last_batch: bool = False,
383
        remove_columns: Optional[Union[str, List[str]]] = None,
384
        features: Optional[Features] = None,
385
        fn_kwargs: Optional[dict] = None,
386
    ) -> "IterableDataset": ...
387
    
388
    def filter(
389
        self,
390
        function=None,
391
        with_indices: bool = False,
392
        input_columns: Optional[Union[str, List[str]]] = None,
393
        batched: bool = False,
394
        batch_size: int = 1000,
395
        fn_kwargs: Optional[dict] = None,
396
    ) -> "IterableDataset": ...
397
    
398
    def shuffle(
399
        self,
400
        seed: Optional[int] = None,
401
        generator: Optional = None,
402
        buffer_size: int = 1000,
403
    ) -> "IterableDataset": ...
404
    
405
    # Column operations (streaming)
406
    def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
407
    def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDataset": ...
408
    def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDataset": ...
409
    def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...
410
    
411
    # Type operations (streaming)
412
    def cast(self, features: Features) -> "IterableDataset": ...
413
    def cast_column(self, column: str, feature) -> "IterableDataset": ...
414
    
415
    # Formatting (streaming)
416
    def with_format(self, type: Optional[str] = None) -> "IterableDataset": ...
417
```
418

419
**Usage Examples:**
420

421
```python
422
from datasets import load_dataset
423

424
# Create streaming dataset
425
streaming_dataset = load_dataset("oscar", "unshuffled_deduplicated_en", streaming=True)
426

427
# Take first 1000 examples
428
small_dataset = streaming_dataset.take(1000)
429

430
# Apply transformations on-the-fly
431
def preprocess(example):
432
    example["length"] = len(example["text"])
433
    return example
434

435
processed = small_dataset.map(preprocess)
436

437
# Iterate through examples
438
for example in processed:
439
    print(example["length"])
440
    break
441
```
442

443
### IterableDatasetDict - Streaming Dataset Container
444

445
Dictionary-like container for multiple IterableDataset objects representing different splits for streaming workflows.
446

447
```python { .api }
448
class IterableDatasetDict(dict):
449
    """Dictionary of IterableDataset objects for streaming workflows."""
450
    
451
    # Properties
452
    num_columns: Optional[Dict[str, int]]
453
    column_names: Optional[Dict[str, List[str]]]
454
    
455
    # Data transformation (applied to all streaming splits)
456
    def map(self, function=None, **kwargs) -> "IterableDatasetDict": ...
457
    def filter(self, function=None, **kwargs) -> "IterableDatasetDict": ...
458
    def shuffle(self, **kwargs) -> "IterableDatasetDict": ...
459
    
460
    # Column operations (applied to all streaming splits)
461
    def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
462
    def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDatasetDict": ...
463
    def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDatasetDict": ...
464
    def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...
465
    
466
    # Type operations (applied to all streaming splits)
467
    def cast(self, features: Features) -> "IterableDatasetDict": ...
468
    def cast_column(self, column: str, feature) -> "IterableDatasetDict": ...
469
    
470
    # Formatting (applied to all streaming splits)  
471
    def with_format(self, type: Optional[str] = None) -> "IterableDatasetDict": ...
472
```
473

474
**Usage Examples:**
475

476
```python
477
# Load streaming dataset with multiple splits
478
streaming_dict = load_dataset("squad", streaming=True)
479

480
# Apply operations to all streaming splits
481
streaming_dict = streaming_dict.map(lambda x: {"question_length": len(x["question"])})
482

483
# Access individual streaming splits
484
train_stream = streaming_dict["train"]
485
validation_stream = streaming_dict["validation"]
486

487
# Take samples from each split
488
for example in train_stream.take(5):
489
    print(f"Question length: {example['question_length']}")
490
    break
491
```
492

493
## Types
494

495
### Path Types
496

497
```python { .api }
498
from os import PathLike
499
```
500

501
### Column Types
502

503
```python { .api }
504
class Column:
505
    """Iterable for accessing specific columns of a dataset."""
506
    
507
    def __init__(self, table, info: Optional[DatasetInfo] = None): ...
508
    def __iter__(self): ...
509
    def __len__(self) -> int: ...
510

511
class IterableColumn:
512
    """Iterable column access for IterableDataset."""
513
    
514
    def __init__(self, dataset, key: str): ...
515
    def __iter__(self): ...
516
```
517

518
### Performance Considerations
519

520
- **Dataset/DatasetDict**: Best for smaller datasets that fit in memory, supports random access and complex operations
521
- **IterableDataset/IterableDatasetDict**: Best for large datasets, memory efficient streaming, sequential access only
522
- **Caching**: Dataset operations are cached by default for reproducibility
523
- **Multiprocessing**: Many operations support `num_proc` parameter for parallel processing
524
- **Apache Arrow**: Underlying storage format provides efficient columnar operations

Version

Tile

Files

core-dataset-classes.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

core-dataset-classes.mddocs/