0
# Dataset Information and Inspection
1
2
Functions and classes for inspecting dataset metadata, configurations, and available splits. This module provides comprehensive capabilities for dataset discovery, split management, and metadata handling without requiring full dataset downloads.
3
4
## Capabilities
5
6
### Dataset Information Container
7
8
Central metadata container documenting all known information about a dataset including structure, licensing, and statistics.
9
10
```python { .api }
11
class DatasetInfo:
12
"""Information about a dataset."""
13
14
# Static dataset information (set by dataset builders)
15
description: str
16
citation: str
17
homepage: str
18
license: str
19
features: Optional[Features]
20
post_processed: Optional[PostProcessedInfo]
21
supervised_keys: Optional[SupervisedKeysData]
22
23
# Dynamic dataset information (set by builder/processing)
24
builder_name: Optional[str]
25
dataset_name: Optional[str]
26
config_name: Optional[str]
27
version: Optional[Union[str, Version]]
28
splits: Optional[dict]
29
download_checksums: Optional[dict]
30
download_size: Optional[int]
31
post_processing_size: Optional[int]
32
dataset_size: Optional[int]
33
size_in_bytes: Optional[int]
34
35
# File I/O operations
36
def write_to_directory(
37
self,
38
dataset_info_dir: str,
39
pretty_print: bool = False,
40
storage_options: Optional[dict] = None,
41
) -> None: ...
42
43
@classmethod
44
def from_directory(
45
cls,
46
dataset_info_dir: str,
47
storage_options: Optional[dict] = None,
48
) -> "DatasetInfo": ...
49
50
@classmethod
51
def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo": ...
52
53
# Manipulation operations
54
def update(self, other_dataset_info: "DatasetInfo", ignore_none: bool = True) -> None: ...
55
def copy(self) -> "DatasetInfo": ...
56
57
@classmethod
58
def from_merge(cls, dataset_infos: List["DatasetInfo"]) -> "DatasetInfo": ...
59
```
60
61
**Usage Examples:**
62
63
```python
64
from datasets import load_dataset, DatasetInfo
65
66
# Access dataset info from loaded dataset
67
dataset = load_dataset("squad", split="train")
68
info = dataset.info
69
70
print(f"Description: {info.description}")
71
print(f"Dataset size: {info.dataset_size} bytes")
72
print(f"Number of examples: {info.splits['train'].num_examples}")
73
print(f"Features: {info.features}")
74
75
# Save dataset info to disk
76
info.write_to_directory("./squad_info/")
77
78
# Load dataset info from disk
79
loaded_info = DatasetInfo.from_directory("./squad_info/")
80
81
# Create custom dataset info
82
custom_info = DatasetInfo(
83
description="My custom dataset",
84
features=Features({
85
"text": Value("string"),
86
"label": ClassLabel(names=["positive", "negative"])
87
}),
88
license="MIT",
89
citation="@misc{my_dataset, ...}"
90
)
91
```
92
93
### Dataset Configuration Discovery
94
95
Functions for discovering available dataset configurations, splits, and metadata without downloading the full dataset.
96
97
```python { .api }
98
def get_dataset_config_names(
99
path: str,
100
revision: Optional[Union[str, Version]] = None,
101
download_config: Optional[DownloadConfig] = None,
102
download_mode: Optional[Union[DownloadMode, str]] = None,
103
data_files: Optional[Union[dict, list, str]] = None,
104
**download_kwargs,
105
) -> List[str]:
106
"""
107
Get the list of available config names for a dataset.
108
109
Parameters:
110
- path (str): Path or name of the dataset
111
- revision (str, Version, optional): Version of the dataset script to load
112
- download_config (DownloadConfig, optional): Specific download configuration parameters
113
- download_mode (DownloadMode or str, optional): Select the download/generation mode
114
- data_files (dict, list, str, optional): Path(s) to source data file(s)
115
- **download_kwargs: Additional download arguments
116
117
Returns:
118
- List[str]: List of available configuration names
119
"""
120
121
def get_dataset_default_config_name(
122
path: str,
123
revision: Optional[Union[str, Version]] = None,
124
download_config: Optional[DownloadConfig] = None,
125
download_mode: Optional[Union[DownloadMode, str]] = None,
126
data_files: Optional[Union[dict, list, str]] = None,
127
**download_kwargs,
128
) -> Optional[str]:
129
"""
130
Get the default config name for a dataset.
131
132
Parameters:
133
- path (str): Path or name of the dataset
134
- revision (str, Version, optional): Version of the dataset script to load
135
- download_config (DownloadConfig, optional): Specific download configuration parameters
136
- download_mode (DownloadMode or str, optional): Select the download/generation mode
137
- data_files (dict, list, str, optional): Path(s) to source data file(s)
138
- **download_kwargs: Additional download arguments
139
140
Returns:
141
- Optional[str]: Default configuration name, or None if no default
142
"""
143
144
def get_dataset_split_names(
145
path: str,
146
config_name: Optional[str] = None,
147
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
148
download_config: Optional[DownloadConfig] = None,
149
download_mode: Optional[Union[DownloadMode, str]] = None,
150
revision: Optional[Union[str, Version]] = None,
151
token: Optional[Union[bool, str]] = None,
152
**config_kwargs,
153
) -> List[str]:
154
"""
155
Get the list of available splits for a particular config and dataset.
156
157
Parameters:
158
- path (str): Path or name of the dataset
159
- config_name (str, optional): Configuration name
160
- data_files (str, Sequence[str], Mapping, optional): Path(s) to source data file(s)
161
- download_config (DownloadConfig, optional): Specific download configuration parameters
162
- download_mode (DownloadMode or str, optional): Select the download/generation mode
163
- revision (str, Version, optional): Version of the dataset script to load
164
- token (bool or str, optional): Optional string or boolean to use as Bearer token
165
- **config_kwargs: Keyword arguments to be passed to the BuilderConfig
166
167
Returns:
168
- List[str]: List of available split names
169
"""
170
```
171
172
**Usage Examples:**
173
174
```python
175
from datasets import get_dataset_config_names, get_dataset_default_config_name, get_dataset_split_names
176
177
# Discover available configurations
178
configs = get_dataset_config_names("nyu-mll/glue")
179
print(f"Available configs: {configs}")
180
# ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', ...]
181
182
# Get default configuration
183
default = get_dataset_default_config_name("squad")
184
print(f"Default config: {default}") # 'plain_text'
185
186
# Get available splits for a configuration
187
splits = get_dataset_split_names("nyu-mll/glue", config_name="cola")
188
print(f"Available splits: {splits}") # ['train', 'validation', 'test']
189
190
# Check splits for default configuration
191
splits = get_dataset_split_names("squad")
192
print(f"SQuAD splits: {splits}") # ['train', 'validation']
193
```
194
195
### Dataset Information Retrieval
196
197
Functions for retrieving detailed metadata about datasets and their configurations.
198
199
```python { .api }
200
def get_dataset_infos(
201
path: str,
202
data_files: Optional[Union[dict, list, str]] = None,
203
download_config: Optional[DownloadConfig] = None,
204
download_mode: Optional[Union[DownloadMode, str]] = None,
205
revision: Optional[Union[str, Version]] = None,
206
token: Optional[Union[bool, str]] = None,
207
**config_kwargs,
208
) -> Dict[str, DatasetInfo]:
209
"""
210
Get meta information about a dataset, returned as dict mapping config name to DatasetInfo.
211
212
Parameters:
213
- path (str): Path or name of the dataset
214
- data_files (dict, list, str, optional): Path(s) to source data file(s)
215
- download_config (DownloadConfig, optional): Specific download configuration parameters
216
- download_mode (DownloadMode or str, optional): Select the download/generation mode
217
- revision (str, Version, optional): Version of the dataset script to load
218
- token (bool or str, optional): Optional string or boolean to use as Bearer token
219
- **config_kwargs: Keyword arguments to be passed to the BuilderConfig
220
221
Returns:
222
- Dict[str, DatasetInfo]: Dictionary mapping configuration names to DatasetInfo objects
223
"""
224
225
def get_dataset_config_info(
226
path: str,
227
config_name: Optional[str] = None,
228
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
229
download_config: Optional[DownloadConfig] = None,
230
download_mode: Optional[Union[DownloadMode, str]] = None,
231
revision: Optional[Union[str, Version]] = None,
232
token: Optional[Union[bool, str]] = None,
233
**config_kwargs,
234
) -> DatasetInfo:
235
"""
236
Get the meta information (DatasetInfo) about a dataset for a particular config.
237
238
Parameters:
239
- path (str): Path or name of the dataset
240
- config_name (str, optional): Configuration name
241
- data_files (str, Sequence[str], Mapping, optional): Path(s) to source data file(s)
242
- download_config (DownloadConfig, optional): Specific download configuration parameters
243
- download_mode (DownloadMode or str, optional): Select the download/generation mode
244
- revision (str, Version, optional): Version of the dataset script to load
245
- token (bool or str, optional): Optional string or boolean to use as Bearer token
246
- **config_kwargs: Keyword arguments to be passed to the BuilderConfig
247
248
Returns:
249
- DatasetInfo: Dataset information for the specified configuration
250
"""
251
```
252
253
**Usage Examples:**
254
255
```python
256
from datasets import get_dataset_infos, get_dataset_config_info
257
258
# Get info for all configurations
259
all_infos = get_dataset_infos("nyu-mll/glue")
260
for config, info in all_infos.items():
261
print(f"Config: {config}")
262
print(f" Description: {info.description[:100]}...")
263
print(f" Features: {list(info.features.keys())}")
264
print(f" Splits: {list(info.splits.keys())}")
265
266
# Get info for specific configuration
267
cola_info = get_dataset_config_info("nyu-mll/glue", config_name="cola")
268
print(f"CoLA dataset size: {cola_info.dataset_size} bytes")
269
print(f"CoLA train examples: {cola_info.splits['train'].num_examples}")
270
```
271
272
### Split Information and Management
273
274
Classes for managing dataset splits, subsplits, and split composition operations.
275
276
```python { .api }
277
class SplitBase:
278
"""Abstract base class for Split compositionality."""
279
280
def get_read_instruction(self, split_dict): ...
281
def __add__(self, other): ... # Merging: Split.TRAIN + Split.TEST
282
def subsplit(self, arg=None, k=None, percent=None, weighted=None): ...
283
284
class NamedSplit(SplitBase):
285
"""Descriptor corresponding to a named split (train, test, ...)."""
286
287
def __init__(self, name: str): ...
288
def __eq__(self, other) -> bool: ... # Supports Split.TRAIN == 'train'
289
def __str__(self) -> str: ...
290
def __repr__(self) -> str: ...
291
def get_read_instruction(self, split_dict): ...
292
293
class NamedSplitAll(NamedSplit):
294
"""Split corresponding to the union of all defined dataset splits."""
295
296
def get_read_instruction(self, split_dict): ...
297
298
class Split:
299
"""Enum for dataset splits with predefined constants."""
300
TRAIN = NamedSplit("train")
301
TEST = NamedSplit("test")
302
VALIDATION = NamedSplit("validation")
303
ALL = NamedSplitAll()
304
305
def __new__(cls, name: str): ... # Create custom split
306
307
class SplitInfo:
308
"""Information about a single dataset split."""
309
310
def __init__(
311
self,
312
name: str = "",
313
num_bytes: int = 0,
314
num_examples: int = 0,
315
shard_lengths: Optional[List[int]] = None,
316
dataset_name: Optional[str] = None,
317
): ...
318
319
@property
320
def file_instructions(self) -> List[dict]: ...
321
322
class SplitDict(dict):
323
"""Split info object - dictionary of split names to SplitInfo objects."""
324
325
def __init__(self, *args, dataset_name=None, **kwargs): ...
326
def __getitem__(self, key: Union[SplitBase, str]): ... # Supports subsplit instructions
327
def add(self, split_info: SplitInfo): ...
328
329
@property
330
def total_num_examples(self) -> int: ...
331
332
@classmethod
333
def from_split_dict(
334
cls,
335
split_infos: Union[list, dict],
336
dataset_name: Optional[str] = None,
337
) -> "SplitDict": ...
338
339
class SplitGenerator:
340
"""Defines split information for the generator in DatasetBuilder._split_generators."""
341
342
def __init__(self, name: str, gen_kwargs: dict = None): ...
343
```
344
345
**Usage Examples:**
346
347
```python
348
from datasets import Split, SplitInfo, SplitDict, percent
349
350
# Use predefined splits
351
train_split = Split.TRAIN
352
test_split = Split.TEST
353
validation_split = Split.VALIDATION
354
355
# Create custom split
356
custom_split = Split("custom_name")
357
358
# Combine splits
359
combined = Split.TRAIN + Split.TEST
360
361
# Create subsplits using percentages
362
train_subset = Split.TRAIN.subsplit(percent[:80]) # First 80%
363
val_subset = Split.TRAIN.subsplit(percent[80:]) # Last 20%
364
365
# Multiple subsplits with weights
366
train, val, test = Split.TRAIN.subsplit(weighted=[70, 15, 15])
367
368
# Create split information
369
split_info = SplitInfo(
370
name="train",
371
num_examples=1000,
372
num_bytes=1024000
373
)
374
375
# Create split dictionary
376
split_dict = SplitDict({
377
"train": SplitInfo(name="train", num_examples=800),
378
"test": SplitInfo(name="test", num_examples=200),
379
})
380
381
print(f"Total examples: {split_dict.total_num_examples}") # 1000
382
```
383
384
### Supervised Learning Configuration
385
386
Classes for specifying input/output relationships in supervised learning scenarios.
387
388
```python { .api }
389
class SupervisedKeysData:
390
"""Specifies input and output for supervised learning."""
391
392
def __init__(self, input: str = "", output: str = ""): ...
393
394
class PostProcessedInfo:
395
"""Information about post-processed resources (e.g., indices)."""
396
397
def __init__(
398
self,
399
features: Optional[Features] = None,
400
resources_checksums: Optional[dict] = None,
401
): ...
402
```
403
404
**Usage Examples:**
405
406
```python
407
from datasets import DatasetInfo, SupervisedKeysData, Features, Value, ClassLabel
408
409
# Define supervised learning keys
410
supervised_keys = SupervisedKeysData(input="text", output="label")
411
412
# Create dataset info with supervised keys
413
info = DatasetInfo(
414
description="Text classification dataset",
415
features=Features({
416
"text": Value("string"),
417
"label": ClassLabel(names=["positive", "negative"])
418
}),
419
supervised_keys=supervised_keys,
420
license="MIT"
421
)
422
423
# Access supervised keys
424
print(f"Input column: {info.supervised_keys.input}") # text
425
print(f"Output column: {info.supervised_keys.output}") # label
426
```
427
428
## Advanced Dataset Information Patterns
429
430
### Programmatic Dataset Discovery
431
432
```python
433
from datasets import get_dataset_config_names, get_dataset_split_names, get_dataset_infos
434
435
def explore_dataset(dataset_name):
436
"""Comprehensively explore a dataset's structure."""
437
438
print(f"Exploring dataset: {dataset_name}")
439
440
# Get all configurations
441
configs = get_dataset_config_names(dataset_name)
442
print(f"Available configurations: {configs}")
443
444
# Explore each configuration
445
for config in configs:
446
print(f"\nConfiguration: {config}")
447
448
# Get splits for this config
449
splits = get_dataset_split_names(dataset_name, config_name=config)
450
print(f" Splits: {splits}")
451
452
# Get detailed info
453
info = get_dataset_config_info(dataset_name, config_name=config)
454
print(f" Description: {info.description[:100]}...")
455
print(f" Features: {list(info.features.keys())}")
456
457
# Show split statistics
458
for split_name in splits:
459
split_info = info.splits[split_name]
460
print(f" {split_name}: {split_info.num_examples} examples, {split_info.num_bytes} bytes")
461
462
# Usage
463
explore_dataset("nyu-mll/glue")
464
```
465
466
### Custom Dataset Metadata Management
467
468
```python
469
from datasets import DatasetInfo, Features, Value, SplitDict, SplitInfo
470
471
def create_custom_dataset_info():
472
"""Create comprehensive dataset metadata."""
473
474
# Define features
475
features = Features({
476
"id": Value("string"),
477
"text": Value("string"),
478
"label": ClassLabel(names=["positive", "negative", "neutral"]),
479
"confidence": Value("float32"),
480
"metadata": {
481
"source": Value("string"),
482
"timestamp": Value("timestamp[ms]")
483
}
484
})
485
486
# Define split information
487
splits = SplitDict({
488
"train": SplitInfo(name="train", num_examples=10000, num_bytes=50000000),
489
"validation": SplitInfo(name="validation", num_examples=1000, num_bytes=5000000),
490
"test": SplitInfo(name="test", num_examples=2000, num_bytes=10000000),
491
})
492
493
# Create comprehensive dataset info
494
info = DatasetInfo(
495
description="A comprehensive sentiment analysis dataset with confidence scores",
496
citation="@misc{my_dataset_2024, title={My Dataset}, author={Author}, year={2024}}",
497
homepage="https://example.com/dataset",
498
license="Apache-2.0",
499
features=features,
500
splits=splits,
501
supervised_keys=SupervisedKeysData(input="text", output="label"),
502
version="1.0.0",
503
download_size=65000000,
504
dataset_size=65000000,
505
)
506
507
return info
508
509
# Save and load metadata
510
info = create_custom_dataset_info()
511
info.write_to_directory("./my_dataset_info/")
512
loaded_info = DatasetInfo.from_directory("./my_dataset_info/")
513
```
514
515
### Split Composition and Subsetting
516
517
```python
518
from datasets import Split, percent, load_dataset
519
520
def create_complex_splits():
521
"""Demonstrate advanced split composition."""
522
523
# Load dataset
524
dataset = load_dataset("imdb", split=Split.TRAIN)
525
526
# Create multiple subsplits
527
train_80, val_10, test_10 = dataset.train_test_split(
528
test_size=0.2,
529
train_size=0.8
530
)["train"].train_test_split(
531
test_size=0.125 # 10% of original (0.1/0.8)
532
).values()
533
534
# Alternative using split composition
535
train_subset = Split.TRAIN.subsplit(percent[:80])
536
val_subset = Split.TRAIN.subsplit(percent[80:90])
537
test_subset = Split.TRAIN.subsplit(percent[90:])
538
539
# Use weighted subsplits
540
splits = Split.TRAIN.subsplit(weighted=[8, 1, 1]) # 80%, 10%, 10%
541
542
return {
543
"train": splits[0],
544
"validation": splits[1],
545
"test": splits[2]
546
}
547
```
548
549
This comprehensive dataset information system provides powerful tools for dataset discovery, metadata management, and split composition, enabling efficient exploration and manipulation of dataset structures without requiring full downloads.