or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-dataset-classes.mddata-loading.mddataset-building.mddataset-information.mddataset-operations.mdfeatures-and-types.mdindex.md

dataset-information.mddocs/

0

# Dataset Information and Inspection

1

2

Functions and classes for inspecting dataset metadata, configurations, and available splits. This module provides comprehensive capabilities for dataset discovery, split management, and metadata handling without requiring full dataset downloads.

3

4

## Capabilities

5

6

### Dataset Information Container

7

8

Central metadata container documenting all known information about a dataset including structure, licensing, and statistics.

9

10

```python { .api }

11

class DatasetInfo:

12

"""Information about a dataset."""

13

14

# Static dataset information (set by dataset builders)

15

description: str

16

citation: str

17

homepage: str

18

license: str

19

features: Optional[Features]

20

post_processed: Optional[PostProcessedInfo]

21

supervised_keys: Optional[SupervisedKeysData]

22

23

# Dynamic dataset information (set by builder/processing)

24

builder_name: Optional[str]

25

dataset_name: Optional[str]

26

config_name: Optional[str]

27

version: Optional[Union[str, Version]]

28

splits: Optional[dict]

29

download_checksums: Optional[dict]

30

download_size: Optional[int]

31

post_processing_size: Optional[int]

32

dataset_size: Optional[int]

33

size_in_bytes: Optional[int]

34

35

# File I/O operations

36

def write_to_directory(

37

self,

38

dataset_info_dir: str,

39

pretty_print: bool = False,

40

storage_options: Optional[dict] = None,

41

) -> None: ...

42

43

@classmethod

44

def from_directory(

45

cls,

46

dataset_info_dir: str,

47

storage_options: Optional[dict] = None,

48

) -> "DatasetInfo": ...

49

50

@classmethod

51

def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo": ...

52

53

# Manipulation operations

54

def update(self, other_dataset_info: "DatasetInfo", ignore_none: bool = True) -> None: ...

55

def copy(self) -> "DatasetInfo": ...

56

57

@classmethod

58

def from_merge(cls, dataset_infos: List["DatasetInfo"]) -> "DatasetInfo": ...

59

```

60

61

**Usage Examples:**

62

63

```python

64

from datasets import load_dataset, DatasetInfo

65

66

# Access dataset info from loaded dataset

67

dataset = load_dataset("squad", split="train")

68

info = dataset.info

69

70

print(f"Description: {info.description}")

71

print(f"Dataset size: {info.dataset_size} bytes")

72

print(f"Number of examples: {info.splits['train'].num_examples}")

73

print(f"Features: {info.features}")

74

75

# Save dataset info to disk

76

info.write_to_directory("./squad_info/")

77

78

# Load dataset info from disk

79

loaded_info = DatasetInfo.from_directory("./squad_info/")

80

81

# Create custom dataset info

82

custom_info = DatasetInfo(

83

description="My custom dataset",

84

features=Features({

85

"text": Value("string"),

86

"label": ClassLabel(names=["positive", "negative"])

87

}),

88

license="MIT",

89

citation="@misc{my_dataset, ...}"

90

)

91

```

92

93

### Dataset Configuration Discovery

94

95

Functions for discovering available dataset configurations, splits, and metadata without downloading the full dataset.

96

97

```python { .api }

98

def get_dataset_config_names(

99

path: str,

100

revision: Optional[Union[str, Version]] = None,

101

download_config: Optional[DownloadConfig] = None,

102

download_mode: Optional[Union[DownloadMode, str]] = None,

103

data_files: Optional[Union[dict, list, str]] = None,

104

**download_kwargs,

105

) -> List[str]:

106

"""

107

Get the list of available config names for a dataset.

108

109

Parameters:

110

- path (str): Path or name of the dataset

111

- revision (str, Version, optional): Version of the dataset script to load

112

- download_config (DownloadConfig, optional): Specific download configuration parameters

113

- download_mode (DownloadMode or str, optional): Select the download/generation mode

114

- data_files (dict, list, str, optional): Path(s) to source data file(s)

115

- **download_kwargs: Additional download arguments

116

117

Returns:

118

- List[str]: List of available configuration names

119

"""

120

121

def get_dataset_default_config_name(

122

path: str,

123

revision: Optional[Union[str, Version]] = None,

124

download_config: Optional[DownloadConfig] = None,

125

download_mode: Optional[Union[DownloadMode, str]] = None,

126

data_files: Optional[Union[dict, list, str]] = None,

127

**download_kwargs,

128

) -> Optional[str]:

129

"""

130

Get the default config name for a dataset.

131

132

Parameters:

133

- path (str): Path or name of the dataset

134

- revision (str, Version, optional): Version of the dataset script to load

135

- download_config (DownloadConfig, optional): Specific download configuration parameters

136

- download_mode (DownloadMode or str, optional): Select the download/generation mode

137

- data_files (dict, list, str, optional): Path(s) to source data file(s)

138

- **download_kwargs: Additional download arguments

139

140

Returns:

141

- Optional[str]: Default configuration name, or None if no default

142

"""

143

144

def get_dataset_split_names(

145

path: str,

146

config_name: Optional[str] = None,

147

data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,

148

download_config: Optional[DownloadConfig] = None,

149

download_mode: Optional[Union[DownloadMode, str]] = None,

150

revision: Optional[Union[str, Version]] = None,

151

token: Optional[Union[bool, str]] = None,

152

**config_kwargs,

153

) -> List[str]:

154

"""

155

Get the list of available splits for a particular config and dataset.

156

157

Parameters:

158

- path (str): Path or name of the dataset

159

- config_name (str, optional): Configuration name

160

- data_files (str, Sequence[str], Mapping, optional): Path(s) to source data file(s)

161

- download_config (DownloadConfig, optional): Specific download configuration parameters

162

- download_mode (DownloadMode or str, optional): Select the download/generation mode

163

- revision (str, Version, optional): Version of the dataset script to load

164

- token (bool or str, optional): Optional string or boolean to use as Bearer token

165

- **config_kwargs: Keyword arguments to be passed to the BuilderConfig

166

167

Returns:

168

- List[str]: List of available split names

169

"""

170

```

171

172

**Usage Examples:**

173

174

```python

175

from datasets import get_dataset_config_names, get_dataset_default_config_name, get_dataset_split_names

176

177

# Discover available configurations

178

configs = get_dataset_config_names("nyu-mll/glue")

179

print(f"Available configs: {configs}")

180

# ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', ...]

181

182

# Get default configuration

183

default = get_dataset_default_config_name("squad")

184

print(f"Default config: {default}") # 'plain_text'

185

186

# Get available splits for a configuration

187

splits = get_dataset_split_names("nyu-mll/glue", config_name="cola")

188

print(f"Available splits: {splits}") # ['train', 'validation', 'test']

189

190

# Check splits for default configuration

191

splits = get_dataset_split_names("squad")

192

print(f"SQuAD splits: {splits}") # ['train', 'validation']

193

```

194

195

### Dataset Information Retrieval

196

197

Functions for retrieving detailed metadata about datasets and their configurations.

198

199

```python { .api }

200

def get_dataset_infos(

201

path: str,

202

data_files: Optional[Union[dict, list, str]] = None,

203

download_config: Optional[DownloadConfig] = None,

204

download_mode: Optional[Union[DownloadMode, str]] = None,

205

revision: Optional[Union[str, Version]] = None,

206

token: Optional[Union[bool, str]] = None,

207

**config_kwargs,

208

) -> Dict[str, DatasetInfo]:

209

"""

210

Get meta information about a dataset, returned as dict mapping config name to DatasetInfo.

211

212

Parameters:

213

- path (str): Path or name of the dataset

214

- data_files (dict, list, str, optional): Path(s) to source data file(s)

215

- download_config (DownloadConfig, optional): Specific download configuration parameters

216

- download_mode (DownloadMode or str, optional): Select the download/generation mode

217

- revision (str, Version, optional): Version of the dataset script to load

218

- token (bool or str, optional): Optional string or boolean to use as Bearer token

219

- **config_kwargs: Keyword arguments to be passed to the BuilderConfig

220

221

Returns:

222

- Dict[str, DatasetInfo]: Dictionary mapping configuration names to DatasetInfo objects

223

"""

224

225

def get_dataset_config_info(

226

path: str,

227

config_name: Optional[str] = None,

228

data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,

229

download_config: Optional[DownloadConfig] = None,

230

download_mode: Optional[Union[DownloadMode, str]] = None,

231

revision: Optional[Union[str, Version]] = None,

232

token: Optional[Union[bool, str]] = None,

233

**config_kwargs,

234

) -> DatasetInfo:

235

"""

236

Get the meta information (DatasetInfo) about a dataset for a particular config.

237

238

Parameters:

239

- path (str): Path or name of the dataset

240

- config_name (str, optional): Configuration name

241

- data_files (str, Sequence[str], Mapping, optional): Path(s) to source data file(s)

242

- download_config (DownloadConfig, optional): Specific download configuration parameters

243

- download_mode (DownloadMode or str, optional): Select the download/generation mode

244

- revision (str, Version, optional): Version of the dataset script to load

245

- token (bool or str, optional): Optional string or boolean to use as Bearer token

246

- **config_kwargs: Keyword arguments to be passed to the BuilderConfig

247

248

Returns:

249

- DatasetInfo: Dataset information for the specified configuration

250

"""

251

```

252

253

**Usage Examples:**

254

255

```python

256

from datasets import get_dataset_infos, get_dataset_config_info

257

258

# Get info for all configurations

259

all_infos = get_dataset_infos("nyu-mll/glue")

260

for config, info in all_infos.items():

261

print(f"Config: {config}")

262

print(f" Description: {info.description[:100]}...")

263

print(f" Features: {list(info.features.keys())}")

264

print(f" Splits: {list(info.splits.keys())}")

265

266

# Get info for specific configuration

267

cola_info = get_dataset_config_info("nyu-mll/glue", config_name="cola")

268

print(f"CoLA dataset size: {cola_info.dataset_size} bytes")

269

print(f"CoLA train examples: {cola_info.splits['train'].num_examples}")

270

```

271

272

### Split Information and Management

273

274

Classes for managing dataset splits, subsplits, and split composition operations.

275

276

```python { .api }

277

class SplitBase:

278

"""Abstract base class for Split compositionality."""

279

280

def get_read_instruction(self, split_dict): ...

281

def __add__(self, other): ... # Merging: Split.TRAIN + Split.TEST

282

def subsplit(self, arg=None, k=None, percent=None, weighted=None): ...

283

284

class NamedSplit(SplitBase):

285

"""Descriptor corresponding to a named split (train, test, ...)."""

286

287

def __init__(self, name: str): ...

288

def __eq__(self, other) -> bool: ... # Supports Split.TRAIN == 'train'

289

def __str__(self) -> str: ...

290

def __repr__(self) -> str: ...

291

def get_read_instruction(self, split_dict): ...

292

293

class NamedSplitAll(NamedSplit):

294

"""Split corresponding to the union of all defined dataset splits."""

295

296

def get_read_instruction(self, split_dict): ...

297

298

class Split:

299

"""Enum for dataset splits with predefined constants."""

300

TRAIN = NamedSplit("train")

301

TEST = NamedSplit("test")

302

VALIDATION = NamedSplit("validation")

303

ALL = NamedSplitAll()

304

305

def __new__(cls, name: str): ... # Create custom split

306

307

class SplitInfo:

308

"""Information about a single dataset split."""

309

310

def __init__(

311

self,

312

name: str = "",

313

num_bytes: int = 0,

314

num_examples: int = 0,

315

shard_lengths: Optional[List[int]] = None,

316

dataset_name: Optional[str] = None,

317

): ...

318

319

@property

320

def file_instructions(self) -> List[dict]: ...

321

322

class SplitDict(dict):

323

"""Split info object - dictionary of split names to SplitInfo objects."""

324

325

def __init__(self, *args, dataset_name=None, **kwargs): ...

326

def __getitem__(self, key: Union[SplitBase, str]): ... # Supports subsplit instructions

327

def add(self, split_info: SplitInfo): ...

328

329

@property

330

def total_num_examples(self) -> int: ...

331

332

@classmethod

333

def from_split_dict(

334

cls,

335

split_infos: Union[list, dict],

336

dataset_name: Optional[str] = None,

337

) -> "SplitDict": ...

338

339

class SplitGenerator:

340

"""Defines split information for the generator in DatasetBuilder._split_generators."""

341

342

def __init__(self, name: str, gen_kwargs: dict = None): ...

343

```

344

345

**Usage Examples:**

346

347

```python

348

from datasets import Split, SplitInfo, SplitDict, percent

349

350

# Use predefined splits

351

train_split = Split.TRAIN

352

test_split = Split.TEST

353

validation_split = Split.VALIDATION

354

355

# Create custom split

356

custom_split = Split("custom_name")

357

358

# Combine splits

359

combined = Split.TRAIN + Split.TEST

360

361

# Create subsplits using percentages

362

train_subset = Split.TRAIN.subsplit(percent[:80]) # First 80%

363

val_subset = Split.TRAIN.subsplit(percent[80:]) # Last 20%

364

365

# Multiple subsplits with weights

366

train, val, test = Split.TRAIN.subsplit(weighted=[70, 15, 15])

367

368

# Create split information

369

split_info = SplitInfo(

370

name="train",

371

num_examples=1000,

372

num_bytes=1024000

373

)

374

375

# Create split dictionary

376

split_dict = SplitDict({

377

"train": SplitInfo(name="train", num_examples=800),

378

"test": SplitInfo(name="test", num_examples=200),

379

})

380

381

print(f"Total examples: {split_dict.total_num_examples}") # 1000

382

```

383

384

### Supervised Learning Configuration

385

386

Classes for specifying input/output relationships in supervised learning scenarios.

387

388

```python { .api }

389

class SupervisedKeysData:

390

"""Specifies input and output for supervised learning."""

391

392

def __init__(self, input: str = "", output: str = ""): ...

393

394

class PostProcessedInfo:

395

"""Information about post-processed resources (e.g., indices)."""

396

397

def __init__(

398

self,

399

features: Optional[Features] = None,

400

resources_checksums: Optional[dict] = None,

401

): ...

402

```

403

404

**Usage Examples:**

405

406

```python

407

from datasets import DatasetInfo, SupervisedKeysData, Features, Value, ClassLabel

408

409

# Define supervised learning keys

410

supervised_keys = SupervisedKeysData(input="text", output="label")

411

412

# Create dataset info with supervised keys

413

info = DatasetInfo(

414

description="Text classification dataset",

415

features=Features({

416

"text": Value("string"),

417

"label": ClassLabel(names=["positive", "negative"])

418

}),

419

supervised_keys=supervised_keys,

420

license="MIT"

421

)

422

423

# Access supervised keys

424

print(f"Input column: {info.supervised_keys.input}") # text

425

print(f"Output column: {info.supervised_keys.output}") # label

426

```

427

428

## Advanced Dataset Information Patterns

429

430

### Programmatic Dataset Discovery

431

432

```python

433

from datasets import get_dataset_config_names, get_dataset_split_names, get_dataset_infos

434

435

def explore_dataset(dataset_name):

436

"""Comprehensively explore a dataset's structure."""

437

438

print(f"Exploring dataset: {dataset_name}")

439

440

# Get all configurations

441

configs = get_dataset_config_names(dataset_name)

442

print(f"Available configurations: {configs}")

443

444

# Explore each configuration

445

for config in configs:

446

print(f"\nConfiguration: {config}")

447

448

# Get splits for this config

449

splits = get_dataset_split_names(dataset_name, config_name=config)

450

print(f" Splits: {splits}")

451

452

# Get detailed info

453

info = get_dataset_config_info(dataset_name, config_name=config)

454

print(f" Description: {info.description[:100]}...")

455

print(f" Features: {list(info.features.keys())}")

456

457

# Show split statistics

458

for split_name in splits:

459

split_info = info.splits[split_name]

460

print(f" {split_name}: {split_info.num_examples} examples, {split_info.num_bytes} bytes")

461

462

# Usage

463

explore_dataset("nyu-mll/glue")

464

```

465

466

### Custom Dataset Metadata Management

467

468

```python

469

from datasets import DatasetInfo, Features, Value, SplitDict, SplitInfo

470

471

def create_custom_dataset_info():

472

"""Create comprehensive dataset metadata."""

473

474

# Define features

475

features = Features({

476

"id": Value("string"),

477

"text": Value("string"),

478

"label": ClassLabel(names=["positive", "negative", "neutral"]),

479

"confidence": Value("float32"),

480

"metadata": {

481

"source": Value("string"),

482

"timestamp": Value("timestamp[ms]")

483

}

484

})

485

486

# Define split information

487

splits = SplitDict({

488

"train": SplitInfo(name="train", num_examples=10000, num_bytes=50000000),

489

"validation": SplitInfo(name="validation", num_examples=1000, num_bytes=5000000),

490

"test": SplitInfo(name="test", num_examples=2000, num_bytes=10000000),

491

})

492

493

# Create comprehensive dataset info

494

info = DatasetInfo(

495

description="A comprehensive sentiment analysis dataset with confidence scores",

496

citation="@misc{my_dataset_2024, title={My Dataset}, author={Author}, year={2024}}",

497

homepage="https://example.com/dataset",

498

license="Apache-2.0",

499

features=features,

500

splits=splits,

501

supervised_keys=SupervisedKeysData(input="text", output="label"),

502

version="1.0.0",

503

download_size=65000000,

504

dataset_size=65000000,

505

)

506

507

return info

508

509

# Save and load metadata

510

info = create_custom_dataset_info()

511

info.write_to_directory("./my_dataset_info/")

512

loaded_info = DatasetInfo.from_directory("./my_dataset_info/")

513

```

514

515

### Split Composition and Subsetting

516

517

```python

518

from datasets import Split, percent, load_dataset

519

520

def create_complex_splits():

521

"""Demonstrate advanced split composition."""

522

523

# Load dataset

524

dataset = load_dataset("imdb", split=Split.TRAIN)

525

526

# Create multiple subsplits

527

train_80, val_10, test_10 = dataset.train_test_split(

528

test_size=0.2,

529

train_size=0.8

530

)["train"].train_test_split(

531

test_size=0.125 # 10% of original (0.1/0.8)

532

).values()

533

534

# Alternative using split composition

535

train_subset = Split.TRAIN.subsplit(percent[:80])

536

val_subset = Split.TRAIN.subsplit(percent[80:90])

537

test_subset = Split.TRAIN.subsplit(percent[90:])

538

539

# Use weighted subsplits

540

splits = Split.TRAIN.subsplit(weighted=[8, 1, 1]) # 80%, 10%, 10%

541

542

return {

543

"train": splits[0],

544

"validation": splits[1],

545

"test": splits[2]

546

}

547

```

548

549

This comprehensive dataset information system provides powerful tools for dataset discovery, metadata management, and split composition, enabling efficient exploration and manipulation of dataset structures without requiring full downloads.