or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-dataset-classes.mddata-loading.mddataset-building.mddataset-information.mddataset-operations.mdfeatures-and-types.mdindex.md

core-dataset-classes.mddocs/

0

# Core Dataset Classes

1

2

The fundamental dataset classes that provide different access patterns and capabilities for working with dataset collections. These classes form the core of the datasets library, offering both in-memory and streaming approaches to dataset processing.

3

4

## Capabilities

5

6

### Dataset - Map-style Dataset

7

8

The main dataset class backed by Apache Arrow for efficient random access. Provides comprehensive data transformation, filtering, and export capabilities.

9

10

```python { .api }

11

class Dataset:

12

"""Map-style dataset backed by Apache Arrow for efficient random access."""

13

14

# Core properties

15

features: Features

16

info: DatasetInfo

17

split: Optional[NamedSplit]

18

num_rows: int

19

num_columns: int

20

column_names: List[str]

21

shape: Tuple[int, int]

22

23

# Dataset creation (class methods)

24

@classmethod

25

def from_pandas(

26

cls,

27

df: "pandas.DataFrame",

28

features: Optional[Features] = None,

29

info: Optional[DatasetInfo] = None,

30

split: Optional[NamedSplit] = None,

31

preserve_index: Optional[bool] = None,

32

) -> "Dataset": ...

33

34

@classmethod

35

def from_dict(

36

cls,

37

mapping: dict,

38

features: Optional[Features] = None,

39

info: Optional[DatasetInfo] = None,

40

split: Optional[NamedSplit] = None,

41

) -> "Dataset": ...

42

43

@classmethod

44

def from_list(

45

cls,

46

mapping: List[dict],

47

features: Optional[Features] = None,

48

info: Optional[DatasetInfo] = None,

49

split: Optional[NamedSplit] = None,

50

) -> "Dataset": ...

51

52

# Dataset creation (static methods)

53

@staticmethod

54

def from_csv(

55

path_or_paths: Union[PathLike, List[PathLike]],

56

split: Optional[NamedSplit] = None,

57

features: Optional[Features] = None,

58

cache_dir: Optional[str] = None,

59

keep_in_memory: bool = False,

60

num_proc: Optional[int] = None,

61

**kwargs,

62

) -> "Dataset": ...

63

64

@staticmethod

65

def from_json(

66

path_or_paths: Union[PathLike, List[PathLike]],

67

split: Optional[NamedSplit] = None,

68

features: Optional[Features] = None,

69

cache_dir: Optional[str] = None,

70

keep_in_memory: bool = False,

71

field: Optional[str] = None,

72

num_proc: Optional[int] = None,

73

**kwargs,

74

) -> "Dataset": ...

75

76

@staticmethod

77

def from_parquet(

78

path_or_paths: Union[PathLike, List[PathLike]],

79

split: Optional[NamedSplit] = None,

80

features: Optional[Features] = None,

81

cache_dir: Optional[str] = None,

82

keep_in_memory: bool = False,

83

columns: Optional[List[str]] = None,

84

num_proc: Optional[int] = None,

85

**kwargs,

86

) -> "Dataset": ...

87

88

@staticmethod

89

def from_text(

90

path_or_paths: Union[PathLike, List[PathLike]],

91

split: Optional[NamedSplit] = None,

92

features: Optional[Features] = None,

93

cache_dir: Optional[str] = None,

94

keep_in_memory: bool = False,

95

num_proc: Optional[int] = None,

96

**kwargs,

97

) -> "Dataset": ...

98

99

@staticmethod

100

def from_generator(

101

generator: Callable,

102

features: Optional[Features] = None,

103

cache_dir: Optional[str] = None,

104

keep_in_memory: bool = False,

105

gen_kwargs: Optional[dict] = None,

106

num_proc: Optional[int] = None,

107

split: NamedSplit = "train",

108

**kwargs,

109

) -> "Dataset": ...

110

111

# Data access

112

def __getitem__(self, key): ...

113

def __len__(self) -> int: ...

114

115

# Data transformation

116

def map(

117

self,

118

function=None,

119

with_indices: bool = False,

120

with_rank: bool = False,

121

input_columns: Optional[Union[str, List[str]]] = None,

122

batched: bool = False,

123

batch_size: int = 1000,

124

drop_last_batch: bool = False,

125

remove_columns: Optional[Union[str, List[str]]] = None,

126

keep_in_memory: bool = False,

127

load_from_cache_file: Optional[bool] = None,

128

cache_file_name: Optional[str] = None,

129

writer_batch_size: int = 1000,

130

features: Optional[Features] = None,

131

disable_nullable: bool = False,

132

fn_kwargs: Optional[dict] = None,

133

num_proc: Optional[int] = None,

134

desc: Optional[str] = None,

135

**kwargs

136

) -> "Dataset": ...

137

138

def filter(

139

self,

140

function=None,

141

with_indices: bool = False,

142

with_rank: bool = False,

143

input_columns: Optional[Union[str, List[str]]] = None,

144

batched: bool = False,

145

batch_size: int = 1000,

146

keep_in_memory: bool = False,

147

load_from_cache_file: Optional[bool] = None,

148

cache_file_name: Optional[str] = None,

149

writer_batch_size: int = 1000,

150

fn_kwargs: Optional[dict] = None,

151

num_proc: Optional[int] = None,

152

desc: Optional[str] = None,

153

**kwargs

154

) -> "Dataset": ...

155

156

def select(

157

self,

158

indices: Union[int, List[int], Iterable[int]],

159

keep_in_memory: bool = False,

160

indices_cache_file_name: Optional[str] = None,

161

writer_batch_size: int = 1000,

162

**kwargs

163

) -> "Dataset": ...

164

165

def sort(

166

self,

167

column_names: Union[str, List[str]],

168

reverse: Union[bool, List[bool]] = False,

169

null_placement: str = "at_end",

170

keep_in_memory: bool = False,

171

load_from_cache_file: Optional[bool] = None,

172

**kwargs

173

) -> "Dataset": ...

174

175

def shuffle(

176

self,

177

seed: Optional[int] = None,

178

generator: Optional = None,

179

keep_in_memory: bool = False,

180

load_from_cache_file: Optional[bool] = None,

181

**kwargs

182

) -> "Dataset": ...

183

184

# Column operations

185

def remove_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...

186

def rename_column(self, original_column_name: str, new_column_name: str, **kwargs) -> "Dataset": ...

187

def rename_columns(self, column_mapping: Dict[str, str], **kwargs) -> "Dataset": ...

188

def select_columns(self, column_names: Union[str, List[str]], **kwargs) -> "Dataset": ...

189

def add_column(self, name: str, column: Union[list, np.array], **kwargs) -> "Dataset": ...

190

191

# Type casting

192

def cast(self, features: Features, **kwargs) -> "Dataset": ...

193

def cast_column(self, column: str, feature, **kwargs) -> "Dataset": ...

194

195

# Data formatting

196

def with_format(

197

self,

198

type: Optional[str] = None,

199

columns: Optional[List] = None,

200

output_all_columns: bool = False,

201

**format_kwargs

202

) -> "Dataset": ...

203

204

def set_format(

205

self,

206

type: Optional[str] = None,

207

columns: Optional[List] = None,

208

output_all_columns: bool = False,

209

**format_kwargs

210

) -> None: ...

211

212

def reset_format(self) -> None: ...

213

214

# Data export

215

def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> dict: ...

216

def to_pandas(

217

self,

218

batch_size: Optional[int] = None,

219

batched: bool = False

220

) -> Union["pandas.DataFrame", Iterator["pandas.DataFrame"]]: ...

221

def save_to_disk(

222

self,

223

dataset_path: PathLike,

224

max_shard_size: Optional[Union[str, int]] = None,

225

num_shards: Optional[int] = None,

226

num_proc: Optional[int] = None,

227

storage_options: Optional[dict] = None,

228

) -> None: ...

229

230

# Dataset splitting

231

def train_test_split(

232

self,

233

test_size: Optional[Union[float, int]] = None,

234

train_size: Optional[Union[float, int]] = None,

235

shuffle: bool = True,

236

seed: Optional[int] = None,

237

**kwargs

238

) -> "DatasetDict": ...

239

240

def shard(

241

self,

242

num_shards: int,

243

index: int,

244

contiguous: bool = True,

245

**kwargs

246

) -> "Dataset": ...

247

```

248

249

**Usage Examples:**

250

251

```python

252

from datasets import Dataset

253

254

# Create dataset from dictionary

255

data = {"text": ["Hello", "World"], "label": [0, 1]}

256

dataset = Dataset.from_dict(data)

257

258

# Transform data

259

def uppercase(example):

260

example["text"] = example["text"].upper()

261

return example

262

263

dataset = dataset.map(uppercase)

264

265

# Filter data

266

dataset = dataset.filter(lambda x: len(x["text"]) > 3)

267

268

# Export to different formats

269

dataset.set_format("torch")

270

pandas_df = dataset.to_pandas()

271

```

272

273

### DatasetDict - Multiple Dataset Container

274

275

Dictionary-like container that holds multiple Dataset objects, typically representing different splits (train, validation, test).

276

277

```python { .api }

278

class DatasetDict(dict):

279

"""Dictionary of Dataset objects, typically for train/validation/test splits."""

280

281

# Properties

282

num_columns: Dict[str, int]

283

num_rows: Dict[str, int]

284

column_names: Dict[str, List[str]]

285

shape: Dict[str, Tuple[int, int]]

286

287

# Data transformation (applied to all splits)

288

def map(self, function=None, **kwargs) -> "DatasetDict": ...

289

def filter(self, function=None, **kwargs) -> "DatasetDict": ...

290

def sort(self, column_names: Union[str, List[str]], **kwargs) -> "DatasetDict": ...

291

def shuffle(self, **kwargs) -> "DatasetDict": ...

292

293

# Column operations (applied to all splits)

294

def remove_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...

295

def rename_column(self, original_column_name: str, new_column_name: str) -> "DatasetDict": ...

296

def rename_columns(self, column_mapping: Dict[str, str]) -> "DatasetDict": ...

297

def select_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": ...

298

299

# Type operations (applied to all splits)

300

def cast(self, features: Features) -> "DatasetDict": ...

301

def cast_column(self, column: str, feature) -> "DatasetDict": ...

302

303

# Formatting (applied to all splits)

304

def with_format(

305

self,

306

type: Optional[str] = None,

307

columns: Optional[List] = None,

308

output_all_columns: bool = False,

309

**format_kwargs

310

) -> "DatasetDict": ...

311

312

def set_format(

313

self,

314

type: Optional[str] = None,

315

columns: Optional[List] = None,

316

output_all_columns: bool = False,

317

**format_kwargs

318

) -> None: ...

319

320

def reset_format(self) -> None: ...

321

322

# Data export

323

def save_to_disk(self, dataset_dict_path: str, **kwargs) -> None: ...

324

325

# Utilities

326

def flatten(self, max_depth: int = 16) -> "DatasetDict": ...

327

def unique(self, column: str) -> Dict[str, List]: ...

328

def cleanup_cache_files(self) -> Dict[str, int]: ...

329

```

330

331

**Usage Examples:**

332

333

```python

334

from datasets import DatasetDict, Dataset

335

336

# Create DatasetDict from separate datasets

337

dataset_dict = DatasetDict({

338

"train": Dataset.from_dict({"text": ["train1", "train2"], "label": [0, 1]}),

339

"test": Dataset.from_dict({"text": ["test1"], "label": [0]})

340

})

341

342

# Apply operations to all splits

343

dataset_dict = dataset_dict.map(lambda x: {"length": len(x["text"])})

344

dataset_dict = dataset_dict.filter(lambda x: x["length"] > 3)

345

346

# Access individual splits

347

train_data = dataset_dict["train"]

348

test_data = dataset_dict["test"]

349

```

350

351

### IterableDataset - Streaming Dataset

352

353

Iterable-style dataset for streaming large datasets without loading everything into memory. Processes data on-the-fly with sequential access only.

354

355

```python { .api }

356

class IterableDataset:

357

"""Iterable-style dataset for streaming large datasets without loading into memory."""

358

359

# Properties

360

features: Optional[Features]

361

info: DatasetInfo

362

split: Optional[NamedSplit]

363

num_columns: Optional[int]

364

column_names: Optional[List[str]]

365

366

# Iteration

367

def __iter__(self): ...

368

def iter(self, batch_size: int, drop_last_batch: bool = False): ...

369

370

# Iteration control

371

def take(self, n: int) -> "IterableDataset": ...

372

def skip(self, n: int) -> "IterableDataset": ...

373

374

# Data transformation (streaming)

375

def map(

376

self,

377

function=None,

378

with_indices: bool = False,

379

input_columns: Optional[Union[str, List[str]]] = None,

380

batched: bool = False,

381

batch_size: int = 1000,

382

drop_last_batch: bool = False,

383

remove_columns: Optional[Union[str, List[str]]] = None,

384

features: Optional[Features] = None,

385

fn_kwargs: Optional[dict] = None,

386

) -> "IterableDataset": ...

387

388

def filter(

389

self,

390

function=None,

391

with_indices: bool = False,

392

input_columns: Optional[Union[str, List[str]]] = None,

393

batched: bool = False,

394

batch_size: int = 1000,

395

fn_kwargs: Optional[dict] = None,

396

) -> "IterableDataset": ...

397

398

def shuffle(

399

self,

400

seed: Optional[int] = None,

401

generator: Optional = None,

402

buffer_size: int = 1000,

403

) -> "IterableDataset": ...

404

405

# Column operations (streaming)

406

def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...

407

def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDataset": ...

408

def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDataset": ...

409

def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDataset": ...

410

411

# Type operations (streaming)

412

def cast(self, features: Features) -> "IterableDataset": ...

413

def cast_column(self, column: str, feature) -> "IterableDataset": ...

414

415

# Formatting (streaming)

416

def with_format(self, type: Optional[str] = None) -> "IterableDataset": ...

417

```

418

419

**Usage Examples:**

420

421

```python

422

from datasets import load_dataset

423

424

# Create streaming dataset

425

streaming_dataset = load_dataset("oscar", "unshuffled_deduplicated_en", streaming=True)

426

427

# Take first 1000 examples

428

small_dataset = streaming_dataset.take(1000)

429

430

# Apply transformations on-the-fly

431

def preprocess(example):

432

example["length"] = len(example["text"])

433

return example

434

435

processed = small_dataset.map(preprocess)

436

437

# Iterate through examples

438

for example in processed:

439

print(example["length"])

440

break

441

```

442

443

### IterableDatasetDict - Streaming Dataset Container

444

445

Dictionary-like container for multiple IterableDataset objects representing different splits for streaming workflows.

446

447

```python { .api }

448

class IterableDatasetDict(dict):

449

"""Dictionary of IterableDataset objects for streaming workflows."""

450

451

# Properties

452

num_columns: Optional[Dict[str, int]]

453

column_names: Optional[Dict[str, List[str]]]

454

455

# Data transformation (applied to all streaming splits)

456

def map(self, function=None, **kwargs) -> "IterableDatasetDict": ...

457

def filter(self, function=None, **kwargs) -> "IterableDatasetDict": ...

458

def shuffle(self, **kwargs) -> "IterableDatasetDict": ...

459

460

# Column operations (applied to all streaming splits)

461

def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...

462

def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDatasetDict": ...

463

def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDatasetDict": ...

464

def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": ...

465

466

# Type operations (applied to all streaming splits)

467

def cast(self, features: Features) -> "IterableDatasetDict": ...

468

def cast_column(self, column: str, feature) -> "IterableDatasetDict": ...

469

470

# Formatting (applied to all streaming splits)

471

def with_format(self, type: Optional[str] = None) -> "IterableDatasetDict": ...

472

```

473

474

**Usage Examples:**

475

476

```python

477

# Load streaming dataset with multiple splits

478

streaming_dict = load_dataset("squad", streaming=True)

479

480

# Apply operations to all streaming splits

481

streaming_dict = streaming_dict.map(lambda x: {"question_length": len(x["question"])})

482

483

# Access individual streaming splits

484

train_stream = streaming_dict["train"]

485

validation_stream = streaming_dict["validation"]

486

487

# Take samples from each split

488

for example in train_stream.take(5):

489

print(f"Question length: {example['question_length']}")

490

break

491

```

492

493

## Types

494

495

### Path Types

496

497

```python { .api }

498

from os import PathLike

499

```

500

501

### Column Types

502

503

```python { .api }

504

class Column:

505

"""Iterable for accessing specific columns of a dataset."""

506

507

def __init__(self, table, info: Optional[DatasetInfo] = None): ...

508

def __iter__(self): ...

509

def __len__(self) -> int: ...

510

511

class IterableColumn:

512

"""Iterable column access for IterableDataset."""

513

514

def __init__(self, dataset, key: str): ...

515

def __iter__(self): ...

516

```

517

518

### Performance Considerations

519

520

- **Dataset/DatasetDict**: Best for smaller datasets that fit in memory, supports random access and complex operations

521

- **IterableDataset/IterableDatasetDict**: Best for large datasets, memory efficient streaming, sequential access only

522

- **Caching**: Dataset operations are cached by default for reproducibility

523

- **Multiprocessing**: Many operations support `num_proc` parameter for parallel processing

524

- **Apache Arrow**: Underlying storage format provides efficient columnar operations