or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-dataset-classes.mddata-loading.mddataset-building.mddataset-information.mddataset-operations.mdfeatures-and-types.mdindex.md

dataset-building.mddocs/

0

# Dataset Building

1

2

Classes and utilities for creating custom dataset builders and configurations for new datasets. The dataset building system provides a robust framework for defining how datasets are downloaded, processed, and structured, with support for both generator-based and Arrow-based processing patterns.

3

4

## Capabilities

5

6

### Dataset Builder Base Class

7

8

Abstract base class for all datasets providing the core infrastructure for dataset download, preparation, and access.

9

10

```python { .api }

11

class DatasetBuilder(ABC):

12

"""Abstract base class for all datasets."""

13

14

# Class attributes (set in subclass)

15

VERSION: Optional[str] = None

16

BUILDER_CONFIG_CLASS: Type[BuilderConfig] = BuilderConfig

17

BUILDER_CONFIGS: List[BuilderConfig] = []

18

DEFAULT_CONFIG_NAME: Optional[str] = None

19

DEFAULT_WRITER_BATCH_SIZE: Optional[int] = 1000

20

21

# Core abstract methods (must be implemented)

22

@abc.abstractmethod

23

def _info(self) -> DatasetInfo:

24

"""Construct the DatasetInfo object with dataset metadata."""

25

26

@abc.abstractmethod

27

def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]) -> List[SplitGenerator]:

28

"""Return list of SplitGenerators defining how to generate data and splits."""

29

30

@abc.abstractmethod

31

def _prepare_split(self, split_generator: SplitGenerator, **kwargs):

32

"""Generate examples and record them on disk."""

33

34

# Main public methods

35

def download_and_prepare(

36

self,

37

output_dir: Optional[str] = None,

38

download_config: Optional[DownloadConfig] = None,

39

download_mode: Optional[Union[DownloadMode, str]] = None,

40

verification_mode: Optional[Union[VerificationMode, str]] = None,

41

dl_manager: Optional[DownloadManager] = None,

42

base_path: Optional[str] = None,

43

file_format: str = "arrow",

44

max_shard_size: Optional[Union[int, str]] = None,

45

num_proc: Optional[int] = None,

46

storage_options: Optional[dict] = None,

47

**kwargs,

48

) -> None: ...

49

50

def as_dataset(

51

self,

52

split: Optional[Union[str, Split, List[str], List[Split]]] = None,

53

run_post_process: bool = True,

54

verification_mode: Optional[Union[VerificationMode, str]] = None,

55

in_memory: bool = False,

56

) -> Union[Dataset, DatasetDict]: ...

57

58

# Properties

59

@property

60

def cache_dir(self) -> str: ...

61

62

@property

63

def manual_download_instructions(self) -> Optional[str]: ...

64

65

@classproperty

66

@classmethod

67

def builder_configs(cls) -> Dict[str, BuilderConfig]: ...

68

```

69

70

### Generator-Based Builder

71

72

Dataset builder for datasets generated from Python generators yielding dictionaries. Best for custom data processing and complex transformations.

73

74

```python { .api }

75

class GeneratorBasedBuilder(DatasetBuilder):

76

"""Base class for datasets with data generation based on dict generators."""

77

78

@abc.abstractmethod

79

def _generate_examples(self, **kwargs):

80

"""

81

Default function generating examples for each SplitGenerator.

82

83

Args:

84

**kwargs: Arguments forwarded from the SplitGenerator.gen_kwargs

85

86

Yields:

87

key: Union[str, int] - A unique deterministic example identification key

88

example: Dict[str, Any] - A feature dictionary ready to be encoded

89

"""

90

```

91

92

**Usage Example:**

93

94

```python

95

from datasets import GeneratorBasedBuilder, DatasetInfo, Features, Value, ClassLabel, Split, SplitGenerator

96

97

class MyTextClassificationBuilder(GeneratorBasedBuilder):

98

99

def _info(self) -> DatasetInfo:

100

return DatasetInfo(

101

description="A custom text classification dataset",

102

features=Features({

103

"text": Value("string"),

104

"label": ClassLabel(names=["positive", "negative", "neutral"]),

105

"confidence": Value("float32"),

106

}),

107

citation="Custom dataset citation",

108

license="MIT",

109

)

110

111

def _split_generators(self, dl_manager):

112

# Download files using the download manager

113

train_file = dl_manager.download("https://example.com/train.jsonl")

114

test_file = dl_manager.download("https://example.com/test.jsonl")

115

116

return [

117

SplitGenerator(

118

name=Split.TRAIN,

119

gen_kwargs={"filepath": train_file, "split": "train"}

120

),

121

SplitGenerator(

122

name=Split.TEST,

123

gen_kwargs={"filepath": test_file, "split": "test"}

124

),

125

]

126

127

def _generate_examples(self, filepath, split):

128

"""Generate examples from the downloaded files."""

129

import json

130

131

with open(filepath, 'r', encoding='utf-8') as f:

132

for idx, line in enumerate(f):

133

data = json.loads(line.strip())

134

yield f"{split}_{idx}", {

135

"text": data["text"],

136

"label": data["label"],

137

"confidence": data.get("confidence", 1.0),

138

}

139

```

140

141

### Arrow-Based Builder

142

143

Dataset builder for datasets generated from Arrow tables. More efficient for large datasets and standard formats (CSV, JSON, Parquet).

144

145

```python { .api }

146

class ArrowBasedBuilder(DatasetBuilder):

147

"""Base class for datasets with data generation based on Arrow loading functions."""

148

149

@abc.abstractmethod

150

def _generate_tables(self, **kwargs):

151

"""

152

Default function generating tables for each SplitGenerator.

153

154

Args:

155

**kwargs: Arguments forwarded from the SplitGenerator.gen_kwargs

156

157

Yields:

158

key: Union[str, int] - A unique deterministic example identification key

159

table: pyarrow.Table - A feature table ready to be written to disk

160

"""

161

```

162

163

**Usage Example:**

164

165

```python

166

import pyarrow as pa

167

import pandas as pd

168

from datasets import ArrowBasedBuilder, DatasetInfo, Features, Value, Split, SplitGenerator

169

170

class MyCSVDatasetBuilder(ArrowBasedBuilder):

171

172

def _info(self) -> DatasetInfo:

173

return DatasetInfo(

174

description="A dataset built from CSV files",

175

features=Features({

176

"id": Value("int64"),

177

"text": Value("string"),

178

"score": Value("float64"),

179

"category": Value("string"),

180

})

181

)

182

183

def _split_generators(self, dl_manager):

184

# Download multiple CSV files

185

urls = {

186

"train": ["https://example.com/train1.csv", "https://example.com/train2.csv"],

187

"test": ["https://example.com/test.csv"]

188

}

189

190

downloaded_files = {}

191

for split, split_urls in urls.items():

192

downloaded_files[split] = [dl_manager.download(url) for url in split_urls]

193

194

return [

195

SplitGenerator(name=Split.TRAIN, gen_kwargs={"files": downloaded_files["train"]}),

196

SplitGenerator(name=Split.TEST, gen_kwargs={"files": downloaded_files["test"]}),

197

]

198

199

def _generate_tables(self, files):

200

"""Generate Arrow tables from CSV files."""

201

for idx, filepath in enumerate(files):

202

# Read CSV into pandas DataFrame

203

df = pd.read_csv(filepath)

204

205

# Convert to Arrow table

206

table = pa.Table.from_pandas(df)

207

208

yield idx, table

209

```

210

211

### Builder Configuration

212

213

Configuration class for dataset builders that defines named configurations and their parameters.

214

215

```python { .api }

216

class BuilderConfig:

217

"""Base class for DatasetBuilder data configuration."""

218

219

def __init__(

220

self,

221

name: str = "default",

222

version: Optional[Union[str, Version]] = "0.0.0",

223

data_dir: Optional[str] = None,

224

data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None,

225

description: Optional[str] = None,

226

): ...

227

228

def create_config_id(

229

self,

230

config_kwargs: dict,

231

custom_features: Optional[Features] = None,

232

) -> str: ...

233

```

234

235

**Usage Example:**

236

237

```python

238

from datasets import BuilderConfig

239

240

class MyBuilderConfig(BuilderConfig):

241

"""Custom configuration with additional parameters."""

242

243

def __init__(

244

self,

245

name: str = "default",

246

version: Optional[Union[str, Version]] = "1.0.0",

247

data_dir: Optional[str] = None,

248

data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None,

249

description: Optional[str] = None,

250

# Custom parameters

251

language: str = "en",

252

preprocessing: str = "standard",

253

**kwargs,

254

):

255

super().__init__(

256

name=name,

257

version=version,

258

data_dir=data_dir,

259

data_files=data_files,

260

description=description,

261

**kwargs,

262

)

263

self.language = language

264

self.preprocessing = preprocessing

265

266

class MyConfigurableBuilder(GeneratorBasedBuilder):

267

268

BUILDER_CONFIG_CLASS = MyBuilderConfig

269

BUILDER_CONFIGS = [

270

MyBuilderConfig(

271

name="en_standard",

272

description="English dataset with standard preprocessing",

273

language="en",

274

preprocessing="standard",

275

),

276

MyBuilderConfig(

277

name="en_minimal",

278

description="English dataset with minimal preprocessing",

279

language="en",

280

preprocessing="minimal",

281

),

282

MyBuilderConfig(

283

name="es_standard",

284

description="Spanish dataset with standard preprocessing",

285

language="es",

286

preprocessing="standard",

287

),

288

]

289

DEFAULT_CONFIG_NAME = "en_standard"

290

291

def _info(self) -> DatasetInfo:

292

return DatasetInfo(

293

description=f"Dataset in {self.config.language} with {self.config.preprocessing} preprocessing",

294

features=Features({

295

"text": Value("string"),

296

"label": ClassLabel(names=["pos", "neg"]),

297

"language": Value("string"),

298

})

299

)

300

301

def _split_generators(self, dl_manager):

302

# Use config parameters to determine data sources

303

url = f"https://example.com/{self.config.language}/data.jsonl"

304

filepath = dl_manager.download(url)

305

306

return [

307

SplitGenerator(

308

name=Split.TRAIN,

309

gen_kwargs={"filepath": filepath, "preprocessing": self.config.preprocessing}

310

)

311

]

312

313

def _generate_examples(self, filepath, preprocessing):

314

# Use preprocessing parameter to determine processing logic

315

with open(filepath, 'r') as f:

316

for idx, line in enumerate(f):

317

data = json.loads(line)

318

319

text = data["text"]

320

if preprocessing == "standard":

321

text = text.lower().strip()

322

elif preprocessing == "minimal":

323

text = text.strip()

324

325

yield idx, {

326

"text": text,

327

"label": data["label"],

328

"language": self.config.language,

329

}

330

```

331

332

## Advanced Dataset Building Patterns

333

334

### Multi-Format Dataset Builder

335

336

```python

337

class MultiFormatBuilder(GeneratorBasedBuilder):

338

"""Builder that can handle multiple input formats."""

339

340

def _split_generators(self, dl_manager):

341

# Handle different file types

342

files = {

343

"csv_files": [dl_manager.download(url) for url in self.config.csv_urls],

344

"json_files": [dl_manager.download(url) for url in self.config.json_urls],

345

"txt_files": [dl_manager.download(url) for url in self.config.txt_urls],

346

}

347

348

return [

349

SplitGenerator(name=Split.TRAIN, gen_kwargs=files)

350

]

351

352

def _generate_examples(self, csv_files, json_files, txt_files):

353

example_id = 0

354

355

# Process CSV files

356

for filepath in csv_files:

357

df = pd.read_csv(filepath)

358

for _, row in df.iterrows():

359

yield example_id, {"text": row["text"], "source": "csv"}

360

example_id += 1

361

362

# Process JSON files

363

for filepath in json_files:

364

with open(filepath) as f:

365

data = json.load(f)

366

for item in data:

367

yield example_id, {"text": item["text"], "source": "json"}

368

example_id += 1

369

370

# Process text files

371

for filepath in txt_files:

372

with open(filepath) as f:

373

for line in f:

374

yield example_id, {"text": line.strip(), "source": "txt"}

375

example_id += 1

376

```

377

378

### Dataset with Manual Download

379

380

```python

381

class ManualDownloadBuilder(GeneratorBasedBuilder):

382

"""Builder for datasets requiring manual download."""

383

384

MANUAL_DOWNLOAD_INSTRUCTIONS = """

385

Please download the dataset files manually from: https://example.com/dataset

386

Extract the files and place them in: {manual_dir}

387

The expected files are:

388

- train.jsonl

389

- test.jsonl

390

- metadata.json

391

"""

392

393

def _split_generators(self, dl_manager):

394

# dl_manager.manual_dir points to the manually downloaded files

395

manual_dir = dl_manager.manual_dir

396

397

return [

398

SplitGenerator(

399

name=Split.TRAIN,

400

gen_kwargs={"filepath": os.path.join(manual_dir, "train.jsonl")}

401

),

402

SplitGenerator(

403

name=Split.TEST,

404

gen_kwargs={"filepath": os.path.join(manual_dir, "test.jsonl")}

405

),

406

]

407

```

408

409

### Performance Optimization

410

411

```python

412

class OptimizedBuilder(GeneratorBasedBuilder):

413

"""Builder with performance optimizations."""

414

415

# Optimize batch size for writing

416

DEFAULT_WRITER_BATCH_SIZE = 10000

417

418

def download_and_prepare(self, **kwargs):

419

# Use multiprocessing for faster preparation

420

kwargs.setdefault("num_proc", 4)

421

422

# Use larger shard size for fewer files

423

kwargs.setdefault("max_shard_size", "1GB")

424

425

super().download_and_prepare(**kwargs)

426

427

def _generate_examples(self, filepath):

428

# Use efficient file reading

429

with open(filepath, 'rb') as f:

430

for idx, line in enumerate(f):

431

# Process line efficiently

432

data = orjson.loads(line) # orjson is faster than json

433

yield idx, self._process_example(data)

434

435

def _process_example(self, data):

436

# Efficient data processing

437

return {

438

"text": data["text"],

439

"label": data["label"],

440

"features": np.array(data["features"], dtype=np.float32),

441

}

442

```

443

444

## Best Practices

445

446

### Error Handling and Validation

447

448

```python

449

def _generate_examples(self, filepath):

450

"""Generate examples with proper error handling."""

451

try:

452

with open(filepath, 'r', encoding='utf-8') as f:

453

for idx, line in enumerate(f):

454

try:

455

data = json.loads(line.strip())

456

457

# Validate required fields

458

if "text" not in data or "label" not in data:

459

logger.warning(f"Skipping incomplete example at line {idx}")

460

continue

461

462

yield idx, {

463

"text": str(data["text"]),

464

"label": str(data["label"]),

465

}

466

467

except json.JSONDecodeError as e:

468

logger.warning(f"Failed to parse JSON at line {idx}: {e}")

469

continue

470

471

except FileNotFoundError:

472

raise FileNotFoundError(f"Data file not found: {filepath}")

473

```

474

475

### Testing Dataset Builders

476

477

```python

478

def test_builder():

479

"""Test the custom dataset builder."""

480

from datasets import load_dataset_builder

481

482

# Test builder instantiation

483

builder = MyTextClassificationBuilder()

484

485

# Test info generation

486

info = builder._info()

487

assert "text" in info.features

488

assert "label" in info.features

489

490

# Test dataset building

491

builder.download_and_prepare()

492

dataset = builder.as_dataset()

493

494

# Validate dataset

495

assert len(dataset["train"]) > 0

496

assert all(key in dataset["train"].features for key in ["text", "label"])

497

```

498

499

This comprehensive dataset building system provides flexible, efficient tools for creating custom datasets that integrate seamlessly with the Hugging Face datasets ecosystem, supporting features like multiprocessing, caching, streaming, and various output formats.