0
# Dataset Building
1
2
Classes and utilities for creating custom dataset builders and configurations for new datasets. The dataset building system provides a robust framework for defining how datasets are downloaded, processed, and structured, with support for both generator-based and Arrow-based processing patterns.
3
4
## Capabilities
5
6
### Dataset Builder Base Class
7
8
Abstract base class for all datasets providing the core infrastructure for dataset download, preparation, and access.
9
10
```python { .api }
11
class DatasetBuilder(ABC):
12
"""Abstract base class for all datasets."""
13
14
# Class attributes (set in subclass)
15
VERSION: Optional[str] = None
16
BUILDER_CONFIG_CLASS: Type[BuilderConfig] = BuilderConfig
17
BUILDER_CONFIGS: List[BuilderConfig] = []
18
DEFAULT_CONFIG_NAME: Optional[str] = None
19
DEFAULT_WRITER_BATCH_SIZE: Optional[int] = 1000
20
21
# Core abstract methods (must be implemented)
22
@abc.abstractmethod
23
def _info(self) -> DatasetInfo:
24
"""Construct the DatasetInfo object with dataset metadata."""
25
26
@abc.abstractmethod
27
def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]) -> List[SplitGenerator]:
28
"""Return list of SplitGenerators defining how to generate data and splits."""
29
30
@abc.abstractmethod
31
def _prepare_split(self, split_generator: SplitGenerator, **kwargs):
32
"""Generate examples and record them on disk."""
33
34
# Main public methods
35
def download_and_prepare(
36
self,
37
output_dir: Optional[str] = None,
38
download_config: Optional[DownloadConfig] = None,
39
download_mode: Optional[Union[DownloadMode, str]] = None,
40
verification_mode: Optional[Union[VerificationMode, str]] = None,
41
dl_manager: Optional[DownloadManager] = None,
42
base_path: Optional[str] = None,
43
file_format: str = "arrow",
44
max_shard_size: Optional[Union[int, str]] = None,
45
num_proc: Optional[int] = None,
46
storage_options: Optional[dict] = None,
47
**kwargs,
48
) -> None: ...
49
50
def as_dataset(
51
self,
52
split: Optional[Union[str, Split, List[str], List[Split]]] = None,
53
run_post_process: bool = True,
54
verification_mode: Optional[Union[VerificationMode, str]] = None,
55
in_memory: bool = False,
56
) -> Union[Dataset, DatasetDict]: ...
57
58
# Properties
59
@property
60
def cache_dir(self) -> str: ...
61
62
@property
63
def manual_download_instructions(self) -> Optional[str]: ...
64
65
@classproperty
66
@classmethod
67
def builder_configs(cls) -> Dict[str, BuilderConfig]: ...
68
```
69
70
### Generator-Based Builder
71
72
Dataset builder for datasets generated from Python generators yielding dictionaries. Best for custom data processing and complex transformations.
73
74
```python { .api }
75
class GeneratorBasedBuilder(DatasetBuilder):
76
"""Base class for datasets with data generation based on dict generators."""
77
78
@abc.abstractmethod
79
def _generate_examples(self, **kwargs):
80
"""
81
Default function generating examples for each SplitGenerator.
82
83
Args:
84
**kwargs: Arguments forwarded from the SplitGenerator.gen_kwargs
85
86
Yields:
87
key: Union[str, int] - A unique deterministic example identification key
88
example: Dict[str, Any] - A feature dictionary ready to be encoded
89
"""
90
```
91
92
**Usage Example:**
93
94
```python
95
from datasets import GeneratorBasedBuilder, DatasetInfo, Features, Value, ClassLabel, Split, SplitGenerator
96
97
class MyTextClassificationBuilder(GeneratorBasedBuilder):
98
99
def _info(self) -> DatasetInfo:
100
return DatasetInfo(
101
description="A custom text classification dataset",
102
features=Features({
103
"text": Value("string"),
104
"label": ClassLabel(names=["positive", "negative", "neutral"]),
105
"confidence": Value("float32"),
106
}),
107
citation="Custom dataset citation",
108
license="MIT",
109
)
110
111
def _split_generators(self, dl_manager):
112
# Download files using the download manager
113
train_file = dl_manager.download("https://example.com/train.jsonl")
114
test_file = dl_manager.download("https://example.com/test.jsonl")
115
116
return [
117
SplitGenerator(
118
name=Split.TRAIN,
119
gen_kwargs={"filepath": train_file, "split": "train"}
120
),
121
SplitGenerator(
122
name=Split.TEST,
123
gen_kwargs={"filepath": test_file, "split": "test"}
124
),
125
]
126
127
def _generate_examples(self, filepath, split):
128
"""Generate examples from the downloaded files."""
129
import json
130
131
with open(filepath, 'r', encoding='utf-8') as f:
132
for idx, line in enumerate(f):
133
data = json.loads(line.strip())
134
yield f"{split}_{idx}", {
135
"text": data["text"],
136
"label": data["label"],
137
"confidence": data.get("confidence", 1.0),
138
}
139
```
140
141
### Arrow-Based Builder
142
143
Dataset builder for datasets generated from Arrow tables. More efficient for large datasets and standard formats (CSV, JSON, Parquet).
144
145
```python { .api }
146
class ArrowBasedBuilder(DatasetBuilder):
147
"""Base class for datasets with data generation based on Arrow loading functions."""
148
149
@abc.abstractmethod
150
def _generate_tables(self, **kwargs):
151
"""
152
Default function generating tables for each SplitGenerator.
153
154
Args:
155
**kwargs: Arguments forwarded from the SplitGenerator.gen_kwargs
156
157
Yields:
158
key: Union[str, int] - A unique deterministic example identification key
159
table: pyarrow.Table - A feature table ready to be written to disk
160
"""
161
```
162
163
**Usage Example:**
164
165
```python
166
import pyarrow as pa
167
import pandas as pd
168
from datasets import ArrowBasedBuilder, DatasetInfo, Features, Value, Split, SplitGenerator
169
170
class MyCSVDatasetBuilder(ArrowBasedBuilder):
171
172
def _info(self) -> DatasetInfo:
173
return DatasetInfo(
174
description="A dataset built from CSV files",
175
features=Features({
176
"id": Value("int64"),
177
"text": Value("string"),
178
"score": Value("float64"),
179
"category": Value("string"),
180
})
181
)
182
183
def _split_generators(self, dl_manager):
184
# Download multiple CSV files
185
urls = {
186
"train": ["https://example.com/train1.csv", "https://example.com/train2.csv"],
187
"test": ["https://example.com/test.csv"]
188
}
189
190
downloaded_files = {}
191
for split, split_urls in urls.items():
192
downloaded_files[split] = [dl_manager.download(url) for url in split_urls]
193
194
return [
195
SplitGenerator(name=Split.TRAIN, gen_kwargs={"files": downloaded_files["train"]}),
196
SplitGenerator(name=Split.TEST, gen_kwargs={"files": downloaded_files["test"]}),
197
]
198
199
def _generate_tables(self, files):
200
"""Generate Arrow tables from CSV files."""
201
for idx, filepath in enumerate(files):
202
# Read CSV into pandas DataFrame
203
df = pd.read_csv(filepath)
204
205
# Convert to Arrow table
206
table = pa.Table.from_pandas(df)
207
208
yield idx, table
209
```
210
211
### Builder Configuration
212
213
Configuration class for dataset builders that defines named configurations and their parameters.
214
215
```python { .api }
216
class BuilderConfig:
217
"""Base class for DatasetBuilder data configuration."""
218
219
def __init__(
220
self,
221
name: str = "default",
222
version: Optional[Union[str, Version]] = "0.0.0",
223
data_dir: Optional[str] = None,
224
data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None,
225
description: Optional[str] = None,
226
): ...
227
228
def create_config_id(
229
self,
230
config_kwargs: dict,
231
custom_features: Optional[Features] = None,
232
) -> str: ...
233
```
234
235
**Usage Example:**
236
237
```python
238
from datasets import BuilderConfig
239
240
class MyBuilderConfig(BuilderConfig):
241
"""Custom configuration with additional parameters."""
242
243
def __init__(
244
self,
245
name: str = "default",
246
version: Optional[Union[str, Version]] = "1.0.0",
247
data_dir: Optional[str] = None,
248
data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None,
249
description: Optional[str] = None,
250
# Custom parameters
251
language: str = "en",
252
preprocessing: str = "standard",
253
**kwargs,
254
):
255
super().__init__(
256
name=name,
257
version=version,
258
data_dir=data_dir,
259
data_files=data_files,
260
description=description,
261
**kwargs,
262
)
263
self.language = language
264
self.preprocessing = preprocessing
265
266
class MyConfigurableBuilder(GeneratorBasedBuilder):
267
268
BUILDER_CONFIG_CLASS = MyBuilderConfig
269
BUILDER_CONFIGS = [
270
MyBuilderConfig(
271
name="en_standard",
272
description="English dataset with standard preprocessing",
273
language="en",
274
preprocessing="standard",
275
),
276
MyBuilderConfig(
277
name="en_minimal",
278
description="English dataset with minimal preprocessing",
279
language="en",
280
preprocessing="minimal",
281
),
282
MyBuilderConfig(
283
name="es_standard",
284
description="Spanish dataset with standard preprocessing",
285
language="es",
286
preprocessing="standard",
287
),
288
]
289
DEFAULT_CONFIG_NAME = "en_standard"
290
291
def _info(self) -> DatasetInfo:
292
return DatasetInfo(
293
description=f"Dataset in {self.config.language} with {self.config.preprocessing} preprocessing",
294
features=Features({
295
"text": Value("string"),
296
"label": ClassLabel(names=["pos", "neg"]),
297
"language": Value("string"),
298
})
299
)
300
301
def _split_generators(self, dl_manager):
302
# Use config parameters to determine data sources
303
url = f"https://example.com/{self.config.language}/data.jsonl"
304
filepath = dl_manager.download(url)
305
306
return [
307
SplitGenerator(
308
name=Split.TRAIN,
309
gen_kwargs={"filepath": filepath, "preprocessing": self.config.preprocessing}
310
)
311
]
312
313
def _generate_examples(self, filepath, preprocessing):
314
# Use preprocessing parameter to determine processing logic
315
with open(filepath, 'r') as f:
316
for idx, line in enumerate(f):
317
data = json.loads(line)
318
319
text = data["text"]
320
if preprocessing == "standard":
321
text = text.lower().strip()
322
elif preprocessing == "minimal":
323
text = text.strip()
324
325
yield idx, {
326
"text": text,
327
"label": data["label"],
328
"language": self.config.language,
329
}
330
```
331
332
## Advanced Dataset Building Patterns
333
334
### Multi-Format Dataset Builder
335
336
```python
337
class MultiFormatBuilder(GeneratorBasedBuilder):
338
"""Builder that can handle multiple input formats."""
339
340
def _split_generators(self, dl_manager):
341
# Handle different file types
342
files = {
343
"csv_files": [dl_manager.download(url) for url in self.config.csv_urls],
344
"json_files": [dl_manager.download(url) for url in self.config.json_urls],
345
"txt_files": [dl_manager.download(url) for url in self.config.txt_urls],
346
}
347
348
return [
349
SplitGenerator(name=Split.TRAIN, gen_kwargs=files)
350
]
351
352
def _generate_examples(self, csv_files, json_files, txt_files):
353
example_id = 0
354
355
# Process CSV files
356
for filepath in csv_files:
357
df = pd.read_csv(filepath)
358
for _, row in df.iterrows():
359
yield example_id, {"text": row["text"], "source": "csv"}
360
example_id += 1
361
362
# Process JSON files
363
for filepath in json_files:
364
with open(filepath) as f:
365
data = json.load(f)
366
for item in data:
367
yield example_id, {"text": item["text"], "source": "json"}
368
example_id += 1
369
370
# Process text files
371
for filepath in txt_files:
372
with open(filepath) as f:
373
for line in f:
374
yield example_id, {"text": line.strip(), "source": "txt"}
375
example_id += 1
376
```
377
378
### Dataset with Manual Download
379
380
```python
381
class ManualDownloadBuilder(GeneratorBasedBuilder):
382
"""Builder for datasets requiring manual download."""
383
384
MANUAL_DOWNLOAD_INSTRUCTIONS = """
385
Please download the dataset files manually from: https://example.com/dataset
386
Extract the files and place them in: {manual_dir}
387
The expected files are:
388
- train.jsonl
389
- test.jsonl
390
- metadata.json
391
"""
392
393
def _split_generators(self, dl_manager):
394
# dl_manager.manual_dir points to the manually downloaded files
395
manual_dir = dl_manager.manual_dir
396
397
return [
398
SplitGenerator(
399
name=Split.TRAIN,
400
gen_kwargs={"filepath": os.path.join(manual_dir, "train.jsonl")}
401
),
402
SplitGenerator(
403
name=Split.TEST,
404
gen_kwargs={"filepath": os.path.join(manual_dir, "test.jsonl")}
405
),
406
]
407
```
408
409
### Performance Optimization
410
411
```python
412
class OptimizedBuilder(GeneratorBasedBuilder):
413
"""Builder with performance optimizations."""
414
415
# Optimize batch size for writing
416
DEFAULT_WRITER_BATCH_SIZE = 10000
417
418
def download_and_prepare(self, **kwargs):
419
# Use multiprocessing for faster preparation
420
kwargs.setdefault("num_proc", 4)
421
422
# Use larger shard size for fewer files
423
kwargs.setdefault("max_shard_size", "1GB")
424
425
super().download_and_prepare(**kwargs)
426
427
def _generate_examples(self, filepath):
428
# Use efficient file reading
429
with open(filepath, 'rb') as f:
430
for idx, line in enumerate(f):
431
# Process line efficiently
432
data = orjson.loads(line) # orjson is faster than json
433
yield idx, self._process_example(data)
434
435
def _process_example(self, data):
436
# Efficient data processing
437
return {
438
"text": data["text"],
439
"label": data["label"],
440
"features": np.array(data["features"], dtype=np.float32),
441
}
442
```
443
444
## Best Practices
445
446
### Error Handling and Validation
447
448
```python
449
def _generate_examples(self, filepath):
450
"""Generate examples with proper error handling."""
451
try:
452
with open(filepath, 'r', encoding='utf-8') as f:
453
for idx, line in enumerate(f):
454
try:
455
data = json.loads(line.strip())
456
457
# Validate required fields
458
if "text" not in data or "label" not in data:
459
logger.warning(f"Skipping incomplete example at line {idx}")
460
continue
461
462
yield idx, {
463
"text": str(data["text"]),
464
"label": str(data["label"]),
465
}
466
467
except json.JSONDecodeError as e:
468
logger.warning(f"Failed to parse JSON at line {idx}: {e}")
469
continue
470
471
except FileNotFoundError:
472
raise FileNotFoundError(f"Data file not found: {filepath}")
473
```
474
475
### Testing Dataset Builders
476
477
```python
478
def test_builder():
479
"""Test the custom dataset builder."""
480
from datasets import load_dataset_builder
481
482
# Test builder instantiation
483
builder = MyTextClassificationBuilder()
484
485
# Test info generation
486
info = builder._info()
487
assert "text" in info.features
488
assert "label" in info.features
489
490
# Test dataset building
491
builder.download_and_prepare()
492
dataset = builder.as_dataset()
493
494
# Validate dataset
495
assert len(dataset["train"]) > 0
496
assert all(key in dataset["train"].features for key in ["text", "label"])
497
```
498
499
This comprehensive dataset building system provides flexible, efficient tools for creating custom datasets that integrate seamlessly with the Hugging Face datasets ecosystem, supporting features like multiprocessing, caching, streaming, and various output formats.