0
# Data Loading and Processing
1
2
Comprehensive data loading system built around the DataBlock API and transform pipelines. Provides flexible, composable data processing for all fastai domains.
3
4
## Capabilities
5
6
### DataLoaders
7
8
Main class for managing training and validation data with integrated transforms.
9
10
```python { .api }
11
class DataLoaders:
12
"""
13
Container for train/valid DataLoader pairs.
14
15
Parameters:
16
- *loaders: DataLoader instances (typically train, valid)
17
- path: Base path for saving/loading
18
- device: Device to place data on
19
"""
20
def __init__(self, *loaders, path='.', device=None): ...
21
22
@classmethod
23
def from_dblock(cls, dblock, source, path='.', **kwargs):
24
"""
25
Create DataLoaders from DataBlock.
26
27
Parameters:
28
- dblock: DataBlock defining data processing
29
- source: Data source (path, list, etc.)
30
- path: Base path
31
- **kwargs: Additional arguments
32
33
Returns:
34
- DataLoaders instance
35
"""
36
37
def show_batch(self, b=None, max_n=9, ctxs=None, show=True, **kwargs):
38
"""Display a batch of data."""
39
40
@property
41
def train(self):
42
"""Training DataLoader."""
43
44
@property
45
def valid(self):
46
"""Validation DataLoader."""
47
48
def one_batch(self):
49
"""Get one batch from training data."""
50
51
def save(self, file='data_loaders.pkl'):
52
"""Save DataLoaders to disk."""
53
54
@classmethod
55
def load(cls, path, file='data_loaders.pkl'):
56
"""Load DataLoaders from disk."""
57
```
58
59
### DataBlock API
60
61
Flexible API for constructing data processing pipelines from modular components.
62
63
```python { .api }
64
class DataBlock:
65
"""
66
Flexible data processing pipeline constructor.
67
68
Parameters:
69
- blocks: Transform blocks for inputs and targets
70
- dl_type: DataLoader type to use
71
- getters: Functions to extract data from source
72
- n_inp: Number of input elements
73
- item_tfms: Item-level transforms
74
- batch_tfms: Batch-level transforms
75
- **kwargs: Additional DataLoader arguments
76
"""
77
def __init__(self, blocks=(TransformBlock,), dl_type=None, getters=None,
78
n_inp=None, item_tfms=None, batch_tfms=None, **kwargs): ...
79
80
def dataloaders(self, source, path='.', verbose=False, **kwargs):
81
"""
82
Create DataLoaders from data source.
83
84
Parameters:
85
- source: Data source
86
- path: Base path
87
- verbose: Show processing information
88
- **kwargs: DataLoader arguments
89
90
Returns:
91
- DataLoaders instance
92
"""
93
94
def datasets(self, source, verbose=False, **kwargs):
95
"""Create datasets without DataLoaders."""
96
97
def summary(self, source, **kwargs):
98
"""Show summary of data processing pipeline."""
99
```
100
101
### Transform Blocks
102
103
Building blocks for different data types in the DataBlock API.
104
105
```python { .api }
106
class TransformBlock:
107
"""Base class for transform blocks."""
108
109
def __init__(self, type_tfms=None, item_tfms=None, batch_tfms=None,
110
dl_type=None, dls_kwargs=None): ...
111
112
class ImageBlock(TransformBlock):
113
"""Transform block for image data."""
114
115
def __init__(self, cls=PILImage): ...
116
117
class CategoryBlock(TransformBlock):
118
"""Transform block for categorical labels."""
119
120
def __init__(self, vocab=None, sort=True, add_na=False): ...
121
122
class MultiCategoryBlock(TransformBlock):
123
"""Transform block for multi-label categorical data."""
124
125
def __init__(self, encoded=False, vocab=None, add_na=False): ...
126
127
class RegressionBlock(TransformBlock):
128
"""Transform block for regression targets."""
129
130
class MaskBlock(TransformBlock):
131
"""Transform block for segmentation masks."""
132
133
def __init__(self, codes=None): ...
134
135
class PointBlock(TransformBlock):
136
"""Transform block for point/keypoint data."""
137
138
class BBoxBlock(TransformBlock):
139
"""Transform block for bounding boxes."""
140
141
class BBoxLblBlock(TransformBlock):
142
"""Transform block for labeled bounding boxes."""
143
```
144
145
### Data Splitting
146
147
Functions and classes for splitting data into train/validation sets.
148
149
```python { .api }
150
class RandomSplitter:
151
"""Random train/validation split."""
152
153
def __init__(self, valid_pct=0.2, seed=None): ...
154
155
def __call__(self, o):
156
"""
157
Split data randomly.
158
159
Parameters:
160
- o: Data items to split
161
162
Returns:
163
- Train indices, validation indices
164
"""
165
166
class TrainTestSplitter:
167
"""Split based on test set."""
168
169
def __init__(self, test_name='test', valid_name='valid'): ...
170
171
def RandomSubsetSplitter(valid_pct=0.2, n=None, **kwargs):
172
"""Random subset splitter for large datasets."""
173
174
def FuncSplitter(func):
175
"""Split based on function result."""
176
177
def MaskSplitter(mask):
178
"""Split based on boolean mask."""
179
180
def FileSplitter(fname):
181
"""Split based on filenames in text file."""
182
183
def GrandparentSplitter(train_name='train', valid_name='valid'):
184
"""Split based on grandparent folder names."""
185
186
def IndexSplitter(valid_idx):
187
"""Split based on specific indices."""
188
```
189
190
### File and Dataset Utilities
191
192
Utilities for working with files and external datasets.
193
194
```python { .api }
195
def get_files(path, extensions=None, recurse=True, folders=None, followlinks=True):
196
"""
197
Get list of files with optional filtering.
198
199
Parameters:
200
- path: Directory path
201
- extensions: File extensions to include
202
- recurse: Search subdirectories
203
- folders: Folder names to include/exclude
204
- followlinks: Follow symbolic links
205
206
Returns:
207
- List of Path objects
208
"""
209
210
def get_image_files(path, recurse=True, folders=None):
211
"""Get image files from directory."""
212
213
def get_text_files(path, recurse=True, folders=None):
214
"""Get text files from directory."""
215
216
def untar_data(url, dest=None, c_key='data', force_download=False, extract=True):
217
"""
218
Download and extract fastai datasets.
219
220
Parameters:
221
- url: Dataset URL or URLs enum value
222
- dest: Destination directory
223
- c_key: Config key for base path
224
- force_download: Re-download if exists
225
- extract: Extract after download
226
227
Returns:
228
- Path to extracted data
229
"""
230
231
class URLs:
232
"""Predefined dataset URLs."""
233
PETS = 'https://s3.amazonaws.com/fast-ai-imageclas/oxford-iiit-pet.tgz'
234
MNIST = 'https://s3.amazonaws.com/fast-ai-sample/mnist_png.tgz'
235
CIFAR = 'https://s3.amazonaws.com/fast-ai-sample/cifar10.tgz'
236
IMDB = 'https://s3.amazonaws.com/fast-ai-nlp/imdb.tgz'
237
# ... many more dataset URLs
238
239
def download_url(url, dest=None, timeout=None, show_progress=True):
240
"""Download file from URL."""
241
242
def fastai_path():
243
"""Get fastai data directory path."""
244
```
245
246
### Transforms
247
248
Core transform classes for data preprocessing.
249
250
```python { .api }
251
class Transform:
252
"""Base class for transforms."""
253
254
def __init__(self, enc=None, dec=None, split_idx=None, order=None): ...
255
256
def __call__(self, x, **kwargs): ...
257
258
class ToTensor(Transform):
259
"""Convert to tensor."""
260
261
class IntToFloatTensor(Transform):
262
"""Convert integer tensor to float."""
263
264
class Normalize(Transform):
265
"""Normalize with mean and standard deviation."""
266
267
def __init__(self, mean=None, std=None, axes=None): ...
268
269
class CategoryMap(Transform):
270
"""Map categories to integers."""
271
272
def __init__(self, vocab=None, add_na=False, sort=True): ...
273
274
class MultiCategoryMap(Transform):
275
"""Map multi-categories to multi-hot encoding."""
276
277
def __init__(self, vocab=None, add_na=False, c2i=None): ...
278
279
class Resize(Transform):
280
"""Resize images to specified size."""
281
282
def __init__(self, size, method='crop', pad_mode='reflection'): ...
283
```
284
285
### TfmdLists and Datasets
286
287
Advanced data containers with integrated transforms.
288
289
```python { .api }
290
class TfmdLists:
291
"""Lists with integrated transform pipeline."""
292
293
def __init__(self, items, tfms, use_list=None, do_setup=True, split_idx=None,
294
train_setup=True, splits=None, types=None, verbose=False): ...
295
296
def subset(self, i):
297
"""Get subset by index."""
298
299
def new_empty(self):
300
"""Create new empty instance."""
301
302
class Datasets:
303
"""Multiple TfmdLists that create tuples."""
304
305
def __init__(self, items, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs): ...
306
307
def subset(self, i):
308
"""Get subset by split index."""
309
310
@property
311
def train(self):
312
"""Training dataset."""
313
314
@property
315
def valid(self):
316
"""Validation dataset."""
317
```