or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

callbacks.mdcollaborative-filtering.mdcore-training.mddata-loading.mdindex.mdinterpretation.mdmedical.mdmetrics-losses.mdtabular.mdtext.mdvision.md

data-loading.mddocs/

0

# Data Loading and Processing

1

2

Comprehensive data loading system built around the DataBlock API and transform pipelines. Provides flexible, composable data processing for all fastai domains.

3

4

## Capabilities

5

6

### DataLoaders

7

8

Main class for managing training and validation data with integrated transforms.

9

10

```python { .api }

11

class DataLoaders:

12

"""

13

Container for train/valid DataLoader pairs.

14

15

Parameters:

16

- *loaders: DataLoader instances (typically train, valid)

17

- path: Base path for saving/loading

18

- device: Device to place data on

19

"""

20

def __init__(self, *loaders, path='.', device=None): ...

21

22

@classmethod

23

def from_dblock(cls, dblock, source, path='.', **kwargs):

24

"""

25

Create DataLoaders from DataBlock.

26

27

Parameters:

28

- dblock: DataBlock defining data processing

29

- source: Data source (path, list, etc.)

30

- path: Base path

31

- **kwargs: Additional arguments

32

33

Returns:

34

- DataLoaders instance

35

"""

36

37

def show_batch(self, b=None, max_n=9, ctxs=None, show=True, **kwargs):

38

"""Display a batch of data."""

39

40

@property

41

def train(self):

42

"""Training DataLoader."""

43

44

@property

45

def valid(self):

46

"""Validation DataLoader."""

47

48

def one_batch(self):

49

"""Get one batch from training data."""

50

51

def save(self, file='data_loaders.pkl'):

52

"""Save DataLoaders to disk."""

53

54

@classmethod

55

def load(cls, path, file='data_loaders.pkl'):

56

"""Load DataLoaders from disk."""

57

```

58

59

### DataBlock API

60

61

Flexible API for constructing data processing pipelines from modular components.

62

63

```python { .api }

64

class DataBlock:

65

"""

66

Flexible data processing pipeline constructor.

67

68

Parameters:

69

- blocks: Transform blocks for inputs and targets

70

- dl_type: DataLoader type to use

71

- getters: Functions to extract data from source

72

- n_inp: Number of input elements

73

- item_tfms: Item-level transforms

74

- batch_tfms: Batch-level transforms

75

- **kwargs: Additional DataLoader arguments

76

"""

77

def __init__(self, blocks=(TransformBlock,), dl_type=None, getters=None,

78

n_inp=None, item_tfms=None, batch_tfms=None, **kwargs): ...

79

80

def dataloaders(self, source, path='.', verbose=False, **kwargs):

81

"""

82

Create DataLoaders from data source.

83

84

Parameters:

85

- source: Data source

86

- path: Base path

87

- verbose: Show processing information

88

- **kwargs: DataLoader arguments

89

90

Returns:

91

- DataLoaders instance

92

"""

93

94

def datasets(self, source, verbose=False, **kwargs):

95

"""Create datasets without DataLoaders."""

96

97

def summary(self, source, **kwargs):

98

"""Show summary of data processing pipeline."""

99

```

100

101

### Transform Blocks

102

103

Building blocks for different data types in the DataBlock API.

104

105

```python { .api }

106

class TransformBlock:

107

"""Base class for transform blocks."""

108

109

def __init__(self, type_tfms=None, item_tfms=None, batch_tfms=None,

110

dl_type=None, dls_kwargs=None): ...

111

112

class ImageBlock(TransformBlock):

113

"""Transform block for image data."""

114

115

def __init__(self, cls=PILImage): ...

116

117

class CategoryBlock(TransformBlock):

118

"""Transform block for categorical labels."""

119

120

def __init__(self, vocab=None, sort=True, add_na=False): ...

121

122

class MultiCategoryBlock(TransformBlock):

123

"""Transform block for multi-label categorical data."""

124

125

def __init__(self, encoded=False, vocab=None, add_na=False): ...

126

127

class RegressionBlock(TransformBlock):

128

"""Transform block for regression targets."""

129

130

class MaskBlock(TransformBlock):

131

"""Transform block for segmentation masks."""

132

133

def __init__(self, codes=None): ...

134

135

class PointBlock(TransformBlock):

136

"""Transform block for point/keypoint data."""

137

138

class BBoxBlock(TransformBlock):

139

"""Transform block for bounding boxes."""

140

141

class BBoxLblBlock(TransformBlock):

142

"""Transform block for labeled bounding boxes."""

143

```

144

145

### Data Splitting

146

147

Functions and classes for splitting data into train/validation sets.

148

149

```python { .api }

150

class RandomSplitter:

151

"""Random train/validation split."""

152

153

def __init__(self, valid_pct=0.2, seed=None): ...

154

155

def __call__(self, o):

156

"""

157

Split data randomly.

158

159

Parameters:

160

- o: Data items to split

161

162

Returns:

163

- Train indices, validation indices

164

"""

165

166

class TrainTestSplitter:

167

"""Split based on test set."""

168

169

def __init__(self, test_name='test', valid_name='valid'): ...

170

171

def RandomSubsetSplitter(valid_pct=0.2, n=None, **kwargs):

172

"""Random subset splitter for large datasets."""

173

174

def FuncSplitter(func):

175

"""Split based on function result."""

176

177

def MaskSplitter(mask):

178

"""Split based on boolean mask."""

179

180

def FileSplitter(fname):

181

"""Split based on filenames in text file."""

182

183

def GrandparentSplitter(train_name='train', valid_name='valid'):

184

"""Split based on grandparent folder names."""

185

186

def IndexSplitter(valid_idx):

187

"""Split based on specific indices."""

188

```

189

190

### File and Dataset Utilities

191

192

Utilities for working with files and external datasets.

193

194

```python { .api }

195

def get_files(path, extensions=None, recurse=True, folders=None, followlinks=True):

196

"""

197

Get list of files with optional filtering.

198

199

Parameters:

200

- path: Directory path

201

- extensions: File extensions to include

202

- recurse: Search subdirectories

203

- folders: Folder names to include/exclude

204

- followlinks: Follow symbolic links

205

206

Returns:

207

- List of Path objects

208

"""

209

210

def get_image_files(path, recurse=True, folders=None):

211

"""Get image files from directory."""

212

213

def get_text_files(path, recurse=True, folders=None):

214

"""Get text files from directory."""

215

216

def untar_data(url, dest=None, c_key='data', force_download=False, extract=True):

217

"""

218

Download and extract fastai datasets.

219

220

Parameters:

221

- url: Dataset URL or URLs enum value

222

- dest: Destination directory

223

- c_key: Config key for base path

224

- force_download: Re-download if exists

225

- extract: Extract after download

226

227

Returns:

228

- Path to extracted data

229

"""

230

231

class URLs:

232

"""Predefined dataset URLs."""

233

PETS = 'https://s3.amazonaws.com/fast-ai-imageclas/oxford-iiit-pet.tgz'

234

MNIST = 'https://s3.amazonaws.com/fast-ai-sample/mnist_png.tgz'

235

CIFAR = 'https://s3.amazonaws.com/fast-ai-sample/cifar10.tgz'

236

IMDB = 'https://s3.amazonaws.com/fast-ai-nlp/imdb.tgz'

237

# ... many more dataset URLs

238

239

def download_url(url, dest=None, timeout=None, show_progress=True):

240

"""Download file from URL."""

241

242

def fastai_path():

243

"""Get fastai data directory path."""

244

```

245

246

### Transforms

247

248

Core transform classes for data preprocessing.

249

250

```python { .api }

251

class Transform:

252

"""Base class for transforms."""

253

254

def __init__(self, enc=None, dec=None, split_idx=None, order=None): ...

255

256

def __call__(self, x, **kwargs): ...

257

258

class ToTensor(Transform):

259

"""Convert to tensor."""

260

261

class IntToFloatTensor(Transform):

262

"""Convert integer tensor to float."""

263

264

class Normalize(Transform):

265

"""Normalize with mean and standard deviation."""

266

267

def __init__(self, mean=None, std=None, axes=None): ...

268

269

class CategoryMap(Transform):

270

"""Map categories to integers."""

271

272

def __init__(self, vocab=None, add_na=False, sort=True): ...

273

274

class MultiCategoryMap(Transform):

275

"""Map multi-categories to multi-hot encoding."""

276

277

def __init__(self, vocab=None, add_na=False, c2i=None): ...

278

279

class Resize(Transform):

280

"""Resize images to specified size."""

281

282

def __init__(self, size, method='crop', pad_mode='reflection'): ...

283

```

284

285

### TfmdLists and Datasets

286

287

Advanced data containers with integrated transforms.

288

289

```python { .api }

290

class TfmdLists:

291

"""Lists with integrated transform pipeline."""

292

293

def __init__(self, items, tfms, use_list=None, do_setup=True, split_idx=None,

294

train_setup=True, splits=None, types=None, verbose=False): ...

295

296

def subset(self, i):

297

"""Get subset by index."""

298

299

def new_empty(self):

300

"""Create new empty instance."""

301

302

class Datasets:

303

"""Multiple TfmdLists that create tuples."""

304

305

def __init__(self, items, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs): ...

306

307

def subset(self, i):

308

"""Get subset by split index."""

309

310

@property

311

def train(self):

312

"""Training dataset."""

313

314

@property

315

def valid(self):

316

"""Validation dataset."""

317

```