or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

activations.mdapplications.mdbackend-config.mdcore-framework.mdindex.mdinitializers.mdlayers.mdlosses-metrics.mdoperations.mdoptimizers.mdpreprocessing.mdregularizers.mdtraining-callbacks.md

preprocessing.mddocs/

0

# Data Processing

1

2

Comprehensive data preprocessing utilities for images, text, audio, and numerical data with built-in augmentation capabilities, dataset creation functions, and feature preprocessing layers.

3

4

## Capabilities

5

6

### Dataset Creation

7

8

Functions for creating datasets from various data sources and formats.

9

10

```python { .api }

11

def image_dataset_from_directory(directory, labels='inferred', label_mode='int',

12

class_names=None, color_mode='rgb', batch_size=32,

13

image_size=(256, 256), shuffle=True, seed=None,

14

validation_split=None, subset=None, **kwargs):

15

"""

16

Create image dataset from directory structure.

17

18

Args:

19

directory (str): Path to directory containing subdirectories of images

20

labels (str): How to generate labels ('inferred' or None)

21

label_mode (str): Type of labels ('int', 'categorical', 'binary', None)

22

class_names (list, optional): Explicit list of class names

23

color_mode (str): Image color mode ('grayscale', 'rgb', 'rgba')

24

batch_size (int): Batch size

25

image_size (tuple): Target image size

26

shuffle (bool): Whether to shuffle data

27

seed (int, optional): Random seed

28

validation_split (float, optional): Fraction for validation

29

subset (str, optional): Subset to return ('training' or 'validation')

30

31

Returns:

32

Dataset: Configured image dataset

33

"""

34

35

def text_dataset_from_directory(directory, labels='inferred', label_mode='int',

36

class_names=None, batch_size=32, max_length=None,

37

shuffle=True, seed=None, validation_split=None,

38

subset=None, **kwargs): ...

39

40

def timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride=1,

41

sampling_rate=1, batch_size=128, shuffle=False,

42

seed=None, start_index=None, end_index=None): ...

43

44

def audio_dataset_from_directory(directory, labels='inferred', label_mode='int',

45

class_names=None, batch_size=32, sampling_rate=16000,

46

output_sequence_length=16000, **kwargs): ...

47

```

48

49

### Text Processing Layers

50

51

Preprocessing layers for text and sequence data including vectorization and encoding.

52

53

```python { .api }

54

class TextVectorization:

55

"""

56

Text vectorization layer for converting text to sequences.

57

58

Args:

59

max_tokens (int, optional): Maximum vocabulary size

60

standardize (str or callable): Text standardization ('lower_and_strip_punctuation', 'lower', 'strip_punctuation', or callable)

61

split (str or callable): Text splitting strategy ('whitespace' or callable)

62

ngrams (int, optional): N-gram size

63

output_mode (str): Output format ('int', 'multi_hot', 'count', 'tf_idf')

64

output_sequence_length (int, optional): Output sequence length

65

pad_to_max_tokens (bool): Whether to pad to max_tokens

66

vocabulary (list, optional): Pre-existing vocabulary

67

idf_weights (array, optional): IDF weights for tf-idf mode

68

sparse (bool): Whether to return sparse tensors

69

ragged (bool): Whether to return ragged tensors

70

"""

71

def __init__(self, max_tokens=None, standardize='lower_and_strip_punctuation',

72

split='whitespace', ngrams=None, output_mode='int',

73

output_sequence_length=None, **kwargs): ...

74

75

def adapt(self, data, batch_size=None, steps=None): ...

76

def get_vocabulary(self): ...

77

def set_vocabulary(self, vocabulary, idf_weights=None): ...

78

79

class StringLookup:

80

"""

81

String to integer lookup layer.

82

83

Args:

84

max_tokens (int, optional): Maximum vocabulary size

85

num_oov_indices (int): Number of out-of-vocabulary indices

86

mask_token (str, optional): Token to use for masking

87

oov_token (str): Token to use for out-of-vocabulary

88

vocabulary (list, optional): Pre-existing vocabulary

89

idf_weights (array, optional): IDF weights

90

invert (bool): Whether to invert the lookup

91

output_mode (str): Output format ('int', 'multi_hot', 'count', 'one_hot', 'tf_idf')

92

sparse (bool): Whether to return sparse tensors

93

pad_to_max_tokens (bool): Whether to pad to max_tokens

94

"""

95

def __init__(self, max_tokens=None, num_oov_indices=1, mask_token=None,

96

oov_token='[UNK]', vocabulary=None, **kwargs): ...

97

98

class IntegerLookup:

99

"""Integer to integer lookup layer."""

100

def __init__(self, max_tokens=None, num_oov_indices=1, mask_token=None,

101

oov_token=-1, vocabulary=None, **kwargs): ...

102

103

class CategoryEncoding:

104

"""

105

Categorical encoding layer.

106

107

Args:

108

num_tokens (int, optional): Total number of tokens

109

output_mode (str): Output format ('multi_hot', 'one_hot', 'count')

110

sparse (bool): Whether to return sparse tensors

111

"""

112

def __init__(self, num_tokens=None, output_mode='multi_hot', sparse=False, **kwargs): ...

113

```

114

115

### Image Processing Layers

116

117

Preprocessing layers for image data including resizing, augmentation, and transformations.

118

119

```python { .api }

120

class Resizing:

121

"""

122

Resize images to target size.

123

124

Args:

125

height (int): Target height

126

width (int): Target width

127

interpolation (str): Interpolation method ('bilinear', 'nearest', 'bicubic', 'area', 'lanczos3', 'lanczos5', 'gaussian', 'mitchellcubic')

128

crop_to_aspect_ratio (bool): Whether to crop to maintain aspect ratio

129

"""

130

def __init__(self, height, width, interpolation='bilinear', crop_to_aspect_ratio=False, **kwargs): ...

131

132

class CenterCrop:

133

"""

134

Crop images to specified size from center.

135

136

Args:

137

height (int): Target height

138

width (int): Target width

139

"""

140

def __init__(self, height, width, **kwargs): ...

141

142

class Rescaling:

143

"""

144

Rescale pixel values.

145

146

Args:

147

scale (float): Scaling factor

148

offset (float): Offset value

149

"""

150

def __init__(self, scale, offset=0.0, **kwargs): ...

151

152

# Data augmentation layers

153

class RandomFlip:

154

"""

155

Random image flipping.

156

157

Args:

158

mode (str): Flip mode ('horizontal', 'vertical', 'horizontal_and_vertical')

159

seed (int, optional): Random seed

160

"""

161

def __init__(self, mode='horizontal_and_vertical', seed=None, **kwargs): ...

162

163

class RandomRotation:

164

"""

165

Random image rotation.

166

167

Args:

168

factor (float or tuple): Rotation factor as fraction of 2π

169

fill_mode (str): Fill mode for transformed pixels

170

interpolation (str): Interpolation method

171

seed (int, optional): Random seed

172

fill_value (float): Fill value for constant fill mode

173

"""

174

def __init__(self, factor, fill_mode='reflect', interpolation='bilinear',

175

seed=None, fill_value=0.0, **kwargs): ...

176

177

class RandomZoom:

178

"""Random image zooming."""

179

def __init__(self, height_factor, width_factor=None, fill_mode='reflect',

180

interpolation='bilinear', seed=None, fill_value=0.0, **kwargs): ...

181

182

class RandomTranslation:

183

"""Random image translation."""

184

def __init__(self, height_factor, width_factor, fill_mode='reflect',

185

interpolation='bilinear', seed=None, fill_value=0.0, **kwargs): ...

186

187

class RandomCrop:

188

"""Random image cropping."""

189

def __init__(self, height, width, seed=None, **kwargs): ...

190

191

class RandomBrightness:

192

"""Random brightness adjustment."""

193

def __init__(self, factor, value_range=(0, 255), seed=None, **kwargs): ...

194

195

class RandomContrast:

196

"""Random contrast adjustment."""

197

def __init__(self, factor, seed=None, **kwargs): ...

198

```

199

200

### Numerical Processing Layers

201

202

Preprocessing layers for numerical data including normalization and discretization.

203

204

```python { .api }

205

class Normalization:

206

"""

207

Feature normalization layer.

208

209

Args:

210

axis (int): Axis to normalize along

211

mean (array, optional): Pre-computed mean

212

variance (array, optional): Pre-computed variance

213

invert (bool): Whether to invert normalization

214

"""

215

def __init__(self, axis=-1, mean=None, variance=None, invert=False, **kwargs): ...

216

217

def adapt(self, data, batch_size=None, steps=None): ...

218

219

class Discretization:

220

"""

221

Value discretization layer.

222

223

Args:

224

bin_boundaries (array, optional): Bin boundary values

225

num_bins (int, optional): Number of bins

226

epsilon (float): Small value for bin boundary adjustment

227

output_mode (str): Output format ('int', 'one_hot', 'multi_hot', 'count')

228

sparse (bool): Whether to return sparse tensors

229

"""

230

def __init__(self, bin_boundaries=None, num_bins=None, epsilon=0.01,

231

output_mode='int', sparse=False, **kwargs): ...

232

233

def adapt(self, data, batch_size=None, steps=None): ...

234

```

235

236

### Audio Processing Layers

237

238

Specialized layers for audio signal processing.

239

240

```python { .api }

241

class MelSpectrogram:

242

"""

243

Mel-frequency spectrogram layer.

244

245

Args:

246

fft_length (int): FFT length

247

sequence_stride (int): Hop length between frames

248

sequence_length (int): Window length

249

window (str): Window function

250

sampling_rate (int): Audio sampling rate

251

num_mel_bins (int): Number of mel frequency bins

252

min_freq (float): Minimum frequency

253

max_freq (float): Maximum frequency

254

power_to_db (bool): Whether to convert power to decibels

255

top_db (float): Dynamic range for dB conversion

256

mag_exp (float): Magnitude exponent

257

"""

258

def __init__(self, fft_length=2048, sequence_stride=512, sequence_length=None,

259

window='hann', sampling_rate=16000, num_mel_bins=128, **kwargs): ...

260

261

class STFTSpectrogram:

262

"""Short-time Fourier transform spectrogram layer."""

263

def __init__(self, fft_length=2048, sequence_stride=512, sequence_length=None,

264

window='hann', **kwargs): ...

265

```

266

267

### Utility Functions

268

269

Additional preprocessing utilities and helper functions.

270

271

```python { .api }

272

def split_dataset(dataset, left_size=None, right_size=None, shuffle=False, seed=None):

273

"""

274

Split dataset into two parts.

275

276

Args:

277

dataset: Dataset to split

278

left_size (float or int, optional): Size of left split

279

right_size (float or int, optional): Size of right split

280

shuffle (bool): Whether to shuffle before splitting

281

seed (int, optional): Random seed

282

283

Returns:

284

tuple: (left_dataset, right_dataset)

285

"""

286

287

def to_categorical(y, num_classes=None, dtype='float32'):

288

"""

289

Convert integer labels to categorical encoding.

290

291

Args:

292

y (array): Integer labels

293

num_classes (int, optional): Total number of classes

294

dtype (str): Output data type

295

296

Returns:

297

array: Categorical encoded labels

298

"""

299

300

def normalize(x, axis=-1, order=2):

301

"""

302

Normalize arrays along specified axis.

303

304

Args:

305

x (array): Input array

306

axis (int): Normalization axis

307

order (int): Norm order

308

309

Returns:

310

array: Normalized array

311

"""

312

313

def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',

314

truncating='pre', value=0.0):

315

"""

316

Pad sequences to same length.

317

318

Args:

319

sequences (list): List of sequences

320

maxlen (int, optional): Maximum length

321

dtype (str): Output data type

322

padding (str): Padding strategy ('pre' or 'post')

323

truncating (str): Truncation strategy ('pre' or 'post')

324

value (float): Padding value

325

326

Returns:

327

array: Padded sequences

328

"""

329

```

330

331

## Usage Examples

332

333

### Image Data Pipeline

334

335

```python

336

import keras

337

from keras import layers

338

339

# Create dataset from directory

340

train_dataset = keras.utils.image_dataset_from_directory(

341

'path/to/train',

342

validation_split=0.2,

343

subset='training',

344

seed=123,

345

image_size=(224, 224),

346

batch_size=32

347

)

348

349

val_dataset = keras.utils.image_dataset_from_directory(

350

'path/to/train',

351

validation_split=0.2,

352

subset='validation',

353

seed=123,

354

image_size=(224, 224),

355

batch_size=32

356

)

357

358

# Build preprocessing pipeline

359

data_augmentation = keras.Sequential([

360

layers.RandomFlip('horizontal'),

361

layers.RandomRotation(0.2),

362

layers.RandomZoom(0.2),

363

layers.RandomBrightness(0.2),

364

layers.RandomContrast(0.2)

365

])

366

367

# Apply to datasets

368

train_dataset = train_dataset.map(lambda x, y: (data_augmentation(x, training=True), y))

369

370

# Normalize pixel values

371

normalization = layers.Rescaling(1./255)

372

train_dataset = train_dataset.map(lambda x, y: (normalization(x), y))

373

val_dataset = val_dataset.map(lambda x, y: (normalization(x), y))

374

```

375

376

### Text Data Pipeline

377

378

```python

379

import keras

380

from keras import layers

381

382

# Create text dataset

383

train_dataset = keras.utils.text_dataset_from_directory(

384

'path/to/text_data',

385

batch_size=32,

386

validation_split=0.2,

387

subset='training',

388

seed=123

389

)

390

391

# Text vectorization

392

vectorize_layer = layers.TextVectorization(

393

max_tokens=10000,

394

output_sequence_length=100,

395

standardize='lower_and_strip_punctuation'

396

)

397

398

# Adapt to training data

399

text_only_dataset = train_dataset.map(lambda x, y: x)

400

vectorize_layer.adapt(text_only_dataset)

401

402

# Apply vectorization

403

train_dataset = train_dataset.map(lambda x, y: (vectorize_layer(x), y))

404

```