0
# Data Processing
1
2
Comprehensive data preprocessing utilities for images, text, audio, and numerical data with built-in augmentation capabilities, dataset creation functions, and feature preprocessing layers.
3
4
## Capabilities
5
6
### Dataset Creation
7
8
Functions for creating datasets from various data sources and formats.
9
10
```python { .api }
11
def image_dataset_from_directory(directory, labels='inferred', label_mode='int',
12
class_names=None, color_mode='rgb', batch_size=32,
13
image_size=(256, 256), shuffle=True, seed=None,
14
validation_split=None, subset=None, **kwargs):
15
"""
16
Create image dataset from directory structure.
17
18
Args:
19
directory (str): Path to directory containing subdirectories of images
20
labels (str): How to generate labels ('inferred' or None)
21
label_mode (str): Type of labels ('int', 'categorical', 'binary', None)
22
class_names (list, optional): Explicit list of class names
23
color_mode (str): Image color mode ('grayscale', 'rgb', 'rgba')
24
batch_size (int): Batch size
25
image_size (tuple): Target image size
26
shuffle (bool): Whether to shuffle data
27
seed (int, optional): Random seed
28
validation_split (float, optional): Fraction for validation
29
subset (str, optional): Subset to return ('training' or 'validation')
30
31
Returns:
32
Dataset: Configured image dataset
33
"""
34
35
def text_dataset_from_directory(directory, labels='inferred', label_mode='int',
36
class_names=None, batch_size=32, max_length=None,
37
shuffle=True, seed=None, validation_split=None,
38
subset=None, **kwargs): ...
39
40
def timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride=1,
41
sampling_rate=1, batch_size=128, shuffle=False,
42
seed=None, start_index=None, end_index=None): ...
43
44
def audio_dataset_from_directory(directory, labels='inferred', label_mode='int',
45
class_names=None, batch_size=32, sampling_rate=16000,
46
output_sequence_length=16000, **kwargs): ...
47
```
48
49
### Text Processing Layers
50
51
Preprocessing layers for text and sequence data including vectorization and encoding.
52
53
```python { .api }
54
class TextVectorization:
55
"""
56
Text vectorization layer for converting text to sequences.
57
58
Args:
59
max_tokens (int, optional): Maximum vocabulary size
60
standardize (str or callable): Text standardization ('lower_and_strip_punctuation', 'lower', 'strip_punctuation', or callable)
61
split (str or callable): Text splitting strategy ('whitespace' or callable)
62
ngrams (int, optional): N-gram size
63
output_mode (str): Output format ('int', 'multi_hot', 'count', 'tf_idf')
64
output_sequence_length (int, optional): Output sequence length
65
pad_to_max_tokens (bool): Whether to pad to max_tokens
66
vocabulary (list, optional): Pre-existing vocabulary
67
idf_weights (array, optional): IDF weights for tf-idf mode
68
sparse (bool): Whether to return sparse tensors
69
ragged (bool): Whether to return ragged tensors
70
"""
71
def __init__(self, max_tokens=None, standardize='lower_and_strip_punctuation',
72
split='whitespace', ngrams=None, output_mode='int',
73
output_sequence_length=None, **kwargs): ...
74
75
def adapt(self, data, batch_size=None, steps=None): ...
76
def get_vocabulary(self): ...
77
def set_vocabulary(self, vocabulary, idf_weights=None): ...
78
79
class StringLookup:
80
"""
81
String to integer lookup layer.
82
83
Args:
84
max_tokens (int, optional): Maximum vocabulary size
85
num_oov_indices (int): Number of out-of-vocabulary indices
86
mask_token (str, optional): Token to use for masking
87
oov_token (str): Token to use for out-of-vocabulary
88
vocabulary (list, optional): Pre-existing vocabulary
89
idf_weights (array, optional): IDF weights
90
invert (bool): Whether to invert the lookup
91
output_mode (str): Output format ('int', 'multi_hot', 'count', 'one_hot', 'tf_idf')
92
sparse (bool): Whether to return sparse tensors
93
pad_to_max_tokens (bool): Whether to pad to max_tokens
94
"""
95
def __init__(self, max_tokens=None, num_oov_indices=1, mask_token=None,
96
oov_token='[UNK]', vocabulary=None, **kwargs): ...
97
98
class IntegerLookup:
99
"""Integer to integer lookup layer."""
100
def __init__(self, max_tokens=None, num_oov_indices=1, mask_token=None,
101
oov_token=-1, vocabulary=None, **kwargs): ...
102
103
class CategoryEncoding:
104
"""
105
Categorical encoding layer.
106
107
Args:
108
num_tokens (int, optional): Total number of tokens
109
output_mode (str): Output format ('multi_hot', 'one_hot', 'count')
110
sparse (bool): Whether to return sparse tensors
111
"""
112
def __init__(self, num_tokens=None, output_mode='multi_hot', sparse=False, **kwargs): ...
113
```
114
115
### Image Processing Layers
116
117
Preprocessing layers for image data including resizing, augmentation, and transformations.
118
119
```python { .api }
120
class Resizing:
121
"""
122
Resize images to target size.
123
124
Args:
125
height (int): Target height
126
width (int): Target width
127
interpolation (str): Interpolation method ('bilinear', 'nearest', 'bicubic', 'area', 'lanczos3', 'lanczos5', 'gaussian', 'mitchellcubic')
128
crop_to_aspect_ratio (bool): Whether to crop to maintain aspect ratio
129
"""
130
def __init__(self, height, width, interpolation='bilinear', crop_to_aspect_ratio=False, **kwargs): ...
131
132
class CenterCrop:
133
"""
134
Crop images to specified size from center.
135
136
Args:
137
height (int): Target height
138
width (int): Target width
139
"""
140
def __init__(self, height, width, **kwargs): ...
141
142
class Rescaling:
143
"""
144
Rescale pixel values.
145
146
Args:
147
scale (float): Scaling factor
148
offset (float): Offset value
149
"""
150
def __init__(self, scale, offset=0.0, **kwargs): ...
151
152
# Data augmentation layers
153
class RandomFlip:
154
"""
155
Random image flipping.
156
157
Args:
158
mode (str): Flip mode ('horizontal', 'vertical', 'horizontal_and_vertical')
159
seed (int, optional): Random seed
160
"""
161
def __init__(self, mode='horizontal_and_vertical', seed=None, **kwargs): ...
162
163
class RandomRotation:
164
"""
165
Random image rotation.
166
167
Args:
168
factor (float or tuple): Rotation factor as fraction of 2π
169
fill_mode (str): Fill mode for transformed pixels
170
interpolation (str): Interpolation method
171
seed (int, optional): Random seed
172
fill_value (float): Fill value for constant fill mode
173
"""
174
def __init__(self, factor, fill_mode='reflect', interpolation='bilinear',
175
seed=None, fill_value=0.0, **kwargs): ...
176
177
class RandomZoom:
178
"""Random image zooming."""
179
def __init__(self, height_factor, width_factor=None, fill_mode='reflect',
180
interpolation='bilinear', seed=None, fill_value=0.0, **kwargs): ...
181
182
class RandomTranslation:
183
"""Random image translation."""
184
def __init__(self, height_factor, width_factor, fill_mode='reflect',
185
interpolation='bilinear', seed=None, fill_value=0.0, **kwargs): ...
186
187
class RandomCrop:
188
"""Random image cropping."""
189
def __init__(self, height, width, seed=None, **kwargs): ...
190
191
class RandomBrightness:
192
"""Random brightness adjustment."""
193
def __init__(self, factor, value_range=(0, 255), seed=None, **kwargs): ...
194
195
class RandomContrast:
196
"""Random contrast adjustment."""
197
def __init__(self, factor, seed=None, **kwargs): ...
198
```
199
200
### Numerical Processing Layers
201
202
Preprocessing layers for numerical data including normalization and discretization.
203
204
```python { .api }
205
class Normalization:
206
"""
207
Feature normalization layer.
208
209
Args:
210
axis (int): Axis to normalize along
211
mean (array, optional): Pre-computed mean
212
variance (array, optional): Pre-computed variance
213
invert (bool): Whether to invert normalization
214
"""
215
def __init__(self, axis=-1, mean=None, variance=None, invert=False, **kwargs): ...
216
217
def adapt(self, data, batch_size=None, steps=None): ...
218
219
class Discretization:
220
"""
221
Value discretization layer.
222
223
Args:
224
bin_boundaries (array, optional): Bin boundary values
225
num_bins (int, optional): Number of bins
226
epsilon (float): Small value for bin boundary adjustment
227
output_mode (str): Output format ('int', 'one_hot', 'multi_hot', 'count')
228
sparse (bool): Whether to return sparse tensors
229
"""
230
def __init__(self, bin_boundaries=None, num_bins=None, epsilon=0.01,
231
output_mode='int', sparse=False, **kwargs): ...
232
233
def adapt(self, data, batch_size=None, steps=None): ...
234
```
235
236
### Audio Processing Layers
237
238
Specialized layers for audio signal processing.
239
240
```python { .api }
241
class MelSpectrogram:
242
"""
243
Mel-frequency spectrogram layer.
244
245
Args:
246
fft_length (int): FFT length
247
sequence_stride (int): Hop length between frames
248
sequence_length (int): Window length
249
window (str): Window function
250
sampling_rate (int): Audio sampling rate
251
num_mel_bins (int): Number of mel frequency bins
252
min_freq (float): Minimum frequency
253
max_freq (float): Maximum frequency
254
power_to_db (bool): Whether to convert power to decibels
255
top_db (float): Dynamic range for dB conversion
256
mag_exp (float): Magnitude exponent
257
"""
258
def __init__(self, fft_length=2048, sequence_stride=512, sequence_length=None,
259
window='hann', sampling_rate=16000, num_mel_bins=128, **kwargs): ...
260
261
class STFTSpectrogram:
262
"""Short-time Fourier transform spectrogram layer."""
263
def __init__(self, fft_length=2048, sequence_stride=512, sequence_length=None,
264
window='hann', **kwargs): ...
265
```
266
267
### Utility Functions
268
269
Additional preprocessing utilities and helper functions.
270
271
```python { .api }
272
def split_dataset(dataset, left_size=None, right_size=None, shuffle=False, seed=None):
273
"""
274
Split dataset into two parts.
275
276
Args:
277
dataset: Dataset to split
278
left_size (float or int, optional): Size of left split
279
right_size (float or int, optional): Size of right split
280
shuffle (bool): Whether to shuffle before splitting
281
seed (int, optional): Random seed
282
283
Returns:
284
tuple: (left_dataset, right_dataset)
285
"""
286
287
def to_categorical(y, num_classes=None, dtype='float32'):
288
"""
289
Convert integer labels to categorical encoding.
290
291
Args:
292
y (array): Integer labels
293
num_classes (int, optional): Total number of classes
294
dtype (str): Output data type
295
296
Returns:
297
array: Categorical encoded labels
298
"""
299
300
def normalize(x, axis=-1, order=2):
301
"""
302
Normalize arrays along specified axis.
303
304
Args:
305
x (array): Input array
306
axis (int): Normalization axis
307
order (int): Norm order
308
309
Returns:
310
array: Normalized array
311
"""
312
313
def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',
314
truncating='pre', value=0.0):
315
"""
316
Pad sequences to same length.
317
318
Args:
319
sequences (list): List of sequences
320
maxlen (int, optional): Maximum length
321
dtype (str): Output data type
322
padding (str): Padding strategy ('pre' or 'post')
323
truncating (str): Truncation strategy ('pre' or 'post')
324
value (float): Padding value
325
326
Returns:
327
array: Padded sequences
328
"""
329
```
330
331
## Usage Examples
332
333
### Image Data Pipeline
334
335
```python
336
import keras
337
from keras import layers
338
339
# Create dataset from directory
340
train_dataset = keras.utils.image_dataset_from_directory(
341
'path/to/train',
342
validation_split=0.2,
343
subset='training',
344
seed=123,
345
image_size=(224, 224),
346
batch_size=32
347
)
348
349
val_dataset = keras.utils.image_dataset_from_directory(
350
'path/to/train',
351
validation_split=0.2,
352
subset='validation',
353
seed=123,
354
image_size=(224, 224),
355
batch_size=32
356
)
357
358
# Build preprocessing pipeline
359
data_augmentation = keras.Sequential([
360
layers.RandomFlip('horizontal'),
361
layers.RandomRotation(0.2),
362
layers.RandomZoom(0.2),
363
layers.RandomBrightness(0.2),
364
layers.RandomContrast(0.2)
365
])
366
367
# Apply to datasets
368
train_dataset = train_dataset.map(lambda x, y: (data_augmentation(x, training=True), y))
369
370
# Normalize pixel values
371
normalization = layers.Rescaling(1./255)
372
train_dataset = train_dataset.map(lambda x, y: (normalization(x), y))
373
val_dataset = val_dataset.map(lambda x, y: (normalization(x), y))
374
```
375
376
### Text Data Pipeline
377
378
```python
379
import keras
380
from keras import layers
381
382
# Create text dataset
383
train_dataset = keras.utils.text_dataset_from_directory(
384
'path/to/text_data',
385
batch_size=32,
386
validation_split=0.2,
387
subset='training',
388
seed=123
389
)
390
391
# Text vectorization
392
vectorize_layer = layers.TextVectorization(
393
max_tokens=10000,
394
output_sequence_length=100,
395
standardize='lower_and_strip_punctuation'
396
)
397
398
# Adapt to training data
399
text_only_dataset = train_dataset.map(lambda x, y: x)
400
vectorize_layer.adapt(text_only_dataset)
401
402
# Apply vectorization
403
train_dataset = train_dataset.map(lambda x, y: (vectorize_layer(x), y))
404
```