0
# Data Processing and Utilities
1
2
Built-in datasets, data preprocessing utilities, image processing functions, and various helper utilities for machine learning workflows. These tools simplify data preparation and provide ready-to-use datasets for experimentation.
3
4
## Capabilities
5
6
### Built-in Datasets
7
8
Standard datasets commonly used for machine learning research and experimentation, pre-loaded and ready to use.
9
10
```python { .api }
11
# MNIST handwritten digits dataset
12
def load_data():
13
"""
14
Load MNIST dataset.
15
16
Returns:
17
Tuple of ((x_train, y_train), (x_test, y_test))
18
- x_train, x_test: uint8 arrays of grayscale image data with shape (num_samples, 28, 28)
19
- y_train, y_test: uint8 arrays of digit labels (0-9) with shape (num_samples,)
20
"""
21
22
# Fashion-MNIST dataset (available as keras.datasets.fashion_mnist.load_data())
23
# CIFAR-10 dataset (available as keras.datasets.cifar10.load_data())
24
# CIFAR-100 dataset (available as keras.datasets.cifar100.load_data())
25
# IMDB movie reviews dataset (available as keras.datasets.imdb.load_data())
26
# Reuters newswire dataset (available as keras.datasets.reuters.load_data())
27
# Boston housing dataset (available as keras.datasets.boston_housing.load_data())
28
# California housing dataset (available as keras.datasets.california_housing.load_data())
29
```
30
31
### Image Processing Utilities
32
33
Functions for loading, saving, and manipulating images for machine learning workflows.
34
35
```python { .api }
36
def load_img(path, color_mode='rgb', target_size=None, interpolation='nearest',
37
keep_aspect_ratio=False):
38
"""
39
Load an image into PIL format.
40
41
Parameters:
42
- path: Path to image file
43
- color_mode: 'grayscale', 'rgb', 'rgba'
44
- target_size: Tuple (height, width) to resize image
45
- interpolation: Interpolation method for resizing
46
- keep_aspect_ratio: Whether to keep aspect ratio when resizing
47
48
Returns:
49
PIL Image instance
50
"""
51
52
def save_img(path, x, data_format=None, file_format=None, scale=True, **kwargs):
53
"""
54
Save an image to disk.
55
56
Parameters:
57
- path: Path to save image
58
- x: Image array data
59
- data_format: Image data format
60
- file_format: Image file format ('png', 'jpeg', etc.)
61
- scale: Whether to scale pixel values to [0, 255]
62
"""
63
64
def img_to_array(img, data_format=None, dtype=None):
65
"""
66
Convert PIL Image to numpy array.
67
68
Parameters:
69
- img: PIL Image instance
70
- data_format: Image data format ('channels_first' or 'channels_last')
71
- dtype: Data type for output array
72
73
Returns:
74
Numpy array representation of image
75
"""
76
77
def array_to_img(x, data_format=None, scale=True, dtype=None):
78
"""
79
Convert numpy array to PIL Image.
80
81
Parameters:
82
- x: Input array
83
- data_format: Image data format
84
- scale: Whether to scale values to [0, 255]
85
- dtype: Data type
86
87
Returns:
88
PIL Image instance
89
"""
90
```
91
92
### Data Transformation Utilities
93
94
Functions for common data preprocessing tasks including categorical encoding, normalization, and sequence processing.
95
96
```python { .api }
97
def to_categorical(y, num_classes=None, dtype='float32'):
98
"""
99
Convert class vector to categorical (one-hot) matrix.
100
101
Parameters:
102
- y: Array of class labels to convert
103
- num_classes: Total number of classes (optional)
104
- dtype: Data type for output matrix
105
106
Returns:
107
Binary matrix representation of input as numpy array
108
"""
109
110
def normalize(x, axis=-1, order=2):
111
"""
112
Normalize array along specified axis.
113
114
Parameters:
115
- x: Array to normalize
116
- axis: Axis along which to normalize
117
- order: Normalization order (1 for L1, 2 for L2)
118
119
Returns:
120
Normalized array
121
"""
122
123
def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',
124
truncating='pre', value=0.0):
125
"""
126
Pad sequences to same length.
127
128
Parameters:
129
- sequences: List of sequences to pad
130
- maxlen: Maximum length of sequences
131
- dtype: Data type for output
132
- padding: 'pre' or 'post' padding
133
- truncating: 'pre' or 'post' truncation
134
- value: Padding value
135
136
Returns:
137
2D numpy array with shape (len(sequences), maxlen)
138
"""
139
```
140
141
### Dataset Creation Utilities
142
143
Functions for creating tf.data.Dataset objects from directories and arrays for efficient data loading.
144
145
```python { .api }
146
def image_dataset_from_directory(directory, labels='inferred', label_mode='int',
147
class_names=None, color_mode='rgb', batch_size=32,
148
image_size=(256, 256), shuffle=True, seed=None,
149
validation_split=None, subset=None, interpolation='bilinear',
150
follow_links=False, crop_to_aspect_ratio=False):
151
"""
152
Generate dataset from image directory.
153
154
Parameters:
155
- directory: Path to directory containing subdirectories of images
156
- labels: 'inferred' (from directory structure) or list of labels
157
- label_mode: 'int', 'categorical', 'binary', or None
158
- class_names: List of class names (overrides inferred names)
159
- color_mode: 'grayscale', 'rgb', or 'rgba'
160
- batch_size: Batch size
161
- image_size: Size to resize images to
162
- shuffle: Whether to shuffle data
163
- seed: Random seed for shuffling
164
- validation_split: Fraction of data for validation
165
- subset: 'training' or 'validation' (when validation_split is set)
166
- interpolation: Interpolation method for resizing
167
- follow_links: Whether to follow symlinks
168
- crop_to_aspect_ratio: Whether to crop to maintain aspect ratio
169
170
Returns:
171
tf.data.Dataset object
172
"""
173
174
def text_dataset_from_directory(directory, labels='inferred', label_mode='int',
175
class_names=None, batch_size=32, max_length=None,
176
shuffle=True, seed=None, validation_split=None,
177
subset=None, follow_links=False):
178
"""
179
Generate dataset from text directory.
180
181
Parameters:
182
- directory: Path to directory containing text files
183
- labels: 'inferred' or list of labels
184
- label_mode: 'int', 'categorical', 'binary', or None
185
- class_names: List of class names
186
- batch_size: Batch size
187
- max_length: Maximum sequence length
188
- shuffle: Whether to shuffle data
189
- seed: Random seed
190
- validation_split: Fraction for validation
191
- subset: 'training' or 'validation'
192
- follow_links: Whether to follow symlinks
193
194
Returns:
195
tf.data.Dataset object
196
"""
197
198
def timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride=1,
199
sampling_rate=1, batch_size=128, shuffle=False,
200
seed=None, start_index=None, end_index=None):
201
"""
202
Create dataset from time series data.
203
204
Parameters:
205
- data: Array of data points
206
- targets: Array of targets corresponding to data
207
- sequence_length: Length of output sequences
208
- sequence_stride: Stride between successive output sequences
209
- sampling_rate: Rate to sample data points within sequences
210
- batch_size: Batch size
211
- shuffle: Whether to shuffle data
212
- seed: Random seed
213
- start_index: Start index for data
214
- end_index: End index for data
215
216
Returns:
217
tf.data.Dataset object yielding (inputs, targets) tuples
218
"""
219
```
220
221
### Data Utilities
222
223
Utility classes and functions for advanced data handling including custom datasets, feature engineering, and data packing.
224
225
```python { .api }
226
class Sequence:
227
"""Base class for fitting to sequence of data."""
228
229
def __init__(self):
230
"""Initialize sequence."""
231
232
def __getitem__(self, index):
233
"""
234
Get batch at index.
235
236
Parameters:
237
- index: Batch index
238
239
Returns:
240
Batch data
241
"""
242
243
def __len__(self):
244
"""
245
Number of batches in sequence.
246
247
Returns:
248
Number of batches
249
"""
250
251
def on_epoch_end(self):
252
"""Method called at end of every epoch."""
253
254
class FeatureSpace:
255
"""Utility for feature preprocessing and engineering."""
256
257
def __init__(self, features, output_mode='concat'):
258
"""
259
Initialize feature space.
260
261
Parameters:
262
- features: Dict mapping feature names to preprocessing layers
263
- output_mode: 'concat' or 'dict'
264
"""
265
266
def adapt(self, dataset):
267
"""
268
Fit feature preprocessing on dataset.
269
270
Parameters:
271
- dataset: Dataset to adapt to
272
"""
273
274
def __call__(self, data):
275
"""
276
Apply feature preprocessing.
277
278
Parameters:
279
- data: Input data
280
281
Returns:
282
Preprocessed features
283
"""
284
285
def pack_x_y_sample_weight(x, y=None, sample_weight=None):
286
"""
287
Pack user-provided data into tuple.
288
289
Parameters:
290
- x: Input data
291
- y: Target data
292
- sample_weight: Sample weights
293
294
Returns:
295
Packed data tuple
296
"""
297
298
def unpack_x_y_sample_weight(data):
299
"""
300
Unpack user-provided data tuple.
301
302
Parameters:
303
- data: Packed data tuple
304
305
Returns:
306
Tuple of (x, y, sample_weight)
307
"""
308
309
def split_dataset(dataset, left_size=None, right_size=None, shuffle=False, seed=None):
310
"""
311
Split dataset into two datasets.
312
313
Parameters:
314
- dataset: Dataset to split
315
- left_size: Size of left split
316
- right_size: Size of right split
317
- shuffle: Whether to shuffle before splitting
318
- seed: Random seed
319
320
Returns:
321
Tuple of (left_dataset, right_dataset)
322
"""
323
```
324
325
### File and Download Utilities
326
327
Functions for downloading files and managing data assets.
328
329
```python { .api }
330
def get_file(fname=None, origin=None, untar=False, md5_hash=None, file_hash=None,
331
cache_subdir='datasets', hash_algorithm='auto', extract=False,
332
archive_format='auto', cache_dir=None):
333
"""
334
Download file from URL if not already cached.
335
336
Parameters:
337
- fname: Name of file (if origin has different name)
338
- origin: Original URL of file
339
- untar: Whether to untar file after download
340
- md5_hash: MD5 hash for verification (deprecated)
341
- file_hash: Hash for verification
342
- cache_subdir: Subdirectory under cache directory
343
- hash_algorithm: Hash algorithm ('md5', 'sha256', 'auto')
344
- extract: Whether to extract archive after download
345
- archive_format: Archive format ('auto', 'tar', 'zip')
346
- cache_dir: Location to store cached files
347
348
Returns:
349
Path to downloaded file
350
"""
351
```
352
353
### Configuration and Random Utilities
354
355
Utilities for setting random seeds and managing global configuration.
356
357
```python { .api }
358
def set_random_seed(seed=None):
359
"""
360
Set random seed for reproducible results.
361
362
Parameters:
363
- seed: Random seed value
364
"""
365
366
class Config:
367
"""Global configuration utility."""
368
369
def enable(self, feature):
370
"""Enable configuration feature."""
371
372
def disable(self, feature):
373
"""Disable configuration feature."""
374
375
def is_enabled(self, feature):
376
"""Check if feature is enabled."""
377
378
class Progbar:
379
"""Progress bar utility for training loops."""
380
381
def __init__(self, target, width=30, verbose=1, interval=0.05,
382
stateful_metrics=None, unit_name='step'):
383
"""
384
Initialize progress bar.
385
386
Parameters:
387
- target: Total number of steps expected
388
- width: Progress bar width
389
- verbose: Verbosity mode
390
- interval: Minimum update interval
391
- stateful_metrics: Metrics that shouldn't be averaged
392
- unit_name: Display name for step units
393
"""
394
395
def update(self, current, values=None, finalize=None):
396
"""
397
Update progress bar.
398
399
Parameters:
400
- current: Current step index
401
- values: List of tuples (name, value) for metrics
402
- finalize: Whether to finalize progress bar
403
"""
404
```
405
406
## Usage Examples
407
408
### Loading and Preprocessing Images
409
410
```python
411
import keras
412
from keras.utils import load_img, img_to_array, to_categorical
413
import numpy as np
414
415
# Load and preprocess single image
416
img_path = 'cat.jpg'
417
img = load_img(img_path, target_size=(224, 224))
418
img_array = img_to_array(img)
419
img_array = np.expand_dims(img_array, axis=0)
420
img_array /= 255.0 # Normalize to [0, 1]
421
422
# Convert labels to categorical
423
labels = [0, 1, 2, 1, 0] # Class indices
424
categorical_labels = to_categorical(labels, num_classes=3)
425
print(categorical_labels)
426
# [[1. 0. 0.]
427
# [0. 1. 0.]
428
# [0. 0. 1.]
429
# [0. 1. 0.]
430
# [1. 0. 0.]]
431
```
432
433
### Creating Datasets from Directories
434
435
```python
436
import keras
437
438
# Create image dataset from directory structure
439
train_dataset = keras.utils.image_dataset_from_directory(
440
'path/to/train_data/',
441
labels='inferred',
442
label_mode='categorical',
443
color_mode='rgb',
444
batch_size=32,
445
image_size=(224, 224),
446
shuffle=True,
447
validation_split=0.2,
448
subset='training',
449
seed=123
450
)
451
452
val_dataset = keras.utils.image_dataset_from_directory(
453
'path/to/train_data/',
454
labels='inferred',
455
label_mode='categorical',
456
color_mode='rgb',
457
batch_size=32,
458
image_size=(224, 224),
459
shuffle=True,
460
validation_split=0.2,
461
subset='validation',
462
seed=123
463
)
464
465
# Use datasets for training
466
# model.fit(train_dataset, validation_data=val_dataset, epochs=10)
467
```
468
469
### Working with Built-in Datasets
470
471
```python
472
import keras
473
from keras.datasets import mnist, cifar10
474
from keras.utils import to_categorical
475
476
# Load MNIST dataset
477
(x_train, y_train), (x_test, y_test) = mnist.load_data()
478
479
# Preprocess data
480
x_train = x_train.astype('float32') / 255.0
481
x_test = x_test.astype('float32') / 255.0
482
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
483
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
484
485
# Convert labels to categorical
486
y_train = to_categorical(y_train, 10)
487
y_test = to_categorical(y_test, 10)
488
489
print(f"Training data shape: {x_train.shape}")
490
print(f"Training labels shape: {y_train.shape}")
491
```
492
493
### Custom Data Sequence
494
495
```python
496
import keras
497
import numpy as np
498
499
class CustomDataSequence(keras.utils.Sequence):
500
def __init__(self, x_data, y_data, batch_size):
501
self.x_data = x_data
502
self.y_data = y_data
503
self.batch_size = batch_size
504
self.indices = np.arange(len(self.x_data))
505
506
def __len__(self):
507
return len(self.x_data) // self.batch_size
508
509
def __getitem__(self, index):
510
batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
511
batch_x = self.x_data[batch_indices]
512
batch_y = self.y_data[batch_indices]
513
return batch_x, batch_y
514
515
def on_epoch_end(self):
516
np.random.shuffle(self.indices)
517
518
# Use custom sequence
519
# train_sequence = CustomDataSequence(x_train, y_train, batch_size=32)
520
# model.fit(train_sequence, epochs=10)
521
```
522
523
### Feature Engineering with FeatureSpace
524
525
```python
526
import keras
527
from keras import layers
528
from keras.utils import FeatureSpace
529
530
# Define feature preprocessing
531
feature_space = FeatureSpace(
532
features={
533
'age': layers.Normalization(),
534
'category': layers.StringLookup(output_mode='one_hot'),
535
'price': layers.Discretization(num_bins=10),
536
},
537
output_mode='concat'
538
)
539
540
# Adapt to training data
541
# feature_space.adapt(train_dataset)
542
543
# Apply preprocessing
544
# processed_features = feature_space(raw_features)
545
```
546
547
### Creating Time Series Dataset
548
549
```python
550
import keras
551
import numpy as np
552
553
# Generate sample time series data
554
data = np.sin(np.arange(1000) * 0.1)
555
targets = np.sin(np.arange(1000) * 0.1 + 0.1)
556
557
# Create dataset for sequence prediction
558
dataset = keras.utils.timeseries_dataset_from_array(
559
data=data,
560
targets=targets,
561
sequence_length=10,
562
batch_size=32,
563
shuffle=True
564
)
565
566
# Use for training RNN models
567
# model.fit(dataset, epochs=10)
568
```