Tessl Tile for pypi/keras@3.11.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

activations.md applications.md data-utils.md index.md initializers.md layers.md models.md operations.md random.md regularizers.md saving.md training.md

data-utils.mddocs/

0
# Data Processing and Utilities
1

2
Built-in datasets, data preprocessing utilities, image processing functions, and various helper utilities for machine learning workflows. These tools simplify data preparation and provide ready-to-use datasets for experimentation.
3

4
## Capabilities
5

6
### Built-in Datasets
7

8
Standard datasets commonly used for machine learning research and experimentation, pre-loaded and ready to use.
9

10
```python { .api }
11
# MNIST handwritten digits dataset
12
def load_data():
13
    """
14
    Load MNIST dataset.
15
    
16
    Returns:
17
    Tuple of ((x_train, y_train), (x_test, y_test))
18
    - x_train, x_test: uint8 arrays of grayscale image data with shape (num_samples, 28, 28)
19
    - y_train, y_test: uint8 arrays of digit labels (0-9) with shape (num_samples,)
20
    """
21

22
# Fashion-MNIST dataset (available as keras.datasets.fashion_mnist.load_data())
23
# CIFAR-10 dataset (available as keras.datasets.cifar10.load_data())
24
# CIFAR-100 dataset (available as keras.datasets.cifar100.load_data())
25
# IMDB movie reviews dataset (available as keras.datasets.imdb.load_data())
26
# Reuters newswire dataset (available as keras.datasets.reuters.load_data())
27
# Boston housing dataset (available as keras.datasets.boston_housing.load_data())
28
# California housing dataset (available as keras.datasets.california_housing.load_data())
29
```
30

31
### Image Processing Utilities
32

33
Functions for loading, saving, and manipulating images for machine learning workflows.
34

35
```python { .api }
36
def load_img(path, color_mode='rgb', target_size=None, interpolation='nearest', 
37
             keep_aspect_ratio=False):
38
    """
39
    Load an image into PIL format.
40
    
41
    Parameters:
42
    - path: Path to image file
43
    - color_mode: 'grayscale', 'rgb', 'rgba'
44
    - target_size: Tuple (height, width) to resize image
45
    - interpolation: Interpolation method for resizing
46
    - keep_aspect_ratio: Whether to keep aspect ratio when resizing
47
    
48
    Returns:
49
    PIL Image instance
50
    """
51

52
def save_img(path, x, data_format=None, file_format=None, scale=True, **kwargs):
53
    """
54
    Save an image to disk.
55
    
56
    Parameters:
57
    - path: Path to save image
58
    - x: Image array data
59
    - data_format: Image data format
60
    - file_format: Image file format ('png', 'jpeg', etc.)
61
    - scale: Whether to scale pixel values to [0, 255]
62
    """
63

64
def img_to_array(img, data_format=None, dtype=None):
65
    """
66
    Convert PIL Image to numpy array.
67
    
68
    Parameters:
69
    - img: PIL Image instance
70
    - data_format: Image data format ('channels_first' or 'channels_last')
71
    - dtype: Data type for output array
72
    
73
    Returns:
74
    Numpy array representation of image
75
    """
76

77
def array_to_img(x, data_format=None, scale=True, dtype=None):
78
    """
79
    Convert numpy array to PIL Image.
80
    
81
    Parameters:
82
    - x: Input array
83
    - data_format: Image data format
84
    - scale: Whether to scale values to [0, 255]
85
    - dtype: Data type
86
    
87
    Returns:
88
    PIL Image instance
89
    """
90
```
91

92
### Data Transformation Utilities
93

94
Functions for common data preprocessing tasks including categorical encoding, normalization, and sequence processing.
95

96
```python { .api }
97
def to_categorical(y, num_classes=None, dtype='float32'):
98
    """
99
    Convert class vector to categorical (one-hot) matrix.
100
    
101
    Parameters:
102
    - y: Array of class labels to convert
103
    - num_classes: Total number of classes (optional)
104
    - dtype: Data type for output matrix
105
    
106
    Returns:
107
    Binary matrix representation of input as numpy array
108
    """
109

110
def normalize(x, axis=-1, order=2):
111
    """
112
    Normalize array along specified axis.
113
    
114
    Parameters:
115
    - x: Array to normalize
116
    - axis: Axis along which to normalize
117
    - order: Normalization order (1 for L1, 2 for L2)
118
    
119
    Returns:
120
    Normalized array
121
    """
122

123
def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',
124
                  truncating='pre', value=0.0):
125
    """
126
    Pad sequences to same length.
127
    
128
    Parameters:
129
    - sequences: List of sequences to pad
130
    - maxlen: Maximum length of sequences
131
    - dtype: Data type for output
132
    - padding: 'pre' or 'post' padding
133
    - truncating: 'pre' or 'post' truncation
134
    - value: Padding value
135
    
136
    Returns:
137
    2D numpy array with shape (len(sequences), maxlen)
138
    """
139
```
140

141
### Dataset Creation Utilities
142

143
Functions for creating tf.data.Dataset objects from directories and arrays for efficient data loading.
144

145
```python { .api }
146
def image_dataset_from_directory(directory, labels='inferred', label_mode='int',
147
                                class_names=None, color_mode='rgb', batch_size=32,
148
                                image_size=(256, 256), shuffle=True, seed=None,
149
                                validation_split=None, subset=None, interpolation='bilinear',
150
                                follow_links=False, crop_to_aspect_ratio=False):
151
    """
152
    Generate dataset from image directory.
153
    
154
    Parameters:
155
    - directory: Path to directory containing subdirectories of images
156
    - labels: 'inferred' (from directory structure) or list of labels
157
    - label_mode: 'int', 'categorical', 'binary', or None
158
    - class_names: List of class names (overrides inferred names)
159
    - color_mode: 'grayscale', 'rgb', or 'rgba'
160
    - batch_size: Batch size
161
    - image_size: Size to resize images to
162
    - shuffle: Whether to shuffle data
163
    - seed: Random seed for shuffling
164
    - validation_split: Fraction of data for validation
165
    - subset: 'training' or 'validation' (when validation_split is set)
166
    - interpolation: Interpolation method for resizing
167
    - follow_links: Whether to follow symlinks
168
    - crop_to_aspect_ratio: Whether to crop to maintain aspect ratio
169
    
170
    Returns:
171
    tf.data.Dataset object
172
    """
173

174
def text_dataset_from_directory(directory, labels='inferred', label_mode='int',
175
                               class_names=None, batch_size=32, max_length=None,
176
                               shuffle=True, seed=None, validation_split=None,
177
                               subset=None, follow_links=False):
178
    """
179
    Generate dataset from text directory.
180
    
181
    Parameters:
182
    - directory: Path to directory containing text files
183
    - labels: 'inferred' or list of labels
184
    - label_mode: 'int', 'categorical', 'binary', or None
185
    - class_names: List of class names
186
    - batch_size: Batch size
187
    - max_length: Maximum sequence length
188
    - shuffle: Whether to shuffle data
189
    - seed: Random seed
190
    - validation_split: Fraction for validation
191
    - subset: 'training' or 'validation'
192
    - follow_links: Whether to follow symlinks
193
    
194
    Returns:
195
    tf.data.Dataset object
196
    """
197

198
def timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride=1,
199
                                 sampling_rate=1, batch_size=128, shuffle=False,
200
                                 seed=None, start_index=None, end_index=None):
201
    """
202
    Create dataset from time series data.
203
    
204
    Parameters:
205
    - data: Array of data points
206
    - targets: Array of targets corresponding to data
207
    - sequence_length: Length of output sequences
208
    - sequence_stride: Stride between successive output sequences
209
    - sampling_rate: Rate to sample data points within sequences
210
    - batch_size: Batch size
211
    - shuffle: Whether to shuffle data
212
    - seed: Random seed
213
    - start_index: Start index for data
214
    - end_index: End index for data
215
    
216
    Returns:
217
    tf.data.Dataset object yielding (inputs, targets) tuples
218
    """
219
```
220

221
### Data Utilities
222

223
Utility classes and functions for advanced data handling including custom datasets, feature engineering, and data packing.
224

225
```python { .api }
226
class Sequence:
227
    """Base class for fitting to sequence of data."""
228
    
229
    def __init__(self):
230
        """Initialize sequence."""
231
        
232
    def __getitem__(self, index):
233
        """
234
        Get batch at index.
235
        
236
        Parameters:
237
        - index: Batch index
238
        
239
        Returns:
240
        Batch data
241
        """
242
    
243
    def __len__(self):
244
        """
245
        Number of batches in sequence.
246
        
247
        Returns:
248
        Number of batches
249
        """
250
    
251
    def on_epoch_end(self):
252
        """Method called at end of every epoch."""
253

254
class FeatureSpace:
255
    """Utility for feature preprocessing and engineering."""
256
    
257
    def __init__(self, features, output_mode='concat'):
258
        """
259
        Initialize feature space.
260
        
261
        Parameters:
262
        - features: Dict mapping feature names to preprocessing layers
263
        - output_mode: 'concat' or 'dict'
264
        """
265
    
266
    def adapt(self, dataset):
267
        """
268
        Fit feature preprocessing on dataset.
269
        
270
        Parameters:
271
        - dataset: Dataset to adapt to
272
        """
273
    
274
    def __call__(self, data):
275
        """
276
        Apply feature preprocessing.
277
        
278
        Parameters:
279
        - data: Input data
280
        
281
        Returns:
282
        Preprocessed features
283
        """
284

285
def pack_x_y_sample_weight(x, y=None, sample_weight=None):
286
    """
287
    Pack user-provided data into tuple.
288
    
289
    Parameters:
290
    - x: Input data
291
    - y: Target data
292
    - sample_weight: Sample weights
293
    
294
    Returns:
295
    Packed data tuple
296
    """
297

298
def unpack_x_y_sample_weight(data):
299
    """
300
    Unpack user-provided data tuple.
301
    
302
    Parameters:
303
    - data: Packed data tuple
304
    
305
    Returns:
306
    Tuple of (x, y, sample_weight)
307
    """
308

309
def split_dataset(dataset, left_size=None, right_size=None, shuffle=False, seed=None):
310
    """
311
    Split dataset into two datasets.
312
    
313
    Parameters:
314
    - dataset: Dataset to split
315
    - left_size: Size of left split
316
    - right_size: Size of right split
317
    - shuffle: Whether to shuffle before splitting
318
    - seed: Random seed
319
    
320
    Returns:
321
    Tuple of (left_dataset, right_dataset)
322
    """
323
```
324

325
### File and Download Utilities
326

327
Functions for downloading files and managing data assets.
328

329
```python { .api }
330
def get_file(fname=None, origin=None, untar=False, md5_hash=None, file_hash=None,
331
             cache_subdir='datasets', hash_algorithm='auto', extract=False,
332
             archive_format='auto', cache_dir=None):
333
    """
334
    Download file from URL if not already cached.
335
    
336
    Parameters:
337
    - fname: Name of file (if origin has different name)
338
    - origin: Original URL of file
339
    - untar: Whether to untar file after download
340
    - md5_hash: MD5 hash for verification (deprecated)
341
    - file_hash: Hash for verification
342
    - cache_subdir: Subdirectory under cache directory
343
    - hash_algorithm: Hash algorithm ('md5', 'sha256', 'auto')
344
    - extract: Whether to extract archive after download
345
    - archive_format: Archive format ('auto', 'tar', 'zip')
346
    - cache_dir: Location to store cached files
347
    
348
    Returns:
349
    Path to downloaded file
350
    """
351
```
352

353
### Configuration and Random Utilities
354

355
Utilities for setting random seeds and managing global configuration.
356

357
```python { .api }
358
def set_random_seed(seed=None):
359
    """
360
    Set random seed for reproducible results.
361
    
362
    Parameters:
363
    - seed: Random seed value
364
    """
365

366
class Config:
367
    """Global configuration utility."""
368
    
369
    def enable(self, feature):
370
        """Enable configuration feature."""
371
        
372
    def disable(self, feature):
373
        """Disable configuration feature."""
374
        
375
    def is_enabled(self, feature):
376
        """Check if feature is enabled."""
377

378
class Progbar:
379
    """Progress bar utility for training loops."""
380
    
381
    def __init__(self, target, width=30, verbose=1, interval=0.05,
382
                 stateful_metrics=None, unit_name='step'):
383
        """
384
        Initialize progress bar.
385
        
386
        Parameters:
387
        - target: Total number of steps expected
388
        - width: Progress bar width
389
        - verbose: Verbosity mode
390
        - interval: Minimum update interval
391
        - stateful_metrics: Metrics that shouldn't be averaged
392
        - unit_name: Display name for step units
393
        """
394
    
395
    def update(self, current, values=None, finalize=None):
396
        """
397
        Update progress bar.
398
        
399
        Parameters:
400
        - current: Current step index
401
        - values: List of tuples (name, value) for metrics
402
        - finalize: Whether to finalize progress bar
403
        """
404
```
405

406
## Usage Examples
407

408
### Loading and Preprocessing Images
409

410
```python
411
import keras
412
from keras.utils import load_img, img_to_array, to_categorical
413
import numpy as np
414

415
# Load and preprocess single image
416
img_path = 'cat.jpg'
417
img = load_img(img_path, target_size=(224, 224))
418
img_array = img_to_array(img)
419
img_array = np.expand_dims(img_array, axis=0)
420
img_array /= 255.0  # Normalize to [0, 1]
421

422
# Convert labels to categorical
423
labels = [0, 1, 2, 1, 0]  # Class indices
424
categorical_labels = to_categorical(labels, num_classes=3)
425
print(categorical_labels)
426
# [[1. 0. 0.]
427
#  [0. 1. 0.]
428
#  [0. 0. 1.]
429
#  [0. 1. 0.]
430
#  [1. 0. 0.]]
431
```
432

433
### Creating Datasets from Directories
434

435
```python
436
import keras
437

438
# Create image dataset from directory structure
439
train_dataset = keras.utils.image_dataset_from_directory(
440
    'path/to/train_data/',
441
    labels='inferred',
442
    label_mode='categorical',
443
    color_mode='rgb',
444
    batch_size=32,
445
    image_size=(224, 224),
446
    shuffle=True,
447
    validation_split=0.2,
448
    subset='training',
449
    seed=123
450
)
451

452
val_dataset = keras.utils.image_dataset_from_directory(
453
    'path/to/train_data/',
454
    labels='inferred',
455
    label_mode='categorical',
456
    color_mode='rgb',
457
    batch_size=32,
458
    image_size=(224, 224),
459
    shuffle=True,
460
    validation_split=0.2,
461
    subset='validation',
462
    seed=123
463
)
464

465
# Use datasets for training
466
# model.fit(train_dataset, validation_data=val_dataset, epochs=10)
467
```
468

469
### Working with Built-in Datasets
470

471
```python
472
import keras
473
from keras.datasets import mnist, cifar10
474
from keras.utils import to_categorical
475

476
# Load MNIST dataset
477
(x_train, y_train), (x_test, y_test) = mnist.load_data()
478

479
# Preprocess data
480
x_train = x_train.astype('float32') / 255.0
481
x_test = x_test.astype('float32') / 255.0
482
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
483
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
484

485
# Convert labels to categorical
486
y_train = to_categorical(y_train, 10)
487
y_test = to_categorical(y_test, 10)
488

489
print(f"Training data shape: {x_train.shape}")
490
print(f"Training labels shape: {y_train.shape}")
491
```
492

493
### Custom Data Sequence
494

495
```python
496
import keras
497
import numpy as np
498

499
class CustomDataSequence(keras.utils.Sequence):
500
    def __init__(self, x_data, y_data, batch_size):
501
        self.x_data = x_data
502
        self.y_data = y_data
503
        self.batch_size = batch_size
504
        self.indices = np.arange(len(self.x_data))
505
    
506
    def __len__(self):
507
        return len(self.x_data) // self.batch_size
508
    
509
    def __getitem__(self, index):
510
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
511
        batch_x = self.x_data[batch_indices]
512
        batch_y = self.y_data[batch_indices]
513
        return batch_x, batch_y
514
    
515
    def on_epoch_end(self):
516
        np.random.shuffle(self.indices)
517

518
# Use custom sequence
519
# train_sequence = CustomDataSequence(x_train, y_train, batch_size=32)
520
# model.fit(train_sequence, epochs=10)
521
```
522

523
### Feature Engineering with FeatureSpace
524

525
```python
526
import keras
527
from keras import layers
528
from keras.utils import FeatureSpace
529

530
# Define feature preprocessing
531
feature_space = FeatureSpace(
532
    features={
533
        'age': layers.Normalization(),
534
        'category': layers.StringLookup(output_mode='one_hot'),
535
        'price': layers.Discretization(num_bins=10),
536
    },
537
    output_mode='concat'
538
)
539

540
# Adapt to training data
541
# feature_space.adapt(train_dataset)
542

543
# Apply preprocessing
544
# processed_features = feature_space(raw_features)
545
```
546

547
### Creating Time Series Dataset
548

549
```python
550
import keras
551
import numpy as np
552

553
# Generate sample time series data
554
data = np.sin(np.arange(1000) * 0.1)
555
targets = np.sin(np.arange(1000) * 0.1 + 0.1)
556

557
# Create dataset for sequence prediction
558
dataset = keras.utils.timeseries_dataset_from_array(
559
    data=data,
560
    targets=targets,
561
    sequence_length=10,
562
    batch_size=32,
563
    shuffle=True
564
)
565

566
# Use for training RNN models
567
# model.fit(dataset, epochs=10)
568
```

Version

Tile

Files

data-utils.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-utils.mddocs/