Tessl Tile for pypi/keras-nightly@3.11.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

activations.md applications.md backend-config.md core-framework.md index.md initializers.md layers.md losses-metrics.md operations.md optimizers.md preprocessing.md regularizers.md training-callbacks.md

preprocessing.mddocs/

0
# Data Processing
1

2
Comprehensive data preprocessing utilities for images, text, audio, and numerical data with built-in augmentation capabilities, dataset creation functions, and feature preprocessing layers.
3

4
## Capabilities
5

6
### Dataset Creation
7

8
Functions for creating datasets from various data sources and formats.
9

10
```python { .api }
11
def image_dataset_from_directory(directory, labels='inferred', label_mode='int',
12
                                class_names=None, color_mode='rgb', batch_size=32,
13
                                image_size=(256, 256), shuffle=True, seed=None,
14
                                validation_split=None, subset=None, **kwargs):
15
    """
16
    Create image dataset from directory structure.
17
    
18
    Args:
19
        directory (str): Path to directory containing subdirectories of images
20
        labels (str): How to generate labels ('inferred' or None)
21
        label_mode (str): Type of labels ('int', 'categorical', 'binary', None)
22
        class_names (list, optional): Explicit list of class names
23
        color_mode (str): Image color mode ('grayscale', 'rgb', 'rgba')
24
        batch_size (int): Batch size
25
        image_size (tuple): Target image size
26
        shuffle (bool): Whether to shuffle data
27
        seed (int, optional): Random seed
28
        validation_split (float, optional): Fraction for validation
29
        subset (str, optional): Subset to return ('training' or 'validation')
30
        
31
    Returns:
32
        Dataset: Configured image dataset
33
    """
34

35
def text_dataset_from_directory(directory, labels='inferred', label_mode='int',
36
                                class_names=None, batch_size=32, max_length=None,
37
                                shuffle=True, seed=None, validation_split=None,
38
                                subset=None, **kwargs): ...
39

40
def timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride=1,
41
                                 sampling_rate=1, batch_size=128, shuffle=False,
42
                                 seed=None, start_index=None, end_index=None): ...
43

44
def audio_dataset_from_directory(directory, labels='inferred', label_mode='int',
45
                                class_names=None, batch_size=32, sampling_rate=16000,
46
                                output_sequence_length=16000, **kwargs): ...
47
```
48

49
### Text Processing Layers
50

51
Preprocessing layers for text and sequence data including vectorization and encoding.
52

53
```python { .api }
54
class TextVectorization:
55
    """
56
    Text vectorization layer for converting text to sequences.
57
    
58
    Args:
59
        max_tokens (int, optional): Maximum vocabulary size
60
        standardize (str or callable): Text standardization ('lower_and_strip_punctuation', 'lower', 'strip_punctuation', or callable)
61
        split (str or callable): Text splitting strategy ('whitespace' or callable)
62
        ngrams (int, optional): N-gram size
63
        output_mode (str): Output format ('int', 'multi_hot', 'count', 'tf_idf')
64
        output_sequence_length (int, optional): Output sequence length
65
        pad_to_max_tokens (bool): Whether to pad to max_tokens
66
        vocabulary (list, optional): Pre-existing vocabulary
67
        idf_weights (array, optional): IDF weights for tf-idf mode
68
        sparse (bool): Whether to return sparse tensors
69
        ragged (bool): Whether to return ragged tensors
70
    """
71
    def __init__(self, max_tokens=None, standardize='lower_and_strip_punctuation',
72
                 split='whitespace', ngrams=None, output_mode='int',
73
                 output_sequence_length=None, **kwargs): ...
74
    
75
    def adapt(self, data, batch_size=None, steps=None): ...
76
    def get_vocabulary(self): ...
77
    def set_vocabulary(self, vocabulary, idf_weights=None): ...
78

79
class StringLookup:
80
    """
81
    String to integer lookup layer.
82
    
83
    Args:
84
        max_tokens (int, optional): Maximum vocabulary size
85
        num_oov_indices (int): Number of out-of-vocabulary indices
86
        mask_token (str, optional): Token to use for masking
87
        oov_token (str): Token to use for out-of-vocabulary
88
        vocabulary (list, optional): Pre-existing vocabulary
89
        idf_weights (array, optional): IDF weights
90
        invert (bool): Whether to invert the lookup
91
        output_mode (str): Output format ('int', 'multi_hot', 'count', 'one_hot', 'tf_idf')
92
        sparse (bool): Whether to return sparse tensors
93
        pad_to_max_tokens (bool): Whether to pad to max_tokens
94
    """
95
    def __init__(self, max_tokens=None, num_oov_indices=1, mask_token=None,
96
                 oov_token='[UNK]', vocabulary=None, **kwargs): ...
97

98
class IntegerLookup:
99
    """Integer to integer lookup layer."""
100
    def __init__(self, max_tokens=None, num_oov_indices=1, mask_token=None,
101
                 oov_token=-1, vocabulary=None, **kwargs): ...
102

103
class CategoryEncoding:
104
    """
105
    Categorical encoding layer.
106
    
107
    Args:
108
        num_tokens (int, optional): Total number of tokens
109
        output_mode (str): Output format ('multi_hot', 'one_hot', 'count')
110
        sparse (bool): Whether to return sparse tensors
111
    """
112
    def __init__(self, num_tokens=None, output_mode='multi_hot', sparse=False, **kwargs): ...
113
```
114

115
### Image Processing Layers
116

117
Preprocessing layers for image data including resizing, augmentation, and transformations.
118

119
```python { .api }
120
class Resizing:
121
    """
122
    Resize images to target size.
123
    
124
    Args:
125
        height (int): Target height
126
        width (int): Target width
127
        interpolation (str): Interpolation method ('bilinear', 'nearest', 'bicubic', 'area', 'lanczos3', 'lanczos5', 'gaussian', 'mitchellcubic')
128
        crop_to_aspect_ratio (bool): Whether to crop to maintain aspect ratio
129
    """
130
    def __init__(self, height, width, interpolation='bilinear', crop_to_aspect_ratio=False, **kwargs): ...
131

132
class CenterCrop:
133
    """
134
    Crop images to specified size from center.
135
    
136
    Args:
137
        height (int): Target height
138
        width (int): Target width
139
    """
140
    def __init__(self, height, width, **kwargs): ...
141

142
class Rescaling:
143
    """
144
    Rescale pixel values.
145
    
146
    Args:
147
        scale (float): Scaling factor
148
        offset (float): Offset value
149
    """
150
    def __init__(self, scale, offset=0.0, **kwargs): ...
151

152
# Data augmentation layers
153
class RandomFlip:
154
    """
155
    Random image flipping.
156
    
157
    Args:
158
        mode (str): Flip mode ('horizontal', 'vertical', 'horizontal_and_vertical')
159
        seed (int, optional): Random seed
160
    """
161
    def __init__(self, mode='horizontal_and_vertical', seed=None, **kwargs): ...
162

163
class RandomRotation:
164
    """
165
    Random image rotation.
166
    
167
    Args:
168
        factor (float or tuple): Rotation factor as fraction of 2π
169
        fill_mode (str): Fill mode for transformed pixels
170
        interpolation (str): Interpolation method
171
        seed (int, optional): Random seed
172
        fill_value (float): Fill value for constant fill mode
173
    """
174
    def __init__(self, factor, fill_mode='reflect', interpolation='bilinear',
175
                 seed=None, fill_value=0.0, **kwargs): ...
176

177
class RandomZoom:
178
    """Random image zooming."""
179
    def __init__(self, height_factor, width_factor=None, fill_mode='reflect',
180
                 interpolation='bilinear', seed=None, fill_value=0.0, **kwargs): ...
181

182
class RandomTranslation:
183
    """Random image translation."""
184
    def __init__(self, height_factor, width_factor, fill_mode='reflect',
185
                 interpolation='bilinear', seed=None, fill_value=0.0, **kwargs): ...
186

187
class RandomCrop:
188
    """Random image cropping."""
189
    def __init__(self, height, width, seed=None, **kwargs): ...
190

191
class RandomBrightness:
192
    """Random brightness adjustment."""
193
    def __init__(self, factor, value_range=(0, 255), seed=None, **kwargs): ...
194

195
class RandomContrast:
196
    """Random contrast adjustment."""
197
    def __init__(self, factor, seed=None, **kwargs): ...
198
```
199

200
### Numerical Processing Layers
201

202
Preprocessing layers for numerical data including normalization and discretization.
203

204
```python { .api }
205
class Normalization:
206
    """
207
    Feature normalization layer.
208
    
209
    Args:
210
        axis (int): Axis to normalize along
211
        mean (array, optional): Pre-computed mean
212
        variance (array, optional): Pre-computed variance
213
        invert (bool): Whether to invert normalization
214
    """
215
    def __init__(self, axis=-1, mean=None, variance=None, invert=False, **kwargs): ...
216
    
217
    def adapt(self, data, batch_size=None, steps=None): ...
218

219
class Discretization:
220
    """
221
    Value discretization layer.
222
    
223
    Args:
224
        bin_boundaries (array, optional): Bin boundary values
225
        num_bins (int, optional): Number of bins
226
        epsilon (float): Small value for bin boundary adjustment
227
        output_mode (str): Output format ('int', 'one_hot', 'multi_hot', 'count')
228
        sparse (bool): Whether to return sparse tensors
229
    """
230
    def __init__(self, bin_boundaries=None, num_bins=None, epsilon=0.01,
231
                 output_mode='int', sparse=False, **kwargs): ...
232
    
233
    def adapt(self, data, batch_size=None, steps=None): ...
234
```
235

236
### Audio Processing Layers
237

238
Specialized layers for audio signal processing.
239

240
```python { .api }
241
class MelSpectrogram:
242
    """
243
    Mel-frequency spectrogram layer.
244
    
245
    Args:
246
        fft_length (int): FFT length
247
        sequence_stride (int): Hop length between frames
248
        sequence_length (int): Window length
249
        window (str): Window function
250
        sampling_rate (int): Audio sampling rate
251
        num_mel_bins (int): Number of mel frequency bins
252
        min_freq (float): Minimum frequency
253
        max_freq (float): Maximum frequency
254
        power_to_db (bool): Whether to convert power to decibels
255
        top_db (float): Dynamic range for dB conversion
256
        mag_exp (float): Magnitude exponent
257
    """
258
    def __init__(self, fft_length=2048, sequence_stride=512, sequence_length=None,
259
                 window='hann', sampling_rate=16000, num_mel_bins=128, **kwargs): ...
260

261
class STFTSpectrogram:
262
    """Short-time Fourier transform spectrogram layer."""
263
    def __init__(self, fft_length=2048, sequence_stride=512, sequence_length=None,
264
                 window='hann', **kwargs): ...
265
```
266

267
### Utility Functions
268

269
Additional preprocessing utilities and helper functions.
270

271
```python { .api }
272
def split_dataset(dataset, left_size=None, right_size=None, shuffle=False, seed=None):
273
    """
274
    Split dataset into two parts.
275
    
276
    Args:
277
        dataset: Dataset to split
278
        left_size (float or int, optional): Size of left split
279
        right_size (float or int, optional): Size of right split  
280
        shuffle (bool): Whether to shuffle before splitting
281
        seed (int, optional): Random seed
282
        
283
    Returns:
284
        tuple: (left_dataset, right_dataset)
285
    """
286

287
def to_categorical(y, num_classes=None, dtype='float32'):
288
    """
289
    Convert integer labels to categorical encoding.
290
    
291
    Args:
292
        y (array): Integer labels
293
        num_classes (int, optional): Total number of classes
294
        dtype (str): Output data type
295
        
296
    Returns:
297
        array: Categorical encoded labels
298
    """
299

300
def normalize(x, axis=-1, order=2):
301
    """
302
    Normalize arrays along specified axis.
303
    
304
    Args:
305
        x (array): Input array
306
        axis (int): Normalization axis
307
        order (int): Norm order
308
        
309
    Returns:
310
        array: Normalized array
311
    """
312

313
def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',
314
                 truncating='pre', value=0.0):
315
    """
316
    Pad sequences to same length.
317
    
318
    Args:
319
        sequences (list): List of sequences
320
        maxlen (int, optional): Maximum length
321
        dtype (str): Output data type
322
        padding (str): Padding strategy ('pre' or 'post')
323
        truncating (str): Truncation strategy ('pre' or 'post')
324
        value (float): Padding value
325
        
326
    Returns:
327
        array: Padded sequences
328
    """
329
```
330

331
## Usage Examples
332

333
### Image Data Pipeline
334

335
```python
336
import keras
337
from keras import layers
338

339
# Create dataset from directory
340
train_dataset = keras.utils.image_dataset_from_directory(
341
    'path/to/train',
342
    validation_split=0.2,
343
    subset='training',
344
    seed=123,
345
    image_size=(224, 224),
346
    batch_size=32
347
)
348

349
val_dataset = keras.utils.image_dataset_from_directory(
350
    'path/to/train',
351
    validation_split=0.2,
352
    subset='validation',
353
    seed=123,
354
    image_size=(224, 224),
355
    batch_size=32
356
)
357

358
# Build preprocessing pipeline
359
data_augmentation = keras.Sequential([
360
    layers.RandomFlip('horizontal'),
361
    layers.RandomRotation(0.2),
362
    layers.RandomZoom(0.2),
363
    layers.RandomBrightness(0.2),
364
    layers.RandomContrast(0.2)
365
])
366

367
# Apply to datasets
368
train_dataset = train_dataset.map(lambda x, y: (data_augmentation(x, training=True), y))
369

370
# Normalize pixel values
371
normalization = layers.Rescaling(1./255)
372
train_dataset = train_dataset.map(lambda x, y: (normalization(x), y))
373
val_dataset = val_dataset.map(lambda x, y: (normalization(x), y))
374
```
375

376
### Text Data Pipeline
377

378
```python
379
import keras
380
from keras import layers
381

382
# Create text dataset
383
train_dataset = keras.utils.text_dataset_from_directory(
384
    'path/to/text_data',
385
    batch_size=32,
386
    validation_split=0.2,
387
    subset='training',
388
    seed=123
389
)
390

391
# Text vectorization
392
vectorize_layer = layers.TextVectorization(
393
    max_tokens=10000,
394
    output_sequence_length=100,
395
    standardize='lower_and_strip_punctuation'
396
)
397

398
# Adapt to training data
399
text_only_dataset = train_dataset.map(lambda x, y: x)
400
vectorize_layer.adapt(text_only_dataset)
401

402
# Apply vectorization
403
train_dataset = train_dataset.map(lambda x, y: (vectorize_layer(x), y))
404
```

Version

Tile

Files

preprocessing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

preprocessing.mddocs/