0
# Data Processing
1
2
Dataset creation, transformation, and preprocessing pipeline operations for efficient data handling and training workflows. The tf.data API provides powerful tools for building scalable input pipelines.
3
4
## Capabilities
5
6
### Dataset Creation
7
8
Create datasets from various data sources.
9
10
```python { .api }
11
class Dataset:
12
"""A potentially large set of elements."""
13
14
@staticmethod
15
def from_tensor_slices(tensors, name=None):
16
"""
17
Creates a Dataset whose elements are slices of the given tensors.
18
19
Parameters:
20
- tensors: A dataset element, whose components have the same first dimension
21
- name: Optional name for the tf.data operation
22
23
Returns:
24
A Dataset
25
"""
26
27
@staticmethod
28
def from_tensors(tensors, name=None):
29
"""
30
Creates a Dataset with a single element, comprising the given tensors.
31
32
Parameters:
33
- tensors: A dataset element
34
- name: Optional name for the tf.data operation
35
36
Returns:
37
A Dataset
38
"""
39
40
@staticmethod
41
def from_generator(generator, output_signature, args=None):
42
"""
43
Creates a Dataset whose elements are generated by generator.
44
45
Parameters:
46
- generator: A callable object that returns an object that supports the iter() protocol
47
- output_signature: A nested structure of tf.TypeSpec objects corresponding to each component of an element yielded by generator
48
- args: A tf.Tensor object or a tuple of tf.Tensor objects to pass as arguments to generator
49
50
Returns:
51
A Dataset
52
"""
53
54
@staticmethod
55
def range(*args, **kwargs):
56
"""
57
Creates a Dataset of a step-separated range of values.
58
59
Parameters:
60
- *args: follows the same semantics as python's xrange
61
- **kwargs: optional keyword arguments
62
63
Returns:
64
A RangeDataset
65
"""
66
67
@staticmethod
68
def zip(datasets):
69
"""
70
Creates a Dataset by zipping together the given datasets.
71
72
Parameters:
73
- datasets: A nested structure of datasets
74
75
Returns:
76
A Dataset
77
"""
78
```
79
80
### Dataset Transformation
81
82
Transform and manipulate dataset elements.
83
84
```python { .api }
85
def map(self, map_func, num_parallel_calls=None, deterministic=None, name=None):
86
"""
87
Maps map_func across the elements of this dataset.
88
89
Parameters:
90
- map_func: A function mapping a dataset element to another dataset element
91
- num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process asynchronously in parallel
92
- deterministic: A boolean controlling whether the map is allowed to return elements out of order
93
- name: Optional name for the tf.data operation
94
95
Returns:
96
A Dataset
97
"""
98
99
def filter(self, predicate, name=None):
100
"""
101
Filters this dataset according to predicate.
102
103
Parameters:
104
- predicate: A function mapping a dataset element to a boolean
105
- name: Optional name for the tf.data operation
106
107
Returns:
108
A Dataset
109
"""
110
111
def flat_map(self, map_func, name=None):
112
"""
113
Maps map_func across this dataset and flattens the result.
114
115
Parameters:
116
- map_func: A function mapping a dataset element to a dataset
117
- name: Optional name for the tf.data operation
118
119
Returns:
120
A Dataset
121
"""
122
123
def interleave(self, map_func, cycle_length=None, block_length=None,
124
num_parallel_calls=None, deterministic=None, name=None):
125
"""
126
Maps map_func across this dataset, and interleaves the results.
127
128
Parameters:
129
- map_func: A function mapping a dataset element to a dataset
130
- cycle_length: The number of input elements that will be processed concurrently
131
- block_length: The number of consecutive elements to produce from each input element before cycling to another input element
132
- num_parallel_calls: The number of parallel calls for map_func
133
- deterministic: A boolean controlling whether the interleave is allowed to return elements out of order
134
- name: Optional name for the tf.data operation
135
136
Returns:
137
A Dataset
138
"""
139
```
140
141
### Dataset Batching and Sampling
142
143
Operations for batching and sampling data.
144
145
```python { .api }
146
def batch(self, batch_size, drop_remainder=False, num_parallel_calls=None,
147
deterministic=None, name=None):
148
"""
149
Combines consecutive elements of this dataset into batches.
150
151
Parameters:
152
- batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch
153
- drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements
154
- num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process in parallel
155
- deterministic: A boolean controlling whether the batch is allowed to return elements out of order
156
- name: Optional name for the tf.data operation
157
158
Returns:
159
A Dataset
160
"""
161
162
def padded_batch(self, batch_size, padded_shapes=None, padding_values=None,
163
drop_remainder=False, name=None):
164
"""
165
Combines consecutive elements of this dataset into padded batches.
166
167
Parameters:
168
- batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch
169
- padded_shapes: A nested structure of tf.TensorShape or tf.int64 vector tensor-like objects representing the shape to which the respective component of each input element should be padded prior to batching
170
- padding_values: A nested structure of scalar-shaped tf.Tensor, representing the padding values to use for the respective components
171
- drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements
172
- name: Optional name for the tf.data operation
173
174
Returns:
175
A Dataset
176
"""
177
178
def unbatch(self, name=None):
179
"""
180
Splits elements of a dataset into multiple elements on the batch dimension.
181
182
Parameters:
183
- name: Optional name for the tf.data operation
184
185
Returns:
186
A Dataset
187
"""
188
189
def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None, name=None):
190
"""
191
Randomly shuffles the elements of this dataset.
192
193
Parameters:
194
- buffer_size: A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset from which the new dataset will sample
195
- seed: Optional tf.int64 scalar tf.Tensor, representing the random seed that will be used to create the distribution
196
- reshuffle_each_iteration: If true, the dataset will be reshuffled each time it is iterated over
197
- name: Optional name for the tf.data operation
198
199
Returns:
200
A Dataset
201
"""
202
203
def repeat(self, count=None, name=None):
204
"""
205
Repeats this dataset so each original value is seen count times.
206
207
Parameters:
208
- count: A tf.int64 scalar tf.Tensor, representing the number of times the dataset should be repeated
209
- name: Optional name for the tf.data operation
210
211
Returns:
212
A Dataset
213
"""
214
215
def take(self, count, name=None):
216
"""
217
Creates a Dataset with at most count elements from this dataset.
218
219
Parameters:
220
- count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be taken to form the new dataset
221
- name: Optional name for the tf.data operation
222
223
Returns:
224
A Dataset
225
"""
226
227
def skip(self, count, name=None):
228
"""
229
Creates a Dataset that skips count elements from this dataset.
230
231
Parameters:
232
- count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be skipped to form the new dataset
233
- name: Optional name for the tf.data operation
234
235
Returns:
236
A Dataset
237
"""
238
```
239
240
### Performance Optimization
241
242
Operations for optimizing dataset performance.
243
244
```python { .api }
245
def cache(self, filename="", name=None):
246
"""
247
Caches the elements in this dataset.
248
249
Parameters:
250
- filename: A tf.string scalar tf.Tensor, representing the name of a directory on the filesystem to use for caching elements in this Dataset
251
- name: Optional name for the tf.data operation
252
253
Returns:
254
A Dataset
255
"""
256
257
def prefetch(self, buffer_size, name=None):
258
"""
259
Creates a Dataset that prefetches elements from this dataset.
260
261
Parameters:
262
- buffer_size: A tf.int64 scalar tf.Tensor, representing the maximum number of elements that will be buffered when prefetching
263
- name: Optional name for the tf.data operation
264
265
Returns:
266
A Dataset
267
"""
268
269
def parallel_interleave(map_func, cycle_length, block_length=1,
270
sloppy=False, buffer_output_elements=None,
271
prefetch_input_elements=None):
272
"""
273
A parallel version of the Dataset.interleave() transformation.
274
275
Parameters:
276
- map_func: A function mapping a nested structure of tensors to a Dataset
277
- cycle_length: The number of input elements that will be processed concurrently
278
- block_length: The number of consecutive elements to produce from each input element before cycling to another input element
279
- sloppy: If false, the relative order of records produced by this transformation is deterministic
280
- buffer_output_elements: The number of elements each iterator being interleaved should buffer
281
- prefetch_input_elements: The number of input elements to transform to iterators in parallel and keep buffered
282
283
Returns:
284
A Dataset transformation function
285
"""
286
```
287
288
### Dataset Properties and Utilities
289
290
Utility methods for inspecting and manipulating datasets.
291
292
```python { .api }
293
@property
294
def element_spec(self):
295
"""
296
The type specification of an element of this dataset.
297
298
Returns:
299
A nested structure of tf.TypeSpec objects matching the structure of an element of this dataset
300
"""
301
302
def cardinality(self):
303
"""
304
Returns the cardinality of the dataset, if known.
305
306
Returns:
307
A scalar tf.int64 Tensor representing the cardinality of the dataset
308
"""
309
310
def enumerate(self, start=0, name=None):
311
"""
312
Enumerates the elements of this dataset.
313
314
Parameters:
315
- start: A tf.int64 scalar tf.Tensor, representing the start value for enumeration
316
- name: Optional name for the tf.data operation
317
318
Returns:
319
A Dataset
320
"""
321
322
def concatenate(self, dataset):
323
"""
324
Creates a Dataset by concatenating the given dataset with this dataset.
325
326
Parameters:
327
- dataset: Dataset to be concatenated
328
329
Returns:
330
A Dataset
331
"""
332
333
def reduce(self, initial_state, reduce_func, name=None):
334
"""
335
Reduces the input dataset to a single element.
336
337
Parameters:
338
- initial_state: An element representing the initial state of the reduction
339
- reduce_func: A function that maps (old_state, input_element) to new_state
340
- name: Optional name for the tf.data operation
341
342
Returns:
343
A dataset element
344
"""
345
346
def apply(self, transformation_func):
347
"""
348
Applies a transformation function to this dataset.
349
350
Parameters:
351
- transformation_func: A function that takes one Dataset argument and returns a Dataset
352
353
Returns:
354
The Dataset returned by applying transformation_func to this dataset
355
"""
356
```
357
358
## Usage Examples
359
360
```python
361
import tensorflow as tf
362
import numpy as np
363
364
# Create datasets from different sources
365
# From tensor slices
366
data = np.array([1, 2, 3, 4, 5])
367
dataset = tf.data.Dataset.from_tensor_slices(data)
368
369
# From tensors (single element)
370
single_element = tf.data.Dataset.from_tensors([1, 2, 3, 4, 5])
371
372
# From generator
373
def gen():
374
for i in range(100):
375
yield i
376
377
dataset_gen = tf.data.Dataset.from_generator(
378
gen,
379
output_signature=tf.TensorSpec(shape=(), dtype=tf.int32)
380
)
381
382
# Range dataset
383
range_dataset = tf.data.Dataset.range(10)
384
385
# Dataset transformations
386
# Map transformation
387
squared_dataset = dataset.map(lambda x: x ** 2)
388
389
# Filter transformation
390
even_dataset = range_dataset.filter(lambda x: x % 2 == 0)
391
392
# Batch transformation
393
batched_dataset = range_dataset.batch(3)
394
395
# Shuffle and repeat
396
shuffled_dataset = range_dataset.shuffle(buffer_size=10).repeat(2)
397
398
# Complex pipeline example
399
(train_images, train_labels) = np.random.random((1000, 28, 28, 1)), np.random.randint(0, 10, 1000)
400
401
train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
402
train_dataset = (train_dataset
403
.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y)) # Normalize
404
.shuffle(buffer_size=100)
405
.batch(32)
406
.prefetch(tf.data.AUTOTUNE))
407
408
# Performance optimizations
409
# Cache dataset
410
cached_dataset = train_dataset.cache()
411
412
# Prefetch for performance
413
prefetched_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
414
415
# Parallel map
416
parallel_mapped = range_dataset.map(
417
lambda x: x * 2,
418
num_parallel_calls=tf.data.AUTOTUNE
419
)
420
421
# Text processing example
422
text_data = ["hello world", "tensorflow data", "machine learning"]
423
text_dataset = tf.data.Dataset.from_tensor_slices(text_data)
424
425
# Split text into words
426
word_dataset = text_dataset.flat_map(
427
lambda x: tf.data.Dataset.from_tensor_slices(tf.strings.split(x))
428
)
429
430
# Iterate through dataset
431
for element in range_dataset.take(5):
432
print(element.numpy())
433
434
# Convert dataset to list (for small datasets)
435
dataset_list = list(range_dataset.take(5).as_numpy_iterator())
436
```