or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core.mddata.mddistribute.mdimage.mdindex.mdkeras.mdmath.mdnn.mdsaved-model.md

data.mddocs/

0

# Data Processing

1

2

Dataset creation, transformation, and preprocessing pipeline operations for efficient data handling and training workflows. The tf.data API provides powerful tools for building scalable input pipelines.

3

4

## Capabilities

5

6

### Dataset Creation

7

8

Create datasets from various data sources.

9

10

```python { .api }

11

class Dataset:

12

"""A potentially large set of elements."""

13

14

@staticmethod

15

def from_tensor_slices(tensors, name=None):

16

"""

17

Creates a Dataset whose elements are slices of the given tensors.

18

19

Parameters:

20

- tensors: A dataset element, whose components have the same first dimension

21

- name: Optional name for the tf.data operation

22

23

Returns:

24

A Dataset

25

"""

26

27

@staticmethod

28

def from_tensors(tensors, name=None):

29

"""

30

Creates a Dataset with a single element, comprising the given tensors.

31

32

Parameters:

33

- tensors: A dataset element

34

- name: Optional name for the tf.data operation

35

36

Returns:

37

A Dataset

38

"""

39

40

@staticmethod

41

def from_generator(generator, output_signature, args=None):

42

"""

43

Creates a Dataset whose elements are generated by generator.

44

45

Parameters:

46

- generator: A callable object that returns an object that supports the iter() protocol

47

- output_signature: A nested structure of tf.TypeSpec objects corresponding to each component of an element yielded by generator

48

- args: A tf.Tensor object or a tuple of tf.Tensor objects to pass as arguments to generator

49

50

Returns:

51

A Dataset

52

"""

53

54

@staticmethod

55

def range(*args, **kwargs):

56

"""

57

Creates a Dataset of a step-separated range of values.

58

59

Parameters:

60

- *args: follows the same semantics as python's xrange

61

- **kwargs: optional keyword arguments

62

63

Returns:

64

A RangeDataset

65

"""

66

67

@staticmethod

68

def zip(datasets):

69

"""

70

Creates a Dataset by zipping together the given datasets.

71

72

Parameters:

73

- datasets: A nested structure of datasets

74

75

Returns:

76

A Dataset

77

"""

78

```

79

80

### Dataset Transformation

81

82

Transform and manipulate dataset elements.

83

84

```python { .api }

85

def map(self, map_func, num_parallel_calls=None, deterministic=None, name=None):

86

"""

87

Maps map_func across the elements of this dataset.

88

89

Parameters:

90

- map_func: A function mapping a dataset element to another dataset element

91

- num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process asynchronously in parallel

92

- deterministic: A boolean controlling whether the map is allowed to return elements out of order

93

- name: Optional name for the tf.data operation

94

95

Returns:

96

A Dataset

97

"""

98

99

def filter(self, predicate, name=None):

100

"""

101

Filters this dataset according to predicate.

102

103

Parameters:

104

- predicate: A function mapping a dataset element to a boolean

105

- name: Optional name for the tf.data operation

106

107

Returns:

108

A Dataset

109

"""

110

111

def flat_map(self, map_func, name=None):

112

"""

113

Maps map_func across this dataset and flattens the result.

114

115

Parameters:

116

- map_func: A function mapping a dataset element to a dataset

117

- name: Optional name for the tf.data operation

118

119

Returns:

120

A Dataset

121

"""

122

123

def interleave(self, map_func, cycle_length=None, block_length=None,

124

num_parallel_calls=None, deterministic=None, name=None):

125

"""

126

Maps map_func across this dataset, and interleaves the results.

127

128

Parameters:

129

- map_func: A function mapping a dataset element to a dataset

130

- cycle_length: The number of input elements that will be processed concurrently

131

- block_length: The number of consecutive elements to produce from each input element before cycling to another input element

132

- num_parallel_calls: The number of parallel calls for map_func

133

- deterministic: A boolean controlling whether the interleave is allowed to return elements out of order

134

- name: Optional name for the tf.data operation

135

136

Returns:

137

A Dataset

138

"""

139

```

140

141

### Dataset Batching and Sampling

142

143

Operations for batching and sampling data.

144

145

```python { .api }

146

def batch(self, batch_size, drop_remainder=False, num_parallel_calls=None,

147

deterministic=None, name=None):

148

"""

149

Combines consecutive elements of this dataset into batches.

150

151

Parameters:

152

- batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch

153

- drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements

154

- num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process in parallel

155

- deterministic: A boolean controlling whether the batch is allowed to return elements out of order

156

- name: Optional name for the tf.data operation

157

158

Returns:

159

A Dataset

160

"""

161

162

def padded_batch(self, batch_size, padded_shapes=None, padding_values=None,

163

drop_remainder=False, name=None):

164

"""

165

Combines consecutive elements of this dataset into padded batches.

166

167

Parameters:

168

- batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch

169

- padded_shapes: A nested structure of tf.TensorShape or tf.int64 vector tensor-like objects representing the shape to which the respective component of each input element should be padded prior to batching

170

- padding_values: A nested structure of scalar-shaped tf.Tensor, representing the padding values to use for the respective components

171

- drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements

172

- name: Optional name for the tf.data operation

173

174

Returns:

175

A Dataset

176

"""

177

178

def unbatch(self, name=None):

179

"""

180

Splits elements of a dataset into multiple elements on the batch dimension.

181

182

Parameters:

183

- name: Optional name for the tf.data operation

184

185

Returns:

186

A Dataset

187

"""

188

189

def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None, name=None):

190

"""

191

Randomly shuffles the elements of this dataset.

192

193

Parameters:

194

- buffer_size: A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset from which the new dataset will sample

195

- seed: Optional tf.int64 scalar tf.Tensor, representing the random seed that will be used to create the distribution

196

- reshuffle_each_iteration: If true, the dataset will be reshuffled each time it is iterated over

197

- name: Optional name for the tf.data operation

198

199

Returns:

200

A Dataset

201

"""

202

203

def repeat(self, count=None, name=None):

204

"""

205

Repeats this dataset so each original value is seen count times.

206

207

Parameters:

208

- count: A tf.int64 scalar tf.Tensor, representing the number of times the dataset should be repeated

209

- name: Optional name for the tf.data operation

210

211

Returns:

212

A Dataset

213

"""

214

215

def take(self, count, name=None):

216

"""

217

Creates a Dataset with at most count elements from this dataset.

218

219

Parameters:

220

- count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be taken to form the new dataset

221

- name: Optional name for the tf.data operation

222

223

Returns:

224

A Dataset

225

"""

226

227

def skip(self, count, name=None):

228

"""

229

Creates a Dataset that skips count elements from this dataset.

230

231

Parameters:

232

- count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be skipped to form the new dataset

233

- name: Optional name for the tf.data operation

234

235

Returns:

236

A Dataset

237

"""

238

```

239

240

### Performance Optimization

241

242

Operations for optimizing dataset performance.

243

244

```python { .api }

245

def cache(self, filename="", name=None):

246

"""

247

Caches the elements in this dataset.

248

249

Parameters:

250

- filename: A tf.string scalar tf.Tensor, representing the name of a directory on the filesystem to use for caching elements in this Dataset

251

- name: Optional name for the tf.data operation

252

253

Returns:

254

A Dataset

255

"""

256

257

def prefetch(self, buffer_size, name=None):

258

"""

259

Creates a Dataset that prefetches elements from this dataset.

260

261

Parameters:

262

- buffer_size: A tf.int64 scalar tf.Tensor, representing the maximum number of elements that will be buffered when prefetching

263

- name: Optional name for the tf.data operation

264

265

Returns:

266

A Dataset

267

"""

268

269

def parallel_interleave(map_func, cycle_length, block_length=1,

270

sloppy=False, buffer_output_elements=None,

271

prefetch_input_elements=None):

272

"""

273

A parallel version of the Dataset.interleave() transformation.

274

275

Parameters:

276

- map_func: A function mapping a nested structure of tensors to a Dataset

277

- cycle_length: The number of input elements that will be processed concurrently

278

- block_length: The number of consecutive elements to produce from each input element before cycling to another input element

279

- sloppy: If false, the relative order of records produced by this transformation is deterministic

280

- buffer_output_elements: The number of elements each iterator being interleaved should buffer

281

- prefetch_input_elements: The number of input elements to transform to iterators in parallel and keep buffered

282

283

Returns:

284

A Dataset transformation function

285

"""

286

```

287

288

### Dataset Properties and Utilities

289

290

Utility methods for inspecting and manipulating datasets.

291

292

```python { .api }

293

@property

294

def element_spec(self):

295

"""

296

The type specification of an element of this dataset.

297

298

Returns:

299

A nested structure of tf.TypeSpec objects matching the structure of an element of this dataset

300

"""

301

302

def cardinality(self):

303

"""

304

Returns the cardinality of the dataset, if known.

305

306

Returns:

307

A scalar tf.int64 Tensor representing the cardinality of the dataset

308

"""

309

310

def enumerate(self, start=0, name=None):

311

"""

312

Enumerates the elements of this dataset.

313

314

Parameters:

315

- start: A tf.int64 scalar tf.Tensor, representing the start value for enumeration

316

- name: Optional name for the tf.data operation

317

318

Returns:

319

A Dataset

320

"""

321

322

def concatenate(self, dataset):

323

"""

324

Creates a Dataset by concatenating the given dataset with this dataset.

325

326

Parameters:

327

- dataset: Dataset to be concatenated

328

329

Returns:

330

A Dataset

331

"""

332

333

def reduce(self, initial_state, reduce_func, name=None):

334

"""

335

Reduces the input dataset to a single element.

336

337

Parameters:

338

- initial_state: An element representing the initial state of the reduction

339

- reduce_func: A function that maps (old_state, input_element) to new_state

340

- name: Optional name for the tf.data operation

341

342

Returns:

343

A dataset element

344

"""

345

346

def apply(self, transformation_func):

347

"""

348

Applies a transformation function to this dataset.

349

350

Parameters:

351

- transformation_func: A function that takes one Dataset argument and returns a Dataset

352

353

Returns:

354

The Dataset returned by applying transformation_func to this dataset

355

"""

356

```

357

358

## Usage Examples

359

360

```python

361

import tensorflow as tf

362

import numpy as np

363

364

# Create datasets from different sources

365

# From tensor slices

366

data = np.array([1, 2, 3, 4, 5])

367

dataset = tf.data.Dataset.from_tensor_slices(data)

368

369

# From tensors (single element)

370

single_element = tf.data.Dataset.from_tensors([1, 2, 3, 4, 5])

371

372

# From generator

373

def gen():

374

for i in range(100):

375

yield i

376

377

dataset_gen = tf.data.Dataset.from_generator(

378

gen,

379

output_signature=tf.TensorSpec(shape=(), dtype=tf.int32)

380

)

381

382

# Range dataset

383

range_dataset = tf.data.Dataset.range(10)

384

385

# Dataset transformations

386

# Map transformation

387

squared_dataset = dataset.map(lambda x: x ** 2)

388

389

# Filter transformation

390

even_dataset = range_dataset.filter(lambda x: x % 2 == 0)

391

392

# Batch transformation

393

batched_dataset = range_dataset.batch(3)

394

395

# Shuffle and repeat

396

shuffled_dataset = range_dataset.shuffle(buffer_size=10).repeat(2)

397

398

# Complex pipeline example

399

(train_images, train_labels) = np.random.random((1000, 28, 28, 1)), np.random.randint(0, 10, 1000)

400

401

train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels))

402

train_dataset = (train_dataset

403

.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y)) # Normalize

404

.shuffle(buffer_size=100)

405

.batch(32)

406

.prefetch(tf.data.AUTOTUNE))

407

408

# Performance optimizations

409

# Cache dataset

410

cached_dataset = train_dataset.cache()

411

412

# Prefetch for performance

413

prefetched_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

414

415

# Parallel map

416

parallel_mapped = range_dataset.map(

417

lambda x: x * 2,

418

num_parallel_calls=tf.data.AUTOTUNE

419

)

420

421

# Text processing example

422

text_data = ["hello world", "tensorflow data", "machine learning"]

423

text_dataset = tf.data.Dataset.from_tensor_slices(text_data)

424

425

# Split text into words

426

word_dataset = text_dataset.flat_map(

427

lambda x: tf.data.Dataset.from_tensor_slices(tf.strings.split(x))

428

)

429

430

# Iterate through dataset

431

for element in range_dataset.take(5):

432

print(element.numpy())

433

434

# Convert dataset to list (for small datasets)

435

dataset_list = list(range_dataset.take(5).as_numpy_iterator())

436

```