or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

activations.mdapplications.mddata-utils.mdindex.mdinitializers.mdlayers.mdmodels.mdoperations.mdrandom.mdregularizers.mdsaving.mdtraining.md

data-utils.mddocs/

0

# Data Processing and Utilities

1

2

Built-in datasets, data preprocessing utilities, image processing functions, and various helper utilities for machine learning workflows. These tools simplify data preparation and provide ready-to-use datasets for experimentation.

3

4

## Capabilities

5

6

### Built-in Datasets

7

8

Standard datasets commonly used for machine learning research and experimentation, pre-loaded and ready to use.

9

10

```python { .api }

11

# MNIST handwritten digits dataset

12

def load_data():

13

"""

14

Load MNIST dataset.

15

16

Returns:

17

Tuple of ((x_train, y_train), (x_test, y_test))

18

- x_train, x_test: uint8 arrays of grayscale image data with shape (num_samples, 28, 28)

19

- y_train, y_test: uint8 arrays of digit labels (0-9) with shape (num_samples,)

20

"""

21

22

# Fashion-MNIST dataset (available as keras.datasets.fashion_mnist.load_data())

23

# CIFAR-10 dataset (available as keras.datasets.cifar10.load_data())

24

# CIFAR-100 dataset (available as keras.datasets.cifar100.load_data())

25

# IMDB movie reviews dataset (available as keras.datasets.imdb.load_data())

26

# Reuters newswire dataset (available as keras.datasets.reuters.load_data())

27

# Boston housing dataset (available as keras.datasets.boston_housing.load_data())

28

# California housing dataset (available as keras.datasets.california_housing.load_data())

29

```

30

31

### Image Processing Utilities

32

33

Functions for loading, saving, and manipulating images for machine learning workflows.

34

35

```python { .api }

36

def load_img(path, color_mode='rgb', target_size=None, interpolation='nearest',

37

keep_aspect_ratio=False):

38

"""

39

Load an image into PIL format.

40

41

Parameters:

42

- path: Path to image file

43

- color_mode: 'grayscale', 'rgb', 'rgba'

44

- target_size: Tuple (height, width) to resize image

45

- interpolation: Interpolation method for resizing

46

- keep_aspect_ratio: Whether to keep aspect ratio when resizing

47

48

Returns:

49

PIL Image instance

50

"""

51

52

def save_img(path, x, data_format=None, file_format=None, scale=True, **kwargs):

53

"""

54

Save an image to disk.

55

56

Parameters:

57

- path: Path to save image

58

- x: Image array data

59

- data_format: Image data format

60

- file_format: Image file format ('png', 'jpeg', etc.)

61

- scale: Whether to scale pixel values to [0, 255]

62

"""

63

64

def img_to_array(img, data_format=None, dtype=None):

65

"""

66

Convert PIL Image to numpy array.

67

68

Parameters:

69

- img: PIL Image instance

70

- data_format: Image data format ('channels_first' or 'channels_last')

71

- dtype: Data type for output array

72

73

Returns:

74

Numpy array representation of image

75

"""

76

77

def array_to_img(x, data_format=None, scale=True, dtype=None):

78

"""

79

Convert numpy array to PIL Image.

80

81

Parameters:

82

- x: Input array

83

- data_format: Image data format

84

- scale: Whether to scale values to [0, 255]

85

- dtype: Data type

86

87

Returns:

88

PIL Image instance

89

"""

90

```

91

92

### Data Transformation Utilities

93

94

Functions for common data preprocessing tasks including categorical encoding, normalization, and sequence processing.

95

96

```python { .api }

97

def to_categorical(y, num_classes=None, dtype='float32'):

98

"""

99

Convert class vector to categorical (one-hot) matrix.

100

101

Parameters:

102

- y: Array of class labels to convert

103

- num_classes: Total number of classes (optional)

104

- dtype: Data type for output matrix

105

106

Returns:

107

Binary matrix representation of input as numpy array

108

"""

109

110

def normalize(x, axis=-1, order=2):

111

"""

112

Normalize array along specified axis.

113

114

Parameters:

115

- x: Array to normalize

116

- axis: Axis along which to normalize

117

- order: Normalization order (1 for L1, 2 for L2)

118

119

Returns:

120

Normalized array

121

"""

122

123

def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',

124

truncating='pre', value=0.0):

125

"""

126

Pad sequences to same length.

127

128

Parameters:

129

- sequences: List of sequences to pad

130

- maxlen: Maximum length of sequences

131

- dtype: Data type for output

132

- padding: 'pre' or 'post' padding

133

- truncating: 'pre' or 'post' truncation

134

- value: Padding value

135

136

Returns:

137

2D numpy array with shape (len(sequences), maxlen)

138

"""

139

```

140

141

### Dataset Creation Utilities

142

143

Functions for creating tf.data.Dataset objects from directories and arrays for efficient data loading.

144

145

```python { .api }

146

def image_dataset_from_directory(directory, labels='inferred', label_mode='int',

147

class_names=None, color_mode='rgb', batch_size=32,

148

image_size=(256, 256), shuffle=True, seed=None,

149

validation_split=None, subset=None, interpolation='bilinear',

150

follow_links=False, crop_to_aspect_ratio=False):

151

"""

152

Generate dataset from image directory.

153

154

Parameters:

155

- directory: Path to directory containing subdirectories of images

156

- labels: 'inferred' (from directory structure) or list of labels

157

- label_mode: 'int', 'categorical', 'binary', or None

158

- class_names: List of class names (overrides inferred names)

159

- color_mode: 'grayscale', 'rgb', or 'rgba'

160

- batch_size: Batch size

161

- image_size: Size to resize images to

162

- shuffle: Whether to shuffle data

163

- seed: Random seed for shuffling

164

- validation_split: Fraction of data for validation

165

- subset: 'training' or 'validation' (when validation_split is set)

166

- interpolation: Interpolation method for resizing

167

- follow_links: Whether to follow symlinks

168

- crop_to_aspect_ratio: Whether to crop to maintain aspect ratio

169

170

Returns:

171

tf.data.Dataset object

172

"""

173

174

def text_dataset_from_directory(directory, labels='inferred', label_mode='int',

175

class_names=None, batch_size=32, max_length=None,

176

shuffle=True, seed=None, validation_split=None,

177

subset=None, follow_links=False):

178

"""

179

Generate dataset from text directory.

180

181

Parameters:

182

- directory: Path to directory containing text files

183

- labels: 'inferred' or list of labels

184

- label_mode: 'int', 'categorical', 'binary', or None

185

- class_names: List of class names

186

- batch_size: Batch size

187

- max_length: Maximum sequence length

188

- shuffle: Whether to shuffle data

189

- seed: Random seed

190

- validation_split: Fraction for validation

191

- subset: 'training' or 'validation'

192

- follow_links: Whether to follow symlinks

193

194

Returns:

195

tf.data.Dataset object

196

"""

197

198

def timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride=1,

199

sampling_rate=1, batch_size=128, shuffle=False,

200

seed=None, start_index=None, end_index=None):

201

"""

202

Create dataset from time series data.

203

204

Parameters:

205

- data: Array of data points

206

- targets: Array of targets corresponding to data

207

- sequence_length: Length of output sequences

208

- sequence_stride: Stride between successive output sequences

209

- sampling_rate: Rate to sample data points within sequences

210

- batch_size: Batch size

211

- shuffle: Whether to shuffle data

212

- seed: Random seed

213

- start_index: Start index for data

214

- end_index: End index for data

215

216

Returns:

217

tf.data.Dataset object yielding (inputs, targets) tuples

218

"""

219

```

220

221

### Data Utilities

222

223

Utility classes and functions for advanced data handling including custom datasets, feature engineering, and data packing.

224

225

```python { .api }

226

class Sequence:

227

"""Base class for fitting to sequence of data."""

228

229

def __init__(self):

230

"""Initialize sequence."""

231

232

def __getitem__(self, index):

233

"""

234

Get batch at index.

235

236

Parameters:

237

- index: Batch index

238

239

Returns:

240

Batch data

241

"""

242

243

def __len__(self):

244

"""

245

Number of batches in sequence.

246

247

Returns:

248

Number of batches

249

"""

250

251

def on_epoch_end(self):

252

"""Method called at end of every epoch."""

253

254

class FeatureSpace:

255

"""Utility for feature preprocessing and engineering."""

256

257

def __init__(self, features, output_mode='concat'):

258

"""

259

Initialize feature space.

260

261

Parameters:

262

- features: Dict mapping feature names to preprocessing layers

263

- output_mode: 'concat' or 'dict'

264

"""

265

266

def adapt(self, dataset):

267

"""

268

Fit feature preprocessing on dataset.

269

270

Parameters:

271

- dataset: Dataset to adapt to

272

"""

273

274

def __call__(self, data):

275

"""

276

Apply feature preprocessing.

277

278

Parameters:

279

- data: Input data

280

281

Returns:

282

Preprocessed features

283

"""

284

285

def pack_x_y_sample_weight(x, y=None, sample_weight=None):

286

"""

287

Pack user-provided data into tuple.

288

289

Parameters:

290

- x: Input data

291

- y: Target data

292

- sample_weight: Sample weights

293

294

Returns:

295

Packed data tuple

296

"""

297

298

def unpack_x_y_sample_weight(data):

299

"""

300

Unpack user-provided data tuple.

301

302

Parameters:

303

- data: Packed data tuple

304

305

Returns:

306

Tuple of (x, y, sample_weight)

307

"""

308

309

def split_dataset(dataset, left_size=None, right_size=None, shuffle=False, seed=None):

310

"""

311

Split dataset into two datasets.

312

313

Parameters:

314

- dataset: Dataset to split

315

- left_size: Size of left split

316

- right_size: Size of right split

317

- shuffle: Whether to shuffle before splitting

318

- seed: Random seed

319

320

Returns:

321

Tuple of (left_dataset, right_dataset)

322

"""

323

```

324

325

### File and Download Utilities

326

327

Functions for downloading files and managing data assets.

328

329

```python { .api }

330

def get_file(fname=None, origin=None, untar=False, md5_hash=None, file_hash=None,

331

cache_subdir='datasets', hash_algorithm='auto', extract=False,

332

archive_format='auto', cache_dir=None):

333

"""

334

Download file from URL if not already cached.

335

336

Parameters:

337

- fname: Name of file (if origin has different name)

338

- origin: Original URL of file

339

- untar: Whether to untar file after download

340

- md5_hash: MD5 hash for verification (deprecated)

341

- file_hash: Hash for verification

342

- cache_subdir: Subdirectory under cache directory

343

- hash_algorithm: Hash algorithm ('md5', 'sha256', 'auto')

344

- extract: Whether to extract archive after download

345

- archive_format: Archive format ('auto', 'tar', 'zip')

346

- cache_dir: Location to store cached files

347

348

Returns:

349

Path to downloaded file

350

"""

351

```

352

353

### Configuration and Random Utilities

354

355

Utilities for setting random seeds and managing global configuration.

356

357

```python { .api }

358

def set_random_seed(seed=None):

359

"""

360

Set random seed for reproducible results.

361

362

Parameters:

363

- seed: Random seed value

364

"""

365

366

class Config:

367

"""Global configuration utility."""

368

369

def enable(self, feature):

370

"""Enable configuration feature."""

371

372

def disable(self, feature):

373

"""Disable configuration feature."""

374

375

def is_enabled(self, feature):

376

"""Check if feature is enabled."""

377

378

class Progbar:

379

"""Progress bar utility for training loops."""

380

381

def __init__(self, target, width=30, verbose=1, interval=0.05,

382

stateful_metrics=None, unit_name='step'):

383

"""

384

Initialize progress bar.

385

386

Parameters:

387

- target: Total number of steps expected

388

- width: Progress bar width

389

- verbose: Verbosity mode

390

- interval: Minimum update interval

391

- stateful_metrics: Metrics that shouldn't be averaged

392

- unit_name: Display name for step units

393

"""

394

395

def update(self, current, values=None, finalize=None):

396

"""

397

Update progress bar.

398

399

Parameters:

400

- current: Current step index

401

- values: List of tuples (name, value) for metrics

402

- finalize: Whether to finalize progress bar

403

"""

404

```

405

406

## Usage Examples

407

408

### Loading and Preprocessing Images

409

410

```python

411

import keras

412

from keras.utils import load_img, img_to_array, to_categorical

413

import numpy as np

414

415

# Load and preprocess single image

416

img_path = 'cat.jpg'

417

img = load_img(img_path, target_size=(224, 224))

418

img_array = img_to_array(img)

419

img_array = np.expand_dims(img_array, axis=0)

420

img_array /= 255.0 # Normalize to [0, 1]

421

422

# Convert labels to categorical

423

labels = [0, 1, 2, 1, 0] # Class indices

424

categorical_labels = to_categorical(labels, num_classes=3)

425

print(categorical_labels)

426

# [[1. 0. 0.]

427

# [0. 1. 0.]

428

# [0. 0. 1.]

429

# [0. 1. 0.]

430

# [1. 0. 0.]]

431

```

432

433

### Creating Datasets from Directories

434

435

```python

436

import keras

437

438

# Create image dataset from directory structure

439

train_dataset = keras.utils.image_dataset_from_directory(

440

'path/to/train_data/',

441

labels='inferred',

442

label_mode='categorical',

443

color_mode='rgb',

444

batch_size=32,

445

image_size=(224, 224),

446

shuffle=True,

447

validation_split=0.2,

448

subset='training',

449

seed=123

450

)

451

452

val_dataset = keras.utils.image_dataset_from_directory(

453

'path/to/train_data/',

454

labels='inferred',

455

label_mode='categorical',

456

color_mode='rgb',

457

batch_size=32,

458

image_size=(224, 224),

459

shuffle=True,

460

validation_split=0.2,

461

subset='validation',

462

seed=123

463

)

464

465

# Use datasets for training

466

# model.fit(train_dataset, validation_data=val_dataset, epochs=10)

467

```

468

469

### Working with Built-in Datasets

470

471

```python

472

import keras

473

from keras.datasets import mnist, cifar10

474

from keras.utils import to_categorical

475

476

# Load MNIST dataset

477

(x_train, y_train), (x_test, y_test) = mnist.load_data()

478

479

# Preprocess data

480

x_train = x_train.astype('float32') / 255.0

481

x_test = x_test.astype('float32') / 255.0

482

x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)

483

x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)

484

485

# Convert labels to categorical

486

y_train = to_categorical(y_train, 10)

487

y_test = to_categorical(y_test, 10)

488

489

print(f"Training data shape: {x_train.shape}")

490

print(f"Training labels shape: {y_train.shape}")

491

```

492

493

### Custom Data Sequence

494

495

```python

496

import keras

497

import numpy as np

498

499

class CustomDataSequence(keras.utils.Sequence):

500

def __init__(self, x_data, y_data, batch_size):

501

self.x_data = x_data

502

self.y_data = y_data

503

self.batch_size = batch_size

504

self.indices = np.arange(len(self.x_data))

505

506

def __len__(self):

507

return len(self.x_data) // self.batch_size

508

509

def __getitem__(self, index):

510

batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

511

batch_x = self.x_data[batch_indices]

512

batch_y = self.y_data[batch_indices]

513

return batch_x, batch_y

514

515

def on_epoch_end(self):

516

np.random.shuffle(self.indices)

517

518

# Use custom sequence

519

# train_sequence = CustomDataSequence(x_train, y_train, batch_size=32)

520

# model.fit(train_sequence, epochs=10)

521

```

522

523

### Feature Engineering with FeatureSpace

524

525

```python

526

import keras

527

from keras import layers

528

from keras.utils import FeatureSpace

529

530

# Define feature preprocessing

531

feature_space = FeatureSpace(

532

features={

533

'age': layers.Normalization(),

534

'category': layers.StringLookup(output_mode='one_hot'),

535

'price': layers.Discretization(num_bins=10),

536

},

537

output_mode='concat'

538

)

539

540

# Adapt to training data

541

# feature_space.adapt(train_dataset)

542

543

# Apply preprocessing

544

# processed_features = feature_space(raw_features)

545

```

546

547

### Creating Time Series Dataset

548

549

```python

550

import keras

551

import numpy as np

552

553

# Generate sample time series data

554

data = np.sin(np.arange(1000) * 0.1)

555

targets = np.sin(np.arange(1000) * 0.1 + 0.1)

556

557

# Create dataset for sequence prediction

558

dataset = keras.utils.timeseries_dataset_from_array(

559

data=data,

560

targets=targets,

561

sequence_length=10,

562

batch_size=32,

563

shuffle=True

564

)

565

566

# Use for training RNN models

567

# model.fit(dataset, epochs=10)

568

```