or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

build-system.mdcommon-data.mdcontainers.mddata-utils.mdindex.mdio-backends.mdquery.mdspecification.mdterm-sets.mdutils.mdvalidation.md

data-utils.mddocs/

0

# Data Utilities

1

2

HDMF provides essential data utilities for handling large datasets, chunk iterators, and I/O configurations. These utilities enable efficient memory management, streaming data operations, and customizable data handling patterns for scientific datasets.

3

4

## Capabilities

5

6

### Data Chunk Iterators

7

8

Iterator classes for processing large datasets in chunks without loading entire datasets into memory.

9

10

```python { .api }

11

class AbstractDataChunkIterator:

12

"""

13

Abstract base class for iterating over data in chunks.

14

15

Enables processing of large datasets by providing them in manageable

16

chunks, reducing memory usage and enabling streaming operations.

17

"""

18

19

def __init__(self, **kwargs):

20

"""Initialize abstract data chunk iterator."""

21

22

def __iter__(self):

23

"""Return iterator object."""

24

25

def __next__(self):

26

"""Get next data chunk."""

27

28

@property

29

def recommended_chunk_shape(self) -> tuple:

30

"""Recommended chunk shape for efficient processing."""

31

32

@property

33

def recommended_data_shape(self) -> tuple:

34

"""Recommended overall data shape."""

35

36

class GenericDataChunkIterator(AbstractDataChunkIterator):

37

"""

38

Generic implementation of data chunk iterator.

39

40

Provides chunk iteration over array-like data with configurable

41

chunk sizes and processing patterns.

42

"""

43

44

def __init__(self, data, **kwargs):

45

"""

46

Initialize generic chunk iterator.

47

48

Args:

49

data: Array-like data to iterate over

50

**kwargs: Iterator options:

51

- chunk_shape: Shape of chunks to yield

52

- buffer_size: Size of internal buffer

53

- iter_axis: Axis to iterate along

54

"""

55

56

def __next__(self) -> 'DataChunk':

57

"""

58

Get next data chunk.

59

60

Returns:

61

DataChunk object containing chunk data and metadata

62

"""

63

64

@property

65

def maxshape(self) -> tuple:

66

"""Maximum shape of the data."""

67

68

class DataChunkIterator(GenericDataChunkIterator):

69

"""

70

Specific implementation for HDMF data chunk iteration.

71

72

Optimized for HDMF data patterns with support for compression,

73

data validation, and backend-specific optimizations.

74

"""

75

76

def __init__(self, data, **kwargs):

77

"""

78

Initialize HDMF data chunk iterator.

79

80

Args:

81

data: Data to iterate over

82

**kwargs: HDMF-specific options:

83

- dtype: Data type for chunks

84

- compression: Compression settings

85

- shuffle: Enable shuffle filter

86

"""

87

88

class DataChunk:

89

"""

90

Represents a chunk of data with associated metadata.

91

92

Properties:

93

- data: The actual chunk data

94

- selection: Selection information for the chunk

95

- chunk_i: Chunk index

96

"""

97

98

def __init__(self, data, selection: tuple = None, chunk_i: int = None):

99

"""

100

Initialize data chunk.

101

102

Args:

103

data: Chunk data

104

selection: Selection tuple for the chunk

105

chunk_i: Index of this chunk

106

"""

107

108

@property

109

def data(self):

110

"""Access to chunk data."""

111

112

@property

113

def selection(self) -> tuple:

114

"""Selection information for this chunk."""

115

```

116

117

### Data I/O Configuration

118

119

Configuration classes for customizing data I/O behavior across different backends.

120

121

```python { .api }

122

class DataIO:

123

"""

124

Generic data I/O configuration wrapper.

125

126

Provides backend-agnostic configuration for data storage options

127

including compression, chunking, and filtering settings.

128

"""

129

130

def __init__(self, data, **kwargs):

131

"""

132

Initialize DataIO wrapper.

133

134

Args:

135

data: Data to be written

136

**kwargs: I/O configuration options:

137

- compression: Compression algorithm

138

- compression_opts: Compression parameters

139

- chunks: Chunking configuration

140

- fillvalue: Fill value for uninitialized data

141

"""

142

143

@property

144

def data(self):

145

"""Access to wrapped data."""

146

147

@property

148

def io_settings(self) -> dict:

149

"""Dictionary of I/O settings."""

150

151

class InvalidDataIOError(Exception):

152

"""

153

Exception for invalid DataIO configurations.

154

155

Raised when DataIO settings are incompatible or invalid

156

for the specified backend or data type.

157

"""

158

pass

159

```

160

161

### Data Manipulation Utilities

162

163

Utility functions for data manipulation and validation operations.

164

165

```python { .api }

166

def append_data(data, new_data):

167

"""

168

Append data to existing array-like structure.

169

170

Args:

171

data: Existing data array

172

new_data: Data to append

173

174

Returns:

175

Combined data array

176

"""

177

178

def extend_data(data, extension_data):

179

"""

180

Extend data with additional elements.

181

182

Args:

183

data: Existing data array

184

extension_data: Data to extend with

185

186

Returns:

187

Extended data array

188

"""

189

190

def assertEqualShape(data1, data2, ignore_axes: list = None):

191

"""

192

Assert that two data arrays have equal shapes.

193

194

Args:

195

data1: First data array

196

data2: Second data array

197

ignore_axes: List of axes to ignore in comparison

198

199

Raises:

200

AssertionError: If shapes don't match

201

"""

202

```

203

204

### Shape Validation

205

206

Classes and utilities for validating data shapes and dimensions.

207

208

```python { .api }

209

class ShapeValidatorResult:

210

"""

211

Result object for shape validation operations.

212

213

Contains validation status, error messages, and corrective suggestions

214

for data shape validation operations.

215

"""

216

217

def __init__(self, valid: bool, message: str = None, **kwargs):

218

"""

219

Initialize shape validation result.

220

221

Args:

222

valid: Whether validation passed

223

message: Validation message or error description

224

**kwargs: Additional result metadata

225

"""

226

227

@property

228

def valid(self) -> bool:

229

"""Whether validation passed."""

230

231

@property

232

def message(self) -> str:

233

"""Validation message or error description."""

234

235

@property

236

def errors(self) -> list:

237

"""List of validation errors."""

238

```

239

240

## Usage Examples

241

242

### Working with Data Chunk Iterators

243

244

```python

245

from hdmf.data_utils import DataChunkIterator

246

import numpy as np

247

248

# Create large dataset

249

large_data = np.random.randn(10000, 1000)

250

251

# Process in chunks to save memory

252

chunk_iter = DataChunkIterator(

253

data=large_data,

254

chunk_shape=(1000, 1000),

255

dtype=np.float64

256

)

257

258

# Process chunks incrementally

259

for chunk in chunk_iter:

260

# Process each chunk

261

processed_chunk = chunk.data * 2.0

262

print(f"Processed chunk {chunk.chunk_i} with shape {chunk.data.shape}")

263

```

264

265

### Configuring Data I/O

266

267

```python

268

from hdmf.data_utils import DataIO

269

from hdmf.backends.hdf5 import HDF5IO

270

import numpy as np

271

272

# Create data with custom I/O settings

273

data = np.random.randn(5000, 200)

274

275

# Configure compression and chunking

276

data_io = DataIO(

277

data=data,

278

compression='gzip',

279

compression_opts=9,

280

chunks=(500, 200),

281

fillvalue=-1

282

)

283

284

# Use with HDF5 backend

285

with HDF5IO('configured_data.h5', mode='w') as io:

286

container = Container(name='experiment')

287

data_container = Data(name='measurements', data=data_io)

288

container.add_child(data_container)

289

io.write(container)

290

```

291

292

### Data Manipulation Utilities

293

294

```python

295

from hdmf.data_utils import append_data, extend_data, assertEqualShape

296

import numpy as np

297

298

# Initial data

299

initial_data = np.array([[1, 2, 3], [4, 5, 6]])

300

301

# Append new rows

302

new_rows = np.array([[7, 8, 9], [10, 11, 12]])

303

combined_data = append_data(initial_data, new_rows)

304

305

# Extend with additional elements

306

extension = [13, 14, 15, 16]

307

extended_data = extend_data(combined_data.flatten(), extension)

308

309

# Validate shapes match

310

data1 = np.random.randn(100, 50)

311

data2 = np.random.randn(100, 50)

312

assertEqualShape(data1, data2) # Passes

313

314

# Ignore specific axes in shape comparison

315

data3 = np.random.randn(100, 60) # Different second dimension

316

assertEqualShape(data1, data3, ignore_axes=[1]) # Passes, ignoring axis 1

317

```

318

319

### Custom Chunk Processing

320

321

```python

322

from hdmf.data_utils import GenericDataChunkIterator, DataChunk

323

import numpy as np

324

325

class CustomProcessor:

326

def __init__(self, data, chunk_size=1000):

327

self.chunk_iter = GenericDataChunkIterator(

328

data=data,

329

chunk_shape=(chunk_size,)

330

)

331

self.results = []

332

333

def process_all_chunks(self):

334

"""Process all chunks and collect results."""

335

for chunk in self.chunk_iter:

336

# Apply custom processing

337

processed = self.custom_transform(chunk.data)

338

self.results.append({

339

'chunk_index': chunk.chunk_i,

340

'original_shape': chunk.data.shape,

341

'processed_data': processed

342

})

343

344

return self.results

345

346

def custom_transform(self, data):

347

"""Custom transformation function."""

348

return np.mean(data, axis=-1)

349

350

# Usage

351

large_dataset = np.random.randn(50000, 100)

352

processor = CustomProcessor(large_dataset, chunk_size=5000)

353

results = processor.process_all_chunks()

354

355

print(f"Processed {len(results)} chunks")

356

for result in results[:3]: # Show first 3 results

357

print(f"Chunk {result['chunk_index']}: {result['original_shape']} -> {result['processed_data'].shape}")

358

```