0
# Data Utilities
1
2
HDMF provides essential data utilities for handling large datasets, chunk iterators, and I/O configurations. These utilities enable efficient memory management, streaming data operations, and customizable data handling patterns for scientific datasets.
3
4
## Capabilities
5
6
### Data Chunk Iterators
7
8
Iterator classes for processing large datasets in chunks without loading entire datasets into memory.
9
10
```python { .api }
11
class AbstractDataChunkIterator:
12
"""
13
Abstract base class for iterating over data in chunks.
14
15
Enables processing of large datasets by providing them in manageable
16
chunks, reducing memory usage and enabling streaming operations.
17
"""
18
19
def __init__(self, **kwargs):
20
"""Initialize abstract data chunk iterator."""
21
22
def __iter__(self):
23
"""Return iterator object."""
24
25
def __next__(self):
26
"""Get next data chunk."""
27
28
@property
29
def recommended_chunk_shape(self) -> tuple:
30
"""Recommended chunk shape for efficient processing."""
31
32
@property
33
def recommended_data_shape(self) -> tuple:
34
"""Recommended overall data shape."""
35
36
class GenericDataChunkIterator(AbstractDataChunkIterator):
37
"""
38
Generic implementation of data chunk iterator.
39
40
Provides chunk iteration over array-like data with configurable
41
chunk sizes and processing patterns.
42
"""
43
44
def __init__(self, data, **kwargs):
45
"""
46
Initialize generic chunk iterator.
47
48
Args:
49
data: Array-like data to iterate over
50
**kwargs: Iterator options:
51
- chunk_shape: Shape of chunks to yield
52
- buffer_size: Size of internal buffer
53
- iter_axis: Axis to iterate along
54
"""
55
56
def __next__(self) -> 'DataChunk':
57
"""
58
Get next data chunk.
59
60
Returns:
61
DataChunk object containing chunk data and metadata
62
"""
63
64
@property
65
def maxshape(self) -> tuple:
66
"""Maximum shape of the data."""
67
68
class DataChunkIterator(GenericDataChunkIterator):
69
"""
70
Specific implementation for HDMF data chunk iteration.
71
72
Optimized for HDMF data patterns with support for compression,
73
data validation, and backend-specific optimizations.
74
"""
75
76
def __init__(self, data, **kwargs):
77
"""
78
Initialize HDMF data chunk iterator.
79
80
Args:
81
data: Data to iterate over
82
**kwargs: HDMF-specific options:
83
- dtype: Data type for chunks
84
- compression: Compression settings
85
- shuffle: Enable shuffle filter
86
"""
87
88
class DataChunk:
89
"""
90
Represents a chunk of data with associated metadata.
91
92
Properties:
93
- data: The actual chunk data
94
- selection: Selection information for the chunk
95
- chunk_i: Chunk index
96
"""
97
98
def __init__(self, data, selection: tuple = None, chunk_i: int = None):
99
"""
100
Initialize data chunk.
101
102
Args:
103
data: Chunk data
104
selection: Selection tuple for the chunk
105
chunk_i: Index of this chunk
106
"""
107
108
@property
109
def data(self):
110
"""Access to chunk data."""
111
112
@property
113
def selection(self) -> tuple:
114
"""Selection information for this chunk."""
115
```
116
117
### Data I/O Configuration
118
119
Configuration classes for customizing data I/O behavior across different backends.
120
121
```python { .api }
122
class DataIO:
123
"""
124
Generic data I/O configuration wrapper.
125
126
Provides backend-agnostic configuration for data storage options
127
including compression, chunking, and filtering settings.
128
"""
129
130
def __init__(self, data, **kwargs):
131
"""
132
Initialize DataIO wrapper.
133
134
Args:
135
data: Data to be written
136
**kwargs: I/O configuration options:
137
- compression: Compression algorithm
138
- compression_opts: Compression parameters
139
- chunks: Chunking configuration
140
- fillvalue: Fill value for uninitialized data
141
"""
142
143
@property
144
def data(self):
145
"""Access to wrapped data."""
146
147
@property
148
def io_settings(self) -> dict:
149
"""Dictionary of I/O settings."""
150
151
class InvalidDataIOError(Exception):
152
"""
153
Exception for invalid DataIO configurations.
154
155
Raised when DataIO settings are incompatible or invalid
156
for the specified backend or data type.
157
"""
158
pass
159
```
160
161
### Data Manipulation Utilities
162
163
Utility functions for data manipulation and validation operations.
164
165
```python { .api }
166
def append_data(data, new_data):
167
"""
168
Append data to existing array-like structure.
169
170
Args:
171
data: Existing data array
172
new_data: Data to append
173
174
Returns:
175
Combined data array
176
"""
177
178
def extend_data(data, extension_data):
179
"""
180
Extend data with additional elements.
181
182
Args:
183
data: Existing data array
184
extension_data: Data to extend with
185
186
Returns:
187
Extended data array
188
"""
189
190
def assertEqualShape(data1, data2, ignore_axes: list = None):
191
"""
192
Assert that two data arrays have equal shapes.
193
194
Args:
195
data1: First data array
196
data2: Second data array
197
ignore_axes: List of axes to ignore in comparison
198
199
Raises:
200
AssertionError: If shapes don't match
201
"""
202
```
203
204
### Shape Validation
205
206
Classes and utilities for validating data shapes and dimensions.
207
208
```python { .api }
209
class ShapeValidatorResult:
210
"""
211
Result object for shape validation operations.
212
213
Contains validation status, error messages, and corrective suggestions
214
for data shape validation operations.
215
"""
216
217
def __init__(self, valid: bool, message: str = None, **kwargs):
218
"""
219
Initialize shape validation result.
220
221
Args:
222
valid: Whether validation passed
223
message: Validation message or error description
224
**kwargs: Additional result metadata
225
"""
226
227
@property
228
def valid(self) -> bool:
229
"""Whether validation passed."""
230
231
@property
232
def message(self) -> str:
233
"""Validation message or error description."""
234
235
@property
236
def errors(self) -> list:
237
"""List of validation errors."""
238
```
239
240
## Usage Examples
241
242
### Working with Data Chunk Iterators
243
244
```python
245
from hdmf.data_utils import DataChunkIterator
246
import numpy as np
247
248
# Create large dataset
249
large_data = np.random.randn(10000, 1000)
250
251
# Process in chunks to save memory
252
chunk_iter = DataChunkIterator(
253
data=large_data,
254
chunk_shape=(1000, 1000),
255
dtype=np.float64
256
)
257
258
# Process chunks incrementally
259
for chunk in chunk_iter:
260
# Process each chunk
261
processed_chunk = chunk.data * 2.0
262
print(f"Processed chunk {chunk.chunk_i} with shape {chunk.data.shape}")
263
```
264
265
### Configuring Data I/O
266
267
```python
268
from hdmf.data_utils import DataIO
269
from hdmf.backends.hdf5 import HDF5IO
270
import numpy as np
271
272
# Create data with custom I/O settings
273
data = np.random.randn(5000, 200)
274
275
# Configure compression and chunking
276
data_io = DataIO(
277
data=data,
278
compression='gzip',
279
compression_opts=9,
280
chunks=(500, 200),
281
fillvalue=-1
282
)
283
284
# Use with HDF5 backend
285
with HDF5IO('configured_data.h5', mode='w') as io:
286
container = Container(name='experiment')
287
data_container = Data(name='measurements', data=data_io)
288
container.add_child(data_container)
289
io.write(container)
290
```
291
292
### Data Manipulation Utilities
293
294
```python
295
from hdmf.data_utils import append_data, extend_data, assertEqualShape
296
import numpy as np
297
298
# Initial data
299
initial_data = np.array([[1, 2, 3], [4, 5, 6]])
300
301
# Append new rows
302
new_rows = np.array([[7, 8, 9], [10, 11, 12]])
303
combined_data = append_data(initial_data, new_rows)
304
305
# Extend with additional elements
306
extension = [13, 14, 15, 16]
307
extended_data = extend_data(combined_data.flatten(), extension)
308
309
# Validate shapes match
310
data1 = np.random.randn(100, 50)
311
data2 = np.random.randn(100, 50)
312
assertEqualShape(data1, data2) # Passes
313
314
# Ignore specific axes in shape comparison
315
data3 = np.random.randn(100, 60) # Different second dimension
316
assertEqualShape(data1, data3, ignore_axes=[1]) # Passes, ignoring axis 1
317
```
318
319
### Custom Chunk Processing
320
321
```python
322
from hdmf.data_utils import GenericDataChunkIterator, DataChunk
323
import numpy as np
324
325
class CustomProcessor:
326
def __init__(self, data, chunk_size=1000):
327
self.chunk_iter = GenericDataChunkIterator(
328
data=data,
329
chunk_shape=(chunk_size,)
330
)
331
self.results = []
332
333
def process_all_chunks(self):
334
"""Process all chunks and collect results."""
335
for chunk in self.chunk_iter:
336
# Apply custom processing
337
processed = self.custom_transform(chunk.data)
338
self.results.append({
339
'chunk_index': chunk.chunk_i,
340
'original_shape': chunk.data.shape,
341
'processed_data': processed
342
})
343
344
return self.results
345
346
def custom_transform(self, data):
347
"""Custom transformation function."""
348
return np.mean(data, axis=-1)
349
350
# Usage
351
large_dataset = np.random.randn(50000, 100)
352
processor = CustomProcessor(large_dataset, chunk_size=5000)
353
results = processor.process_all_chunks()
354
355
print(f"Processed {len(results)} chunks")
356
for result in results[:3]: # Show first 3 results
357
print(f"Chunk {result['chunk_index']}: {result['original_shape']} -> {result['processed_data'].shape}")
358
```