Tessl Tile for pypi/hdmf@4.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

build-system.md common-data.md containers.md data-utils.md index.md io-backends.md query.md specification.md term-sets.md utils.md validation.md

data-utils.mddocs/

0
# Data Utilities
1

2
HDMF provides essential data utilities for handling large datasets, chunk iterators, and I/O configurations. These utilities enable efficient memory management, streaming data operations, and customizable data handling patterns for scientific datasets.
3

4
## Capabilities
5

6
### Data Chunk Iterators
7

8
Iterator classes for processing large datasets in chunks without loading entire datasets into memory.
9

10
```python { .api }
11
class AbstractDataChunkIterator:
12
    """
13
    Abstract base class for iterating over data in chunks.
14
    
15
    Enables processing of large datasets by providing them in manageable
16
    chunks, reducing memory usage and enabling streaming operations.
17
    """
18
    
19
    def __init__(self, **kwargs):
20
        """Initialize abstract data chunk iterator."""
21
    
22
    def __iter__(self):
23
        """Return iterator object."""
24
        
25
    def __next__(self):
26
        """Get next data chunk."""
27
    
28
    @property
29
    def recommended_chunk_shape(self) -> tuple:
30
        """Recommended chunk shape for efficient processing."""
31
    
32
    @property
33
    def recommended_data_shape(self) -> tuple:
34
        """Recommended overall data shape."""
35

36
class GenericDataChunkIterator(AbstractDataChunkIterator):
37
    """
38
    Generic implementation of data chunk iterator.
39
    
40
    Provides chunk iteration over array-like data with configurable
41
    chunk sizes and processing patterns.
42
    """
43
    
44
    def __init__(self, data, **kwargs):
45
        """
46
        Initialize generic chunk iterator.
47
        
48
        Args:
49
            data: Array-like data to iterate over
50
            **kwargs: Iterator options:
51
                - chunk_shape: Shape of chunks to yield
52
                - buffer_size: Size of internal buffer
53
                - iter_axis: Axis to iterate along
54
        """
55
    
56
    def __next__(self) -> 'DataChunk':
57
        """
58
        Get next data chunk.
59
        
60
        Returns:
61
            DataChunk object containing chunk data and metadata
62
        """
63
    
64
    @property
65
    def maxshape(self) -> tuple:
66
        """Maximum shape of the data."""
67

68
class DataChunkIterator(GenericDataChunkIterator):
69
    """
70
    Specific implementation for HDMF data chunk iteration.
71
    
72
    Optimized for HDMF data patterns with support for compression,
73
    data validation, and backend-specific optimizations.
74
    """
75
    
76
    def __init__(self, data, **kwargs):
77
        """
78
        Initialize HDMF data chunk iterator.
79
        
80
        Args:
81
            data: Data to iterate over
82
            **kwargs: HDMF-specific options:
83
                - dtype: Data type for chunks
84
                - compression: Compression settings
85
                - shuffle: Enable shuffle filter
86
        """
87

88
class DataChunk:
89
    """
90
    Represents a chunk of data with associated metadata.
91
    
92
    Properties:
93
    - data: The actual chunk data
94
    - selection: Selection information for the chunk
95
    - chunk_i: Chunk index
96
    """
97
    
98
    def __init__(self, data, selection: tuple = None, chunk_i: int = None):
99
        """
100
        Initialize data chunk.
101
        
102
        Args:
103
            data: Chunk data
104
            selection: Selection tuple for the chunk
105
            chunk_i: Index of this chunk
106
        """
107
    
108
    @property
109
    def data(self):
110
        """Access to chunk data."""
111
    
112
    @property
113
    def selection(self) -> tuple:
114
        """Selection information for this chunk."""
115
```
116

117
### Data I/O Configuration
118

119
Configuration classes for customizing data I/O behavior across different backends.
120

121
```python { .api }
122
class DataIO:
123
    """
124
    Generic data I/O configuration wrapper.
125
    
126
    Provides backend-agnostic configuration for data storage options
127
    including compression, chunking, and filtering settings.
128
    """
129
    
130
    def __init__(self, data, **kwargs):
131
        """
132
        Initialize DataIO wrapper.
133
        
134
        Args:
135
            data: Data to be written
136
            **kwargs: I/O configuration options:
137
                - compression: Compression algorithm
138
                - compression_opts: Compression parameters
139
                - chunks: Chunking configuration
140
                - fillvalue: Fill value for uninitialized data
141
        """
142
    
143
    @property
144
    def data(self):
145
        """Access to wrapped data."""
146
    
147
    @property
148
    def io_settings(self) -> dict:
149
        """Dictionary of I/O settings."""
150

151
class InvalidDataIOError(Exception):
152
    """
153
    Exception for invalid DataIO configurations.
154
    
155
    Raised when DataIO settings are incompatible or invalid
156
    for the specified backend or data type.
157
    """
158
    pass
159
```
160

161
### Data Manipulation Utilities
162

163
Utility functions for data manipulation and validation operations.
164

165
```python { .api }
166
def append_data(data, new_data):
167
    """
168
    Append data to existing array-like structure.
169
    
170
    Args:
171
        data: Existing data array
172
        new_data: Data to append
173
        
174
    Returns:
175
        Combined data array
176
    """
177

178
def extend_data(data, extension_data):
179
    """
180
    Extend data with additional elements.
181
    
182
    Args:
183
        data: Existing data array
184
        extension_data: Data to extend with
185
        
186
    Returns:
187
        Extended data array
188
    """
189

190
def assertEqualShape(data1, data2, ignore_axes: list = None):
191
    """
192
    Assert that two data arrays have equal shapes.
193
    
194
    Args:
195
        data1: First data array
196
        data2: Second data array
197
        ignore_axes: List of axes to ignore in comparison
198
        
199
    Raises:
200
        AssertionError: If shapes don't match
201
    """
202
```
203

204
### Shape Validation
205

206
Classes and utilities for validating data shapes and dimensions.
207

208
```python { .api }
209
class ShapeValidatorResult:
210
    """
211
    Result object for shape validation operations.
212
    
213
    Contains validation status, error messages, and corrective suggestions
214
    for data shape validation operations.
215
    """
216
    
217
    def __init__(self, valid: bool, message: str = None, **kwargs):
218
        """
219
        Initialize shape validation result.
220
        
221
        Args:
222
            valid: Whether validation passed
223
            message: Validation message or error description
224
            **kwargs: Additional result metadata
225
        """
226
    
227
    @property
228
    def valid(self) -> bool:
229
        """Whether validation passed."""
230
    
231
    @property
232
    def message(self) -> str:
233
        """Validation message or error description."""
234
    
235
    @property
236
    def errors(self) -> list:
237
        """List of validation errors."""
238
```
239

240
## Usage Examples
241

242
### Working with Data Chunk Iterators
243

244
```python
245
from hdmf.data_utils import DataChunkIterator
246
import numpy as np
247

248
# Create large dataset
249
large_data = np.random.randn(10000, 1000)
250

251
# Process in chunks to save memory
252
chunk_iter = DataChunkIterator(
253
    data=large_data,
254
    chunk_shape=(1000, 1000),
255
    dtype=np.float64
256
)
257

258
# Process chunks incrementally
259
for chunk in chunk_iter:
260
    # Process each chunk
261
    processed_chunk = chunk.data * 2.0
262
    print(f"Processed chunk {chunk.chunk_i} with shape {chunk.data.shape}")
263
```
264

265
### Configuring Data I/O
266

267
```python
268
from hdmf.data_utils import DataIO
269
from hdmf.backends.hdf5 import HDF5IO
270
import numpy as np
271

272
# Create data with custom I/O settings
273
data = np.random.randn(5000, 200)
274

275
# Configure compression and chunking
276
data_io = DataIO(
277
    data=data,
278
    compression='gzip',
279
    compression_opts=9,
280
    chunks=(500, 200),
281
    fillvalue=-1
282
)
283

284
# Use with HDF5 backend
285
with HDF5IO('configured_data.h5', mode='w') as io:
286
    container = Container(name='experiment')
287
    data_container = Data(name='measurements', data=data_io)
288
    container.add_child(data_container)
289
    io.write(container)
290
```
291

292
### Data Manipulation Utilities
293

294
```python
295
from hdmf.data_utils import append_data, extend_data, assertEqualShape
296
import numpy as np
297

298
# Initial data
299
initial_data = np.array([[1, 2, 3], [4, 5, 6]])
300

301
# Append new rows
302
new_rows = np.array([[7, 8, 9], [10, 11, 12]])
303
combined_data = append_data(initial_data, new_rows)
304

305
# Extend with additional elements
306
extension = [13, 14, 15, 16]
307
extended_data = extend_data(combined_data.flatten(), extension)
308

309
# Validate shapes match
310
data1 = np.random.randn(100, 50)
311
data2 = np.random.randn(100, 50)
312
assertEqualShape(data1, data2)  # Passes
313

314
# Ignore specific axes in shape comparison
315
data3 = np.random.randn(100, 60)  # Different second dimension
316
assertEqualShape(data1, data3, ignore_axes=[1])  # Passes, ignoring axis 1
317
```
318

319
### Custom Chunk Processing
320

321
```python
322
from hdmf.data_utils import GenericDataChunkIterator, DataChunk
323
import numpy as np
324

325
class CustomProcessor:
326
    def __init__(self, data, chunk_size=1000):
327
        self.chunk_iter = GenericDataChunkIterator(
328
            data=data,
329
            chunk_shape=(chunk_size,)
330
        )
331
        self.results = []
332
    
333
    def process_all_chunks(self):
334
        """Process all chunks and collect results."""
335
        for chunk in self.chunk_iter:
336
            # Apply custom processing
337
            processed = self.custom_transform(chunk.data)
338
            self.results.append({
339
                'chunk_index': chunk.chunk_i,
340
                'original_shape': chunk.data.shape,
341
                'processed_data': processed
342
            })
343
        
344
        return self.results
345
    
346
    def custom_transform(self, data):
347
        """Custom transformation function."""
348
        return np.mean(data, axis=-1)
349

350
# Usage
351
large_dataset = np.random.randn(50000, 100)
352
processor = CustomProcessor(large_dataset, chunk_size=5000)
353
results = processor.process_all_chunks()
354

355
print(f"Processed {len(results)} chunks")
356
for result in results[:3]:  # Show first 3 results
357
    print(f"Chunk {result['chunk_index']}: {result['original_shape']} -> {result['processed_data'].shape}")
358
```

Version

Tile

Files

data-utils.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-utils.mddocs/