A hierarchical data modeling framework for modern science data standards
—
HDMF provides essential data utilities for handling large datasets, chunk iterators, and I/O configurations. These utilities enable efficient memory management, streaming data operations, and customizable data handling patterns for scientific datasets.
Iterator classes for processing large datasets in chunks without loading entire datasets into memory.
class AbstractDataChunkIterator:
"""
Abstract base class for iterating over data in chunks.
Enables processing of large datasets by providing them in manageable
chunks, reducing memory usage and enabling streaming operations.
"""
def __init__(self, **kwargs):
"""Initialize abstract data chunk iterator."""
def __iter__(self):
"""Return iterator object."""
def __next__(self):
"""Get next data chunk."""
@property
def recommended_chunk_shape(self) -> tuple:
"""Recommended chunk shape for efficient processing."""
@property
def recommended_data_shape(self) -> tuple:
"""Recommended overall data shape."""
class GenericDataChunkIterator(AbstractDataChunkIterator):
"""
Generic implementation of data chunk iterator.
Provides chunk iteration over array-like data with configurable
chunk sizes and processing patterns.
"""
def __init__(self, data, **kwargs):
"""
Initialize generic chunk iterator.
Args:
data: Array-like data to iterate over
**kwargs: Iterator options:
- chunk_shape: Shape of chunks to yield
- buffer_size: Size of internal buffer
- iter_axis: Axis to iterate along
"""
def __next__(self) -> 'DataChunk':
"""
Get next data chunk.
Returns:
DataChunk object containing chunk data and metadata
"""
@property
def maxshape(self) -> tuple:
"""Maximum shape of the data."""
class DataChunkIterator(GenericDataChunkIterator):
"""
Specific implementation for HDMF data chunk iteration.
Optimized for HDMF data patterns with support for compression,
data validation, and backend-specific optimizations.
"""
def __init__(self, data, **kwargs):
"""
Initialize HDMF data chunk iterator.
Args:
data: Data to iterate over
**kwargs: HDMF-specific options:
- dtype: Data type for chunks
- compression: Compression settings
- shuffle: Enable shuffle filter
"""
class DataChunk:
"""
Represents a chunk of data with associated metadata.
Properties:
- data: The actual chunk data
- selection: Selection information for the chunk
- chunk_i: Chunk index
"""
def __init__(self, data, selection: tuple = None, chunk_i: int = None):
"""
Initialize data chunk.
Args:
data: Chunk data
selection: Selection tuple for the chunk
chunk_i: Index of this chunk
"""
@property
def data(self):
"""Access to chunk data."""
@property
def selection(self) -> tuple:
"""Selection information for this chunk."""Configuration classes for customizing data I/O behavior across different backends.
class DataIO:
"""
Generic data I/O configuration wrapper.
Provides backend-agnostic configuration for data storage options
including compression, chunking, and filtering settings.
"""
def __init__(self, data, **kwargs):
"""
Initialize DataIO wrapper.
Args:
data: Data to be written
**kwargs: I/O configuration options:
- compression: Compression algorithm
- compression_opts: Compression parameters
- chunks: Chunking configuration
- fillvalue: Fill value for uninitialized data
"""
@property
def data(self):
"""Access to wrapped data."""
@property
def io_settings(self) -> dict:
"""Dictionary of I/O settings."""
class InvalidDataIOError(Exception):
"""
Exception for invalid DataIO configurations.
Raised when DataIO settings are incompatible or invalid
for the specified backend or data type.
"""
passUtility functions for data manipulation and validation operations.
def append_data(data, new_data):
"""
Append data to existing array-like structure.
Args:
data: Existing data array
new_data: Data to append
Returns:
Combined data array
"""
def extend_data(data, extension_data):
"""
Extend data with additional elements.
Args:
data: Existing data array
extension_data: Data to extend with
Returns:
Extended data array
"""
def assertEqualShape(data1, data2, ignore_axes: list = None):
"""
Assert that two data arrays have equal shapes.
Args:
data1: First data array
data2: Second data array
ignore_axes: List of axes to ignore in comparison
Raises:
AssertionError: If shapes don't match
"""Classes and utilities for validating data shapes and dimensions.
class ShapeValidatorResult:
"""
Result object for shape validation operations.
Contains validation status, error messages, and corrective suggestions
for data shape validation operations.
"""
def __init__(self, valid: bool, message: str = None, **kwargs):
"""
Initialize shape validation result.
Args:
valid: Whether validation passed
message: Validation message or error description
**kwargs: Additional result metadata
"""
@property
def valid(self) -> bool:
"""Whether validation passed."""
@property
def message(self) -> str:
"""Validation message or error description."""
@property
def errors(self) -> list:
"""List of validation errors."""from hdmf.data_utils import DataChunkIterator
import numpy as np
# Create large dataset
large_data = np.random.randn(10000, 1000)
# Process in chunks to save memory
chunk_iter = DataChunkIterator(
data=large_data,
chunk_shape=(1000, 1000),
dtype=np.float64
)
# Process chunks incrementally
for chunk in chunk_iter:
# Process each chunk
processed_chunk = chunk.data * 2.0
print(f"Processed chunk {chunk.chunk_i} with shape {chunk.data.shape}")from hdmf.data_utils import DataIO
from hdmf.backends.hdf5 import HDF5IO
import numpy as np
# Create data with custom I/O settings
data = np.random.randn(5000, 200)
# Configure compression and chunking
data_io = DataIO(
data=data,
compression='gzip',
compression_opts=9,
chunks=(500, 200),
fillvalue=-1
)
# Use with HDF5 backend
with HDF5IO('configured_data.h5', mode='w') as io:
container = Container(name='experiment')
data_container = Data(name='measurements', data=data_io)
container.add_child(data_container)
io.write(container)from hdmf.data_utils import append_data, extend_data, assertEqualShape
import numpy as np
# Initial data
initial_data = np.array([[1, 2, 3], [4, 5, 6]])
# Append new rows
new_rows = np.array([[7, 8, 9], [10, 11, 12]])
combined_data = append_data(initial_data, new_rows)
# Extend with additional elements
extension = [13, 14, 15, 16]
extended_data = extend_data(combined_data.flatten(), extension)
# Validate shapes match
data1 = np.random.randn(100, 50)
data2 = np.random.randn(100, 50)
assertEqualShape(data1, data2) # Passes
# Ignore specific axes in shape comparison
data3 = np.random.randn(100, 60) # Different second dimension
assertEqualShape(data1, data3, ignore_axes=[1]) # Passes, ignoring axis 1from hdmf.data_utils import GenericDataChunkIterator, DataChunk
import numpy as np
class CustomProcessor:
def __init__(self, data, chunk_size=1000):
self.chunk_iter = GenericDataChunkIterator(
data=data,
chunk_shape=(chunk_size,)
)
self.results = []
def process_all_chunks(self):
"""Process all chunks and collect results."""
for chunk in self.chunk_iter:
# Apply custom processing
processed = self.custom_transform(chunk.data)
self.results.append({
'chunk_index': chunk.chunk_i,
'original_shape': chunk.data.shape,
'processed_data': processed
})
return self.results
def custom_transform(self, data):
"""Custom transformation function."""
return np.mean(data, axis=-1)
# Usage
large_dataset = np.random.randn(50000, 100)
processor = CustomProcessor(large_dataset, chunk_size=5000)
results = processor.process_all_chunks()
print(f"Processed {len(results)} chunks")
for result in results[:3]: # Show first 3 results
print(f"Chunk {result['chunk_index']}: {result['original_shape']} -> {result['processed_data'].shape}")Install with Tessl CLI
npx tessl i tessl/pypi-hdmf