A hierarchical data modeling framework for modern science data standards
—
HDMF's query system provides powerful querying and filtering capabilities for datasets and containers with reference resolution and advanced data access patterns. It enables efficient data exploration and analysis without loading entire datasets into memory.
Interface for querying HDF5-like datasets with lazy loading and efficient data access.
class HDMFDataset:
"""
Dataset query interface providing querying capabilities for HDF5-like datasets.
Enables efficient data access with lazy loading, slicing, and filtering
without requiring full dataset loading into memory.
"""
def __init__(self, dataset, io, **kwargs):
"""
Initialize HDMF dataset wrapper.
Args:
dataset: Underlying dataset object (e.g., h5py.Dataset)
io: I/O backend for data access
**kwargs: Additional dataset properties
"""
def __getitem__(self, key):
"""
Get data slice from dataset with advanced indexing support.
Args:
key: Index, slice, or advanced indexing specification
Returns:
Data slice from the dataset
Examples:
dataset[0:100] # Simple slice
dataset[:, [0, 5, 10]] # Column selection
dataset[mask] # Boolean indexing
"""
def __setitem__(self, key, value):
"""
Set data slice in dataset.
Args:
key: Index or slice specification
value: Data to set
"""
def append(self, data):
"""
Append data to dataset (if resizable).
Args:
data: Data to append
"""
def query(self, condition: str, **kwargs):
"""
Query dataset with condition string.
Args:
condition: Query condition string
**kwargs: Additional query parameters
Returns:
Filtered data matching the condition
"""
def where(self, condition):
"""
Find indices where condition is True.
Args:
condition: Boolean condition or callable
Returns:
Array of indices where condition is satisfied
"""
@property
def shape(self) -> tuple:
"""Shape of the dataset."""
@property
def dtype(self):
"""Data type of the dataset."""
@property
def size(self) -> int:
"""Total number of elements."""
@property
def ndim(self) -> int:
"""Number of dimensions."""System for resolving references between containers and builders in the data hierarchy.
class ReferenceResolver:
"""
Abstract base class for resolving references between containers/builders.
Provides the interface for resolving object references, region references,
and other cross-references within HDMF data structures.
"""
def __init__(self, **kwargs):
"""Initialize reference resolver."""
def get_object(self, ref) -> object:
"""
Get object from reference.
Args:
ref: Reference to resolve
Returns:
Referenced object
"""
def get_region(self, ref) -> tuple:
"""
Get region from region reference.
Args:
ref: Region reference to resolve
Returns:
Tuple of (object, selection)
"""
class BuilderResolver(ReferenceResolver):
"""
Reference resolver for Builder objects.
Resolves references between builders during the build process,
enabling cross-references in storage representations.
"""
def __init__(self, builder_map: dict, **kwargs):
"""
Initialize builder resolver.
Args:
builder_map: Dictionary mapping objects to builders
"""
def get_object(self, ref):
"""
Get builder from reference.
Args:
ref: Reference to builder
Returns:
Builder object
"""
class ContainerResolver(ReferenceResolver):
"""
Reference resolver for Container objects.
Resolves references between containers in the constructed object hierarchy,
enabling navigation and cross-references in the in-memory representation.
"""
def __init__(self, type_map: 'TypeMap', container: 'Container', **kwargs):
"""
Initialize container resolver.
Args:
type_map: Type mapping for container resolution
container: Root container for resolution context
"""
def get_object(self, ref):
"""
Get container from reference.
Args:
ref: Reference to container
Returns:
Container object
"""
def get_region(self, ref):
"""
Get region from container reference.
Args:
ref: Region reference
Returns:
Tuple of (container, selection)
"""Utility functions and classes for advanced querying and data filtering.
def query_dataset(dataset: HDMFDataset, query_str: str, **kwargs):
"""
Query dataset using query string syntax.
Args:
dataset: Dataset to query
query_str: Query string with conditions
**kwargs: Additional query parameters
Returns:
Query results
Examples:
query_dataset(data, "column > 5 AND column < 10")
query_dataset(data, "name LIKE 'neuron_*'")
"""
def filter_data(data, condition_func, **kwargs):
"""
Filter data using condition function.
Args:
data: Data to filter
condition_func: Function returning boolean mask
**kwargs: Additional filtering options
Returns:
Filtered data
"""
class QueryResult:
"""
Result object for query operations with lazy evaluation.
Provides access to query results with efficient memory usage
and support for chaining additional operations.
"""
def __init__(self, source_dataset, indices, **kwargs):
"""
Initialize query result.
Args:
source_dataset: Source dataset
indices: Selected indices
"""
def to_array(self):
"""
Convert query result to numpy array.
Returns:
NumPy array with query results
"""
def __getitem__(self, key):
"""Access subset of query results."""
def __len__(self) -> int:
"""Number of results."""
def __iter__(self):
"""Iterate over results."""from hdmf.backends.hdf5 import HDF5IO
from hdmf.query import HDMFDataset
import numpy as np
# Open HDF5 file with data
with HDF5IO('experiment.h5', mode='r') as io:
container = io.read()
# Get dataset as HDMFDataset for querying
neural_data = container.neural_data.data # This is an HDMFDataset
# Basic slicing operations
first_1000_samples = neural_data[0:1000, :]
specific_channels = neural_data[:, [0, 5, 10, 15]]
time_window = neural_data[5000:10000, :]
print(f"Dataset shape: {neural_data.shape}")
print(f"First 1000 samples shape: {first_1000_samples.shape}")
print(f"Selected channels shape: {specific_channels.shape}")
# Advanced indexing with boolean masks
with HDF5IO('experiment.h5', mode='r') as io:
container = io.read()
voltage_data = container.voltage_traces.data
# Create boolean mask for high-activity periods
mean_activity = np.mean(voltage_data[:], axis=1)
high_activity_mask = mean_activity > np.percentile(mean_activity, 95)
# Extract high activity periods
high_activity_data = voltage_data[high_activity_mask, :]
print(f"High activity periods: {high_activity_data.shape}")from hdmf.common import DynamicTable
from hdmf.query import query_dataset
# Create sample table
subjects_table = DynamicTable(
name='subjects',
description='Subject information'
)
subjects_table.add_column('subject_id', 'Subject ID')
subjects_table.add_column('age', 'Age in months', dtype='int')
subjects_table.add_column('weight', 'Weight in grams', dtype='float')
subjects_table.add_column('genotype', 'Genotype')
# Add sample data
for i in range(50):
subjects_table.add_row(
subject_id=f'subject_{i:03d}',
age=np.random.randint(3, 24),
weight=np.random.normal(25.0, 3.0),
genotype=np.random.choice(['WT', 'KO'])
)
# Query using table methods
adult_subjects = subjects_table.which(age__gt=12)
print(f"Adult subjects: {len(adult_subjects)}")
heavy_subjects = subjects_table.which(weight__gt=27.0)
print(f"Heavy subjects: {len(heavy_subjects)}")
ko_subjects = subjects_table.which(genotype='KO')
print(f"KO subjects: {len(ko_subjects)}")
# Complex queries combining conditions
adult_ko = []
for idx in range(len(subjects_table)):
row = subjects_table[idx]
if row['age'] > 12 and row['genotype'] == 'KO':
adult_ko.append(idx)
print(f"Adult KO subjects: {len(adult_ko)}")from hdmf.query import ContainerResolver
from hdmf.common import DynamicTable, DynamicTableRegion, get_type_map
# Create referenced data structure
neurons_table = DynamicTable(name='neurons', description='Neuron data')
neurons_table.add_column('neuron_id', 'Neuron ID')
neurons_table.add_column('cell_type', 'Cell type')
# Add neurons
for i in range(20):
neurons_table.add_row(
neuron_id=f'neuron_{i:03d}',
cell_type='pyramidal' if i % 2 == 0 else 'interneuron'
)
# Create table region referring to subset
pyramidal_region = DynamicTableRegion(
name='pyramidal_neurons',
data=[i for i in range(0, 20, 2)], # Even indices (pyramidal cells)
description='Pyramidal neurons only',
table=neurons_table
)
# Create analysis table using references
analysis_table = DynamicTable(name='analysis', description='Analysis results')
analysis_table.add_column('neuron_group', 'Group of neurons')
analysis_table.add_column('avg_firing_rate', 'Average firing rate', dtype='float')
analysis_table.add_row(
neuron_group=pyramidal_region,
avg_firing_rate=15.3
)
# Resolve references using ContainerResolver
type_map = get_type_map()
resolver = ContainerResolver(type_map, neurons_table)
# Access referenced data through resolver
referenced_neurons = analysis_table.get_column('neuron_group').data[0]
resolved_neurons = resolver.get_object(referenced_neurons)
print(f"Referenced neurons: {len(referenced_neurons)} neurons")
print(f"First referenced neuron: {resolved_neurons[0]}")from hdmf.backends.hdf5 import HDF5IO
from hdmf.query import filter_data, QueryResult
import numpy as np
# Load time series data
with HDF5IO('timeseries.h5', mode='r') as io:
container = io.read()
timestamps = container.timestamps.data
neural_data = container.neural_data.data
# Define filtering conditions
def high_variance_condition(data_slice):
"""Find time periods with high variance across channels."""
return np.var(data_slice, axis=1) > np.percentile(np.var(data_slice, axis=1), 90)
def specific_frequency_condition(data_slice, target_freq=40.0, sampling_rate=1000.0):
"""Find periods with specific frequency content."""
# Simple frequency detection using FFT
fft_result = np.fft.fft(data_slice, axis=0)
freqs = np.fft.fftfreq(data_slice.shape[0], 1/sampling_rate)
# Check for peak near target frequency
target_idx = np.argmin(np.abs(freqs - target_freq))
power_at_target = np.abs(fft_result[target_idx, :])
return np.mean(power_at_target) > np.percentile(power_at_target, 95)
# Apply filters with sliding window
window_size = 1000 # 1 second windows at 1kHz
high_var_periods = []
freq_periods = []
for start_idx in range(0, len(neural_data) - window_size, window_size//2):
window_data = neural_data[start_idx:start_idx + window_size, :]
if high_variance_condition(window_data):
high_var_periods.append((start_idx, start_idx + window_size))
if specific_frequency_condition(window_data):
freq_periods.append((start_idx, start_idx + window_size))
print(f"High variance periods: {len(high_var_periods)}")
print(f"Target frequency periods: {len(freq_periods)}")
# Extract filtered data
if high_var_periods:
first_high_var = neural_data[high_var_periods[0][0]:high_var_periods[0][1], :]
print(f"First high variance period shape: {first_high_var.shape}")from hdmf.backends.hdf5 import HDF5IO
import numpy as np
def query_large_dataset_efficiently(file_path: str, query_condition, chunk_size: int = 10000):
"""
Efficiently query large datasets using chunked processing.
Args:
file_path: Path to HDF5 file
query_condition: Function that returns boolean mask
chunk_size: Size of data chunks to process
Returns:
List of matching data indices
"""
matching_indices = []
with HDF5IO(file_path, mode='r') as io:
container = io.read()
dataset = container.large_dataset.data
total_samples = dataset.shape[0]
# Process dataset in chunks
for start_idx in range(0, total_samples, chunk_size):
end_idx = min(start_idx + chunk_size, total_samples)
# Load chunk
chunk_data = dataset[start_idx:end_idx, :]
# Apply condition to chunk
chunk_mask = query_condition(chunk_data)
# Convert local indices to global indices
local_matches = np.where(chunk_mask)[0]
global_matches = local_matches + start_idx
matching_indices.extend(global_matches)
print(f"Processed {end_idx}/{total_samples} samples, "
f"found {len(local_matches)} matches in chunk")
return matching_indices
# Example usage
def find_outliers(data_chunk, threshold=3.0):
"""Find data points that are outliers (>3 standard deviations)."""
z_scores = np.abs((data_chunk - np.mean(data_chunk, axis=0)) / np.std(data_chunk, axis=0))
return np.any(z_scores > threshold, axis=1)
outlier_indices = query_large_dataset_efficiently(
'large_experiment.h5',
find_outliers,
chunk_size=5000
)
print(f"Found {len(outlier_indices)} outlier samples")from hdmf.common import DynamicTable, DynamicTableRegion
from hdmf.query import ContainerResolver
def cross_table_analysis(subjects_table, sessions_table, results_table):
"""
Perform analysis across multiple related tables.
Args:
subjects_table: Table with subject information
sessions_table: Table with session information
results_table: Table with analysis results
"""
# Find high-performing subjects
high_performance_threshold = 0.85
high_performers = []
for i in range(len(results_table)):
if results_table[i]['performance_score'] > high_performance_threshold:
high_performers.append(i)
# Get subject IDs for high performers
high_performer_subjects = []
for result_idx in high_performers:
session_ref = results_table[result_idx]['session']
# Resolve session reference
session_info = session_ref.table[session_ref.data[0]]
subject_id = session_info['subject_id']
high_performer_subjects.append(subject_id)
# Analyze subject characteristics
subject_ages = []
subject_genotypes = []
for subject_id in high_performer_subjects:
# Find subject in subjects table
subject_indices = subjects_table.which(subject_id=subject_id)
if subject_indices:
subject_info = subjects_table[subject_indices[0]]
subject_ages.append(subject_info['age'])
subject_genotypes.append(subject_info['genotype'])
# Summary statistics
avg_age = np.mean(subject_ages)
genotype_counts = {}
for genotype in subject_genotypes:
genotype_counts[genotype] = genotype_counts.get(genotype, 0) + 1
print(f"High performers: {len(high_performers)} sessions")
print(f"Average age: {avg_age:.1f} months")
print(f"Genotype distribution: {genotype_counts}")
return {
'high_performer_indices': high_performers,
'subject_ages': subject_ages,
'genotype_distribution': genotype_counts
}
# Example usage would require setting up the related tables
# with proper cross-references between subjects, sessions, and resultsfrom hdmf.query import HDMFDataset
import numpy as np
from functools import lru_cache
class CachedQueryDataset:
"""Dataset wrapper with query result caching for better performance."""
def __init__(self, dataset: HDMFDataset, cache_size: int = 128):
self.dataset = dataset
self.cache_size = cache_size
# Create cached query method
self._cached_query = lru_cache(maxsize=cache_size)(self._query_impl)
def _query_impl(self, query_hash: str, *args, **kwargs):
"""Internal query implementation for caching."""
# This would contain the actual query logic
# Hash is used as cache key
return self.dataset.query(*args, **kwargs)
def query_with_cache(self, condition: str, **kwargs):
"""Query with result caching based on condition string."""
# Create hash of query parameters for caching
query_params = f"{condition}_{str(sorted(kwargs.items()))}"
query_hash = str(hash(query_params))
return self._cached_query(query_hash, condition, **kwargs)
def clear_cache(self):
"""Clear query result cache."""
self._cached_query.cache_clear()
def cache_info(self):
"""Get cache statistics."""
return self._cached_query.cache_info()
# Usage example
with HDF5IO('experiment.h5', mode='r') as io:
container = io.read()
# Wrap dataset with caching
cached_dataset = CachedQueryDataset(container.neural_data.data)
# Repeated queries will be cached
result1 = cached_dataset.query_with_cache("value > 0.5")
result2 = cached_dataset.query_with_cache("value > 0.5") # From cache
print(f"Cache info: {cached_dataset.cache_info()}")Install with Tessl CLI
npx tessl i tessl/pypi-hdmf