tessl/pypi-hdmf

A hierarchical data modeling framework for modern science data standards

—

Pending

Overview

Eval results

Files

Query System

Name: tessl/pypi-hdmf
Author: tessl

HDMF's query system provides powerful querying and filtering capabilities for datasets and containers with reference resolution and advanced data access patterns. It enables efficient data exploration and analysis without loading entire datasets into memory.

Capabilities

Dataset Query Interface

Interface for querying HDF5-like datasets with lazy loading and efficient data access.

class HDMFDataset:
    """
    Dataset query interface providing querying capabilities for HDF5-like datasets.
    
    Enables efficient data access with lazy loading, slicing, and filtering
    without requiring full dataset loading into memory.
    """
    
    def __init__(self, dataset, io, **kwargs):
        """
        Initialize HDMF dataset wrapper.
        
        Args:
            dataset: Underlying dataset object (e.g., h5py.Dataset)
            io: I/O backend for data access
            **kwargs: Additional dataset properties
        """
    
    def __getitem__(self, key):
        """
        Get data slice from dataset with advanced indexing support.
        
        Args:
            key: Index, slice, or advanced indexing specification
            
        Returns:
            Data slice from the dataset
            
        Examples:
            dataset[0:100]           # Simple slice
            dataset[:, [0, 5, 10]]   # Column selection
            dataset[mask]            # Boolean indexing
        """
    
    def __setitem__(self, key, value):
        """
        Set data slice in dataset.
        
        Args:
            key: Index or slice specification
            value: Data to set
        """
    
    def append(self, data):
        """
        Append data to dataset (if resizable).
        
        Args:
            data: Data to append
        """
    
    def query(self, condition: str, **kwargs):
        """
        Query dataset with condition string.
        
        Args:
            condition: Query condition string
            **kwargs: Additional query parameters
            
        Returns:
            Filtered data matching the condition
        """
    
    def where(self, condition):
        """
        Find indices where condition is True.
        
        Args:
            condition: Boolean condition or callable
            
        Returns:
            Array of indices where condition is satisfied
        """
    
    @property
    def shape(self) -> tuple:
        """Shape of the dataset."""
    
    @property
    def dtype(self):
        """Data type of the dataset."""
    
    @property
    def size(self) -> int:
        """Total number of elements."""
    
    @property
    def ndim(self) -> int:
        """Number of dimensions."""

Reference Resolution

System for resolving references between containers and builders in the data hierarchy.

class ReferenceResolver:
    """
    Abstract base class for resolving references between containers/builders.
    
    Provides the interface for resolving object references, region references,
    and other cross-references within HDMF data structures.
    """
    
    def __init__(self, **kwargs):
        """Initialize reference resolver."""
    
    def get_object(self, ref) -> object:
        """
        Get object from reference.
        
        Args:
            ref: Reference to resolve
            
        Returns:
            Referenced object
        """
    
    def get_region(self, ref) -> tuple:
        """
        Get region from region reference.
        
        Args:
            ref: Region reference to resolve
            
        Returns:
            Tuple of (object, selection)
        """

class BuilderResolver(ReferenceResolver):
    """
    Reference resolver for Builder objects.
    
    Resolves references between builders during the build process,
    enabling cross-references in storage representations.
    """
    
    def __init__(self, builder_map: dict, **kwargs):
        """
        Initialize builder resolver.
        
        Args:
            builder_map: Dictionary mapping objects to builders
        """
    
    def get_object(self, ref):
        """
        Get builder from reference.
        
        Args:
            ref: Reference to builder
            
        Returns:
            Builder object
        """

class ContainerResolver(ReferenceResolver):
    """
    Reference resolver for Container objects.
    
    Resolves references between containers in the constructed object hierarchy,
    enabling navigation and cross-references in the in-memory representation.
    """
    
    def __init__(self, type_map: 'TypeMap', container: 'Container', **kwargs):
        """
        Initialize container resolver.
        
        Args:
            type_map: Type mapping for container resolution
            container: Root container for resolution context
        """
    
    def get_object(self, ref):
        """
        Get container from reference.
        
        Args:
            ref: Reference to container
            
        Returns:
            Container object
        """
    
    def get_region(self, ref):
        """
        Get region from container reference.
        
        Args:
            ref: Region reference
            
        Returns:
            Tuple of (container, selection)
        """

Query Utilities

Utility functions and classes for advanced querying and data filtering.

def query_dataset(dataset: HDMFDataset, query_str: str, **kwargs):
    """
    Query dataset using query string syntax.
    
    Args:
        dataset: Dataset to query
        query_str: Query string with conditions
        **kwargs: Additional query parameters
        
    Returns:
        Query results
        
    Examples:
        query_dataset(data, "column > 5 AND column < 10")
        query_dataset(data, "name LIKE 'neuron_*'")
    """

def filter_data(data, condition_func, **kwargs):
    """
    Filter data using condition function.
    
    Args:
        data: Data to filter
        condition_func: Function returning boolean mask
        **kwargs: Additional filtering options
        
    Returns:
        Filtered data
    """

class QueryResult:
    """
    Result object for query operations with lazy evaluation.
    
    Provides access to query results with efficient memory usage
    and support for chaining additional operations.
    """
    
    def __init__(self, source_dataset, indices, **kwargs):
        """
        Initialize query result.
        
        Args:
            source_dataset: Source dataset
            indices: Selected indices
        """
    
    def to_array(self):
        """
        Convert query result to numpy array.
        
        Returns:
            NumPy array with query results
        """
    
    def __getitem__(self, key):
        """Access subset of query results."""
    
    def __len__(self) -> int:
        """Number of results."""
    
    def __iter__(self):
        """Iterate over results."""

Usage Examples

Basic Dataset Querying

from hdmf.backends.hdf5 import HDF5IO
from hdmf.query import HDMFDataset
import numpy as np

# Open HDF5 file with data
with HDF5IO('experiment.h5', mode='r') as io:
    container = io.read()
    
    # Get dataset as HDMFDataset for querying
    neural_data = container.neural_data.data  # This is an HDMFDataset
    
    # Basic slicing operations
    first_1000_samples = neural_data[0:1000, :]
    specific_channels = neural_data[:, [0, 5, 10, 15]]
    time_window = neural_data[5000:10000, :]
    
    print(f"Dataset shape: {neural_data.shape}")
    print(f"First 1000 samples shape: {first_1000_samples.shape}")
    print(f"Selected channels shape: {specific_channels.shape}")

# Advanced indexing with boolean masks
with HDF5IO('experiment.h5', mode='r') as io:
    container = io.read()
    voltage_data = container.voltage_traces.data
    
    # Create boolean mask for high-activity periods
    mean_activity = np.mean(voltage_data[:], axis=1)
    high_activity_mask = mean_activity > np.percentile(mean_activity, 95)
    
    # Extract high activity periods
    high_activity_data = voltage_data[high_activity_mask, :]
    print(f"High activity periods: {high_activity_data.shape}")

Querying Dynamic Tables

from hdmf.common import DynamicTable
from hdmf.query import query_dataset

# Create sample table
subjects_table = DynamicTable(
    name='subjects',
    description='Subject information'
)

subjects_table.add_column('subject_id', 'Subject ID')
subjects_table.add_column('age', 'Age in months', dtype='int')
subjects_table.add_column('weight', 'Weight in grams', dtype='float')
subjects_table.add_column('genotype', 'Genotype')

# Add sample data
for i in range(50):
    subjects_table.add_row(
        subject_id=f'subject_{i:03d}',
        age=np.random.randint(3, 24),
        weight=np.random.normal(25.0, 3.0),
        genotype=np.random.choice(['WT', 'KO'])
    )

# Query using table methods
adult_subjects = subjects_table.which(age__gt=12)
print(f"Adult subjects: {len(adult_subjects)}")

heavy_subjects = subjects_table.which(weight__gt=27.0)
print(f"Heavy subjects: {len(heavy_subjects)}")

ko_subjects = subjects_table.which(genotype='KO')
print(f"KO subjects: {len(ko_subjects)}")

# Complex queries combining conditions
adult_ko = []
for idx in range(len(subjects_table)):
    row = subjects_table[idx]
    if row['age'] > 12 and row['genotype'] == 'KO':
        adult_ko.append(idx)

print(f"Adult KO subjects: {len(adult_ko)}")

Reference Resolution

from hdmf.query import ContainerResolver
from hdmf.common import DynamicTable, DynamicTableRegion, get_type_map

# Create referenced data structure
neurons_table = DynamicTable(name='neurons', description='Neuron data')
neurons_table.add_column('neuron_id', 'Neuron ID')
neurons_table.add_column('cell_type', 'Cell type')

# Add neurons
for i in range(20):
    neurons_table.add_row(
        neuron_id=f'neuron_{i:03d}',
        cell_type='pyramidal' if i % 2 == 0 else 'interneuron'
    )

# Create table region referring to subset
pyramidal_region = DynamicTableRegion(
    name='pyramidal_neurons',
    data=[i for i in range(0, 20, 2)],  # Even indices (pyramidal cells)
    description='Pyramidal neurons only',
    table=neurons_table
)

# Create analysis table using references
analysis_table = DynamicTable(name='analysis', description='Analysis results')
analysis_table.add_column('neuron_group', 'Group of neurons')
analysis_table.add_column('avg_firing_rate', 'Average firing rate', dtype='float')

analysis_table.add_row(
    neuron_group=pyramidal_region,
    avg_firing_rate=15.3
)

# Resolve references using ContainerResolver
type_map = get_type_map()
resolver = ContainerResolver(type_map, neurons_table)

# Access referenced data through resolver
referenced_neurons = analysis_table.get_column('neuron_group').data[0]
resolved_neurons = resolver.get_object(referenced_neurons)

print(f"Referenced neurons: {len(referenced_neurons)} neurons")
print(f"First referenced neuron: {resolved_neurons[0]}")

Advanced Data Filtering

from hdmf.backends.hdf5 import HDF5IO
from hdmf.query import filter_data, QueryResult
import numpy as np

# Load time series data
with HDF5IO('timeseries.h5', mode='r') as io:
    container = io.read()
    timestamps = container.timestamps.data
    neural_data = container.neural_data.data
    
    # Define filtering conditions
    def high_variance_condition(data_slice):
        """Find time periods with high variance across channels."""
        return np.var(data_slice, axis=1) > np.percentile(np.var(data_slice, axis=1), 90)
    
    def specific_frequency_condition(data_slice, target_freq=40.0, sampling_rate=1000.0):
        """Find periods with specific frequency content."""
        # Simple frequency detection using FFT
        fft_result = np.fft.fft(data_slice, axis=0)
        freqs = np.fft.fftfreq(data_slice.shape[0], 1/sampling_rate)
        
        # Check for peak near target frequency
        target_idx = np.argmin(np.abs(freqs - target_freq))
        power_at_target = np.abs(fft_result[target_idx, :])
        
        return np.mean(power_at_target) > np.percentile(power_at_target, 95)
    
    # Apply filters with sliding window
    window_size = 1000  # 1 second windows at 1kHz
    high_var_periods = []
    freq_periods = []
    
    for start_idx in range(0, len(neural_data) - window_size, window_size//2):
        window_data = neural_data[start_idx:start_idx + window_size, :]
        
        if high_variance_condition(window_data):
            high_var_periods.append((start_idx, start_idx + window_size))
        
        if specific_frequency_condition(window_data):
            freq_periods.append((start_idx, start_idx + window_size))
    
    print(f"High variance periods: {len(high_var_periods)}")
    print(f"Target frequency periods: {len(freq_periods)}")
    
    # Extract filtered data
    if high_var_periods:
        first_high_var = neural_data[high_var_periods[0][0]:high_var_periods[0][1], :]
        print(f"First high variance period shape: {first_high_var.shape}")

Efficient Large Dataset Queries

from hdmf.backends.hdf5 import HDF5IO
import numpy as np

def query_large_dataset_efficiently(file_path: str, query_condition, chunk_size: int = 10000):
    """
    Efficiently query large datasets using chunked processing.
    
    Args:
        file_path: Path to HDF5 file
        query_condition: Function that returns boolean mask
        chunk_size: Size of data chunks to process
        
    Returns:
        List of matching data indices
    """
    
    matching_indices = []
    
    with HDF5IO(file_path, mode='r') as io:
        container = io.read()
        dataset = container.large_dataset.data
        
        total_samples = dataset.shape[0]
        
        # Process dataset in chunks
        for start_idx in range(0, total_samples, chunk_size):
            end_idx = min(start_idx + chunk_size, total_samples)
            
            # Load chunk
            chunk_data = dataset[start_idx:end_idx, :]
            
            # Apply condition to chunk
            chunk_mask = query_condition(chunk_data)
            
            # Convert local indices to global indices
            local_matches = np.where(chunk_mask)[0]
            global_matches = local_matches + start_idx
            
            matching_indices.extend(global_matches)
            
            print(f"Processed {end_idx}/{total_samples} samples, "
                  f"found {len(local_matches)} matches in chunk")
    
    return matching_indices

# Example usage
def find_outliers(data_chunk, threshold=3.0):
    """Find data points that are outliers (>3 standard deviations)."""
    z_scores = np.abs((data_chunk - np.mean(data_chunk, axis=0)) / np.std(data_chunk, axis=0))
    return np.any(z_scores > threshold, axis=1)

outlier_indices = query_large_dataset_efficiently(
    'large_experiment.h5',
    find_outliers,
    chunk_size=5000
)

print(f"Found {len(outlier_indices)} outlier samples")

Cross-Container Queries

from hdmf.common import DynamicTable, DynamicTableRegion
from hdmf.query import ContainerResolver

def cross_table_analysis(subjects_table, sessions_table, results_table):
    """
    Perform analysis across multiple related tables.
    
    Args:
        subjects_table: Table with subject information
        sessions_table: Table with session information  
        results_table: Table with analysis results
    """
    
    # Find high-performing subjects
    high_performance_threshold = 0.85
    high_performers = []
    
    for i in range(len(results_table)):
        if results_table[i]['performance_score'] > high_performance_threshold:
            high_performers.append(i)
    
    # Get subject IDs for high performers
    high_performer_subjects = []
    for result_idx in high_performers:
        session_ref = results_table[result_idx]['session']
        # Resolve session reference
        session_info = session_ref.table[session_ref.data[0]]
        subject_id = session_info['subject_id']
        high_performer_subjects.append(subject_id)
    
    # Analyze subject characteristics
    subject_ages = []
    subject_genotypes = []
    
    for subject_id in high_performer_subjects:
        # Find subject in subjects table
        subject_indices = subjects_table.which(subject_id=subject_id)
        if subject_indices:
            subject_info = subjects_table[subject_indices[0]]
            subject_ages.append(subject_info['age'])
            subject_genotypes.append(subject_info['genotype'])
    
    # Summary statistics
    avg_age = np.mean(subject_ages)
    genotype_counts = {}
    for genotype in subject_genotypes:
        genotype_counts[genotype] = genotype_counts.get(genotype, 0) + 1
    
    print(f"High performers: {len(high_performers)} sessions")
    print(f"Average age: {avg_age:.1f} months")
    print(f"Genotype distribution: {genotype_counts}")
    
    return {
        'high_performer_indices': high_performers,
        'subject_ages': subject_ages,
        'genotype_distribution': genotype_counts
    }

# Example usage would require setting up the related tables
# with proper cross-references between subjects, sessions, and results

Query Result Caching and Optimization

from hdmf.query import HDMFDataset
import numpy as np
from functools import lru_cache

class CachedQueryDataset:
    """Dataset wrapper with query result caching for better performance."""
    
    def __init__(self, dataset: HDMFDataset, cache_size: int = 128):
        self.dataset = dataset
        self.cache_size = cache_size
        
        # Create cached query method
        self._cached_query = lru_cache(maxsize=cache_size)(self._query_impl)
    
    def _query_impl(self, query_hash: str, *args, **kwargs):
        """Internal query implementation for caching."""
        # This would contain the actual query logic
        # Hash is used as cache key
        return self.dataset.query(*args, **kwargs)
    
    def query_with_cache(self, condition: str, **kwargs):
        """Query with result caching based on condition string."""
        # Create hash of query parameters for caching
        query_params = f"{condition}_{str(sorted(kwargs.items()))}"
        query_hash = str(hash(query_params))
        
        return self._cached_query(query_hash, condition, **kwargs)
    
    def clear_cache(self):
        """Clear query result cache."""
        self._cached_query.cache_clear()
    
    def cache_info(self):
        """Get cache statistics."""
        return self._cached_query.cache_info()

# Usage example
with HDF5IO('experiment.h5', mode='r') as io:
    container = io.read()
    
    # Wrap dataset with caching
    cached_dataset = CachedQueryDataset(container.neural_data.data)
    
    # Repeated queries will be cached
    result1 = cached_dataset.query_with_cache("value > 0.5")
    result2 = cached_dataset.query_with_cache("value > 0.5")  # From cache
    
    print(f"Cache info: {cached_dataset.cache_info()}")

Install with Tessl CLI