tessl/pypi-hdmf

A hierarchical data modeling framework for modern science data standards

—

Pending

Overview

Eval results

Files

Validation System

Name: tessl/pypi-hdmf
Author: tessl

HDMF provides comprehensive validation of data against specifications with detailed error reporting and schema compliance checking. The validation system ensures data integrity, specification compliance, and provides detailed feedback for debugging and quality assurance.

Capabilities

Validator Classes

Core validator classes for different types of data validation against specifications.

class Validator:
    """
    Base validator class for validating data against specifications.
    
    Provides the foundation for all validation operations in HDMF,
    including schema validation, type checking, and constraint verification.
    """
    
    def __init__(self, spec, **kwargs):
        """
        Initialize validator.
        
        Args:
            spec: Specification object to validate against
            **kwargs: Additional validator options:
                - strict: Enable strict validation mode
                - ignore_missing: Ignore missing optional fields
        """
    
    def validate(self, builder, **kwargs) -> list:
        """
        Validate builder against specification.
        
        Args:
            builder: Builder object to validate
            **kwargs: Validation options
            
        Returns:
            List of validation errors (empty if valid)
        """

### Validation Error Classes

Specific error classes for different types of validation failures with detailed error reporting.

```python { .api }
class Error(Exception):
    """
    Base class for HDMF validation errors.
    
    Provides structured error reporting with location information
    and detailed messages for debugging validation failures.
    """
    
    def __init__(self, location: str, message: str = None):
        """
        Initialize validation error.
        
        Args:
            location: Location where error occurred
            message: Detailed error message
        """

class DtypeError(Error):
    """
    Error for data type mismatches in validation.
    
    Raised when data types don't match specification requirements.
    """
    pass

class MissingError(Error):
    """
    Error for missing required components.
    
    Raised when required datasets, groups, or attributes are missing.
    """
    pass

class ExpectedArrayError(Error):
    """
    Error for expected array data validation failures.
    
    Raised when array-like data doesn't meet shape or type requirements.
    """
    pass

class ShapeError(Error):
    """
    Error for array shape validation failures.
    
    Raised when array shapes don't match specification constraints.
    """
    pass

class MissingDataType(Error):
    """
    Error for missing data type specifications.
    
    Raised when referenced data types are not found in namespace.
    """
    pass

class IllegalLinkError(Error):
    """
    Error for illegal link operations in validation.
    
    Raised when links violate specification constraints.
    """
    pass

class IncorrectDataType(Error):
    """
    Error for incorrect data type usage.
    
    Raised when data types are incorrect for the context.
    """
    pass

class IncorrectQuantityError(Error):
    """
    Error for incorrect quantity specifications.
    
    Raised when quantities don't match cardinality constraints.
    """
    pass

def check_type(self, builder) -> list:
    """
    Check data type compliance.
    
    Args:
        builder: Builder to check
        
    Returns:
        List of type validation errors
    """

def check_shape(self, builder) -> list:
    """
    Check data shape compliance.
    
    Args:
        builder: Builder to check
        
    Returns:
        List of shape validation errors
    """

def check_attributes(self, builder) -> list:
    """
    Check attribute requirements and values.
    
    Args:
        builder: Builder to check
        
    Returns:
        List of attribute validation errors
    """

@property
def spec(self):
    """Specification being validated against."""

class GroupValidator(Validator): """ Validator for group (container) specifications.

Validates hierarchical container structures including nested groups,
datasets, attributes, and links against group specifications.
"""

def __init__(self, spec, **kwargs):
    """
    Initialize group validator.
    
    Args:
        spec: GroupSpec to validate against
    """

def validate(self, builder, **kwargs) -> list:
    """
    Validate group builder against specification.
    
    Args:
        builder: GroupBuilder to validate
        
    Returns:
        List of validation errors
    """

def check_groups(self, builder) -> list:
    """
    Check nested group requirements.
    
    Args:
        builder: GroupBuilder to check
        
    Returns:
        List of group validation errors
    """

def check_datasets(self, builder) -> list:
    """
    Check dataset requirements.
    
    Args:
        builder: GroupBuilder to check
        
    Returns:
        List of dataset validation errors
    """

def check_links(self, builder) -> list:
    """
    Check link requirements and targets.
    
    Args:
        builder: GroupBuilder to check
        
    Returns:
        List of link validation errors
    """

class DatasetValidator(Validator): """ Validator for dataset specifications.

Validates dataset structures including data types, shapes,
dimensions, and associated attributes against dataset specifications.
"""

def __init__(self, spec, **kwargs):
    """
    Initialize dataset validator.
    
    Args:
        spec: DatasetSpec to validate against
    """

def validate(self, builder, **kwargs) -> list:
    """
    Validate dataset builder against specification.
    
    Args:
        builder: DatasetBuilder to validate
        
    Returns:
        List of validation errors
    """

def check_data_type(self, builder) -> list:
    """
    Check data type compliance including compound types.
    
    Args:
        builder: DatasetBuilder to check
        
    Returns:
        List of data type validation errors
    """

def check_dimensions(self, builder) -> list:
    """
    Check dimension names and constraints.
    
    Args:
        builder: DatasetBuilder to check
        
    Returns:
        List of dimension validation errors
    """

class AttributeValidator(Validator): """ Validator for attribute specifications.

Validates metadata attributes including values, types,
and constraints against attribute specifications.
"""

def __init__(self, spec, **kwargs):
    """
    Initialize attribute validator.
    
    Args:
        spec: AttributeSpec to validate against
    """

def validate(self, builder, **kwargs) -> list:
    """
    Validate attribute against specification.
    
    Args:
        builder: Builder containing the attribute
        
    Returns:
        List of validation errors
    """

def check_value_constraints(self, value) -> list:
    """
    Check value against specification constraints.
    
    Args:
        value: Attribute value to check
        
    Returns:
        List of constraint validation errors
    """

### Validator Management

Classes for managing and coordinating validation across different data types.

```python { .api }
class ValidatorMap:
    """
    Mapping system for validators across different data types.
    
    Manages the association between data types and their corresponding
    validators, enabling automatic validator selection and coordination.
    """
    
    def __init__(self, **kwargs):
        """Initialize validator map."""
    
    def register_validator(self, neurodata_type: str, validator_class):
        """
        Register validator class for a data type.
        
        Args:
            neurodata_type: Name of the data type
            validator_class: Validator class to register
        """
    
    def get_validator(self, neurodata_type: str, spec) -> Validator:
        """
        Get validator instance for a data type.
        
        Args:
            neurodata_type: Name of the data type
            spec: Specification to validate against
            
        Returns:
            Validator instance for the data type
        """
    
    def validate_builder(self, builder, spec, **kwargs) -> list:
        """
        Validate builder using appropriate validator.
        
        Args:
            builder: Builder to validate
            spec: Specification to validate against
            
        Returns:
            List of validation errors
        """

Validation Errors

Comprehensive error classes for different types of validation failures.

class ValidationError(Exception):
    """Base class for validation errors."""
    
    def __init__(self, message: str, location: str = None, **kwargs):
        """
        Initialize validation error.
        
        Args:
            message: Error message
            location: Location in data where error occurred
        """
        super().__init__(message)
        self.location = location

class SpecValidationError(ValidationError):
    """Error for specification compliance failures."""
    
    def __init__(self, spec_type: str, message: str, **kwargs):
        """
        Initialize specification validation error.
        
        Args:
            spec_type: Type of specification that failed
            message: Error message
        """
        super().__init__(message, **kwargs)
        self.spec_type = spec_type

class TypeValidationError(ValidationError):
    """Error for data type validation failures."""
    
    def __init__(self, expected_type, actual_type, **kwargs):
        """
        Initialize type validation error.
        
        Args:
            expected_type: Expected data type
            actual_type: Actual data type found
        """
        message = f"Expected type {expected_type}, got {actual_type}"
        super().__init__(message, **kwargs)
        self.expected_type = expected_type
        self.actual_type = actual_type

class ShapeValidationError(ValidationError):
    """Error for data shape validation failures."""
    
    def __init__(self, expected_shape, actual_shape, **kwargs):
        """
        Initialize shape validation error.
        
        Args:
            expected_shape: Expected data shape
            actual_shape: Actual data shape found
        """
        message = f"Expected shape {expected_shape}, got {actual_shape}"
        super().__init__(message, **kwargs)
        self.expected_shape = expected_shape
        self.actual_shape = actual_shape

class RequiredValueError(ValidationError):
    """Error for missing required values."""
    
    def __init__(self, field_name: str, **kwargs):
        """
        Initialize required value error.
        
        Args:
            field_name: Name of required field that is missing
        """
        message = f"Required field '{field_name}' is missing"
        super().__init__(message, **kwargs)
        self.field_name = field_name

class ConstraintViolationError(ValidationError):
    """Error for constraint violations."""
    
    def __init__(self, constraint: str, value, **kwargs):
        """
        Initialize constraint violation error.
        
        Args:
            constraint: Description of violated constraint
            value: Value that violated the constraint
        """
        message = f"Constraint violation: {constraint}, value: {value}"
        super().__init__(message, **kwargs)
        self.constraint = constraint
        self.value = value

Validation Utilities

Utility functions for performing validation operations and reporting results.

def validate_file(file_path: str, namespace: str = None, **kwargs) -> dict:
    """
    Validate entire file against namespace specifications.
    
    Args:
        file_path: Path to file to validate
        namespace: Namespace to validate against (default: auto-detect)
        **kwargs: Validation options:
            - strict: Enable strict validation
            - detailed: Include detailed error information
            
    Returns:
        Dictionary with validation results:
        {
            'valid': bool,
            'errors': list,
            'warnings': list,
            'summary': dict
        }
    """

def validate_container(container, **kwargs) -> dict:
    """
    Validate container object against its specification.
    
    Args:
        container: Container object to validate
        **kwargs: Validation options
        
    Returns:
        Dictionary with validation results
    """

def generate_validation_report(validation_results: dict, output_path: str = None) -> str:
    """
    Generate human-readable validation report.
    
    Args:
        validation_results: Results from validation operation
        output_path: Optional path to save report
        
    Returns:
        Formatted validation report string
    """

def check_specification_compliance(builder, spec, **kwargs) -> bool:
    """
    Quick compliance check for builder against specification.
    
    Args:
        builder: Builder to check
        spec: Specification to check against
        
    Returns:
        True if compliant, False otherwise
    """

Usage Examples

Basic File Validation

from hdmf.validate import validate_file, generate_validation_report
from hdmf.backends.hdf5 import HDF5IO

# Validate entire HDF5 file
validation_results = validate_file(
    'experiment.h5',
    namespace='hdmf-common',
    strict=True,
    detailed=True
)

print(f"File is valid: {validation_results['valid']}")
print(f"Number of errors: {len(validation_results['errors'])}")
print(f"Number of warnings: {len(validation_results['warnings'])}")

# Generate detailed report
if not validation_results['valid']:
    report = generate_validation_report(validation_results)
    print("Validation Report:")
    print(report)
    
    # Save report to file
    with open('validation_report.txt', 'w') as f:
        f.write(report)

# Summary statistics
summary = validation_results['summary']
print(f"Total containers validated: {summary.get('containers_checked', 0)}")
print(f"Total datasets validated: {summary.get('datasets_checked', 0)}")

Container-Level Validation

from hdmf.validate import validate_container, ValidationError
from hdmf.common import DynamicTable, VectorData
from hdmf import Container
import numpy as np

# Create container with potential validation issues
table = DynamicTable(
    name='test_table',
    description='Test table for validation'
)

# Add column with correct data
table.add_column('valid_column', 'Valid column', data=np.arange(10))

# Add column with problematic data (wrong type)
try:
    table.add_column('problem_column', 'Problematic column', 
                    data=['string', 'data', 'in', 'numeric', 'column'])
except Exception as e:
    print(f"Column creation warning: {e}")

# Validate the container
validation_results = validate_container(
    table,
    strict=False,  # Allow some flexibility
    detailed=True
)

print(f"Container validation results:")
print(f"Valid: {validation_results['valid']}")

for error in validation_results['errors']:
    print(f"Error: {error}")

for warning in validation_results['warnings']:
    print(f"Warning: {warning}")

Custom Validator Implementation

from hdmf.validate import Validator, ValidationError
from hdmf.spec import DatasetSpec
import numpy as np

class NeuralDataValidator(Validator):
    """
    Custom validator for neural data with domain-specific checks.
    """
    
    def __init__(self, spec, **kwargs):
        super().__init__(spec, **kwargs)
        self.sampling_rate_min = kwargs.get('sampling_rate_min', 1.0)
        self.sampling_rate_max = kwargs.get('sampling_rate_max', 100000.0)
    
    def validate(self, builder, **kwargs):
        """Validate neural data with custom rules."""
        errors = super().validate(builder, **kwargs)
        
        # Add domain-specific validations
        errors.extend(self._check_neural_data_quality(builder))
        errors.extend(self._check_sampling_rate(builder))
        errors.extend(self._check_channel_count(builder))
        
        return errors
    
    def _check_neural_data_quality(self, builder):
        """Check neural data for quality issues."""
        errors = []
        
        if hasattr(builder, 'data') and builder.data is not None:
            data = np.array(builder.data)
            
            # Check for unrealistic voltage values
            if np.any(np.abs(data) > 10000):  # > 10mV in µV
                errors.append(ValidationError(
                    "Neural data contains unrealistic voltage values (>10mV)",
                    location=f"{builder.name}/data"
                ))
            
            # Check for constant channels (likely broken)
            if len(data.shape) > 1:
                for ch_idx in range(data.shape[1]):
                    if np.std(data[:, ch_idx]) < 1e-6:
                        errors.append(ValidationError(
                            f"Channel {ch_idx} appears to be constant (possibly broken)",
                            location=f"{builder.name}/data/channel_{ch_idx}"
                        ))
        
        return errors
    
    def _check_sampling_rate(self, builder):
        """Check sampling rate is within reasonable bounds."""
        errors = []
        
        if 'sampling_rate' in builder.attributes:
            rate = builder.attributes['sampling_rate']
            
            if rate < self.sampling_rate_min:
                errors.append(ValidationError(
                    f"Sampling rate {rate} Hz is too low (min: {self.sampling_rate_min})",
                    location=f"{builder.name}/sampling_rate"
                ))
            
            elif rate > self.sampling_rate_max:
                errors.append(ValidationError(
                    f"Sampling rate {rate} Hz is too high (max: {self.sampling_rate_max})",
                    location=f"{builder.name}/sampling_rate"
                ))
        
        return errors
    
    def _check_channel_count(self, builder):
        """Check channel count is reasonable."""
        errors = []
        
        if hasattr(builder, 'data') and builder.data is not None:
            data = np.array(builder.data)
            
            if len(data.shape) > 1:
                n_channels = data.shape[1]
                
                if n_channels > 1000:
                    errors.append(ValidationError(
                        f"Very high channel count ({n_channels}), please verify",
                        location=f"{builder.name}/data"
                    ))
                
                elif n_channels == 0:
                    errors.append(ValidationError(
                        "No channels found in neural data",
                        location=f"{builder.name}/data"
                    ))
        
        return errors

# Usage
neural_spec = DatasetSpec(
    doc='Neural recording data',
    name='neural_data',
    dtype='float64',
    shape=(None, None),
    dims=['time', 'channels']
)

neural_validator = NeuralDataValidator(
    neural_spec,
    sampling_rate_min=100.0,
    sampling_rate_max=50000.0
)

# Validate neural data builder
from hdmf.build import DatasetBuilder
neural_builder = DatasetBuilder(
    name='neural_data',
    data=np.random.randn(30000, 64) * 100,  # 64 channels, 30k samples
    attributes={'sampling_rate': 30000.0}
)

validation_errors = neural_validator.validate(neural_builder)
if validation_errors:
    for error in validation_errors:
        print(f"Validation error: {error}")
else:
    print("Neural data passed validation")

Batch Validation of Multiple Files

from hdmf.validate import validate_file
import os
from pathlib import Path
import json

def batch_validate_files(directory_path: str, file_pattern: str = "*.h5", 
                        namespace: str = 'hdmf-common') -> dict:
    """
    Validate all files matching pattern in directory.
    
    Args:
        directory_path: Directory containing files to validate
        file_pattern: File pattern to match
        namespace: Namespace to validate against
        
    Returns:
        Dictionary with results for each file
    """
    
    results = {}
    directory = Path(directory_path)
    
    # Find all matching files
    files_to_validate = list(directory.glob(file_pattern))
    print(f"Found {len(files_to_validate)} files to validate")
    
    for file_path in files_to_validate:
        print(f"Validating {file_path.name}...")
        
        try:
            validation_result = validate_file(
                str(file_path),
                namespace=namespace,
                strict=False,
                detailed=True
            )
            
            results[str(file_path)] = {
                'valid': validation_result['valid'],
                'error_count': len(validation_result['errors']),
                'warning_count': len(validation_result['warnings']),
                'errors': validation_result['errors'][:5],  # First 5 errors
                'summary': validation_result['summary']
            }
            
        except Exception as e:
            results[str(file_path)] = {
                'valid': False,
                'error_count': 1,
                'warning_count': 0,
                'errors': [f"Validation failed: {str(e)}"],
                'summary': {}
            }
    
    return results

# Run batch validation
validation_results = batch_validate_files(
    './experiment_data/',
    file_pattern='*.h5',
    namespace='hdmf-common'
)

# Generate summary report
total_files = len(validation_results)
valid_files = sum(1 for r in validation_results.values() if r['valid'])
total_errors = sum(r['error_count'] for r in validation_results.values())

print(f"\nBatch Validation Summary:")
print(f"Total files: {total_files}")
print(f"Valid files: {valid_files}")
print(f"Invalid files: {total_files - valid_files}")
print(f"Total errors: {total_errors}")

# Save detailed results
with open('batch_validation_results.json', 'w') as f:
    json.dump(validation_results, f, indent=2)

# Print problematic files
print(f"\nProblematic files:")
for file_path, result in validation_results.items():
    if not result['valid']:
        print(f"  {Path(file_path).name}: {result['error_count']} errors")
        for error in result['errors'][:3]:  # Show first 3 errors
            print(f"    - {error}")

Real-time Validation During Data Creation

from hdmf.validate import Validator, validate_container
from hdmf.common import DynamicTable
from hdmf import docval, getargs
import numpy as np

class ValidatedDynamicTable(DynamicTable):
    """
    DynamicTable with real-time validation during data entry.
    """
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.validation_enabled = kwargs.get('validate_on_add', True)
        self.validation_errors = []
    
    @docval({'name': 'data', 'type': dict, 'doc': 'Row data to add'})
    def add_validated_row(self, **kwargs):
        """Add row with validation."""
        data = getargs('data', kwargs)
        
        if self.validation_enabled:
            # Validate data before adding
            validation_errors = self._validate_row_data(data)
            
            if validation_errors:
                error_msg = f"Row validation failed: {validation_errors}"
                if kwargs.get('strict', True):
                    raise ValueError(error_msg)
                else:
                    print(f"Warning: {error_msg}")
                    self.validation_errors.extend(validation_errors)
        
        # Add row if validation passes or warnings allowed
        self.add_row(**data)
    
    def _validate_row_data(self, data):
        """Validate individual row data."""
        errors = []
        
        # Check required columns
        for col_name in self.colnames:
            if col_name not in data:
                errors.append(f"Missing required column: {col_name}")
        
        # Check column data types and ranges
        for col_name, value in data.items():
            if col_name in self.colnames:
                column = self.get_column(col_name)
                
                # Basic type checking
                if hasattr(column, 'dtype'):
                    expected_dtype = column.dtype
                    if expected_dtype == 'int' and not isinstance(value, int):
                        errors.append(f"Column {col_name} expects int, got {type(value)}")
                    elif expected_dtype == 'float' and not isinstance(value, (int, float)):
                        errors.append(f"Column {col_name} expects float, got {type(value)}")
                
                # Range checking for numeric columns
                if col_name == 'age' and isinstance(value, (int, float)):
                    if value < 0 or value > 365:  # Days
                        errors.append(f"Age {value} is outside valid range [0, 365]")
                
                elif col_name == 'weight' and isinstance(value, (int, float)):
                    if value < 0 or value > 100:  # Grams
                        errors.append(f"Weight {value} is outside valid range [0, 100]")
        
        return errors
    
    def validate_table(self):
        """Validate entire table and return results."""
        return validate_container(self, detailed=True)
    
    def get_validation_summary(self):
        """Get summary of validation issues."""
        return {
            'total_errors': len(self.validation_errors),
            'errors': self.validation_errors,
            'rows': len(self)
        }

# Usage
validated_table = ValidatedDynamicTable(
    name='subjects',
    description='Subject data with validation',
    validate_on_add=True
)

validated_table.add_column('subject_id', 'Subject ID')
validated_table.add_column('age', 'Age in days', dtype='int')
validated_table.add_column('weight', 'Weight in grams', dtype='float')

# Add valid data
try:
    validated_table.add_validated_row(
        data={'subject_id': 'mouse_001', 'age': 90, 'weight': 25.5}
    )
    print("Successfully added valid row")
except ValueError as e:
    print(f"Validation error: {e}")

# Try to add invalid data
try:
    validated_table.add_validated_row(
        data={'subject_id': 'mouse_002', 'age': -10, 'weight': 150.0},  # Invalid values
        strict=False  # Allow warnings
    )
    print("Added row with warnings")
except ValueError as e:
    print(f"Validation error: {e}")

# Check validation summary
summary = validated_table.get_validation_summary()
print(f"Validation summary: {summary}")

# Final table validation
final_validation = validated_table.validate_table()
print(f"Final table validation: {final_validation['valid']}")

Install with Tessl CLI