A hierarchical data modeling framework for modern science data standards
—
HDMF provides comprehensive validation of data against specifications with detailed error reporting and schema compliance checking. The validation system ensures data integrity, specification compliance, and provides detailed feedback for debugging and quality assurance.
Core validator classes for different types of data validation against specifications.
class Validator:
"""
Base validator class for validating data against specifications.
Provides the foundation for all validation operations in HDMF,
including schema validation, type checking, and constraint verification.
"""
def __init__(self, spec, **kwargs):
"""
Initialize validator.
Args:
spec: Specification object to validate against
**kwargs: Additional validator options:
- strict: Enable strict validation mode
- ignore_missing: Ignore missing optional fields
"""
def validate(self, builder, **kwargs) -> list:
"""
Validate builder against specification.
Args:
builder: Builder object to validate
**kwargs: Validation options
Returns:
List of validation errors (empty if valid)
"""
### Validation Error Classes
Specific error classes for different types of validation failures with detailed error reporting.
```python { .api }
class Error(Exception):
"""
Base class for HDMF validation errors.
Provides structured error reporting with location information
and detailed messages for debugging validation failures.
"""
def __init__(self, location: str, message: str = None):
"""
Initialize validation error.
Args:
location: Location where error occurred
message: Detailed error message
"""
class DtypeError(Error):
"""
Error for data type mismatches in validation.
Raised when data types don't match specification requirements.
"""
pass
class MissingError(Error):
"""
Error for missing required components.
Raised when required datasets, groups, or attributes are missing.
"""
pass
class ExpectedArrayError(Error):
"""
Error for expected array data validation failures.
Raised when array-like data doesn't meet shape or type requirements.
"""
pass
class ShapeError(Error):
"""
Error for array shape validation failures.
Raised when array shapes don't match specification constraints.
"""
pass
class MissingDataType(Error):
"""
Error for missing data type specifications.
Raised when referenced data types are not found in namespace.
"""
pass
class IllegalLinkError(Error):
"""
Error for illegal link operations in validation.
Raised when links violate specification constraints.
"""
pass
class IncorrectDataType(Error):
"""
Error for incorrect data type usage.
Raised when data types are incorrect for the context.
"""
pass
class IncorrectQuantityError(Error):
"""
Error for incorrect quantity specifications.
Raised when quantities don't match cardinality constraints.
"""
passdef check_type(self, builder) -> list:
"""
Check data type compliance.
Args:
builder: Builder to check
Returns:
List of type validation errors
"""
def check_shape(self, builder) -> list:
"""
Check data shape compliance.
Args:
builder: Builder to check
Returns:
List of shape validation errors
"""
def check_attributes(self, builder) -> list:
"""
Check attribute requirements and values.
Args:
builder: Builder to check
Returns:
List of attribute validation errors
"""
@property
def spec(self):
"""Specification being validated against."""class GroupValidator(Validator): """ Validator for group (container) specifications.
Validates hierarchical container structures including nested groups,
datasets, attributes, and links against group specifications.
"""
def __init__(self, spec, **kwargs):
"""
Initialize group validator.
Args:
spec: GroupSpec to validate against
"""
def validate(self, builder, **kwargs) -> list:
"""
Validate group builder against specification.
Args:
builder: GroupBuilder to validate
Returns:
List of validation errors
"""
def check_groups(self, builder) -> list:
"""
Check nested group requirements.
Args:
builder: GroupBuilder to check
Returns:
List of group validation errors
"""
def check_datasets(self, builder) -> list:
"""
Check dataset requirements.
Args:
builder: GroupBuilder to check
Returns:
List of dataset validation errors
"""
def check_links(self, builder) -> list:
"""
Check link requirements and targets.
Args:
builder: GroupBuilder to check
Returns:
List of link validation errors
"""class DatasetValidator(Validator): """ Validator for dataset specifications.
Validates dataset structures including data types, shapes,
dimensions, and associated attributes against dataset specifications.
"""
def __init__(self, spec, **kwargs):
"""
Initialize dataset validator.
Args:
spec: DatasetSpec to validate against
"""
def validate(self, builder, **kwargs) -> list:
"""
Validate dataset builder against specification.
Args:
builder: DatasetBuilder to validate
Returns:
List of validation errors
"""
def check_data_type(self, builder) -> list:
"""
Check data type compliance including compound types.
Args:
builder: DatasetBuilder to check
Returns:
List of data type validation errors
"""
def check_dimensions(self, builder) -> list:
"""
Check dimension names and constraints.
Args:
builder: DatasetBuilder to check
Returns:
List of dimension validation errors
"""class AttributeValidator(Validator): """ Validator for attribute specifications.
Validates metadata attributes including values, types,
and constraints against attribute specifications.
"""
def __init__(self, spec, **kwargs):
"""
Initialize attribute validator.
Args:
spec: AttributeSpec to validate against
"""
def validate(self, builder, **kwargs) -> list:
"""
Validate attribute against specification.
Args:
builder: Builder containing the attribute
Returns:
List of validation errors
"""
def check_value_constraints(self, value) -> list:
"""
Check value against specification constraints.
Args:
value: Attribute value to check
Returns:
List of constraint validation errors
"""### Validator Management
Classes for managing and coordinating validation across different data types.
```python { .api }
class ValidatorMap:
"""
Mapping system for validators across different data types.
Manages the association between data types and their corresponding
validators, enabling automatic validator selection and coordination.
"""
def __init__(self, **kwargs):
"""Initialize validator map."""
def register_validator(self, neurodata_type: str, validator_class):
"""
Register validator class for a data type.
Args:
neurodata_type: Name of the data type
validator_class: Validator class to register
"""
def get_validator(self, neurodata_type: str, spec) -> Validator:
"""
Get validator instance for a data type.
Args:
neurodata_type: Name of the data type
spec: Specification to validate against
Returns:
Validator instance for the data type
"""
def validate_builder(self, builder, spec, **kwargs) -> list:
"""
Validate builder using appropriate validator.
Args:
builder: Builder to validate
spec: Specification to validate against
Returns:
List of validation errors
"""Comprehensive error classes for different types of validation failures.
class ValidationError(Exception):
"""Base class for validation errors."""
def __init__(self, message: str, location: str = None, **kwargs):
"""
Initialize validation error.
Args:
message: Error message
location: Location in data where error occurred
"""
super().__init__(message)
self.location = location
class SpecValidationError(ValidationError):
"""Error for specification compliance failures."""
def __init__(self, spec_type: str, message: str, **kwargs):
"""
Initialize specification validation error.
Args:
spec_type: Type of specification that failed
message: Error message
"""
super().__init__(message, **kwargs)
self.spec_type = spec_type
class TypeValidationError(ValidationError):
"""Error for data type validation failures."""
def __init__(self, expected_type, actual_type, **kwargs):
"""
Initialize type validation error.
Args:
expected_type: Expected data type
actual_type: Actual data type found
"""
message = f"Expected type {expected_type}, got {actual_type}"
super().__init__(message, **kwargs)
self.expected_type = expected_type
self.actual_type = actual_type
class ShapeValidationError(ValidationError):
"""Error for data shape validation failures."""
def __init__(self, expected_shape, actual_shape, **kwargs):
"""
Initialize shape validation error.
Args:
expected_shape: Expected data shape
actual_shape: Actual data shape found
"""
message = f"Expected shape {expected_shape}, got {actual_shape}"
super().__init__(message, **kwargs)
self.expected_shape = expected_shape
self.actual_shape = actual_shape
class RequiredValueError(ValidationError):
"""Error for missing required values."""
def __init__(self, field_name: str, **kwargs):
"""
Initialize required value error.
Args:
field_name: Name of required field that is missing
"""
message = f"Required field '{field_name}' is missing"
super().__init__(message, **kwargs)
self.field_name = field_name
class ConstraintViolationError(ValidationError):
"""Error for constraint violations."""
def __init__(self, constraint: str, value, **kwargs):
"""
Initialize constraint violation error.
Args:
constraint: Description of violated constraint
value: Value that violated the constraint
"""
message = f"Constraint violation: {constraint}, value: {value}"
super().__init__(message, **kwargs)
self.constraint = constraint
self.value = valueUtility functions for performing validation operations and reporting results.
def validate_file(file_path: str, namespace: str = None, **kwargs) -> dict:
"""
Validate entire file against namespace specifications.
Args:
file_path: Path to file to validate
namespace: Namespace to validate against (default: auto-detect)
**kwargs: Validation options:
- strict: Enable strict validation
- detailed: Include detailed error information
Returns:
Dictionary with validation results:
{
'valid': bool,
'errors': list,
'warnings': list,
'summary': dict
}
"""
def validate_container(container, **kwargs) -> dict:
"""
Validate container object against its specification.
Args:
container: Container object to validate
**kwargs: Validation options
Returns:
Dictionary with validation results
"""
def generate_validation_report(validation_results: dict, output_path: str = None) -> str:
"""
Generate human-readable validation report.
Args:
validation_results: Results from validation operation
output_path: Optional path to save report
Returns:
Formatted validation report string
"""
def check_specification_compliance(builder, spec, **kwargs) -> bool:
"""
Quick compliance check for builder against specification.
Args:
builder: Builder to check
spec: Specification to check against
Returns:
True if compliant, False otherwise
"""from hdmf.validate import validate_file, generate_validation_report
from hdmf.backends.hdf5 import HDF5IO
# Validate entire HDF5 file
validation_results = validate_file(
'experiment.h5',
namespace='hdmf-common',
strict=True,
detailed=True
)
print(f"File is valid: {validation_results['valid']}")
print(f"Number of errors: {len(validation_results['errors'])}")
print(f"Number of warnings: {len(validation_results['warnings'])}")
# Generate detailed report
if not validation_results['valid']:
report = generate_validation_report(validation_results)
print("Validation Report:")
print(report)
# Save report to file
with open('validation_report.txt', 'w') as f:
f.write(report)
# Summary statistics
summary = validation_results['summary']
print(f"Total containers validated: {summary.get('containers_checked', 0)}")
print(f"Total datasets validated: {summary.get('datasets_checked', 0)}")from hdmf.validate import validate_container, ValidationError
from hdmf.common import DynamicTable, VectorData
from hdmf import Container
import numpy as np
# Create container with potential validation issues
table = DynamicTable(
name='test_table',
description='Test table for validation'
)
# Add column with correct data
table.add_column('valid_column', 'Valid column', data=np.arange(10))
# Add column with problematic data (wrong type)
try:
table.add_column('problem_column', 'Problematic column',
data=['string', 'data', 'in', 'numeric', 'column'])
except Exception as e:
print(f"Column creation warning: {e}")
# Validate the container
validation_results = validate_container(
table,
strict=False, # Allow some flexibility
detailed=True
)
print(f"Container validation results:")
print(f"Valid: {validation_results['valid']}")
for error in validation_results['errors']:
print(f"Error: {error}")
for warning in validation_results['warnings']:
print(f"Warning: {warning}")from hdmf.validate import Validator, ValidationError
from hdmf.spec import DatasetSpec
import numpy as np
class NeuralDataValidator(Validator):
"""
Custom validator for neural data with domain-specific checks.
"""
def __init__(self, spec, **kwargs):
super().__init__(spec, **kwargs)
self.sampling_rate_min = kwargs.get('sampling_rate_min', 1.0)
self.sampling_rate_max = kwargs.get('sampling_rate_max', 100000.0)
def validate(self, builder, **kwargs):
"""Validate neural data with custom rules."""
errors = super().validate(builder, **kwargs)
# Add domain-specific validations
errors.extend(self._check_neural_data_quality(builder))
errors.extend(self._check_sampling_rate(builder))
errors.extend(self._check_channel_count(builder))
return errors
def _check_neural_data_quality(self, builder):
"""Check neural data for quality issues."""
errors = []
if hasattr(builder, 'data') and builder.data is not None:
data = np.array(builder.data)
# Check for unrealistic voltage values
if np.any(np.abs(data) > 10000): # > 10mV in µV
errors.append(ValidationError(
"Neural data contains unrealistic voltage values (>10mV)",
location=f"{builder.name}/data"
))
# Check for constant channels (likely broken)
if len(data.shape) > 1:
for ch_idx in range(data.shape[1]):
if np.std(data[:, ch_idx]) < 1e-6:
errors.append(ValidationError(
f"Channel {ch_idx} appears to be constant (possibly broken)",
location=f"{builder.name}/data/channel_{ch_idx}"
))
return errors
def _check_sampling_rate(self, builder):
"""Check sampling rate is within reasonable bounds."""
errors = []
if 'sampling_rate' in builder.attributes:
rate = builder.attributes['sampling_rate']
if rate < self.sampling_rate_min:
errors.append(ValidationError(
f"Sampling rate {rate} Hz is too low (min: {self.sampling_rate_min})",
location=f"{builder.name}/sampling_rate"
))
elif rate > self.sampling_rate_max:
errors.append(ValidationError(
f"Sampling rate {rate} Hz is too high (max: {self.sampling_rate_max})",
location=f"{builder.name}/sampling_rate"
))
return errors
def _check_channel_count(self, builder):
"""Check channel count is reasonable."""
errors = []
if hasattr(builder, 'data') and builder.data is not None:
data = np.array(builder.data)
if len(data.shape) > 1:
n_channels = data.shape[1]
if n_channels > 1000:
errors.append(ValidationError(
f"Very high channel count ({n_channels}), please verify",
location=f"{builder.name}/data"
))
elif n_channels == 0:
errors.append(ValidationError(
"No channels found in neural data",
location=f"{builder.name}/data"
))
return errors
# Usage
neural_spec = DatasetSpec(
doc='Neural recording data',
name='neural_data',
dtype='float64',
shape=(None, None),
dims=['time', 'channels']
)
neural_validator = NeuralDataValidator(
neural_spec,
sampling_rate_min=100.0,
sampling_rate_max=50000.0
)
# Validate neural data builder
from hdmf.build import DatasetBuilder
neural_builder = DatasetBuilder(
name='neural_data',
data=np.random.randn(30000, 64) * 100, # 64 channels, 30k samples
attributes={'sampling_rate': 30000.0}
)
validation_errors = neural_validator.validate(neural_builder)
if validation_errors:
for error in validation_errors:
print(f"Validation error: {error}")
else:
print("Neural data passed validation")from hdmf.validate import validate_file
import os
from pathlib import Path
import json
def batch_validate_files(directory_path: str, file_pattern: str = "*.h5",
namespace: str = 'hdmf-common') -> dict:
"""
Validate all files matching pattern in directory.
Args:
directory_path: Directory containing files to validate
file_pattern: File pattern to match
namespace: Namespace to validate against
Returns:
Dictionary with results for each file
"""
results = {}
directory = Path(directory_path)
# Find all matching files
files_to_validate = list(directory.glob(file_pattern))
print(f"Found {len(files_to_validate)} files to validate")
for file_path in files_to_validate:
print(f"Validating {file_path.name}...")
try:
validation_result = validate_file(
str(file_path),
namespace=namespace,
strict=False,
detailed=True
)
results[str(file_path)] = {
'valid': validation_result['valid'],
'error_count': len(validation_result['errors']),
'warning_count': len(validation_result['warnings']),
'errors': validation_result['errors'][:5], # First 5 errors
'summary': validation_result['summary']
}
except Exception as e:
results[str(file_path)] = {
'valid': False,
'error_count': 1,
'warning_count': 0,
'errors': [f"Validation failed: {str(e)}"],
'summary': {}
}
return results
# Run batch validation
validation_results = batch_validate_files(
'./experiment_data/',
file_pattern='*.h5',
namespace='hdmf-common'
)
# Generate summary report
total_files = len(validation_results)
valid_files = sum(1 for r in validation_results.values() if r['valid'])
total_errors = sum(r['error_count'] for r in validation_results.values())
print(f"\nBatch Validation Summary:")
print(f"Total files: {total_files}")
print(f"Valid files: {valid_files}")
print(f"Invalid files: {total_files - valid_files}")
print(f"Total errors: {total_errors}")
# Save detailed results
with open('batch_validation_results.json', 'w') as f:
json.dump(validation_results, f, indent=2)
# Print problematic files
print(f"\nProblematic files:")
for file_path, result in validation_results.items():
if not result['valid']:
print(f" {Path(file_path).name}: {result['error_count']} errors")
for error in result['errors'][:3]: # Show first 3 errors
print(f" - {error}")from hdmf.validate import Validator, validate_container
from hdmf.common import DynamicTable
from hdmf import docval, getargs
import numpy as np
class ValidatedDynamicTable(DynamicTable):
"""
DynamicTable with real-time validation during data entry.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.validation_enabled = kwargs.get('validate_on_add', True)
self.validation_errors = []
@docval({'name': 'data', 'type': dict, 'doc': 'Row data to add'})
def add_validated_row(self, **kwargs):
"""Add row with validation."""
data = getargs('data', kwargs)
if self.validation_enabled:
# Validate data before adding
validation_errors = self._validate_row_data(data)
if validation_errors:
error_msg = f"Row validation failed: {validation_errors}"
if kwargs.get('strict', True):
raise ValueError(error_msg)
else:
print(f"Warning: {error_msg}")
self.validation_errors.extend(validation_errors)
# Add row if validation passes or warnings allowed
self.add_row(**data)
def _validate_row_data(self, data):
"""Validate individual row data."""
errors = []
# Check required columns
for col_name in self.colnames:
if col_name not in data:
errors.append(f"Missing required column: {col_name}")
# Check column data types and ranges
for col_name, value in data.items():
if col_name in self.colnames:
column = self.get_column(col_name)
# Basic type checking
if hasattr(column, 'dtype'):
expected_dtype = column.dtype
if expected_dtype == 'int' and not isinstance(value, int):
errors.append(f"Column {col_name} expects int, got {type(value)}")
elif expected_dtype == 'float' and not isinstance(value, (int, float)):
errors.append(f"Column {col_name} expects float, got {type(value)}")
# Range checking for numeric columns
if col_name == 'age' and isinstance(value, (int, float)):
if value < 0 or value > 365: # Days
errors.append(f"Age {value} is outside valid range [0, 365]")
elif col_name == 'weight' and isinstance(value, (int, float)):
if value < 0 or value > 100: # Grams
errors.append(f"Weight {value} is outside valid range [0, 100]")
return errors
def validate_table(self):
"""Validate entire table and return results."""
return validate_container(self, detailed=True)
def get_validation_summary(self):
"""Get summary of validation issues."""
return {
'total_errors': len(self.validation_errors),
'errors': self.validation_errors,
'rows': len(self)
}
# Usage
validated_table = ValidatedDynamicTable(
name='subjects',
description='Subject data with validation',
validate_on_add=True
)
validated_table.add_column('subject_id', 'Subject ID')
validated_table.add_column('age', 'Age in days', dtype='int')
validated_table.add_column('weight', 'Weight in grams', dtype='float')
# Add valid data
try:
validated_table.add_validated_row(
data={'subject_id': 'mouse_001', 'age': 90, 'weight': 25.5}
)
print("Successfully added valid row")
except ValueError as e:
print(f"Validation error: {e}")
# Try to add invalid data
try:
validated_table.add_validated_row(
data={'subject_id': 'mouse_002', 'age': -10, 'weight': 150.0}, # Invalid values
strict=False # Allow warnings
)
print("Added row with warnings")
except ValueError as e:
print(f"Validation error: {e}")
# Check validation summary
summary = validated_table.get_validation_summary()
print(f"Validation summary: {summary}")
# Final table validation
final_validation = validated_table.validate_table()
print(f"Final table validation: {final_validation['valid']}")Install with Tessl CLI
npx tessl i tessl/pypi-hdmf