0
# Term Sets and Ontologies
1
2
HDMF provides integration with ontologies and controlled vocabularies through term sets, type configuration, and semantic validation. This system enables standardized terminology usage, data validation against ontologies, and semantic interoperability across different data standards.
3
4
## Capabilities
5
6
### Term Set Implementation
7
8
Core implementation for working with ontological term sets and controlled vocabularies.
9
10
```python { .api }
11
class TermSet:
12
"""
13
Term set implementation for ontologies and controlled vocabularies.
14
15
Provides validation and lookup capabilities for terms from external
16
ontologies, enabling semantic consistency and data standardization.
17
"""
18
19
def __init__(self, term_schema_path: str = None, **kwargs):
20
"""
21
Initialize term set.
22
23
Args:
24
term_schema_path: Path to term schema file
25
**kwargs: Additional term set properties:
26
- sources: List of ontology sources
27
- view: Specific view or subset of terms
28
- name: Name for the term set
29
- view_set: Set of terms in the view
30
"""
31
32
def validate(self, value) -> bool:
33
"""
34
Validate value against term set.
35
36
Args:
37
value: Value to validate
38
39
Returns:
40
True if value is valid according to term set
41
42
Raises:
43
ValueError: If validation fails with details
44
"""
45
46
def __getitem__(self, key):
47
"""
48
Get term information by key.
49
50
Args:
51
key: Term identifier or label
52
53
Returns:
54
Term information dictionary
55
"""
56
57
def search_terms(self, query: str, **kwargs) -> list:
58
"""
59
Search for terms matching query.
60
61
Args:
62
query: Search query string
63
**kwargs: Search options:
64
- case_sensitive: Whether search is case sensitive
65
- fuzzy: Enable fuzzy matching
66
- limit: Maximum number of results
67
68
Returns:
69
List of matching terms
70
"""
71
72
def get_term_hierarchy(self, term_id: str) -> dict:
73
"""
74
Get hierarchical relationships for a term.
75
76
Args:
77
term_id: Term identifier
78
79
Returns:
80
Dictionary with parent/child relationships
81
"""
82
83
@property
84
def sources(self) -> list:
85
"""List of ontology sources."""
86
87
@property
88
def view(self) -> str:
89
"""Current view or subset name."""
90
91
@property
92
def name(self) -> str:
93
"""Name of the term set."""
94
95
@property
96
def view_set(self) -> set:
97
"""Set of terms in the current view."""
98
```
99
100
### Term Set Wrapper
101
102
Wrapper class that allows datasets and attributes to be associated with term sets for validation.
103
104
```python { .api }
105
class TermSetWrapper:
106
"""
107
Wrapper allowing datasets/attributes to have associated TermSets.
108
109
Enables any HDF5 dataset or attribute to be validated against
110
ontological terms while preserving the original data structure.
111
"""
112
113
def __init__(self, value, field: str, termset: TermSet, **kwargs):
114
"""
115
Initialize term set wrapper.
116
117
Args:
118
value: Original value to wrap
119
field: Field name being wrapped
120
termset: TermSet for validation
121
**kwargs: Additional wrapper properties:
122
- dtype: Data type for the wrapped value
123
- allow_multiple: Allow multiple term selection
124
"""
125
126
def append(self, data):
127
"""
128
Append data with term validation.
129
130
Args:
131
data: Data to append (will be validated)
132
133
Raises:
134
ValueError: If appended data fails term validation
135
"""
136
137
def extend(self, data):
138
"""
139
Extend with iterable data, validating each element.
140
141
Args:
142
data: Iterable data to extend with
143
144
Raises:
145
ValueError: If any element fails term validation
146
"""
147
148
def validate_value(self, value) -> bool:
149
"""
150
Validate individual value against term set.
151
152
Args:
153
value: Value to validate
154
155
Returns:
156
True if value is valid
157
"""
158
159
@property
160
def value(self):
161
"""Wrapped value."""
162
163
@property
164
def field(self) -> str:
165
"""Field name being wrapped."""
166
167
@property
168
def termset(self) -> TermSet:
169
"""Associated TermSet."""
170
171
@property
172
def dtype(self):
173
"""Data type of wrapped value."""
174
```
175
176
### Type Configuration
177
178
Global configuration system for managing data type validation with term sets.
179
180
```python { .api }
181
class TypeConfigurator:
182
"""
183
Global configuration manager for data type validation with TermSets.
184
185
Manages mappings between data types and their associated term sets,
186
enabling automatic validation and ontology enforcement across HDMF.
187
"""
188
189
@staticmethod
190
def get_config() -> dict:
191
"""
192
Get current type configuration.
193
194
Returns:
195
Dictionary with current type-to-termset mappings
196
"""
197
198
@staticmethod
199
def load_type_config(config_path: str):
200
"""
201
Load type configuration from file.
202
203
Args:
204
config_path: Path to configuration file (JSON/YAML)
205
206
The configuration file should specify mappings between
207
data types and their associated term sets:
208
209
{
210
"cell_type": {
211
"termset": "cell_ontology",
212
"view": "neurons"
213
},
214
"brain_region": {
215
"termset": "brain_atlas",
216
"view": "allen_mouse"
217
}
218
}
219
"""
220
221
@staticmethod
222
def unload_type_config():
223
"""Unload current type configuration and reset to defaults."""
224
225
@staticmethod
226
def register_termset(data_type: str, termset: TermSet, **kwargs):
227
"""
228
Register term set for a specific data type.
229
230
Args:
231
data_type: Data type identifier
232
termset: TermSet to associate with the type
233
**kwargs: Additional registration options
234
"""
235
236
@staticmethod
237
def get_termset(data_type: str) -> TermSet:
238
"""
239
Get term set for a data type.
240
241
Args:
242
data_type: Data type identifier
243
244
Returns:
245
TermSet associated with the data type, or None
246
"""
247
248
@staticmethod
249
def validate_type_value(data_type: str, value) -> bool:
250
"""
251
Validate value for a specific data type.
252
253
Args:
254
data_type: Data type identifier
255
value: Value to validate
256
257
Returns:
258
True if value is valid for the data type
259
"""
260
```
261
262
### Term Set Utilities
263
264
Utility functions for working with term sets and ontologies.
265
266
```python { .api }
267
def load_termset_from_file(file_path: str, **kwargs) -> TermSet:
268
"""
269
Load term set from file.
270
271
Args:
272
file_path: Path to term set file (JSON/YAML/OWL)
273
**kwargs: Loading options
274
275
Returns:
276
TermSet loaded from file
277
"""
278
279
def create_termset_from_list(terms: list, name: str, **kwargs) -> TermSet:
280
"""
281
Create simple term set from list of terms.
282
283
Args:
284
terms: List of allowed terms
285
name: Name for the term set
286
**kwargs: Additional term set properties
287
288
Returns:
289
TermSet created from the term list
290
"""
291
292
def validate_with_termset(data, termset: TermSet, **kwargs) -> dict:
293
"""
294
Validate data against term set with detailed results.
295
296
Args:
297
data: Data to validate (scalar, list, or array)
298
termset: TermSet for validation
299
**kwargs: Validation options
300
301
Returns:
302
Dictionary with validation results:
303
{
304
'valid': bool,
305
'invalid_values': list,
306
'suggestions': dict
307
}
308
"""
309
310
def find_common_termsets(termsets: list) -> TermSet:
311
"""
312
Find common terms across multiple term sets.
313
314
Args:
315
termsets: List of TermSet objects
316
317
Returns:
318
TermSet containing intersection of all input term sets
319
"""
320
```
321
322
## Usage Examples
323
324
### Basic Term Set Usage
325
326
```python
327
from hdmf.term_set import TermSet, TermSetWrapper
328
import json
329
330
# Create simple term set from predefined terms
331
cell_types = TermSet(
332
name='cell_types',
333
sources=['cell_ontology'],
334
view='basic_types',
335
view_set={
336
'pyramidal_neuron',
337
'interneuron',
338
'astrocyte',
339
'oligodendrocyte',
340
'microglia'
341
}
342
)
343
344
# Validate individual terms
345
valid_term = cell_types.validate('pyramidal_neuron') # True
346
invalid_term = cell_types.validate('unknown_cell') # False (raises ValueError)
347
348
print(f"Pyramidal neuron is valid: {valid_term}")
349
350
# Search for terms
351
search_results = cell_types.search_terms('neuron', fuzzy=True)
352
print(f"Neuron-related terms: {search_results}")
353
354
# Get term information
355
if 'pyramidal_neuron' in cell_types.view_set:
356
term_info = cell_types['pyramidal_neuron']
357
print(f"Term info: {term_info}")
358
```
359
360
### Using Term Set Wrappers with Data
361
362
```python
363
from hdmf.term_set import TermSetWrapper
364
from hdmf.common import VectorData
365
import numpy as np
366
367
# Create term set for brain regions
368
brain_regions = TermSet(
369
name='brain_regions',
370
sources=['allen_brain_atlas'],
371
view='mouse_cortex',
372
view_set={
373
'primary_visual_cortex',
374
'primary_motor_cortex',
375
'somatosensory_cortex',
376
'auditory_cortex',
377
'prefrontal_cortex'
378
}
379
)
380
381
# Create data with term validation
382
region_data = [
383
'primary_visual_cortex',
384
'primary_motor_cortex',
385
'somatosensory_cortex'
386
]
387
388
# Wrap data with term set validation
389
validated_regions = TermSetWrapper(
390
value=region_data,
391
field='brain_region',
392
termset=brain_regions
393
)
394
395
print(f"Original data: {validated_regions.value}")
396
print(f"Field: {validated_regions.field}")
397
398
# Append new data (with validation)
399
try:
400
validated_regions.append('auditory_cortex') # Valid - will succeed
401
print("Successfully added auditory cortex")
402
except ValueError as e:
403
print(f"Validation error: {e}")
404
405
try:
406
validated_regions.append('invalid_region') # Invalid - will fail
407
except ValueError as e:
408
print(f"Validation error: {e}")
409
410
# Use in HDMF VectorData
411
region_vector = VectorData(
412
name='recording_regions',
413
description='Brain regions where recordings were made',
414
data=validated_regions
415
)
416
```
417
418
### Type Configuration Management
419
420
```python
421
from hdmf.term_set import TypeConfigurator, TermSet
422
import json
423
424
# Create configuration for different data types
425
cell_type_termset = TermSet(
426
name='cell_types',
427
view_set={'pyramidal', 'interneuron', 'astrocyte'}
428
)
429
430
behavior_termset = TermSet(
431
name='behaviors',
432
view_set={'running', 'grooming', 'resting', 'exploring'}
433
)
434
435
# Register term sets for specific data types
436
TypeConfigurator.register_termset('cell_type', cell_type_termset)
437
TypeConfigurator.register_termset('behavior_state', behavior_termset)
438
439
# Validate values using type configuration
440
cell_valid = TypeConfigurator.validate_type_value('cell_type', 'pyramidal')
441
behavior_valid = TypeConfigurator.validate_type_value('behavior_state', 'running')
442
443
print(f"Cell type 'pyramidal' is valid: {cell_valid}")
444
print(f"Behavior 'running' is valid: {behavior_valid}")
445
446
# Get current configuration
447
config = TypeConfigurator.get_config()
448
print(f"Current type configuration: {list(config.keys())}")
449
450
# Save configuration to file for reuse
451
config_dict = {
452
'cell_type': {
453
'terms': list(cell_type_termset.view_set),
454
'source': 'cell_ontology'
455
},
456
'behavior_state': {
457
'terms': list(behavior_termset.view_set),
458
'source': 'behavior_ontology'
459
}
460
}
461
462
with open('type_config.json', 'w') as f:
463
json.dump(config_dict, f, indent=2)
464
465
# Load configuration from file
466
TypeConfigurator.load_type_config('type_config.json')
467
```
468
469
### Advanced Term Set Operations
470
471
```python
472
from hdmf.term_set import TermSet, find_common_termsets, validate_with_termset
473
474
# Create multiple overlapping term sets
475
mouse_terms = TermSet(
476
name='mouse_anatomy',
477
view_set={
478
'cortex', 'hippocampus', 'thalamus',
479
'cerebellum', 'brainstem', 'olfactory_bulb'
480
}
481
)
482
483
rat_terms = TermSet(
484
name='rat_anatomy',
485
view_set={
486
'cortex', 'hippocampus', 'thalamus',
487
'cerebellum', 'brainstem', 'striatum'
488
}
489
)
490
491
human_terms = TermSet(
492
name='human_anatomy',
493
view_set={
494
'cortex', 'hippocampus', 'thalamus',
495
'cerebellum', 'brainstem', 'amygdala'
496
}
497
)
498
499
# Find common terms across species
500
common_terms = find_common_termsets([mouse_terms, rat_terms, human_terms])
501
print(f"Common brain regions: {common_terms.view_set}")
502
503
# Validate data with detailed results
504
test_data = [
505
'cortex', # Valid in all
506
'hippocampus', # Valid in all
507
'olfactory_bulb', # Only in mouse
508
'invalid_region' # Invalid everywhere
509
]
510
511
validation_result = validate_with_termset(test_data, common_terms)
512
print(f"Validation results: {validation_result}")
513
514
# Results would show:
515
# {
516
# 'valid': False,
517
# 'invalid_values': ['olfactory_bulb', 'invalid_region'],
518
# 'suggestions': {
519
# 'olfactory_bulb': ['available_in_mouse_only'],
520
# 'invalid_region': ['did_you_mean: cortex, thalamus']
521
# }
522
# }
523
```
524
525
### Integration with HDMF Common Data Structures
526
527
```python
528
from hdmf.common import DynamicTable, VectorData
529
from hdmf.term_set import TermSet, TermSetWrapper, TypeConfigurator
530
531
# Set up term sets for experimental metadata
532
species_terms = TermSet(
533
name='species',
534
view_set={'mus_musculus', 'rattus_norvegicus', 'homo_sapiens'}
535
)
536
537
sex_terms = TermSet(
538
name='sex',
539
view_set={'male', 'female', 'unknown'}
540
)
541
542
# Register with type configurator
543
TypeConfigurator.register_termset('species', species_terms)
544
TypeConfigurator.register_termset('sex', sex_terms)
545
546
# Create subject table with term validation
547
subjects_table = DynamicTable(
548
name='subjects',
549
description='Subject information with ontology validation'
550
)
551
552
# Add columns with term set validation
553
subjects_table.add_column('subject_id', 'Subject identifier')
554
subjects_table.add_column('species', 'Species (ontology validated)')
555
subjects_table.add_column('sex', 'Sex (ontology validated)')
556
subjects_table.add_column('age_days', 'Age in days', dtype='int')
557
558
# Add rows with automatic validation
559
def add_validated_subject(table, subject_id, species, sex, age_days):
560
"""Add subject with term validation."""
561
562
# Validate terms before adding
563
species_valid = TypeConfigurator.validate_type_value('species', species)
564
sex_valid = TypeConfigurator.validate_type_value('sex', sex)
565
566
if not species_valid:
567
raise ValueError(f"Invalid species: {species}")
568
if not sex_valid:
569
raise ValueError(f"Invalid sex: {sex}")
570
571
# Add row if validation passes
572
table.add_row(
573
subject_id=subject_id,
574
species=species,
575
sex=sex,
576
age_days=age_days
577
)
578
579
# Add subjects with validation
580
try:
581
add_validated_subject(subjects_table, 'mouse_001', 'mus_musculus', 'male', 90)
582
add_validated_subject(subjects_table, 'mouse_002', 'mus_musculus', 'female', 85)
583
print("Successfully added validated subjects")
584
except ValueError as e:
585
print(f"Validation error: {e}")
586
587
# Try to add invalid data
588
try:
589
add_validated_subject(subjects_table, 'invalid_001', 'invalid_species', 'male', 90)
590
except ValueError as e:
591
print(f"Expected validation error: {e}")
592
593
print(f"Subjects table has {len(subjects_table)} validated entries")
594
```
595
596
### Creating Custom Ontology Integrations
597
598
```python
599
from hdmf.term_set import TermSet
600
import requests
601
import json
602
603
class OntologyTermSet(TermSet):
604
"""
605
Extended TermSet that can load terms from external ontology APIs.
606
"""
607
608
def __init__(self, ontology_id: str, api_base_url: str, **kwargs):
609
self.ontology_id = ontology_id
610
self.api_base_url = api_base_url
611
612
# Load terms from API
613
terms = self._load_terms_from_api()
614
615
super().__init__(
616
name=ontology_id,
617
view_set=set(terms.keys()),
618
**kwargs
619
)
620
621
self.term_definitions = terms
622
623
def _load_terms_from_api(self) -> dict:
624
"""Load terms from ontology API."""
625
# This is a mock implementation - real implementation would
626
# connect to actual ontology services like OLS, BioPortal, etc.
627
628
mock_terms = {
629
'CL:0000540': {
630
'label': 'neuron',
631
'definition': 'A cell that is electrically active and specialized for the conduction and transmission of electrical signals.',
632
'synonyms': ['nerve cell']
633
},
634
'CL:0000129': {
635
'label': 'microglial cell',
636
'definition': 'A central nervous system macrophage found in the brain.',
637
'synonyms': ['microglia']
638
}
639
}
640
641
return mock_terms
642
643
def get_term_definition(self, term_id: str) -> str:
644
"""Get definition for a term."""
645
if term_id in self.term_definitions:
646
return self.term_definitions[term_id]['definition']
647
return None
648
649
def search_by_synonym(self, synonym: str) -> list:
650
"""Search terms by synonym."""
651
matches = []
652
for term_id, term_data in self.term_definitions.items():
653
if synonym.lower() in [s.lower() for s in term_data.get('synonyms', [])]:
654
matches.append(term_id)
655
return matches
656
657
# Usage
658
cell_ontology = OntologyTermSet(
659
ontology_id='cell_ontology',
660
api_base_url='https://www.ebi.ac.uk/ols/api/ontologies/cl'
661
)
662
663
# Use enhanced features
664
definition = cell_ontology.get_term_definition('CL:0000540')
665
print(f"Neuron definition: {definition}")
666
667
microglia_matches = cell_ontology.search_by_synonym('microglia')
668
print(f"Microglia term IDs: {microglia_matches}")
669
```