0
# I/O Utilities
1
2
Advanced I/O utilities including HDF5 wrapper with attribute-style access and memory-mapped arrays for efficient handling of large proteomics datasets. Optimized for high-throughput workflows, memory efficiency, and seamless integration with pandas and numpy operations.
3
4
## Capabilities
5
6
### HDF5 File Interface
7
8
Comprehensive HDF5 wrapper providing attribute-style access and pandas integration for proteomics data storage.
9
10
```python { .api }
11
class HDF_File:
12
"""Main HDF5 file wrapper with comprehensive read/write functionality."""
13
14
def __init__(self, filepath: str, mode: str = 'r', **kwargs):
15
"""
16
Initialize HDF5 file wrapper.
17
18
Parameters:
19
- filepath: Path to HDF5 file
20
- mode: File access mode ('r', 'w', 'a', 'r+')
21
- **kwargs: Additional h5py.File options
22
"""
23
24
def __getitem__(self, key: str):
25
"""
26
Access datasets and groups using dictionary-style syntax.
27
28
Parameters:
29
- key: Dataset or group path
30
31
Returns:
32
HDF_Dataset, HDF_Group, or HDF_Dataframe object
33
"""
34
35
def __setitem__(self, key: str, value):
36
"""
37
Create or update datasets using dictionary-style syntax.
38
39
Parameters:
40
- key: Dataset path
41
- value: Data to store (numpy array, pandas DataFrame, etc.)
42
"""
43
44
def __contains__(self, key: str) -> bool:
45
"""Check if dataset or group exists in file."""
46
47
def __enter__(self):
48
"""Context manager entry."""
49
return self
50
51
def __exit__(self, exc_type, exc_val, exc_tb):
52
"""Context manager exit with automatic file closing."""
53
54
def close(self) -> None:
55
"""Close HDF5 file."""
56
57
def keys(self) -> list:
58
"""Get list of top-level datasets and groups."""
59
60
def create_group(self, name: str) -> 'HDF_Group':
61
"""
62
Create new HDF5 group.
63
64
Parameters:
65
- name: Group name/path
66
67
Returns:
68
HDF_Group wrapper object
69
"""
70
71
def require_group(self, name: str) -> 'HDF_Group':
72
"""
73
Get existing group or create if it doesn't exist.
74
75
Parameters:
76
- name: Group name/path
77
78
Returns:
79
HDF_Group wrapper object
80
"""
81
82
class HDF_Group:
83
"""HDF group wrapper with attribute-style access."""
84
85
def __init__(self, hdf_group):
86
"""Initialize from h5py Group object."""
87
88
def __getitem__(self, key: str):
89
"""Access group contents using dictionary-style syntax."""
90
91
def __setitem__(self, key: str, value):
92
"""Create datasets in group using dictionary-style syntax."""
93
94
def __getattr__(self, name: str):
95
"""Access group contents using attribute-style syntax."""
96
97
def __setattr__(self, name: str, value):
98
"""Create datasets using attribute-style syntax."""
99
100
def keys(self) -> list:
101
"""Get list of datasets and subgroups."""
102
103
def create_dataset(self, name: str, data=None, **kwargs):
104
"""
105
Create dataset in group.
106
107
Parameters:
108
- name: Dataset name
109
- data: Data to store
110
- **kwargs: Dataset creation options
111
"""
112
113
class HDF_Dataset:
114
"""HDF dataset wrapper with NumPy-like interface."""
115
116
def __init__(self, hdf_dataset):
117
"""Initialize from h5py Dataset object."""
118
119
def __getitem__(self, key):
120
"""NumPy-style array indexing."""
121
122
def __setitem__(self, key, value):
123
"""NumPy-style array assignment."""
124
125
def __array__(self) -> np.ndarray:
126
"""Convert to numpy array."""
127
128
@property
129
def shape(self) -> tuple:
130
"""Dataset shape."""
131
132
@property
133
def dtype(self):
134
"""Dataset data type."""
135
136
@property
137
def size(self) -> int:
138
"""Total number of elements."""
139
140
def resize(self, size: tuple) -> None:
141
"""
142
Resize dataset.
143
144
Parameters:
145
- size: New dataset shape
146
"""
147
148
class HDF_Dataframe:
149
"""HDF DataFrame wrapper with pandas-like interface."""
150
151
def __init__(self, hdf_group):
152
"""Initialize from HDF group containing DataFrame data."""
153
154
def to_pandas(self) -> pd.DataFrame:
155
"""
156
Convert to pandas DataFrame.
157
158
Returns:
159
pandas DataFrame with all data loaded into memory
160
"""
161
162
def __getitem__(self, key) -> pd.Series:
163
"""
164
Access DataFrame columns.
165
166
Parameters:
167
- key: Column name
168
169
Returns:
170
pandas Series with column data
171
"""
172
173
def __setitem__(self, key: str, value):
174
"""
175
Set DataFrame column.
176
177
Parameters:
178
- key: Column name
179
- value: Column data
180
"""
181
182
@property
183
def columns(self) -> list:
184
"""Get DataFrame column names."""
185
186
@property
187
def shape(self) -> tuple:
188
"""Get DataFrame shape."""
189
190
def head(self, n: int = 5) -> pd.DataFrame:
191
"""
192
Get first n rows as pandas DataFrame.
193
194
Parameters:
195
- n: Number of rows to return
196
197
Returns:
198
pandas DataFrame with first n rows
199
"""
200
201
def tail(self, n: int = 5) -> pd.DataFrame:
202
"""
203
Get last n rows as pandas DataFrame.
204
205
Parameters:
206
- n: Number of rows to return
207
208
Returns:
209
pandas DataFrame with last n rows
210
"""
211
212
class HDF_Object:
213
"""Base class for HDF components with common functionality."""
214
215
def __init__(self, hdf_obj):
216
"""Initialize from h5py object."""
217
218
@property
219
def attrs(self) -> dict:
220
"""Access HDF5 attributes as dictionary."""
221
222
def set_attr(self, name: str, value) -> None:
223
"""
224
Set HDF5 attribute.
225
226
Parameters:
227
- name: Attribute name
228
- value: Attribute value
229
"""
230
231
def get_attr(self, name: str, default=None):
232
"""
233
Get HDF5 attribute.
234
235
Parameters:
236
- name: Attribute name
237
- default: Default value if attribute doesn't exist
238
239
Returns:
240
Attribute value or default
241
"""
242
```
243
244
### Memory-Mapped Arrays
245
246
High-performance memory-mapped array operations for handling large datasets that don't fit in memory.
247
248
```python { .api }
249
def redefine_temp_location(temp_dir: str) -> None:
250
"""
251
Change temporary file storage location.
252
253
Parameters:
254
- temp_dir: New directory for temporary files
255
"""
256
257
def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
258
"""
259
Initialize empty HDF5 file for memory mapping.
260
261
Parameters:
262
- filepath: Path for new HDF5 file
263
- shape: Array shape to create
264
- dtype: Data type
265
"""
266
267
def mmap_array_from_path(filepath: str, dataset_name: str = 'data',
268
mode: str = 'r') -> np.ndarray:
269
"""
270
Reconnect to existing memory-mapped file.
271
272
Parameters:
273
- filepath: Path to existing HDF5 file
274
- dataset_name: Name of dataset in HDF5 file
275
- mode: Access mode ('r', 'r+', 'w')
276
277
Returns:
278
Memory-mapped array connected to file
279
"""
280
281
def array(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
282
"""
283
Create temporary memory-mapped array.
284
285
Parameters:
286
- shape: Array shape
287
- dtype: Data type (default: float64)
288
- **kwargs: Additional numpy.memmap options
289
290
Returns:
291
Memory-mapped numpy array
292
"""
293
294
def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
295
"""
296
Initialize empty HDF5 file for memory mapping.
297
298
Parameters:
299
- filepath: Path for new HDF5 file
300
- shape: Array shape to create
301
- dtype: Data type
302
"""
303
304
def mmap_array_from_path(filepath: str, dataset_name: str = 'data',
305
mode: str = 'r') -> np.ndarray:
306
"""
307
Reconnect to existing memory-mapped file.
308
309
Parameters:
310
- filepath: Path to existing HDF5 file
311
- dataset_name: Name of dataset in HDF5 file
312
- mode: Access mode ('r', 'r+', 'w')
313
314
Returns:
315
Memory-mapped array connected to file
316
"""
317
318
def zeros(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
319
"""
320
Create zero-filled temporary memory-mapped array.
321
322
Parameters:
323
- shape: Array shape
324
- dtype: Data type (default: float64)
325
- **kwargs: Additional options
326
327
Returns:
328
Zero-filled memory-mapped array
329
"""
330
331
def ones(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
332
"""
333
Create ones-filled temporary memory-mapped array.
334
335
Parameters:
336
- shape: Array shape
337
- dtype: Data type (default: float64)
338
- **kwargs: Additional options
339
340
Returns:
341
Ones-filled memory-mapped array
342
"""
343
344
def clear() -> None:
345
"""
346
Clear temporary memory-mapped file directory.
347
Removes all temporary files created by this session.
348
"""
349
350
def get_temp_dir() -> str:
351
"""
352
Get current temporary directory location.
353
354
Returns:
355
Path to temporary directory
356
"""
357
358
def get_available_memory() -> int:
359
"""
360
Get available system memory in bytes.
361
362
Returns:
363
Available memory in bytes
364
"""
365
366
def estimate_memory_usage(shape: tuple, dtype=np.float64) -> int:
367
"""
368
Estimate memory usage for array with given shape and dtype.
369
370
Parameters:
371
- shape: Array shape
372
- dtype: Data type
373
374
Returns:
375
Estimated memory usage in bytes
376
"""
377
```
378
379
### Utility Functions
380
381
Additional I/O utility functions for data processing and file management.
382
383
```python { .api }
384
def save_dataframe_hdf(df: pd.DataFrame, filepath: str, key: str = 'data',
385
**kwargs) -> None:
386
"""
387
Save pandas DataFrame to HDF5 format with optimization.
388
389
Parameters:
390
- df: DataFrame to save
391
- filepath: Output HDF5 file path
392
- key: Dataset key in HDF5 file
393
- **kwargs: Additional pandas.to_hdf options
394
"""
395
396
def load_dataframe_hdf(filepath: str, key: str = 'data',
397
**kwargs) -> pd.DataFrame:
398
"""
399
Load pandas DataFrame from HDF5 format.
400
401
Parameters:
402
- filepath: Input HDF5 file path
403
- key: Dataset key in HDF5 file
404
- **kwargs: Additional pandas.read_hdf options
405
406
Returns:
407
Loaded pandas DataFrame
408
"""
409
410
def get_hdf_info(filepath: str) -> dict:
411
"""
412
Get comprehensive information about HDF5 file contents.
413
414
Parameters:
415
- filepath: Path to HDF5 file
416
417
Returns:
418
Dictionary with file structure and metadata
419
"""
420
421
def compress_hdf_file(input_path: str, output_path: str,
422
compression: str = 'gzip') -> None:
423
"""
424
Compress HDF5 file to reduce size.
425
426
Parameters:
427
- input_path: Input HDF5 file
428
- output_path: Output compressed HDF5 file
429
- compression: Compression algorithm ('gzip', 'lzf', 'szip')
430
"""
431
432
def merge_hdf_files(file_paths: List[str], output_path: str) -> None:
433
"""
434
Merge multiple HDF5 files into single file.
435
436
Parameters:
437
- file_paths: List of HDF5 files to merge
438
- output_path: Output merged HDF5 file path
439
"""
440
```
441
442
## Usage Examples
443
444
### Basic HDF5 Operations
445
446
```python
447
from alphabase.io.hdf import HDF_File
448
import pandas as pd
449
import numpy as np
450
451
# Create or open HDF5 file
452
with HDF_File('data.h5', mode='w') as hf:
453
# Store numpy array
454
data_array = np.random.randn(1000, 50)
455
hf['array_data'] = data_array
456
457
# Store pandas DataFrame
458
df = pd.DataFrame({
459
'sequence': ['PEPTIDE', 'SEQUENCE'],
460
'charge': [2, 3],
461
'mz': [123.45, 234.56]
462
})
463
hf['precursors'] = df
464
465
# Create groups for organization
466
group = hf.create_group('experiments')
467
group['exp1'] = np.random.randn(500, 10)
468
group['exp2'] = np.random.randn(300, 15)
469
470
# Set attributes
471
hf.set_attr('version', '1.0')
472
hf.set_attr('created_by', 'alphabase')
473
474
# Read data back
475
with HDF_File('data.h5', mode='r') as hf:
476
# Access using dictionary syntax
477
array_data = hf['array_data'][:] # Load full array
478
precursor_df = hf['precursors'].to_pandas()
479
480
# Access using attribute syntax
481
exp1_data = hf.experiments.exp1[:]
482
483
# Check file contents
484
print(f"Keys: {hf.keys()}")
485
print(f"Version: {hf.get_attr('version')}")
486
```
487
488
### Memory-Mapped Arrays for Large Data
489
490
```python
491
from alphabase.io.tempmmap import array, zeros, ones, clear
492
493
# Create large memory-mapped arrays that don't fit in RAM
494
large_shape = (1000000, 100) # 100M x 100 = 10B elements
495
496
# Create zero-filled memory-mapped array
497
large_zeros = zeros(large_shape, dtype=np.float32)
498
print(f"Created array shape: {large_zeros.shape}")
499
500
# Create ones-filled array
501
large_ones = ones((500000, 200), dtype=np.float64)
502
503
# Create empty array for computation
504
workspace = array((100000, 500), dtype=np.float32)
505
506
# Use arrays in computations without loading all data into memory
507
for i in range(0, large_shape[0], 10000):
508
# Process in chunks
509
chunk = large_zeros[i:i+10000]
510
# Perform operations on chunk
511
chunk[:] = np.random.randn(chunk.shape[0], chunk.shape[1])
512
513
# Clean up temporary files when done
514
clear()
515
```
516
517
### Advanced HDF5 Operations
518
519
```python
520
from alphabase.io.hdf import HDF_File
521
from alphabase.spectral_library.base import SpecLibBase
522
523
# Save spectral library to HDF5
524
spec_lib = SpecLibBase()
525
# ... populate library ...
526
527
with HDF_File('spectral_library.h5', mode='w') as hf:
528
# Save each DataFrame to separate group
529
lib_group = hf.create_group('spectral_library')
530
lib_group['precursors'] = spec_lib.precursor_df
531
lib_group['fragments_mz'] = spec_lib.fragment_mz_df
532
lib_group['fragments_intensity'] = spec_lib.fragment_intensity_df
533
534
# Add metadata
535
lib_group.set_attr('num_precursors', len(spec_lib.precursor_df))
536
lib_group.set_attr('format_version', '2.0')
537
lib_group.set_attr('creation_date', str(pd.Timestamp.now()))
538
539
# Load spectral library from HDF5
540
new_lib = SpecLibBase()
541
with HDF_File('spectral_library.h5', mode='r') as hf:
542
lib_group = hf['spectral_library']
543
new_lib.precursor_df = lib_group['precursors'].to_pandas()
544
new_lib.fragment_mz_df = lib_group['fragments_mz'].to_pandas()
545
new_lib.fragment_intensity_df = lib_group['fragments_intensity'].to_pandas()
546
547
# Read metadata
548
num_precursors = lib_group.get_attr('num_precursors')
549
print(f"Loaded library with {num_precursors} precursors")
550
```
551
552
### Efficient Data Processing Workflows
553
554
```python
555
from alphabase.io.hdf import HDF_File
556
from alphabase.io.tempmmap import array
557
import numpy as np
558
559
# Process large dataset in chunks using HDF5 and memory mapping
560
input_file = 'large_dataset.h5'
561
output_file = 'processed_dataset.h5'
562
563
with HDF_File(input_file, 'r') as input_hf, \
564
HDF_File(output_file, 'w') as output_hf:
565
566
# Get input data info
567
input_data = input_hf['raw_data']
568
total_rows = input_data.shape[0]
569
chunk_size = 10000
570
571
# Create output dataset
572
output_hf.create_dataset('processed_data',
573
shape=input_data.shape,
574
dtype=np.float32)
575
576
# Create temporary workspace
577
workspace = array((chunk_size, input_data.shape[1]), dtype=np.float32)
578
579
# Process in chunks
580
for i in range(0, total_rows, chunk_size):
581
end_idx = min(i + chunk_size, total_rows)
582
583
# Load chunk
584
chunk = input_data[i:end_idx]
585
586
# Process data (example: normalize)
587
workspace[:chunk.shape[0]] = chunk
588
workspace[:chunk.shape[0]] = (workspace[:chunk.shape[0]] -
589
workspace[:chunk.shape[0]].mean(axis=1, keepdims=True))
590
591
# Save processed chunk
592
output_hf['processed_data'][i:end_idx] = workspace[:chunk.shape[0]]
593
594
print(f"Processed {end_idx}/{total_rows} rows")
595
596
print("Processing complete!")
597
```
598
599
### File Management and Utilities
600
601
```python
602
from alphabase.io.hdf import get_hdf_info
603
604
# Get information about HDF5 file structure
605
file_info = get_hdf_info('spectral_library.h5')
606
print(f"File info: {file_info}")
607
608
# Check available memory before creating large arrays
609
from alphabase.io.tempmmap import get_available_memory, estimate_memory_usage
610
611
available = get_available_memory()
612
required = estimate_memory_usage((1000000, 100), dtype=np.float64)
613
614
print(f"Available memory: {available / 1e9:.1f} GB")
615
print(f"Required memory: {required / 1e9:.1f} GB")
616
617
if required < available * 0.8: # Use max 80% of available memory
618
large_array = array((1000000, 100), dtype=np.float64)
619
print("Array created successfully")
620
else:
621
print("Not enough memory, using smaller chunks")
622
```