Tessl Tile for pypi/alphabase@1.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-peptide-operations.md advanced-spectral-libraries.md chemical-constants.md fragment-ions.md index.md io-utilities.md protein-analysis.md psm-readers.md quantification.md smiles-chemistry.md spectral-libraries.md

io-utilities.mddocs/

0
# I/O Utilities
1

2
Advanced I/O utilities including HDF5 wrapper with attribute-style access and memory-mapped arrays for efficient handling of large proteomics datasets. Optimized for high-throughput workflows, memory efficiency, and seamless integration with pandas and numpy operations.
3

4
## Capabilities
5

6
### HDF5 File Interface
7

8
Comprehensive HDF5 wrapper providing attribute-style access and pandas integration for proteomics data storage.
9

10
```python { .api }
11
class HDF_File:
12
    """Main HDF5 file wrapper with comprehensive read/write functionality."""
13
    
14
    def __init__(self, filepath: str, mode: str = 'r', **kwargs):
15
        """
16
        Initialize HDF5 file wrapper.
17
        
18
        Parameters:
19
        - filepath: Path to HDF5 file
20
        - mode: File access mode ('r', 'w', 'a', 'r+')
21
        - **kwargs: Additional h5py.File options
22
        """
23
    
24
    def __getitem__(self, key: str):
25
        """
26
        Access datasets and groups using dictionary-style syntax.
27
        
28
        Parameters:
29
        - key: Dataset or group path
30
        
31
        Returns:
32
        HDF_Dataset, HDF_Group, or HDF_Dataframe object
33
        """
34
    
35
    def __setitem__(self, key: str, value):
36
        """
37
        Create or update datasets using dictionary-style syntax.
38
        
39
        Parameters:
40
        - key: Dataset path
41
        - value: Data to store (numpy array, pandas DataFrame, etc.)
42
        """
43
    
44
    def __contains__(self, key: str) -> bool:
45
        """Check if dataset or group exists in file."""
46
    
47
    def __enter__(self):
48
        """Context manager entry."""
49
        return self
50
    
51
    def __exit__(self, exc_type, exc_val, exc_tb):
52
        """Context manager exit with automatic file closing."""
53
    
54
    def close(self) -> None:
55
        """Close HDF5 file."""
56
    
57
    def keys(self) -> list:
58
        """Get list of top-level datasets and groups."""
59
    
60
    def create_group(self, name: str) -> 'HDF_Group':
61
        """
62
        Create new HDF5 group.
63
        
64
        Parameters:
65
        - name: Group name/path
66
        
67
        Returns:
68
        HDF_Group wrapper object
69
        """
70
    
71
    def require_group(self, name: str) -> 'HDF_Group':
72
        """
73
        Get existing group or create if it doesn't exist.
74
        
75
        Parameters:
76
        - name: Group name/path
77
        
78
        Returns:
79
        HDF_Group wrapper object
80
        """
81

82
class HDF_Group:
83
    """HDF group wrapper with attribute-style access."""
84
    
85
    def __init__(self, hdf_group):
86
        """Initialize from h5py Group object."""
87
    
88
    def __getitem__(self, key: str):
89
        """Access group contents using dictionary-style syntax."""
90
    
91
    def __setitem__(self, key: str, value):
92
        """Create datasets in group using dictionary-style syntax."""
93
    
94
    def __getattr__(self, name: str):
95
        """Access group contents using attribute-style syntax."""
96
    
97
    def __setattr__(self, name: str, value):
98
        """Create datasets using attribute-style syntax."""
99
    
100
    def keys(self) -> list:
101
        """Get list of datasets and subgroups."""
102
    
103
    def create_dataset(self, name: str, data=None, **kwargs):
104
        """
105
        Create dataset in group.
106
        
107
        Parameters:
108
        - name: Dataset name
109
        - data: Data to store
110
        - **kwargs: Dataset creation options
111
        """
112

113
class HDF_Dataset:
114
    """HDF dataset wrapper with NumPy-like interface."""
115
    
116
    def __init__(self, hdf_dataset):
117
        """Initialize from h5py Dataset object."""
118
    
119
    def __getitem__(self, key):
120
        """NumPy-style array indexing."""
121
    
122
    def __setitem__(self, key, value):
123
        """NumPy-style array assignment."""
124
    
125
    def __array__(self) -> np.ndarray:
126
        """Convert to numpy array."""
127
    
128
    @property
129
    def shape(self) -> tuple:
130
        """Dataset shape."""
131
    
132
    @property
133
    def dtype(self):
134
        """Dataset data type."""
135
    
136
    @property
137
    def size(self) -> int:
138
        """Total number of elements."""
139
    
140
    def resize(self, size: tuple) -> None:
141
        """
142
        Resize dataset.
143
        
144
        Parameters:
145
        - size: New dataset shape
146
        """
147

148
class HDF_Dataframe:
149
    """HDF DataFrame wrapper with pandas-like interface."""
150
    
151
    def __init__(self, hdf_group):
152
        """Initialize from HDF group containing DataFrame data."""
153
    
154
    def to_pandas(self) -> pd.DataFrame:
155
        """
156
        Convert to pandas DataFrame.
157
        
158
        Returns:
159
        pandas DataFrame with all data loaded into memory
160
        """
161
    
162
    def __getitem__(self, key) -> pd.Series:
163
        """
164
        Access DataFrame columns.
165
        
166
        Parameters:
167
        - key: Column name
168
        
169
        Returns:
170
        pandas Series with column data
171
        """
172
    
173
    def __setitem__(self, key: str, value):
174
        """
175
        Set DataFrame column.
176
        
177
        Parameters:
178
        - key: Column name
179
        - value: Column data
180
        """
181
    
182
    @property
183
    def columns(self) -> list:
184
        """Get DataFrame column names."""
185
    
186
    @property
187
    def shape(self) -> tuple:
188
        """Get DataFrame shape."""
189
    
190
    def head(self, n: int = 5) -> pd.DataFrame:
191
        """
192
        Get first n rows as pandas DataFrame.
193
        
194
        Parameters:
195
        - n: Number of rows to return
196
        
197
        Returns:
198
        pandas DataFrame with first n rows
199
        """
200
    
201
    def tail(self, n: int = 5) -> pd.DataFrame:
202
        """
203
        Get last n rows as pandas DataFrame.
204
        
205
        Parameters:
206
        - n: Number of rows to return
207
        
208
        Returns:
209
        pandas DataFrame with last n rows
210
        """
211

212
class HDF_Object:
213
    """Base class for HDF components with common functionality."""
214
    
215
    def __init__(self, hdf_obj):
216
        """Initialize from h5py object."""
217
    
218
    @property
219
    def attrs(self) -> dict:
220
        """Access HDF5 attributes as dictionary."""
221
    
222
    def set_attr(self, name: str, value) -> None:
223
        """
224
        Set HDF5 attribute.
225
        
226
        Parameters:
227
        - name: Attribute name
228
        - value: Attribute value
229
        """
230
    
231
    def get_attr(self, name: str, default=None):
232
        """
233
        Get HDF5 attribute.
234
        
235
        Parameters:
236
        - name: Attribute name
237
        - default: Default value if attribute doesn't exist
238
        
239
        Returns:
240
        Attribute value or default
241
        """
242
```
243

244
### Memory-Mapped Arrays
245

246
High-performance memory-mapped array operations for handling large datasets that don't fit in memory.
247

248
```python { .api }
249
def redefine_temp_location(temp_dir: str) -> None:
250
    """
251
    Change temporary file storage location.
252
    
253
    Parameters:
254
    - temp_dir: New directory for temporary files
255
    """
256

257
def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
258
    """
259
    Initialize empty HDF5 file for memory mapping.
260
    
261
    Parameters:
262
    - filepath: Path for new HDF5 file
263
    - shape: Array shape to create
264
    - dtype: Data type
265
    """
266

267
def mmap_array_from_path(filepath: str, dataset_name: str = 'data', 
268
                        mode: str = 'r') -> np.ndarray:
269
    """
270
    Reconnect to existing memory-mapped file.
271
    
272
    Parameters:
273
    - filepath: Path to existing HDF5 file
274
    - dataset_name: Name of dataset in HDF5 file
275
    - mode: Access mode ('r', 'r+', 'w')
276
    
277
    Returns:
278
    Memory-mapped array connected to file
279
    """
280

281
def array(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
282
    """
283
    Create temporary memory-mapped array.
284
    
285
    Parameters:
286
    - shape: Array shape
287
    - dtype: Data type (default: float64)
288
    - **kwargs: Additional numpy.memmap options
289
    
290
    Returns:
291
    Memory-mapped numpy array
292
    """
293

294
def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
295
    """
296
    Initialize empty HDF5 file for memory mapping.
297
    
298
    Parameters:
299
    - filepath: Path for new HDF5 file
300
    - shape: Array shape to create
301
    - dtype: Data type
302
    """
303

304
def mmap_array_from_path(filepath: str, dataset_name: str = 'data', 
305
                        mode: str = 'r') -> np.ndarray:
306
    """
307
    Reconnect to existing memory-mapped file.
308
    
309
    Parameters:
310
    - filepath: Path to existing HDF5 file
311
    - dataset_name: Name of dataset in HDF5 file
312
    - mode: Access mode ('r', 'r+', 'w')
313
    
314
    Returns:
315
    Memory-mapped array connected to file
316
    """
317

318
def zeros(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
319
    """
320
    Create zero-filled temporary memory-mapped array.
321
    
322
    Parameters:
323
    - shape: Array shape
324
    - dtype: Data type (default: float64)
325
    - **kwargs: Additional options
326
    
327
    Returns:
328
    Zero-filled memory-mapped array
329
    """
330

331
def ones(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
332
    """
333
    Create ones-filled temporary memory-mapped array.
334
    
335
    Parameters:
336
    - shape: Array shape  
337
    - dtype: Data type (default: float64)
338
    - **kwargs: Additional options
339
    
340
    Returns:
341
    Ones-filled memory-mapped array
342
    """
343

344
def clear() -> None:
345
    """
346
    Clear temporary memory-mapped file directory.
347
    Removes all temporary files created by this session.
348
    """
349

350
def get_temp_dir() -> str:
351
    """
352
    Get current temporary directory location.
353
    
354
    Returns:
355
    Path to temporary directory
356
    """
357

358
def get_available_memory() -> int:
359
    """
360
    Get available system memory in bytes.
361
    
362
    Returns:
363
    Available memory in bytes
364
    """
365

366
def estimate_memory_usage(shape: tuple, dtype=np.float64) -> int:
367
    """
368
    Estimate memory usage for array with given shape and dtype.
369
    
370
    Parameters:
371
    - shape: Array shape
372
    - dtype: Data type
373
    
374
    Returns:
375
    Estimated memory usage in bytes
376
    """
377
```
378

379
### Utility Functions
380

381
Additional I/O utility functions for data processing and file management.
382

383
```python { .api }
384
def save_dataframe_hdf(df: pd.DataFrame, filepath: str, key: str = 'data',
385
                      **kwargs) -> None:
386
    """
387
    Save pandas DataFrame to HDF5 format with optimization.
388
    
389
    Parameters:
390
    - df: DataFrame to save
391
    - filepath: Output HDF5 file path
392
    - key: Dataset key in HDF5 file
393
    - **kwargs: Additional pandas.to_hdf options
394
    """
395

396
def load_dataframe_hdf(filepath: str, key: str = 'data',
397
                      **kwargs) -> pd.DataFrame:
398
    """
399
    Load pandas DataFrame from HDF5 format.
400
    
401
    Parameters:
402
    - filepath: Input HDF5 file path
403
    - key: Dataset key in HDF5 file
404
    - **kwargs: Additional pandas.read_hdf options
405
    
406
    Returns:
407
    Loaded pandas DataFrame
408
    """
409

410
def get_hdf_info(filepath: str) -> dict:
411
    """
412
    Get comprehensive information about HDF5 file contents.
413
    
414
    Parameters:
415
    - filepath: Path to HDF5 file
416
    
417
    Returns:
418
    Dictionary with file structure and metadata
419
    """
420

421
def compress_hdf_file(input_path: str, output_path: str,
422
                     compression: str = 'gzip') -> None:
423
    """
424
    Compress HDF5 file to reduce size.
425
    
426
    Parameters:
427
    - input_path: Input HDF5 file
428
    - output_path: Output compressed HDF5 file
429
    - compression: Compression algorithm ('gzip', 'lzf', 'szip')
430
    """
431

432
def merge_hdf_files(file_paths: List[str], output_path: str) -> None:
433
    """
434
    Merge multiple HDF5 files into single file.
435
    
436
    Parameters:
437
    - file_paths: List of HDF5 files to merge
438
    - output_path: Output merged HDF5 file path
439
    """
440
```
441

442
## Usage Examples
443

444
### Basic HDF5 Operations
445

446
```python
447
from alphabase.io.hdf import HDF_File
448
import pandas as pd
449
import numpy as np
450

451
# Create or open HDF5 file
452
with HDF_File('data.h5', mode='w') as hf:
453
    # Store numpy array
454
    data_array = np.random.randn(1000, 50)
455
    hf['array_data'] = data_array
456
    
457
    # Store pandas DataFrame
458
    df = pd.DataFrame({
459
        'sequence': ['PEPTIDE', 'SEQUENCE'],
460
        'charge': [2, 3],
461
        'mz': [123.45, 234.56]
462
    })
463
    hf['precursors'] = df
464
    
465
    # Create groups for organization
466
    group = hf.create_group('experiments')
467
    group['exp1'] = np.random.randn(500, 10)
468
    group['exp2'] = np.random.randn(300, 15)
469
    
470
    # Set attributes
471
    hf.set_attr('version', '1.0')
472
    hf.set_attr('created_by', 'alphabase')
473

474
# Read data back
475
with HDF_File('data.h5', mode='r') as hf:
476
    # Access using dictionary syntax
477
    array_data = hf['array_data'][:]  # Load full array
478
    precursor_df = hf['precursors'].to_pandas()
479
    
480
    # Access using attribute syntax  
481
    exp1_data = hf.experiments.exp1[:]
482
    
483
    # Check file contents
484
    print(f"Keys: {hf.keys()}")
485
    print(f"Version: {hf.get_attr('version')}")
486
```
487

488
### Memory-Mapped Arrays for Large Data
489

490
```python
491
from alphabase.io.tempmmap import array, zeros, ones, clear
492

493
# Create large memory-mapped arrays that don't fit in RAM
494
large_shape = (1000000, 100)  # 100M x 100 = 10B elements
495

496
# Create zero-filled memory-mapped array
497
large_zeros = zeros(large_shape, dtype=np.float32)
498
print(f"Created array shape: {large_zeros.shape}")
499

500
# Create ones-filled array
501
large_ones = ones((500000, 200), dtype=np.float64)
502

503
# Create empty array for computation
504
workspace = array((100000, 500), dtype=np.float32)
505

506
# Use arrays in computations without loading all data into memory
507
for i in range(0, large_shape[0], 10000):
508
    # Process in chunks
509
    chunk = large_zeros[i:i+10000]
510
    # Perform operations on chunk
511
    chunk[:] = np.random.randn(chunk.shape[0], chunk.shape[1])
512

513
# Clean up temporary files when done
514
clear()
515
```
516

517
### Advanced HDF5 Operations
518

519
```python
520
from alphabase.io.hdf import HDF_File
521
from alphabase.spectral_library.base import SpecLibBase
522

523
# Save spectral library to HDF5
524
spec_lib = SpecLibBase()
525
# ... populate library ...
526

527
with HDF_File('spectral_library.h5', mode='w') as hf:
528
    # Save each DataFrame to separate group
529
    lib_group = hf.create_group('spectral_library')
530
    lib_group['precursors'] = spec_lib.precursor_df
531
    lib_group['fragments_mz'] = spec_lib.fragment_mz_df
532
    lib_group['fragments_intensity'] = spec_lib.fragment_intensity_df
533
    
534
    # Add metadata
535
    lib_group.set_attr('num_precursors', len(spec_lib.precursor_df))
536
    lib_group.set_attr('format_version', '2.0')
537
    lib_group.set_attr('creation_date', str(pd.Timestamp.now()))
538

539
# Load spectral library from HDF5
540
new_lib = SpecLibBase()
541
with HDF_File('spectral_library.h5', mode='r') as hf:
542
    lib_group = hf['spectral_library']
543
    new_lib.precursor_df = lib_group['precursors'].to_pandas()
544
    new_lib.fragment_mz_df = lib_group['fragments_mz'].to_pandas()
545
    new_lib.fragment_intensity_df = lib_group['fragments_intensity'].to_pandas()
546
    
547
    # Read metadata
548
    num_precursors = lib_group.get_attr('num_precursors')
549
    print(f"Loaded library with {num_precursors} precursors")
550
```
551

552
### Efficient Data Processing Workflows
553

554
```python
555
from alphabase.io.hdf import HDF_File
556
from alphabase.io.tempmmap import array
557
import numpy as np
558

559
# Process large dataset in chunks using HDF5 and memory mapping
560
input_file = 'large_dataset.h5'
561
output_file = 'processed_dataset.h5'
562

563
with HDF_File(input_file, 'r') as input_hf, \
564
     HDF_File(output_file, 'w') as output_hf:
565
    
566
    # Get input data info
567
    input_data = input_hf['raw_data']
568
    total_rows = input_data.shape[0]
569
    chunk_size = 10000
570
    
571
    # Create output dataset
572
    output_hf.create_dataset('processed_data', 
573
                           shape=input_data.shape, 
574
                           dtype=np.float32)
575
    
576
    # Create temporary workspace
577
    workspace = array((chunk_size, input_data.shape[1]), dtype=np.float32)
578
    
579
    # Process in chunks
580
    for i in range(0, total_rows, chunk_size):
581
        end_idx = min(i + chunk_size, total_rows)
582
        
583
        # Load chunk
584
        chunk = input_data[i:end_idx]
585
        
586
        # Process data (example: normalize)
587
        workspace[:chunk.shape[0]] = chunk
588
        workspace[:chunk.shape[0]] = (workspace[:chunk.shape[0]] - 
589
                                    workspace[:chunk.shape[0]].mean(axis=1, keepdims=True))
590
        
591
        # Save processed chunk
592
        output_hf['processed_data'][i:end_idx] = workspace[:chunk.shape[0]]
593
        
594
        print(f"Processed {end_idx}/{total_rows} rows")
595

596
print("Processing complete!")
597
```
598

599
### File Management and Utilities
600

601
```python
602
from alphabase.io.hdf import get_hdf_info
603

604
# Get information about HDF5 file structure
605
file_info = get_hdf_info('spectral_library.h5')
606
print(f"File info: {file_info}")
607

608
# Check available memory before creating large arrays
609
from alphabase.io.tempmmap import get_available_memory, estimate_memory_usage
610

611
available = get_available_memory()
612
required = estimate_memory_usage((1000000, 100), dtype=np.float64)
613

614
print(f"Available memory: {available / 1e9:.1f} GB")
615
print(f"Required memory: {required / 1e9:.1f} GB")
616

617
if required < available * 0.8:  # Use max 80% of available memory
618
    large_array = array((1000000, 100), dtype=np.float64)
619
    print("Array created successfully")
620
else:
621
    print("Not enough memory, using smaller chunks")
622
```

Version

Tile

Files

io-utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

io-utilities.mddocs/