An infrastructure Python package of the AlphaX ecosystem for MS proteomics
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Advanced I/O utilities including HDF5 wrapper with attribute-style access and memory-mapped arrays for efficient handling of large proteomics datasets. Optimized for high-throughput workflows, memory efficiency, and seamless integration with pandas and numpy operations.
Comprehensive HDF5 wrapper providing attribute-style access and pandas integration for proteomics data storage.
class HDF_File:
"""Main HDF5 file wrapper with comprehensive read/write functionality."""
def __init__(self, filepath: str, mode: str = 'r', **kwargs):
"""
Initialize HDF5 file wrapper.
Parameters:
- filepath: Path to HDF5 file
- mode: File access mode ('r', 'w', 'a', 'r+')
- **kwargs: Additional h5py.File options
"""
def __getitem__(self, key: str):
"""
Access datasets and groups using dictionary-style syntax.
Parameters:
- key: Dataset or group path
Returns:
HDF_Dataset, HDF_Group, or HDF_Dataframe object
"""
def __setitem__(self, key: str, value):
"""
Create or update datasets using dictionary-style syntax.
Parameters:
- key: Dataset path
- value: Data to store (numpy array, pandas DataFrame, etc.)
"""
def __contains__(self, key: str) -> bool:
"""Check if dataset or group exists in file."""
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit with automatic file closing."""
def close(self) -> None:
"""Close HDF5 file."""
def keys(self) -> list:
"""Get list of top-level datasets and groups."""
def create_group(self, name: str) -> 'HDF_Group':
"""
Create new HDF5 group.
Parameters:
- name: Group name/path
Returns:
HDF_Group wrapper object
"""
def require_group(self, name: str) -> 'HDF_Group':
"""
Get existing group or create if it doesn't exist.
Parameters:
- name: Group name/path
Returns:
HDF_Group wrapper object
"""
class HDF_Group:
"""HDF group wrapper with attribute-style access."""
def __init__(self, hdf_group):
"""Initialize from h5py Group object."""
def __getitem__(self, key: str):
"""Access group contents using dictionary-style syntax."""
def __setitem__(self, key: str, value):
"""Create datasets in group using dictionary-style syntax."""
def __getattr__(self, name: str):
"""Access group contents using attribute-style syntax."""
def __setattr__(self, name: str, value):
"""Create datasets using attribute-style syntax."""
def keys(self) -> list:
"""Get list of datasets and subgroups."""
def create_dataset(self, name: str, data=None, **kwargs):
"""
Create dataset in group.
Parameters:
- name: Dataset name
- data: Data to store
- **kwargs: Dataset creation options
"""
class HDF_Dataset:
"""HDF dataset wrapper with NumPy-like interface."""
def __init__(self, hdf_dataset):
"""Initialize from h5py Dataset object."""
def __getitem__(self, key):
"""NumPy-style array indexing."""
def __setitem__(self, key, value):
"""NumPy-style array assignment."""
def __array__(self) -> np.ndarray:
"""Convert to numpy array."""
@property
def shape(self) -> tuple:
"""Dataset shape."""
@property
def dtype(self):
"""Dataset data type."""
@property
def size(self) -> int:
"""Total number of elements."""
def resize(self, size: tuple) -> None:
"""
Resize dataset.
Parameters:
- size: New dataset shape
"""
class HDF_Dataframe:
"""HDF DataFrame wrapper with pandas-like interface."""
def __init__(self, hdf_group):
"""Initialize from HDF group containing DataFrame data."""
def to_pandas(self) -> pd.DataFrame:
"""
Convert to pandas DataFrame.
Returns:
pandas DataFrame with all data loaded into memory
"""
def __getitem__(self, key) -> pd.Series:
"""
Access DataFrame columns.
Parameters:
- key: Column name
Returns:
pandas Series with column data
"""
def __setitem__(self, key: str, value):
"""
Set DataFrame column.
Parameters:
- key: Column name
- value: Column data
"""
@property
def columns(self) -> list:
"""Get DataFrame column names."""
@property
def shape(self) -> tuple:
"""Get DataFrame shape."""
def head(self, n: int = 5) -> pd.DataFrame:
"""
Get first n rows as pandas DataFrame.
Parameters:
- n: Number of rows to return
Returns:
pandas DataFrame with first n rows
"""
def tail(self, n: int = 5) -> pd.DataFrame:
"""
Get last n rows as pandas DataFrame.
Parameters:
- n: Number of rows to return
Returns:
pandas DataFrame with last n rows
"""
class HDF_Object:
"""Base class for HDF components with common functionality."""
def __init__(self, hdf_obj):
"""Initialize from h5py object."""
@property
def attrs(self) -> dict:
"""Access HDF5 attributes as dictionary."""
def set_attr(self, name: str, value) -> None:
"""
Set HDF5 attribute.
Parameters:
- name: Attribute name
- value: Attribute value
"""
def get_attr(self, name: str, default=None):
"""
Get HDF5 attribute.
Parameters:
- name: Attribute name
- default: Default value if attribute doesn't exist
Returns:
Attribute value or default
"""High-performance memory-mapped array operations for handling large datasets that don't fit in memory.
def redefine_temp_location(temp_dir: str) -> None:
"""
Change temporary file storage location.
Parameters:
- temp_dir: New directory for temporary files
"""
def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
"""
Initialize empty HDF5 file for memory mapping.
Parameters:
- filepath: Path for new HDF5 file
- shape: Array shape to create
- dtype: Data type
"""
def mmap_array_from_path(filepath: str, dataset_name: str = 'data',
mode: str = 'r') -> np.ndarray:
"""
Reconnect to existing memory-mapped file.
Parameters:
- filepath: Path to existing HDF5 file
- dataset_name: Name of dataset in HDF5 file
- mode: Access mode ('r', 'r+', 'w')
Returns:
Memory-mapped array connected to file
"""
def array(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
"""
Create temporary memory-mapped array.
Parameters:
- shape: Array shape
- dtype: Data type (default: float64)
- **kwargs: Additional numpy.memmap options
Returns:
Memory-mapped numpy array
"""
def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
"""
Initialize empty HDF5 file for memory mapping.
Parameters:
- filepath: Path for new HDF5 file
- shape: Array shape to create
- dtype: Data type
"""
def mmap_array_from_path(filepath: str, dataset_name: str = 'data',
mode: str = 'r') -> np.ndarray:
"""
Reconnect to existing memory-mapped file.
Parameters:
- filepath: Path to existing HDF5 file
- dataset_name: Name of dataset in HDF5 file
- mode: Access mode ('r', 'r+', 'w')
Returns:
Memory-mapped array connected to file
"""
def zeros(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
"""
Create zero-filled temporary memory-mapped array.
Parameters:
- shape: Array shape
- dtype: Data type (default: float64)
- **kwargs: Additional options
Returns:
Zero-filled memory-mapped array
"""
def ones(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
"""
Create ones-filled temporary memory-mapped array.
Parameters:
- shape: Array shape
- dtype: Data type (default: float64)
- **kwargs: Additional options
Returns:
Ones-filled memory-mapped array
"""
def clear() -> None:
"""
Clear temporary memory-mapped file directory.
Removes all temporary files created by this session.
"""
def get_temp_dir() -> str:
"""
Get current temporary directory location.
Returns:
Path to temporary directory
"""
def get_available_memory() -> int:
"""
Get available system memory in bytes.
Returns:
Available memory in bytes
"""
def estimate_memory_usage(shape: tuple, dtype=np.float64) -> int:
"""
Estimate memory usage for array with given shape and dtype.
Parameters:
- shape: Array shape
- dtype: Data type
Returns:
Estimated memory usage in bytes
"""Additional I/O utility functions for data processing and file management.
def save_dataframe_hdf(df: pd.DataFrame, filepath: str, key: str = 'data',
**kwargs) -> None:
"""
Save pandas DataFrame to HDF5 format with optimization.
Parameters:
- df: DataFrame to save
- filepath: Output HDF5 file path
- key: Dataset key in HDF5 file
- **kwargs: Additional pandas.to_hdf options
"""
def load_dataframe_hdf(filepath: str, key: str = 'data',
**kwargs) -> pd.DataFrame:
"""
Load pandas DataFrame from HDF5 format.
Parameters:
- filepath: Input HDF5 file path
- key: Dataset key in HDF5 file
- **kwargs: Additional pandas.read_hdf options
Returns:
Loaded pandas DataFrame
"""
def get_hdf_info(filepath: str) -> dict:
"""
Get comprehensive information about HDF5 file contents.
Parameters:
- filepath: Path to HDF5 file
Returns:
Dictionary with file structure and metadata
"""
def compress_hdf_file(input_path: str, output_path: str,
compression: str = 'gzip') -> None:
"""
Compress HDF5 file to reduce size.
Parameters:
- input_path: Input HDF5 file
- output_path: Output compressed HDF5 file
- compression: Compression algorithm ('gzip', 'lzf', 'szip')
"""
def merge_hdf_files(file_paths: List[str], output_path: str) -> None:
"""
Merge multiple HDF5 files into single file.
Parameters:
- file_paths: List of HDF5 files to merge
- output_path: Output merged HDF5 file path
"""from alphabase.io.hdf import HDF_File
import pandas as pd
import numpy as np
# Create or open HDF5 file
with HDF_File('data.h5', mode='w') as hf:
# Store numpy array
data_array = np.random.randn(1000, 50)
hf['array_data'] = data_array
# Store pandas DataFrame
df = pd.DataFrame({
'sequence': ['PEPTIDE', 'SEQUENCE'],
'charge': [2, 3],
'mz': [123.45, 234.56]
})
hf['precursors'] = df
# Create groups for organization
group = hf.create_group('experiments')
group['exp1'] = np.random.randn(500, 10)
group['exp2'] = np.random.randn(300, 15)
# Set attributes
hf.set_attr('version', '1.0')
hf.set_attr('created_by', 'alphabase')
# Read data back
with HDF_File('data.h5', mode='r') as hf:
# Access using dictionary syntax
array_data = hf['array_data'][:] # Load full array
precursor_df = hf['precursors'].to_pandas()
# Access using attribute syntax
exp1_data = hf.experiments.exp1[:]
# Check file contents
print(f"Keys: {hf.keys()}")
print(f"Version: {hf.get_attr('version')}")from alphabase.io.tempmmap import array, zeros, ones, clear
# Create large memory-mapped arrays that don't fit in RAM
large_shape = (1000000, 100) # 100M x 100 = 10B elements
# Create zero-filled memory-mapped array
large_zeros = zeros(large_shape, dtype=np.float32)
print(f"Created array shape: {large_zeros.shape}")
# Create ones-filled array
large_ones = ones((500000, 200), dtype=np.float64)
# Create empty array for computation
workspace = array((100000, 500), dtype=np.float32)
# Use arrays in computations without loading all data into memory
for i in range(0, large_shape[0], 10000):
# Process in chunks
chunk = large_zeros[i:i+10000]
# Perform operations on chunk
chunk[:] = np.random.randn(chunk.shape[0], chunk.shape[1])
# Clean up temporary files when done
clear()from alphabase.io.hdf import HDF_File
from alphabase.spectral_library.base import SpecLibBase
# Save spectral library to HDF5
spec_lib = SpecLibBase()
# ... populate library ...
with HDF_File('spectral_library.h5', mode='w') as hf:
# Save each DataFrame to separate group
lib_group = hf.create_group('spectral_library')
lib_group['precursors'] = spec_lib.precursor_df
lib_group['fragments_mz'] = spec_lib.fragment_mz_df
lib_group['fragments_intensity'] = spec_lib.fragment_intensity_df
# Add metadata
lib_group.set_attr('num_precursors', len(spec_lib.precursor_df))
lib_group.set_attr('format_version', '2.0')
lib_group.set_attr('creation_date', str(pd.Timestamp.now()))
# Load spectral library from HDF5
new_lib = SpecLibBase()
with HDF_File('spectral_library.h5', mode='r') as hf:
lib_group = hf['spectral_library']
new_lib.precursor_df = lib_group['precursors'].to_pandas()
new_lib.fragment_mz_df = lib_group['fragments_mz'].to_pandas()
new_lib.fragment_intensity_df = lib_group['fragments_intensity'].to_pandas()
# Read metadata
num_precursors = lib_group.get_attr('num_precursors')
print(f"Loaded library with {num_precursors} precursors")from alphabase.io.hdf import HDF_File
from alphabase.io.tempmmap import array
import numpy as np
# Process large dataset in chunks using HDF5 and memory mapping
input_file = 'large_dataset.h5'
output_file = 'processed_dataset.h5'
with HDF_File(input_file, 'r') as input_hf, \
HDF_File(output_file, 'w') as output_hf:
# Get input data info
input_data = input_hf['raw_data']
total_rows = input_data.shape[0]
chunk_size = 10000
# Create output dataset
output_hf.create_dataset('processed_data',
shape=input_data.shape,
dtype=np.float32)
# Create temporary workspace
workspace = array((chunk_size, input_data.shape[1]), dtype=np.float32)
# Process in chunks
for i in range(0, total_rows, chunk_size):
end_idx = min(i + chunk_size, total_rows)
# Load chunk
chunk = input_data[i:end_idx]
# Process data (example: normalize)
workspace[:chunk.shape[0]] = chunk
workspace[:chunk.shape[0]] = (workspace[:chunk.shape[0]] -
workspace[:chunk.shape[0]].mean(axis=1, keepdims=True))
# Save processed chunk
output_hf['processed_data'][i:end_idx] = workspace[:chunk.shape[0]]
print(f"Processed {end_idx}/{total_rows} rows")
print("Processing complete!")from alphabase.io.hdf import get_hdf_info
# Get information about HDF5 file structure
file_info = get_hdf_info('spectral_library.h5')
print(f"File info: {file_info}")
# Check available memory before creating large arrays
from alphabase.io.tempmmap import get_available_memory, estimate_memory_usage
available = get_available_memory()
required = estimate_memory_usage((1000000, 100), dtype=np.float64)
print(f"Available memory: {available / 1e9:.1f} GB")
print(f"Required memory: {required / 1e9:.1f} GB")
if required < available * 0.8: # Use max 80% of available memory
large_array = array((1000000, 100), dtype=np.float64)
print("Array created successfully")
else:
print("Not enough memory, using smaller chunks")Install with Tessl CLI
npx tessl i tessl/pypi-alphabase