tessl/pypi-alphabase

An infrastructure Python package of the AlphaX ecosystem for MS proteomics

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

I/O Utilities

Name: tessl/pypi-alphabase
Author: tessl

Advanced I/O utilities including HDF5 wrapper with attribute-style access and memory-mapped arrays for efficient handling of large proteomics datasets. Optimized for high-throughput workflows, memory efficiency, and seamless integration with pandas and numpy operations.

Capabilities

HDF5 File Interface

Comprehensive HDF5 wrapper providing attribute-style access and pandas integration for proteomics data storage.

class HDF_File:
    """Main HDF5 file wrapper with comprehensive read/write functionality."""
    
    def __init__(self, filepath: str, mode: str = 'r', **kwargs):
        """
        Initialize HDF5 file wrapper.
        
        Parameters:
        - filepath: Path to HDF5 file
        - mode: File access mode ('r', 'w', 'a', 'r+')
        - **kwargs: Additional h5py.File options
        """
    
    def __getitem__(self, key: str):
        """
        Access datasets and groups using dictionary-style syntax.
        
        Parameters:
        - key: Dataset or group path
        
        Returns:
        HDF_Dataset, HDF_Group, or HDF_Dataframe object
        """
    
    def __setitem__(self, key: str, value):
        """
        Create or update datasets using dictionary-style syntax.
        
        Parameters:
        - key: Dataset path
        - value: Data to store (numpy array, pandas DataFrame, etc.)
        """
    
    def __contains__(self, key: str) -> bool:
        """Check if dataset or group exists in file."""
    
    def __enter__(self):
        """Context manager entry."""
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit with automatic file closing."""
    
    def close(self) -> None:
        """Close HDF5 file."""
    
    def keys(self) -> list:
        """Get list of top-level datasets and groups."""
    
    def create_group(self, name: str) -> 'HDF_Group':
        """
        Create new HDF5 group.
        
        Parameters:
        - name: Group name/path
        
        Returns:
        HDF_Group wrapper object
        """
    
    def require_group(self, name: str) -> 'HDF_Group':
        """
        Get existing group or create if it doesn't exist.
        
        Parameters:
        - name: Group name/path
        
        Returns:
        HDF_Group wrapper object
        """

class HDF_Group:
    """HDF group wrapper with attribute-style access."""
    
    def __init__(self, hdf_group):
        """Initialize from h5py Group object."""
    
    def __getitem__(self, key: str):
        """Access group contents using dictionary-style syntax."""
    
    def __setitem__(self, key: str, value):
        """Create datasets in group using dictionary-style syntax."""
    
    def __getattr__(self, name: str):
        """Access group contents using attribute-style syntax."""
    
    def __setattr__(self, name: str, value):
        """Create datasets using attribute-style syntax."""
    
    def keys(self) -> list:
        """Get list of datasets and subgroups."""
    
    def create_dataset(self, name: str, data=None, **kwargs):
        """
        Create dataset in group.
        
        Parameters:
        - name: Dataset name
        - data: Data to store
        - **kwargs: Dataset creation options
        """

class HDF_Dataset:
    """HDF dataset wrapper with NumPy-like interface."""
    
    def __init__(self, hdf_dataset):
        """Initialize from h5py Dataset object."""
    
    def __getitem__(self, key):
        """NumPy-style array indexing."""
    
    def __setitem__(self, key, value):
        """NumPy-style array assignment."""
    
    def __array__(self) -> np.ndarray:
        """Convert to numpy array."""
    
    @property
    def shape(self) -> tuple:
        """Dataset shape."""
    
    @property
    def dtype(self):
        """Dataset data type."""
    
    @property
    def size(self) -> int:
        """Total number of elements."""
    
    def resize(self, size: tuple) -> None:
        """
        Resize dataset.
        
        Parameters:
        - size: New dataset shape
        """

class HDF_Dataframe:
    """HDF DataFrame wrapper with pandas-like interface."""
    
    def __init__(self, hdf_group):
        """Initialize from HDF group containing DataFrame data."""
    
    def to_pandas(self) -> pd.DataFrame:
        """
        Convert to pandas DataFrame.
        
        Returns:
        pandas DataFrame with all data loaded into memory
        """
    
    def __getitem__(self, key) -> pd.Series:
        """
        Access DataFrame columns.
        
        Parameters:
        - key: Column name
        
        Returns:
        pandas Series with column data
        """
    
    def __setitem__(self, key: str, value):
        """
        Set DataFrame column.
        
        Parameters:
        - key: Column name
        - value: Column data
        """
    
    @property
    def columns(self) -> list:
        """Get DataFrame column names."""
    
    @property
    def shape(self) -> tuple:
        """Get DataFrame shape."""
    
    def head(self, n: int = 5) -> pd.DataFrame:
        """
        Get first n rows as pandas DataFrame.
        
        Parameters:
        - n: Number of rows to return
        
        Returns:
        pandas DataFrame with first n rows
        """
    
    def tail(self, n: int = 5) -> pd.DataFrame:
        """
        Get last n rows as pandas DataFrame.
        
        Parameters:
        - n: Number of rows to return
        
        Returns:
        pandas DataFrame with last n rows
        """

class HDF_Object:
    """Base class for HDF components with common functionality."""
    
    def __init__(self, hdf_obj):
        """Initialize from h5py object."""
    
    @property
    def attrs(self) -> dict:
        """Access HDF5 attributes as dictionary."""
    
    def set_attr(self, name: str, value) -> None:
        """
        Set HDF5 attribute.
        
        Parameters:
        - name: Attribute name
        - value: Attribute value
        """
    
    def get_attr(self, name: str, default=None):
        """
        Get HDF5 attribute.
        
        Parameters:
        - name: Attribute name
        - default: Default value if attribute doesn't exist
        
        Returns:
        Attribute value or default
        """

Memory-Mapped Arrays

High-performance memory-mapped array operations for handling large datasets that don't fit in memory.

def redefine_temp_location(temp_dir: str) -> None:
    """
    Change temporary file storage location.
    
    Parameters:
    - temp_dir: New directory for temporary files
    """

def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
    """
    Initialize empty HDF5 file for memory mapping.
    
    Parameters:
    - filepath: Path for new HDF5 file
    - shape: Array shape to create
    - dtype: Data type
    """

def mmap_array_from_path(filepath: str, dataset_name: str = 'data', 
                        mode: str = 'r') -> np.ndarray:
    """
    Reconnect to existing memory-mapped file.
    
    Parameters:
    - filepath: Path to existing HDF5 file
    - dataset_name: Name of dataset in HDF5 file
    - mode: Access mode ('r', 'r+', 'w')
    
    Returns:
    Memory-mapped array connected to file
    """

def array(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
    """
    Create temporary memory-mapped array.
    
    Parameters:
    - shape: Array shape
    - dtype: Data type (default: float64)
    - **kwargs: Additional numpy.memmap options
    
    Returns:
    Memory-mapped numpy array
    """

def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
    """
    Initialize empty HDF5 file for memory mapping.
    
    Parameters:
    - filepath: Path for new HDF5 file
    - shape: Array shape to create
    - dtype: Data type
    """

def mmap_array_from_path(filepath: str, dataset_name: str = 'data', 
                        mode: str = 'r') -> np.ndarray:
    """
    Reconnect to existing memory-mapped file.
    
    Parameters:
    - filepath: Path to existing HDF5 file
    - dataset_name: Name of dataset in HDF5 file
    - mode: Access mode ('r', 'r+', 'w')
    
    Returns:
    Memory-mapped array connected to file
    """

def zeros(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
    """
    Create zero-filled temporary memory-mapped array.
    
    Parameters:
    - shape: Array shape
    - dtype: Data type (default: float64)
    - **kwargs: Additional options
    
    Returns:
    Zero-filled memory-mapped array
    """

def ones(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
    """
    Create ones-filled temporary memory-mapped array.
    
    Parameters:
    - shape: Array shape  
    - dtype: Data type (default: float64)
    - **kwargs: Additional options
    
    Returns:
    Ones-filled memory-mapped array
    """

def clear() -> None:
    """
    Clear temporary memory-mapped file directory.
    Removes all temporary files created by this session.
    """

def get_temp_dir() -> str:
    """
    Get current temporary directory location.
    
    Returns:
    Path to temporary directory
    """

def get_available_memory() -> int:
    """
    Get available system memory in bytes.
    
    Returns:
    Available memory in bytes
    """

def estimate_memory_usage(shape: tuple, dtype=np.float64) -> int:
    """
    Estimate memory usage for array with given shape and dtype.
    
    Parameters:
    - shape: Array shape
    - dtype: Data type
    
    Returns:
    Estimated memory usage in bytes
    """

Utility Functions

Additional I/O utility functions for data processing and file management.

def save_dataframe_hdf(df: pd.DataFrame, filepath: str, key: str = 'data',
                      **kwargs) -> None:
    """
    Save pandas DataFrame to HDF5 format with optimization.
    
    Parameters:
    - df: DataFrame to save
    - filepath: Output HDF5 file path
    - key: Dataset key in HDF5 file
    - **kwargs: Additional pandas.to_hdf options
    """

def load_dataframe_hdf(filepath: str, key: str = 'data',
                      **kwargs) -> pd.DataFrame:
    """
    Load pandas DataFrame from HDF5 format.
    
    Parameters:
    - filepath: Input HDF5 file path
    - key: Dataset key in HDF5 file
    - **kwargs: Additional pandas.read_hdf options
    
    Returns:
    Loaded pandas DataFrame
    """

def get_hdf_info(filepath: str) -> dict:
    """
    Get comprehensive information about HDF5 file contents.
    
    Parameters:
    - filepath: Path to HDF5 file
    
    Returns:
    Dictionary with file structure and metadata
    """

def compress_hdf_file(input_path: str, output_path: str,
                     compression: str = 'gzip') -> None:
    """
    Compress HDF5 file to reduce size.
    
    Parameters:
    - input_path: Input HDF5 file
    - output_path: Output compressed HDF5 file
    - compression: Compression algorithm ('gzip', 'lzf', 'szip')
    """

def merge_hdf_files(file_paths: List[str], output_path: str) -> None:
    """
    Merge multiple HDF5 files into single file.
    
    Parameters:
    - file_paths: List of HDF5 files to merge
    - output_path: Output merged HDF5 file path
    """

Usage Examples

Basic HDF5 Operations

from alphabase.io.hdf import HDF_File
import pandas as pd
import numpy as np

# Create or open HDF5 file
with HDF_File('data.h5', mode='w') as hf:
    # Store numpy array
    data_array = np.random.randn(1000, 50)
    hf['array_data'] = data_array
    
    # Store pandas DataFrame
    df = pd.DataFrame({
        'sequence': ['PEPTIDE', 'SEQUENCE'],
        'charge': [2, 3],
        'mz': [123.45, 234.56]
    })
    hf['precursors'] = df
    
    # Create groups for organization
    group = hf.create_group('experiments')
    group['exp1'] = np.random.randn(500, 10)
    group['exp2'] = np.random.randn(300, 15)
    
    # Set attributes
    hf.set_attr('version', '1.0')
    hf.set_attr('created_by', 'alphabase')

# Read data back
with HDF_File('data.h5', mode='r') as hf:
    # Access using dictionary syntax
    array_data = hf['array_data'][:]  # Load full array
    precursor_df = hf['precursors'].to_pandas()
    
    # Access using attribute syntax  
    exp1_data = hf.experiments.exp1[:]
    
    # Check file contents
    print(f"Keys: {hf.keys()}")
    print(f"Version: {hf.get_attr('version')}")

Memory-Mapped Arrays for Large Data

from alphabase.io.tempmmap import array, zeros, ones, clear

# Create large memory-mapped arrays that don't fit in RAM
large_shape = (1000000, 100)  # 100M x 100 = 10B elements

# Create zero-filled memory-mapped array
large_zeros = zeros(large_shape, dtype=np.float32)
print(f"Created array shape: {large_zeros.shape}")

# Create ones-filled array
large_ones = ones((500000, 200), dtype=np.float64)

# Create empty array for computation
workspace = array((100000, 500), dtype=np.float32)

# Use arrays in computations without loading all data into memory
for i in range(0, large_shape[0], 10000):
    # Process in chunks
    chunk = large_zeros[i:i+10000]
    # Perform operations on chunk
    chunk[:] = np.random.randn(chunk.shape[0], chunk.shape[1])

# Clean up temporary files when done
clear()

Advanced HDF5 Operations

from alphabase.io.hdf import HDF_File
from alphabase.spectral_library.base import SpecLibBase

# Save spectral library to HDF5
spec_lib = SpecLibBase()
# ... populate library ...

with HDF_File('spectral_library.h5', mode='w') as hf:
    # Save each DataFrame to separate group
    lib_group = hf.create_group('spectral_library')
    lib_group['precursors'] = spec_lib.precursor_df
    lib_group['fragments_mz'] = spec_lib.fragment_mz_df
    lib_group['fragments_intensity'] = spec_lib.fragment_intensity_df
    
    # Add metadata
    lib_group.set_attr('num_precursors', len(spec_lib.precursor_df))
    lib_group.set_attr('format_version', '2.0')
    lib_group.set_attr('creation_date', str(pd.Timestamp.now()))

# Load spectral library from HDF5
new_lib = SpecLibBase()
with HDF_File('spectral_library.h5', mode='r') as hf:
    lib_group = hf['spectral_library']
    new_lib.precursor_df = lib_group['precursors'].to_pandas()
    new_lib.fragment_mz_df = lib_group['fragments_mz'].to_pandas()
    new_lib.fragment_intensity_df = lib_group['fragments_intensity'].to_pandas()
    
    # Read metadata
    num_precursors = lib_group.get_attr('num_precursors')
    print(f"Loaded library with {num_precursors} precursors")

Efficient Data Processing Workflows

from alphabase.io.hdf import HDF_File
from alphabase.io.tempmmap import array
import numpy as np

# Process large dataset in chunks using HDF5 and memory mapping
input_file = 'large_dataset.h5'
output_file = 'processed_dataset.h5'

with HDF_File(input_file, 'r') as input_hf, \
     HDF_File(output_file, 'w') as output_hf:
    
    # Get input data info
    input_data = input_hf['raw_data']
    total_rows = input_data.shape[0]
    chunk_size = 10000
    
    # Create output dataset
    output_hf.create_dataset('processed_data', 
                           shape=input_data.shape, 
                           dtype=np.float32)
    
    # Create temporary workspace
    workspace = array((chunk_size, input_data.shape[1]), dtype=np.float32)
    
    # Process in chunks
    for i in range(0, total_rows, chunk_size):
        end_idx = min(i + chunk_size, total_rows)
        
        # Load chunk
        chunk = input_data[i:end_idx]
        
        # Process data (example: normalize)
        workspace[:chunk.shape[0]] = chunk
        workspace[:chunk.shape[0]] = (workspace[:chunk.shape[0]] - 
                                    workspace[:chunk.shape[0]].mean(axis=1, keepdims=True))
        
        # Save processed chunk
        output_hf['processed_data'][i:end_idx] = workspace[:chunk.shape[0]]
        
        print(f"Processed {end_idx}/{total_rows} rows")

print("Processing complete!")

File Management and Utilities

from alphabase.io.hdf import get_hdf_info

# Get information about HDF5 file structure
file_info = get_hdf_info('spectral_library.h5')
print(f"File info: {file_info}")

# Check available memory before creating large arrays
from alphabase.io.tempmmap import get_available_memory, estimate_memory_usage

available = get_available_memory()
required = estimate_memory_usage((1000000, 100), dtype=np.float64)

print(f"Available memory: {available / 1e9:.1f} GB")
print(f"Required memory: {required / 1e9:.1f} GB")

if required < available * 0.8:  # Use max 80% of available memory
    large_array = array((1000000, 100), dtype=np.float64)
    print("Array created successfully")
else:
    print("Not enough memory, using smaller chunks")

Install with Tessl CLI