CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-alphabase

An infrastructure Python package of the AlphaX ecosystem for MS proteomics

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

io-utilities.mddocs/

I/O Utilities

Advanced I/O utilities including HDF5 wrapper with attribute-style access and memory-mapped arrays for efficient handling of large proteomics datasets. Optimized for high-throughput workflows, memory efficiency, and seamless integration with pandas and numpy operations.

Capabilities

HDF5 File Interface

Comprehensive HDF5 wrapper providing attribute-style access and pandas integration for proteomics data storage.

class HDF_File:
    """Main HDF5 file wrapper with comprehensive read/write functionality."""
    
    def __init__(self, filepath: str, mode: str = 'r', **kwargs):
        """
        Initialize HDF5 file wrapper.
        
        Parameters:
        - filepath: Path to HDF5 file
        - mode: File access mode ('r', 'w', 'a', 'r+')
        - **kwargs: Additional h5py.File options
        """
    
    def __getitem__(self, key: str):
        """
        Access datasets and groups using dictionary-style syntax.
        
        Parameters:
        - key: Dataset or group path
        
        Returns:
        HDF_Dataset, HDF_Group, or HDF_Dataframe object
        """
    
    def __setitem__(self, key: str, value):
        """
        Create or update datasets using dictionary-style syntax.
        
        Parameters:
        - key: Dataset path
        - value: Data to store (numpy array, pandas DataFrame, etc.)
        """
    
    def __contains__(self, key: str) -> bool:
        """Check if dataset or group exists in file."""
    
    def __enter__(self):
        """Context manager entry."""
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit with automatic file closing."""
    
    def close(self) -> None:
        """Close HDF5 file."""
    
    def keys(self) -> list:
        """Get list of top-level datasets and groups."""
    
    def create_group(self, name: str) -> 'HDF_Group':
        """
        Create new HDF5 group.
        
        Parameters:
        - name: Group name/path
        
        Returns:
        HDF_Group wrapper object
        """
    
    def require_group(self, name: str) -> 'HDF_Group':
        """
        Get existing group or create if it doesn't exist.
        
        Parameters:
        - name: Group name/path
        
        Returns:
        HDF_Group wrapper object
        """

class HDF_Group:
    """HDF group wrapper with attribute-style access."""
    
    def __init__(self, hdf_group):
        """Initialize from h5py Group object."""
    
    def __getitem__(self, key: str):
        """Access group contents using dictionary-style syntax."""
    
    def __setitem__(self, key: str, value):
        """Create datasets in group using dictionary-style syntax."""
    
    def __getattr__(self, name: str):
        """Access group contents using attribute-style syntax."""
    
    def __setattr__(self, name: str, value):
        """Create datasets using attribute-style syntax."""
    
    def keys(self) -> list:
        """Get list of datasets and subgroups."""
    
    def create_dataset(self, name: str, data=None, **kwargs):
        """
        Create dataset in group.
        
        Parameters:
        - name: Dataset name
        - data: Data to store
        - **kwargs: Dataset creation options
        """

class HDF_Dataset:
    """HDF dataset wrapper with NumPy-like interface."""
    
    def __init__(self, hdf_dataset):
        """Initialize from h5py Dataset object."""
    
    def __getitem__(self, key):
        """NumPy-style array indexing."""
    
    def __setitem__(self, key, value):
        """NumPy-style array assignment."""
    
    def __array__(self) -> np.ndarray:
        """Convert to numpy array."""
    
    @property
    def shape(self) -> tuple:
        """Dataset shape."""
    
    @property
    def dtype(self):
        """Dataset data type."""
    
    @property
    def size(self) -> int:
        """Total number of elements."""
    
    def resize(self, size: tuple) -> None:
        """
        Resize dataset.
        
        Parameters:
        - size: New dataset shape
        """

class HDF_Dataframe:
    """HDF DataFrame wrapper with pandas-like interface."""
    
    def __init__(self, hdf_group):
        """Initialize from HDF group containing DataFrame data."""
    
    def to_pandas(self) -> pd.DataFrame:
        """
        Convert to pandas DataFrame.
        
        Returns:
        pandas DataFrame with all data loaded into memory
        """
    
    def __getitem__(self, key) -> pd.Series:
        """
        Access DataFrame columns.
        
        Parameters:
        - key: Column name
        
        Returns:
        pandas Series with column data
        """
    
    def __setitem__(self, key: str, value):
        """
        Set DataFrame column.
        
        Parameters:
        - key: Column name
        - value: Column data
        """
    
    @property
    def columns(self) -> list:
        """Get DataFrame column names."""
    
    @property
    def shape(self) -> tuple:
        """Get DataFrame shape."""
    
    def head(self, n: int = 5) -> pd.DataFrame:
        """
        Get first n rows as pandas DataFrame.
        
        Parameters:
        - n: Number of rows to return
        
        Returns:
        pandas DataFrame with first n rows
        """
    
    def tail(self, n: int = 5) -> pd.DataFrame:
        """
        Get last n rows as pandas DataFrame.
        
        Parameters:
        - n: Number of rows to return
        
        Returns:
        pandas DataFrame with last n rows
        """

class HDF_Object:
    """Base class for HDF components with common functionality."""
    
    def __init__(self, hdf_obj):
        """Initialize from h5py object."""
    
    @property
    def attrs(self) -> dict:
        """Access HDF5 attributes as dictionary."""
    
    def set_attr(self, name: str, value) -> None:
        """
        Set HDF5 attribute.
        
        Parameters:
        - name: Attribute name
        - value: Attribute value
        """
    
    def get_attr(self, name: str, default=None):
        """
        Get HDF5 attribute.
        
        Parameters:
        - name: Attribute name
        - default: Default value if attribute doesn't exist
        
        Returns:
        Attribute value or default
        """

Memory-Mapped Arrays

High-performance memory-mapped array operations for handling large datasets that don't fit in memory.

def redefine_temp_location(temp_dir: str) -> None:
    """
    Change temporary file storage location.
    
    Parameters:
    - temp_dir: New directory for temporary files
    """

def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
    """
    Initialize empty HDF5 file for memory mapping.
    
    Parameters:
    - filepath: Path for new HDF5 file
    - shape: Array shape to create
    - dtype: Data type
    """

def mmap_array_from_path(filepath: str, dataset_name: str = 'data', 
                        mode: str = 'r') -> np.ndarray:
    """
    Reconnect to existing memory-mapped file.
    
    Parameters:
    - filepath: Path to existing HDF5 file
    - dataset_name: Name of dataset in HDF5 file
    - mode: Access mode ('r', 'r+', 'w')
    
    Returns:
    Memory-mapped array connected to file
    """

def array(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
    """
    Create temporary memory-mapped array.
    
    Parameters:
    - shape: Array shape
    - dtype: Data type (default: float64)
    - **kwargs: Additional numpy.memmap options
    
    Returns:
    Memory-mapped numpy array
    """

def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:
    """
    Initialize empty HDF5 file for memory mapping.
    
    Parameters:
    - filepath: Path for new HDF5 file
    - shape: Array shape to create
    - dtype: Data type
    """

def mmap_array_from_path(filepath: str, dataset_name: str = 'data', 
                        mode: str = 'r') -> np.ndarray:
    """
    Reconnect to existing memory-mapped file.
    
    Parameters:
    - filepath: Path to existing HDF5 file
    - dataset_name: Name of dataset in HDF5 file
    - mode: Access mode ('r', 'r+', 'w')
    
    Returns:
    Memory-mapped array connected to file
    """

def zeros(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
    """
    Create zero-filled temporary memory-mapped array.
    
    Parameters:
    - shape: Array shape
    - dtype: Data type (default: float64)
    - **kwargs: Additional options
    
    Returns:
    Zero-filled memory-mapped array
    """

def ones(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:
    """
    Create ones-filled temporary memory-mapped array.
    
    Parameters:
    - shape: Array shape  
    - dtype: Data type (default: float64)
    - **kwargs: Additional options
    
    Returns:
    Ones-filled memory-mapped array
    """

def clear() -> None:
    """
    Clear temporary memory-mapped file directory.
    Removes all temporary files created by this session.
    """

def get_temp_dir() -> str:
    """
    Get current temporary directory location.
    
    Returns:
    Path to temporary directory
    """

def get_available_memory() -> int:
    """
    Get available system memory in bytes.
    
    Returns:
    Available memory in bytes
    """

def estimate_memory_usage(shape: tuple, dtype=np.float64) -> int:
    """
    Estimate memory usage for array with given shape and dtype.
    
    Parameters:
    - shape: Array shape
    - dtype: Data type
    
    Returns:
    Estimated memory usage in bytes
    """

Utility Functions

Additional I/O utility functions for data processing and file management.

def save_dataframe_hdf(df: pd.DataFrame, filepath: str, key: str = 'data',
                      **kwargs) -> None:
    """
    Save pandas DataFrame to HDF5 format with optimization.
    
    Parameters:
    - df: DataFrame to save
    - filepath: Output HDF5 file path
    - key: Dataset key in HDF5 file
    - **kwargs: Additional pandas.to_hdf options
    """

def load_dataframe_hdf(filepath: str, key: str = 'data',
                      **kwargs) -> pd.DataFrame:
    """
    Load pandas DataFrame from HDF5 format.
    
    Parameters:
    - filepath: Input HDF5 file path
    - key: Dataset key in HDF5 file
    - **kwargs: Additional pandas.read_hdf options
    
    Returns:
    Loaded pandas DataFrame
    """

def get_hdf_info(filepath: str) -> dict:
    """
    Get comprehensive information about HDF5 file contents.
    
    Parameters:
    - filepath: Path to HDF5 file
    
    Returns:
    Dictionary with file structure and metadata
    """

def compress_hdf_file(input_path: str, output_path: str,
                     compression: str = 'gzip') -> None:
    """
    Compress HDF5 file to reduce size.
    
    Parameters:
    - input_path: Input HDF5 file
    - output_path: Output compressed HDF5 file
    - compression: Compression algorithm ('gzip', 'lzf', 'szip')
    """

def merge_hdf_files(file_paths: List[str], output_path: str) -> None:
    """
    Merge multiple HDF5 files into single file.
    
    Parameters:
    - file_paths: List of HDF5 files to merge
    - output_path: Output merged HDF5 file path
    """

Usage Examples

Basic HDF5 Operations

from alphabase.io.hdf import HDF_File
import pandas as pd
import numpy as np

# Create or open HDF5 file
with HDF_File('data.h5', mode='w') as hf:
    # Store numpy array
    data_array = np.random.randn(1000, 50)
    hf['array_data'] = data_array
    
    # Store pandas DataFrame
    df = pd.DataFrame({
        'sequence': ['PEPTIDE', 'SEQUENCE'],
        'charge': [2, 3],
        'mz': [123.45, 234.56]
    })
    hf['precursors'] = df
    
    # Create groups for organization
    group = hf.create_group('experiments')
    group['exp1'] = np.random.randn(500, 10)
    group['exp2'] = np.random.randn(300, 15)
    
    # Set attributes
    hf.set_attr('version', '1.0')
    hf.set_attr('created_by', 'alphabase')

# Read data back
with HDF_File('data.h5', mode='r') as hf:
    # Access using dictionary syntax
    array_data = hf['array_data'][:]  # Load full array
    precursor_df = hf['precursors'].to_pandas()
    
    # Access using attribute syntax  
    exp1_data = hf.experiments.exp1[:]
    
    # Check file contents
    print(f"Keys: {hf.keys()}")
    print(f"Version: {hf.get_attr('version')}")

Memory-Mapped Arrays for Large Data

from alphabase.io.tempmmap import array, zeros, ones, clear

# Create large memory-mapped arrays that don't fit in RAM
large_shape = (1000000, 100)  # 100M x 100 = 10B elements

# Create zero-filled memory-mapped array
large_zeros = zeros(large_shape, dtype=np.float32)
print(f"Created array shape: {large_zeros.shape}")

# Create ones-filled array
large_ones = ones((500000, 200), dtype=np.float64)

# Create empty array for computation
workspace = array((100000, 500), dtype=np.float32)

# Use arrays in computations without loading all data into memory
for i in range(0, large_shape[0], 10000):
    # Process in chunks
    chunk = large_zeros[i:i+10000]
    # Perform operations on chunk
    chunk[:] = np.random.randn(chunk.shape[0], chunk.shape[1])

# Clean up temporary files when done
clear()

Advanced HDF5 Operations

from alphabase.io.hdf import HDF_File
from alphabase.spectral_library.base import SpecLibBase

# Save spectral library to HDF5
spec_lib = SpecLibBase()
# ... populate library ...

with HDF_File('spectral_library.h5', mode='w') as hf:
    # Save each DataFrame to separate group
    lib_group = hf.create_group('spectral_library')
    lib_group['precursors'] = spec_lib.precursor_df
    lib_group['fragments_mz'] = spec_lib.fragment_mz_df
    lib_group['fragments_intensity'] = spec_lib.fragment_intensity_df
    
    # Add metadata
    lib_group.set_attr('num_precursors', len(spec_lib.precursor_df))
    lib_group.set_attr('format_version', '2.0')
    lib_group.set_attr('creation_date', str(pd.Timestamp.now()))

# Load spectral library from HDF5
new_lib = SpecLibBase()
with HDF_File('spectral_library.h5', mode='r') as hf:
    lib_group = hf['spectral_library']
    new_lib.precursor_df = lib_group['precursors'].to_pandas()
    new_lib.fragment_mz_df = lib_group['fragments_mz'].to_pandas()
    new_lib.fragment_intensity_df = lib_group['fragments_intensity'].to_pandas()
    
    # Read metadata
    num_precursors = lib_group.get_attr('num_precursors')
    print(f"Loaded library with {num_precursors} precursors")

Efficient Data Processing Workflows

from alphabase.io.hdf import HDF_File
from alphabase.io.tempmmap import array
import numpy as np

# Process large dataset in chunks using HDF5 and memory mapping
input_file = 'large_dataset.h5'
output_file = 'processed_dataset.h5'

with HDF_File(input_file, 'r') as input_hf, \
     HDF_File(output_file, 'w') as output_hf:
    
    # Get input data info
    input_data = input_hf['raw_data']
    total_rows = input_data.shape[0]
    chunk_size = 10000
    
    # Create output dataset
    output_hf.create_dataset('processed_data', 
                           shape=input_data.shape, 
                           dtype=np.float32)
    
    # Create temporary workspace
    workspace = array((chunk_size, input_data.shape[1]), dtype=np.float32)
    
    # Process in chunks
    for i in range(0, total_rows, chunk_size):
        end_idx = min(i + chunk_size, total_rows)
        
        # Load chunk
        chunk = input_data[i:end_idx]
        
        # Process data (example: normalize)
        workspace[:chunk.shape[0]] = chunk
        workspace[:chunk.shape[0]] = (workspace[:chunk.shape[0]] - 
                                    workspace[:chunk.shape[0]].mean(axis=1, keepdims=True))
        
        # Save processed chunk
        output_hf['processed_data'][i:end_idx] = workspace[:chunk.shape[0]]
        
        print(f"Processed {end_idx}/{total_rows} rows")

print("Processing complete!")

File Management and Utilities

from alphabase.io.hdf import get_hdf_info

# Get information about HDF5 file structure
file_info = get_hdf_info('spectral_library.h5')
print(f"File info: {file_info}")

# Check available memory before creating large arrays
from alphabase.io.tempmmap import get_available_memory, estimate_memory_usage

available = get_available_memory()
required = estimate_memory_usage((1000000, 100), dtype=np.float64)

print(f"Available memory: {available / 1e9:.1f} GB")
print(f"Required memory: {required / 1e9:.1f} GB")

if required < available * 0.8:  # Use max 80% of available memory
    large_array = array((1000000, 100), dtype=np.float64)
    print("Array created successfully")
else:
    print("Not enough memory, using smaller chunks")

Install with Tessl CLI

npx tessl i tessl/pypi-alphabase

docs

advanced-peptide-operations.md

advanced-spectral-libraries.md

chemical-constants.md

fragment-ions.md

index.md

io-utilities.md

protein-analysis.md

psm-readers.md

quantification.md

smiles-chemistry.md

spectral-libraries.md

tile.json