tessl/pypi-alphabase

An infrastructure Python package of the AlphaX ecosystem for MS proteomics

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Spectral Library Management

Name: tessl/pypi-alphabase
Author: tessl

Full-featured spectral library management with comprehensive functionality for loading, processing, filtering, and exporting spectral libraries. Supports multiple formats, advanced operations like decoy generation, isotope calculations, and integration with various mass spectrometry workflows.

Capabilities

Core Spectral Library Class

The main SpecLibBase class provides comprehensive spectral library functionality with integrated DataFrame management and processing capabilities.

class SpecLibBase:
    """
    Main spectral library class with comprehensive functionality.
    
    Properties:
    - precursor_df: DataFrame with precursor information (sequence, mods, charge, proteins)
    - peptide_df: DataFrame with unique peptide information
    - fragment_mz_df: DataFrame with fragment m/z values
    - fragment_intensity_df: DataFrame with fragment intensities
    """
    
    # Core properties
    precursor_df: pd.DataFrame
    peptide_df: pd.DataFrame  
    fragment_mz_df: pd.DataFrame
    fragment_intensity_df: pd.DataFrame
    
    def __init__(self):
        """Initialize empty spectral library."""
    
    def copy(self) -> 'SpecLibBase':
        """
        Create deep copy of spectral library.
        
        Returns:
        New SpecLibBase instance with copied data
        """
    
    def append(self, other: 'SpecLibBase') -> None:
        """
        Append another spectral library to this one.
        
        Parameters:
        - other: Another SpecLibBase instance to append
        """
    
    def refine_df(self) -> None:
        """
        Sort and optimize all DataFrames for performance.
        Sets proper indexing and memory layout.
        """
    
    def append_decoy_sequence(self, decoy_sequence: str, 
                            decoy_proteins: str = "decoy") -> None:
        """
        Add decoy sequences to the library.
        
        Parameters:
        - decoy_sequence: Decoy sequence string
        - decoy_proteins: Protein identifier for decoys
        """

Mass and M/Z Calculations

Methods for calculating precursor and fragment m/z values with support for modifications and charge states.

class SpecLibBase:
    def calc_precursor_mz(self) -> None:
        """
        Calculate precursor m/z values from mass and charge.
        Updates precursor_df with 'mz' column.
        """
    
    def calc_fragment_mz_df(self, frag_types: List[str] = None) -> None:
        """
        Generate fragment m/z DataFrame for all precursors.
        
        Parameters:
        - frag_types: List of fragment types like ['b+', 'y+', 'b++', 'y++']
                     If None, uses default fragment types
        """
    
    def update_precursor_mz(self) -> None:
        """
        Update precursor m/z values after modifications.
        Alias for calc_precursor_mz() for backwards compatibility.
        """

Hashing and Identification

Methods for generating hash codes for fast precursor lookup and deduplication.

class SpecLibBase:
    def hash_precursor_df(self) -> None:
        """
        Add hash columns to precursor DataFrame.
        Adds 'seq_hash' and 'prec_hash' columns for fast lookup.
        """
    
    def get_mod_seq_hash(self) -> pd.Series:
        """
        Generate hash codes for modified peptide sequences.
        
        Returns:
        Series with hash codes for each sequence
        """
    
    def get_mod_seq_charge_hash(self) -> pd.Series:
        """
        Generate hash codes for precursors (sequence + charge).
        
        Returns:
        Series with hash codes for each precursor
        """

Isotope Calculations

Methods for calculating isotope patterns and intensities for precursors.

class SpecLibBase:
    def calc_precursor_isotope_info(self, max_isotope: int = 6) -> None:
        """
        Calculate isotope envelope information for precursors.
        
        Parameters:
        - max_isotope: Maximum number of isotope peaks to calculate
        """
    
    def calc_precursor_isotope_info_mp(self, max_isotope: int = 6, 
                                      n_jobs: int = 8) -> None:
        """
        Multiprocessing isotope information calculation.
        
        Parameters:
        - max_isotope: Maximum isotope peaks
        - n_jobs: Number of parallel processes
        """
    
    def calc_precursor_isotope_intensity(self, max_isotope: int = 6) -> None:
        """
        Calculate isotope pattern intensities for precursors.
        
        Parameters:
        - max_isotope: Maximum isotope peaks to calculate
        """
    
    def calc_precursor_isotope_intensity_mp(self, max_isotope: int = 6,
                                           n_jobs: int = 8) -> None:
        """
        Multiprocessing isotope intensity calculation.
        
        Parameters:
        - max_isotope: Maximum isotope peaks
        - n_jobs: Number of parallel processes
        """

Fragment Processing

Methods for processing and optimizing fragment data within the spectral library.

class SpecLibBase:
    def remove_unused_fragments(self) -> None:
        """
        Remove fragment entries with zero intensity across all precursors.
        Compresses fragment DataFrames to save memory.
        """
    
    def calc_fragment_count(self) -> pd.Series:
        """
        Count number of fragments per precursor.
        
        Returns:
        Series with fragment counts indexed by precursor
        """
    
    def filter_fragment_number(self, top_k: int = 100) -> None:
        """
        Keep only top-k fragments per precursor by intensity.
        
        Parameters:
        - top_k: Number of top fragments to retain per precursor
        """
    
    def sort_fragment_by_intensity(self, ascending: bool = False) -> None:
        """
        Sort fragments by intensity within each precursor.
        
        Parameters:
        - ascending: Sort order (False for highest intensity first)
        """

I/O Operations

Comprehensive I/O methods supporting multiple spectral library formats.

class SpecLibBase:
    def save_hdf(self, filepath: str, **kwargs) -> None:
        """
        Save spectral library to HDF5 format.
        
        Parameters:
        - filepath: Output HDF5 file path
        - **kwargs: Additional HDF5 options
        """
    
    def load_hdf(self, filepath: str, **kwargs) -> None:
        """
        Load spectral library from HDF5 format.
        
        Parameters:
        - filepath: Input HDF5 file path
        - **kwargs: Additional loading options
        """
    
    # Note: Additional export formats may be available through external functions
    # Check the alphabase.spectral_library module for format-specific export utilities

Library Statistics and Analysis

Methods for analyzing spectral library content and quality metrics.

class SpecLibBase:
    # Note: Statistical analysis and validation methods may be available
    # through external functions in the alphabase.spectral_library module
    pass

Utility Functions

Standalone functions for spectral library operations and annotations.

def annotate_fragments_from_speclib(target_lib: SpecLibBase,
                                   donor_lib: SpecLibBase,
                                   match_tolerance: float = 0.02) -> None:
    """
    Annotate fragments using donor spectral library.
    
    Parameters:
    - target_lib: Target library to annotate
    - donor_lib: Donor library with reference spectra
    - match_tolerance: Mass tolerance for matching (Da)
    """

def get_available_columns(spec_lib: SpecLibBase) -> dict:
    """
    Get available DataFrame columns across all library components.
    
    Parameters:
    - spec_lib: Spectral library instance
    
    Returns:
    Dictionary with available columns for each DataFrame
    """

# Note: Additional utility functions for library merging and filtering
# may be available in the alphabase.spectral_library module

Usage Examples

Basic Library Creation and Processing

from alphabase.spectral_library.base import SpecLibBase
import pandas as pd

# Create new spectral library
spec_lib = SpecLibBase()

# Add precursor data
precursor_df = pd.DataFrame({
    'sequence': ['PEPTIDE', 'SEQUENCE', 'EXAMPLE'],
    'mods': ['', 'Phospho (STY)@2', 'Oxidation (M)@1'],
    'charge': [2, 3, 2],
    'proteins': ['P12345', 'P67890', 'P11111'],
    'rt': [25.5, 32.1, 28.7]  # retention times
})

spec_lib.precursor_df = precursor_df

# Optimize DataFrame structure
spec_lib.refine_df()

# Calculate precursor m/z values
spec_lib.calc_precursor_mz()

# Generate fragment m/z values
frag_types = ['b+', 'y+', 'b++', 'y++']
spec_lib.calc_fragment_mz_df(frag_types)

print(f"Library contains {len(spec_lib.precursor_df)} precursors")
print(f"Generated {len(spec_lib.fragment_mz_df)} fragment entries")

Library I/O Operations

# Save library in HDF5 format
spec_lib.save_hdf('my_library.hdf5')

# Load library from HDF5
new_lib = SpecLibBase()
new_lib.load_hdf('my_library.hdf5')

# Additional export formats may be available through external functions
# Check alphabase.spectral_library module for format-specific exporters

Advanced Processing

# Add hash codes for fast lookup
spec_lib.hash_precursor_df()

# Calculate isotope patterns
spec_lib.calc_precursor_isotope_info(max_isotope=6)

# Remove low-intensity fragments
spec_lib.filter_fragment_number(top_k=50)

# Remove unused fragment entries
spec_lib.remove_unused_fragments()

# Library statistics can be calculated manually:
print(f"Precursors: {len(spec_lib.precursor_df)}")
print(f"Fragments: {len(spec_lib.fragment_mz_df)}")

Library Merging and Filtering

# Merge multiple libraries using append method
lib1 = SpecLibBase()
lib2 = SpecLibBase()
# ... populate libraries ...

# Merge libraries
merged_lib = lib1.copy()
merged_lib.append(lib2)

# Filter by specific proteins using pandas operations
target_proteins = ['P12345', 'P67890']
filtered_precursors = merged_lib.precursor_df[
    merged_lib.precursor_df['proteins'].isin(target_proteins)
]

print(f"Merged library: {len(merged_lib.precursor_df)} precursors")
print(f"Filtered precursors: {len(filtered_precursors)} precursors")

Library Validation and Quality Control

# Manual validation and quality control
print(f"Library integrity check:")
print(f"  Precursors: {len(spec_lib.precursor_df)}")
print(f"  Fragment m/z entries: {len(spec_lib.fragment_mz_df)}")
print(f"  Fragment intensity entries: {len(spec_lib.fragment_intensity_df)}")

# Get fragment count statistics
frag_counts = spec_lib.calc_fragment_count()
print(f"Average fragments per precursor: {frag_counts.mean():.1f}")
print(f"Min fragments: {frag_counts.min()}, Max fragments: {frag_counts.max()}")

# Check available columns
available_cols = get_available_columns(spec_lib)
print(f"Available columns: {available_cols}")

Working with Decoys

# Create a copy for decoy generation
decoy_lib = spec_lib.copy()

# Add decoy sequences (typically done with specialized decoy generation)
for idx, row in spec_lib.precursor_df.iterrows():
    # Reverse sequence as simple decoy strategy
    decoy_seq = row['sequence'][::-1]
    decoy_lib.append_decoy_sequence(decoy_seq, decoy_proteins="DECOY_" + row['proteins'])

print(f"Original library: {len(spec_lib.precursor_df)} precursors")
print(f"With decoys: {len(decoy_lib.precursor_df)} precursors")

Install with Tessl CLI