An infrastructure Python package of the AlphaX ecosystem for MS proteomics
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Comprehensive peptide processing capabilities including precursor calculations, mass calculations, ion mobility transformations, and advanced algorithmic operations. Provides high-performance functions for large-scale peptide analysis, isotope modeling, and multi-dimensional separations integration.
Advanced functions for precursor-level calculations including m/z computation, hashing, and isotope pattern analysis.
def update_precursor_mz(precursor_df: pd.DataFrame,
batch_size: int = 100000) -> None:
"""
Calculate and update precursor m/z values in DataFrame.
Parameters:
- precursor_df: DataFrame with sequence, mods, charge columns
- batch_size: Batch size for memory-efficient processing
Modifies precursor_df in-place by adding 'mz' column
"""
def calc_precursor_mz(precursor_df: pd.DataFrame,
batch_size: int = 100000) -> np.ndarray:
"""
Calculate precursor m/z values from sequence and modifications.
Parameters:
- precursor_df: DataFrame with peptide information
- batch_size: Processing batch size
Returns:
Array of precursor m/z values
"""
def refine_precursor_df(precursor_df: pd.DataFrame,
drop_frag_idx: bool = True,
ensure_data_validity: bool = True) -> pd.DataFrame:
"""
Optimize and validate precursor DataFrame structure.
Parameters:
- precursor_df: Input precursor DataFrame
- drop_frag_idx: Whether to drop fragment indexing columns
- ensure_data_validity: Perform data validation checks
Returns:
Refined and optimized precursor DataFrame
"""
def is_precursor_refined(precursor_df: pd.DataFrame) -> bool:
"""
Check if precursor DataFrame has been refined/optimized.
Parameters:
- precursor_df: DataFrame to check
Returns:
True if DataFrame is in refined state
"""
def is_precursor_sorted(precursor_df: pd.DataFrame) -> bool:
"""
Check if precursor DataFrame is properly sorted.
Parameters:
- precursor_df: DataFrame to check
Returns:
True if DataFrame is sorted by precursor index
"""Functions for generating hash codes for fast peptide lookup, deduplication, and comparison operations.
def get_mod_seq_hash(sequence: List[str],
mod_names: List[List[str]],
mod_sites: List[List[int]],
seed: int = 42) -> np.ndarray:
"""
Generate hash codes for modified peptide sequences.
Parameters:
- sequence: List of peptide sequences
- mod_names: List of modification names for each sequence
- mod_sites: List of modification sites for each sequence
- seed: Random seed for reproducible hashing
Returns:
Array of hash codes for each modified sequence
"""
def get_mod_seq_charge_hash(sequence: List[str],
mod_names: List[List[str]],
mod_sites: List[List[int]],
charge: List[int],
seed: int = 42) -> np.ndarray:
"""
Generate hash codes for precursors (sequence + charge).
Parameters:
- sequence: List of peptide sequences
- mod_names: List of modification names for each sequence
- mod_sites: List of modification sites for each sequence
- charge: List of precursor charges
- seed: Random seed for reproducible hashing
Returns:
Array of hash codes for each precursor
"""
def hash_mod_seq_df(precursor_df: pd.DataFrame,
seed: int = 42) -> pd.Series:
"""
Generate sequence hash codes for precursor DataFrame.
Parameters:
- precursor_df: DataFrame with sequence, mods, mod_sites
- seed: Random seed for hashing
Returns:
Series with hash codes indexed by DataFrame index
"""
def hash_mod_seq_charge_df(precursor_df: pd.DataFrame,
seed: int = 42) -> pd.Series:
"""
Generate precursor hash codes including charge state.
Parameters:
- precursor_df: DataFrame with sequence, mods, mod_sites, charge
- seed: Random seed for hashing
Returns:
Series with precursor hash codes
"""
def hash_precursor_df(precursor_df: pd.DataFrame,
seed: int = 42) -> None:
"""
Add hash columns to precursor DataFrame in-place.
Parameters:
- precursor_df: DataFrame to modify
- seed: Random seed for hashing
Adds 'seq_hash' and 'prec_hash' columns to DataFrame
"""Advanced functions for calculating isotope patterns, intensities, and distributions for precursors.
def calc_precursor_isotope_info(precursor_df: pd.DataFrame,
max_isotope: int = 6) -> None:
"""
Calculate isotope envelope information for precursors.
Parameters:
- precursor_df: DataFrame with peptide sequences and modifications
- max_isotope: Maximum number of isotope peaks to calculate
Adds isotope-related columns to precursor_df in-place
"""
def calc_precursor_isotope_info_mp(precursor_df: pd.DataFrame,
max_isotope: int = 6,
n_jobs: int = 8) -> None:
"""
Multiprocessing isotope information calculation.
Parameters:
- precursor_df: DataFrame with peptide information
- max_isotope: Maximum isotope peaks to calculate
- n_jobs: Number of parallel processes
Adds isotope information using parallel processing
"""
def calc_precursor_isotope_intensity(precursor_df: pd.DataFrame,
max_isotope: int = 6) -> None:
"""
Calculate detailed isotope pattern intensities.
Parameters:
- precursor_df: DataFrame with peptide information
- max_isotope: Maximum isotope peaks for intensity calculation
Adds isotope intensity columns to DataFrame
"""
def calc_precursor_isotope_intensity_mp(precursor_df: pd.DataFrame,
max_isotope: int = 6,
n_jobs: int = 8) -> None:
"""
Multiprocessing isotope intensity calculation.
Parameters:
- precursor_df: DataFrame with peptide information
- max_isotope: Maximum isotope peaks
- n_jobs: Number of parallel processes
Parallel calculation of isotope intensities
"""
def get_mod_seq_formula(sequence: List[str],
mod_names: List[List[str]],
mod_sites: List[List[int]]) -> List[str]:
"""
Generate chemical formulas for modified peptide sequences.
Parameters:
- sequence: List of peptide sequences
- mod_names: List of modification names for each sequence
- mod_sites: List of modification sites for each sequence
Returns:
List of chemical formula strings for each modified sequence
"""Efficient mass calculation functions optimized for batch processing and high-throughput analysis.
def calc_b_y_and_peptide_masses_for_same_len_seqs(sequences: List[str],
mod_names: List[List[str]] = None,
mod_sites: List[List[int]] = None,
aa_mass_diffs: List[List[float]] = None,
aa_mass_diff_sites: List[List[int]] = None) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Batch calculate b/y fragments and peptide masses for equal-length sequences.
Parameters:
- sequences: List of equal-length peptide sequences
- mod_names: Optional modification names for each sequence
- mod_sites: Optional modification sites for each sequence
- aa_mass_diffs: Optional amino acid mass differences
- aa_mass_diff_sites: Optional sites for mass differences
Returns:
Tuple of (b_ion_masses, y_ion_masses, peptide_masses)
All arrays have optimized memory layout for equal-length sequences
"""
def calc_peptide_masses_for_same_len_seqs(sequences: List[str],
mod_list: List[tuple] = None,
mod_diff_list: List[tuple] = None) -> np.ndarray:
"""
Calculate peptide masses for equal-length sequences efficiently.
Parameters:
- sequences: List of equal-length peptide sequences
- mod_list: List of (mod_names, mod_sites) tuples
- mod_diff_list: List of (mass_diffs, mass_diff_sites) tuples
Returns:
1D array of peptide masses with optimized computation
"""
def calc_diff_modification_mass(pep_len: int,
mass_diffs: List[float],
mass_diff_sites: List[int]) -> np.ndarray:
"""
Calculate mass differences for open search workflows.
Parameters:
- pep_len: Peptide sequence length
- mass_diffs: List of mass differences to apply
- mass_diff_sites: List of sites where mass differences occur
Returns:
2D array with mass differences by position
"""
def calc_mod_diff_masses_for_same_len_seqs(nAA: int,
aa_mass_diffs_list: List[List[float]],
mod_sites_list: List[List[int]]) -> np.ndarray:
"""
Batch calculation of modification mass differences.
Parameters:
- nAA: Number of amino acids (sequence length)
- aa_mass_diffs_list: List of mass difference arrays
- mod_sites_list: List of modification site arrays
Returns:
3D array with mass differences for batch processing
"""Functions for collision cross section (CCS) and ion mobility calculations across different instrument platforms.
def get_reduced_mass(precursor_mzs: np.ndarray,
charges: np.ndarray) -> np.ndarray:
"""
Calculate reduced mass for ion mobility calculations.
Parameters:
- precursor_mzs: Array of precursor m/z values
- charges: Array of precursor charges
Returns:
Array of reduced masses
"""
def ccs_to_mobility_bruker(ccs: np.ndarray,
mz: np.ndarray,
charge: np.ndarray,
mass_gas: float = 28.014,
temp: float = 273.15,
t_diff: float = 0.0) -> np.ndarray:
"""
Convert collision cross section to ion mobility (Bruker platform).
Parameters:
- ccs: Array of CCS values (Ų)
- mz: Array of m/z values
- charge: Array of charge states
- mass_gas: Mass of drift gas (default: N2)
- temp: Temperature in Kelvin
- t_diff: Temperature difference correction
Returns:
Array of ion mobility values (1/K0)
"""
def mobility_to_ccs_bruker(mobility: np.ndarray,
mz: np.ndarray,
charge: np.ndarray,
mass_gas: float = 28.014,
temp: float = 273.15,
t_diff: float = 0.0) -> np.ndarray:
"""
Convert ion mobility to collision cross section (Bruker platform).
Parameters:
- mobility: Array of ion mobility values (1/K0)
- mz: Array of m/z values
- charge: Array of charge states
- mass_gas: Mass of drift gas (default: N2)
- temp: Temperature in Kelvin
- t_diff: Temperature difference correction
Returns:
Array of CCS values (Ų)
"""
def ccs_to_mobility_waters(ccs: np.ndarray,
mz: np.ndarray,
charge: np.ndarray,
**kwargs) -> np.ndarray:
"""
Convert CCS to ion mobility for Waters instruments.
Parameters:
- ccs: Array of CCS values
- mz: Array of m/z values
- charge: Array of charge states
- **kwargs: Platform-specific parameters
Returns:
Array of ion mobility values
"""
def mobility_to_ccs_waters(mobility: np.ndarray,
mz: np.ndarray,
charge: np.ndarray,
**kwargs) -> np.ndarray:
"""
Convert ion mobility to CCS for Waters instruments.
Parameters:
- mobility: Array of ion mobility values
- mz: Array of m/z values
- charge: Array of charge states
- **kwargs: Platform-specific parameters
Returns:
Array of CCS values
"""
def ccs_to_mobility_for_df(precursor_df: pd.DataFrame,
vendor_type: str = 'bruker') -> None:
"""
Convert CCS to mobility values for precursor DataFrame.
Parameters:
- precursor_df: DataFrame with ccs, mz, charge columns
- vendor_type: Instrument vendor ('bruker', 'waters', 'agilent')
Adds 'mobility' column to DataFrame in-place
"""
def mobility_to_ccs_for_df(precursor_df: pd.DataFrame,
vendor_type: str = 'bruker') -> None:
"""
Convert mobility to CCS values for precursor DataFrame.
Parameters:
- precursor_df: DataFrame with mobility, mz, charge columns
- vendor_type: Instrument vendor ('bruker', 'waters', 'agilent')
Adds 'ccs' column to DataFrame in-place
"""Functions optimized for high-throughput peptide processing with memory efficiency and parallel computation.
def process_precursors_in_batches(precursor_df: pd.DataFrame,
processing_func: callable,
batch_size: int = 100000,
n_jobs: int = 1,
**kwargs) -> pd.DataFrame:
"""
Process large precursor DataFrames in memory-efficient batches.
Parameters:
- precursor_df: Large precursor DataFrame
- processing_func: Function to apply to each batch
- batch_size: Number of precursors per batch
- n_jobs: Number of parallel processes
- **kwargs: Additional arguments for processing function
Returns:
Processed DataFrame with results from all batches
"""
def optimize_precursor_memory_layout(precursor_df: pd.DataFrame) -> pd.DataFrame:
"""
Optimize DataFrame memory layout for computational efficiency.
Parameters:
- precursor_df: Input precursor DataFrame
Returns:
DataFrame with optimized memory layout and data types
"""
def validate_precursor_data_integrity(precursor_df: pd.DataFrame) -> dict:
"""
Validate precursor data for completeness and consistency.
Parameters:
- precursor_df: Precursor DataFrame to validate
Returns:
Dictionary with validation results and any issues found
"""
def create_precursor_index_mapping(precursor_df: pd.DataFrame) -> dict:
"""
Create efficient index mappings for fast precursor lookup.
Parameters:
- precursor_df: Precursor DataFrame
Returns:
Dictionary with various index mappings for optimized access
"""Functions for statistical analysis and quality assessment of peptide-level data.
def calculate_precursor_statistics(precursor_df: pd.DataFrame) -> pd.DataFrame:
"""
Calculate comprehensive statistics for precursor data.
Parameters:
- precursor_df: Precursor DataFrame
Returns:
DataFrame with statistical summaries
"""
def detect_precursor_outliers(precursor_df: pd.DataFrame,
method: str = 'zscore',
threshold: float = 3.0) -> pd.Series:
"""
Detect outlier precursors based on various metrics.
Parameters:
- precursor_df: Precursor DataFrame
- method: Outlier detection method ('zscore', 'iqr', 'isolation_forest')
- threshold: Threshold for outlier detection
Returns:
Boolean Series indicating outliers
"""
def analyze_modification_patterns(precursor_df: pd.DataFrame) -> dict:
"""
Analyze patterns in peptide modifications.
Parameters:
- precursor_df: DataFrame with modification information
Returns:
Dictionary with modification analysis results
"""
def assess_sequence_coverage(precursor_df: pd.DataFrame,
protein_sequences: dict) -> pd.DataFrame:
"""
Assess protein sequence coverage from identified precursors.
Parameters:
- precursor_df: DataFrame with precursor sequences and proteins
- protein_sequences: Dictionary mapping protein IDs to sequences
Returns:
DataFrame with coverage statistics per protein
"""from alphabase.peptide.precursor import (
update_precursor_mz, refine_precursor_df, hash_precursor_df
)
import pandas as pd
# Create precursor DataFrame
precursor_df = pd.DataFrame({
'sequence': ['PEPTIDE', 'SEQUENCE', 'EXAMPLE'],
'mods': ['', 'Phospho (STY)@2', 'Oxidation (M)@1'],
'charge': [2, 3, 2],
'proteins': ['P12345', 'P67890', 'P11111']
})
# Refine DataFrame structure
refined_df = refine_precursor_df(precursor_df, ensure_data_validity=True)
# Calculate m/z values
update_precursor_mz(refined_df)
print(f"Added m/z values: {refined_df['mz'].tolist()}")
# Add hash codes for fast lookup
hash_precursor_df(refined_df)
print(f"Added hash columns: {refined_df.columns.tolist()}")from alphabase.peptide.precursor import (
calc_precursor_isotope_info, calc_precursor_isotope_intensity
)
# Calculate isotope envelope information
calc_precursor_isotope_info(refined_df, max_isotope=6)
print(f"Isotope columns: {[col for col in refined_df.columns if 'isotope' in col]}")
# Calculate detailed isotope intensities
calc_precursor_isotope_intensity(refined_df, max_isotope=6)
print(f"Isotope intensity patterns calculated for {len(refined_df)} precursors")
# For large datasets, use multiprocessing
from alphabase.peptide.precursor import calc_precursor_isotope_info_mp
calc_precursor_isotope_info_mp(large_precursor_df, max_isotope=6, n_jobs=8)from alphabase.peptide.mass_calc import (
calc_b_y_and_peptide_masses_for_same_len_seqs,
calc_peptide_masses_for_same_len_seqs
)
# Efficient batch processing for same-length sequences
same_len_sequences = ['PEPTIDE', 'EXAMPLE', 'TESTPEP'] # All length 7
mod_names = [[], ['Oxidation (M)'], []]
mod_sites = [[], [4], []]
# Calculate b/y fragments and peptide masses
b_masses, y_masses, peptide_masses = calc_b_y_and_peptide_masses_for_same_len_seqs(
sequences=same_len_sequences,
mod_names=mod_names,
mod_sites=mod_sites
)
print(f"B-ion masses shape: {b_masses.shape}")
print(f"Y-ion masses shape: {y_masses.shape}")
print(f"Peptide masses: {peptide_masses}")
# For peptide masses only
peptide_masses_only = calc_peptide_masses_for_same_len_seqs(
sequences=same_len_sequences,
mod_list=list(zip(mod_names, mod_sites))
)
print(f"Peptide masses: {peptide_masses_only}")from alphabase.peptide.mobility import (
ccs_to_mobility_for_df, mobility_to_ccs_for_df,
ccs_to_mobility_bruker, mobility_to_ccs_bruker
)
# Add CCS values to DataFrame (example values)
mobility_df = refined_df.copy()
mobility_df['ccs'] = [150.5, 180.2, 165.8] # Example CCS values
# Convert CCS to mobility for Bruker platform
ccs_to_mobility_for_df(mobility_df, vendor_type='bruker')
print(f"Added mobility values: {mobility_df['mobility'].tolist()}")
# Convert back to CCS to verify
test_df = mobility_df[['mobility', 'mz', 'charge']].copy()
mobility_to_ccs_for_df(test_df, vendor_type='bruker')
print(f"Verified CCS values: {test_df['ccs'].tolist()}")
# Direct array calculations
import numpy as np
ccs_values = np.array([150.5, 180.2, 165.8])
mz_values = mobility_df['mz'].values
charge_values = mobility_df['charge'].values
mobility_values = ccs_to_mobility_bruker(ccs_values, mz_values, charge_values)
print(f"Direct mobility calculation: {mobility_values}")from alphabase.peptide.precursor import process_precursors_in_batches
import numpy as np
# Create large dataset for demonstration
np.random.seed(42)
large_df = pd.DataFrame({
'sequence': ['PEPTIDE'] * 100000 + ['EXAMPLE'] * 100000,
'charge': np.random.choice([2, 3, 4], 200000),
'proteins': [f'P{i:05d}' for i in range(200000)]
})
# Define processing function
def add_theoretical_rt(batch_df):
"""Add theoretical retention time based on sequence properties."""
batch_df = batch_df.copy()
# Simple hydrophobicity-based RT prediction (example)
hydrophobic_aas = ['A', 'I', 'L', 'F', 'W', 'Y', 'V']
batch_df['theoretical_rt'] = [
sum(1 for aa in seq if aa in hydrophobic_aas) * 2.5 + 10
for seq in batch_df['sequence']
]
return batch_df
# Process in batches
processed_df = process_precursors_in_batches(
large_df,
processing_func=add_theoretical_rt,
batch_size=50000,
n_jobs=4
)
print(f"Processed {len(processed_df)} precursors with theoretical RT")
print(f"RT range: {processed_df['theoretical_rt'].min():.1f} - {processed_df['theoretical_rt'].max():.1f}")from alphabase.peptide.precursor import (
validate_precursor_data_integrity,
detect_precursor_outliers,
calculate_precursor_statistics
)
# Validate data integrity
validation_results = validate_precursor_data_integrity(processed_df)
print(f"Validation results:")
for check, result in validation_results.items():
print(f" {check}: {result}")
# Calculate comprehensive statistics
stats_df = calculate_precursor_statistics(processed_df)
print(f"Precursor statistics:")
print(stats_df.head())
# Detect outliers
outliers = detect_precursor_outliers(
processed_df,
method='zscore',
threshold=3.0
)
print(f"Detected {outliers.sum()} outlier precursors ({outliers.mean()*100:.1f}%)")
# Remove outliers
clean_df = processed_df[~outliers].copy()
print(f"Clean dataset: {len(clean_df)} precursors")from alphabase.peptide.precursor import analyze_modification_patterns
# Add more complex modifications for analysis
complex_df = refined_df.copy()
complex_df['mods'] = [
'Oxidation (M)@3;Acetyl (Protein N-term)@0',
'Phospho (STY)@2;Phospho (STY)@5',
'Carbamidomethyl (C)@2;Oxidation (M)@6'
]
# Analyze modification patterns
mod_analysis = analyze_modification_patterns(complex_df)
print(f"Modification analysis:")
print(f" Most common modifications: {mod_analysis['common_mods']}")
print(f" Co-occurring modifications: {mod_analysis['cooccurrence']}")
print(f" Site preferences: {mod_analysis['site_preferences']}")from alphabase.peptide.precursor import optimize_precursor_memory_layout
# Check memory usage before optimization
print(f"Memory usage before optimization: {processed_df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
# Optimize memory layout
optimized_df = optimize_precursor_memory_layout(processed_df)
print(f"Memory usage after optimization: {optimized_df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
# Compare data types
print("Data type changes:")
for col in processed_df.columns:
if col in optimized_df.columns:
old_dtype = processed_df[col].dtype
new_dtype = optimized_df[col].dtype
if old_dtype != new_dtype:
print(f" {col}: {old_dtype} -> {new_dtype}")Install with Tessl CLI
npx tessl i tessl/pypi-alphabase