tessl/pypi-alphabase

An infrastructure Python package of the AlphaX ecosystem for MS proteomics

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

PSM Reading and Processing

Name: tessl/pypi-alphabase
Author: tessl

Unified interface for reading Peptide-Spectrum Match (PSM) files from multiple proteomics search engines. Standardizes column mappings and data formats across different tools for seamless data integration and downstream analysis workflows.

Capabilities

Base PSM Reader Class

Foundation class providing common functionality for all PSM readers with standardized interface and column mapping.

class PSMReaderBase:
    """Base class for all PSM readers with common functionality."""
    
    def __init__(self):
        """Initialize PSM reader with default settings."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import PSM file and return standardized DataFrame.
        
        Parameters:
        - filepath: Path to PSM file
        
        Returns:
        DataFrame with standardized column names and data types
        """
    
    def get_modification_mapping(self) -> dict:
        """
        Get modification name mapping for this search engine.
        
        Returns:
        Dictionary mapping search engine mod names to standard names
        """
    
    def get_column_mapping(self) -> dict:
        """
        Get column name mapping for this search engine.
        
        Returns:
        Dictionary mapping search engine columns to standard names
        """
    
    def set_modification_mapping(self, mod_mapping: dict) -> None:
        """
        Set custom modification mapping.
        
        Parameters:
        - mod_mapping: Dictionary with modification name mappings
        """
    
    def validate_file_format(self, filepath: str) -> bool:
        """
        Validate if file format matches this reader.
        
        Parameters:
        - filepath: Path to file to validate
        
        Returns:
        True if file format is compatible
        """

Search Engine Specific Readers

Individual reader classes for different proteomics search engines, each inheriting from PSMReaderBase.

class MaxQuantReader(PSMReaderBase):
    """Reader for MaxQuant msms.txt and evidence.txt files."""
    
    def __init__(self):
        """Initialize MaxQuant reader with specific column mappings."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import MaxQuant output file.
        
        Parameters:
        - filepath: Path to msms.txt or evidence.txt file
        
        Returns:
        Standardized DataFrame with MaxQuant PSM data
        """

class DiannReader(PSMReaderBase):
    """Reader for DIA-NN report.tsv files."""
    
    def __init__(self):
        """Initialize DIA-NN reader with specific settings."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import DIA-NN report file.
        
        Parameters:
        - filepath: Path to DIA-NN report.tsv file
        
        Returns:
        Standardized DataFrame with DIA-NN results
        """

class SpectronautReader(PSMReaderBase):
    """Reader for Spectronaut export files."""
    
    def __init__(self):
        """Initialize Spectronaut reader."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import Spectronaut export file.
        
        Parameters:
        - filepath: Path to Spectronaut export file
        
        Returns:
        Standardized DataFrame with Spectronaut data
        """

class SwathReader(PSMReaderBase):
    """Reader for SWATH output files."""
    
    def __init__(self):
        """Initialize SWATH reader."""

class SpectronautReportReader(PSMReaderBase):
    """Reader for Spectronaut report files."""
    
    def __init__(self):
        """Initialize Spectronaut report reader."""

class MSFragger_PSM_TSV_Reader(PSMReaderBase):
    """Reader for MSFragger PSM TSV files."""
    
    def __init__(self):
        """Initialize MSFragger TSV reader."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import MSFragger PSM TSV file.
        
        Parameters:
        - filepath: Path to MSFragger psm.tsv file
        
        Returns:
        Standardized DataFrame with MSFragger PSM data
        """

class MSFraggerPepXMLReader(PSMReaderBase):
    """Reader for MSFragger pepXML files."""
    
    def __init__(self):
        """Initialize MSFragger pepXML reader."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import MSFragger pepXML file.
        
        Parameters:
        - filepath: Path to pepXML file
        
        Returns:
        Standardized DataFrame with pepXML data
        """

class MSFraggerPepXML(MSFraggerPepXMLReader):
    """Alias for MSFraggerPepXMLReader for backwards compatibility."""

class pFindReader(PSMReaderBase):
    """Reader for pFind output files."""
    
    def __init__(self):
        """Initialize pFind reader."""

class SageReaderTSV(PSMReaderBase):
    """Reader for Sage TSV output files."""
    
    def __init__(self):
        """Initialize Sage TSV reader."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import Sage TSV file.
        
        Parameters:
        - filepath: Path to Sage results.sage.tsv file
        
        Returns:
        Standardized DataFrame with Sage results
        """

class SageReaderParquet(PSMReaderBase):
    """Reader for Sage Parquet output files."""
    
    def __init__(self):
        """Initialize Sage Parquet reader."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import Sage Parquet file.
        
        Parameters:
        - filepath: Path to Sage .parquet file
        
        Returns:
        Standardized DataFrame with Sage results
        """

class AlphaPeptReader(PSMReaderBase):
    """Reader for AlphaPept output files."""
    
    def __init__(self):
        """Initialize AlphaPept reader."""

class AlphaDiaReaderTsv(PSMReaderBase):
    """Reader for AlphaDIA TSV output files."""
    
    def __init__(self):
        """Initialize AlphaDIA TSV reader."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import AlphaDIA TSV file.
        
        Parameters:
        - filepath: Path to AlphaDIA output.tsv file
        
        Returns:
        Standardized DataFrame with AlphaDIA results
        """

class AlphaDiaReaderParquet(PSMReaderBase):
    """Reader for AlphaDIA Parquet output files."""
    
    def __init__(self):
        """Initialize AlphaDIA Parquet reader."""
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """
        Import AlphaDIA Parquet file.
        
        Parameters:
        - filepath: Path to AlphaDIA .parquet file
        
        Returns:
        Standardized DataFrame with AlphaDIA results
        """

PSM Reader Provider System

Centralized system for managing and accessing PSM readers with automatic format detection.

# Provider object for accessing registered readers
psm_reader_provider: dict  # Dictionary of all registered PSM readers

# YAML configuration for reader settings
psm_reader_yaml: dict  # Configuration settings for PSM readers

def get_reader_by_name(reader_name: str) -> PSMReaderBase:
    """
    Get PSM reader instance by name.
    
    Parameters:
    - reader_name: Name of the reader ('maxquant', 'diann', etc.)
    
    Returns:
    Instantiated PSM reader
    """

def get_reader_by_file(filepath: str) -> PSMReaderBase:
    """
    Auto-detect and return appropriate reader for file.
    
    Parameters:
    - filepath: Path to PSM file
    
    Returns:
    Best matching PSM reader for the file format
    """

def list_available_readers() -> List[str]:
    """
    List all available PSM reader names.
    
    Returns:
    List of registered reader names
    """

def register_custom_reader(name: str, reader_class: type) -> None:
    """
    Register custom PSM reader.
    
    Parameters:
    - name: Name for the custom reader
    - reader_class: PSM reader class inheriting from PSMReaderBase
    """

Column Standardization

Standard column names and data types used across all PSM readers for consistent output.

# Standard column names used by all readers
STANDARD_COLUMNS: dict = {
    'sequence': str,           # Peptide sequence
    'mods': str,              # Modification string
    'charge': int,            # Precursor charge
    'proteins': str,          # Protein identifiers
    'rt': float,              # Retention time
    'mz': float,              # Precursor m/z
    'mass': float,            # Precursor mass
    'score': float,           # Primary identification score
    'qvalue': float,          # Q-value (FDR)
    'pep': float,             # Posterior error probability
    'intensity': float,       # Precursor intensity
    'spec_idx': int,          # Spectrum index
    'run': str,               # Run/file identifier
    'scan': int,              # Scan number
}

def standardize_columns(df: pd.DataFrame, column_mapping: dict) -> pd.DataFrame:
    """
    Apply column standardization to DataFrame.
    
    Parameters:
    - df: Input DataFrame with search engine specific columns
    - column_mapping: Mapping from original to standard column names
    
    Returns:
    DataFrame with standardized column names and types
    """

def validate_required_columns(df: pd.DataFrame, required: List[str] = None) -> bool:
    """
    Validate that DataFrame contains required columns.
    
    Parameters:
    - df: DataFrame to validate
    - required: List of required column names
    
    Returns:
    True if all required columns are present
    """

Usage Examples

Basic PSM File Reading

from alphabase.psm_reader import MaxQuantReader, DiannReader, SpectronautReader

# Read MaxQuant msms.txt file
mq_reader = MaxQuantReader()
mq_df = mq_reader.import_file('msms.txt')
print(f"MaxQuant PSMs: {len(mq_df)}")

# Read DIA-NN report
diann_reader = DiannReader()
diann_df = diann_reader.import_file('report.tsv')
print(f"DIA-NN PSMs: {len(diann_df)}")

# Read Spectronaut export
spec_reader = SpectronautReader()
spec_df = spec_reader.import_file('spectronaut_export.tsv')
print(f"Spectronaut PSMs: {len(spec_df)}")

# All DataFrames now have standardized column names
print(f"Columns: {mq_df.columns.tolist()}")

Using the Provider System

from alphabase.psm_reader import psm_reader_provider

# Get reader by name
reader = psm_reader_provider['maxquant']()
df = reader.import_file('msms.txt')

# Auto-detect file format (if supported)
auto_reader = get_reader_by_file('unknown_format.tsv')
if auto_reader:
    df = auto_reader.import_file('unknown_format.tsv')

# List all available readers
available = list_available_readers()
print(f"Available readers: {available}")

Working with Multiple Search Engines

import pandas as pd
from alphabase.psm_reader import MaxQuantReader, DiannReader, SageReaderTSV

# Read files from different search engines
readers_and_files = [
    (MaxQuantReader(), 'maxquant/msms.txt'),
    (DiannReader(), 'diann/report.tsv'), 
    (SageReaderTSV(), 'sage/results.sage.tsv')
]

all_psms = []
for reader, filepath in readers_and_files:
    df = reader.import_file(filepath)
    df['search_engine'] = reader.__class__.__name__
    all_psms.append(df)

# Combine all PSMs with standardized columns
combined_df = pd.concat(all_psms, ignore_index=True)
print(f"Total PSMs from all engines: {len(combined_df)}")
print(f"Search engines: {combined_df['search_engine'].unique()}")

Custom Modification Mappings

from alphabase.psm_reader import MaxQuantReader

# Create reader with custom modification mapping
reader = MaxQuantReader()

# Get current modification mapping
current_mapping = reader.get_modification_mapping()
print(f"Current mappings: {current_mapping}")

# Add custom modifications
custom_mapping = {
    'Oxidation (M)': 'Oxidation',
    'Phospho (STY)': 'Phosphorylation',
    'Acetyl (Protein N-term)': 'Acetylation'
}

reader.set_modification_mapping(custom_mapping)

# Import file with custom mappings
df = reader.import_file('msms.txt')

Advanced Processing Workflows

from alphabase.psm_reader import DiannReader
import numpy as np

# Read DIA-NN results
reader = DiannReader()
df = reader.import_file('report.tsv')

# Apply quality filters using standardized columns
filtered_df = df[
    (df['qvalue'] <= 0.01) &  # 1% FDR
    (df['score'] >= 0.99) &   # High confidence
    (df['rt'] > 0)            # Valid retention time
].copy()

print(f"Original PSMs: {len(df)}")
print(f"After filtering: {len(filtered_df)}")

# Group by sequence for peptide-level analysis
peptide_level = filtered_df.groupby('sequence').agg({
    'score': 'max',
    'intensity': 'sum',
    'proteins': 'first',
    'rt': 'mean'
}).reset_index()

print(f"Unique peptides: {len(peptide_level)}")

Custom Reader Development

from alphabase.psm_reader import PSMReaderBase
import pandas as pd

class CustomReader(PSMReaderBase):
    """Custom reader for proprietary format."""
    
    def __init__(self):
        super().__init__()
        # Define column mappings specific to this format
        self.column_mapping = {
            'peptide_seq': 'sequence',
            'precursor_charge': 'charge',
            'protein_ids': 'proteins',
            'retention_time': 'rt',
            'confidence_score': 'score'
        }
    
    def import_file(self, filepath: str) -> pd.DataFrame:
        """Import custom format file."""
        # Read raw file
        raw_df = pd.read_csv(filepath, sep='\t')
        
        # Apply column mapping
        standardized_df = self.standardize_columns(raw_df, self.column_mapping)
        
        # Apply any format-specific processing
        standardized_df['mods'] = ''  # No modifications in this format
        
        return standardized_df

# Register custom reader
register_custom_reader('custom', CustomReader)

# Use custom reader
custom_reader = CustomReader()
df = custom_reader.import_file('custom_format.tsv')

Install with Tessl CLI