CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-alphabase

An infrastructure Python package of the AlphaX ecosystem for MS proteomics

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

quantification.mddocs/

Quantification Data Processing

Comprehensive quantification data processing capabilities for handling multi-format quantified peptide and protein data from various proteomics platforms. Provides unified interfaces for reading, reformatting, and processing quantification results from DIA-NN, Spectronaut, MaxQuant, and other proteomics tools.

Capabilities

Quantification Reader Manager

Central management system for importing and processing quantified proteomics data from multiple sources with automatic format detection and standardization.

def import_data(data_path: str, 
               data_type: str = None,
               config_dict: dict = None,
               **kwargs) -> pd.DataFrame:
    """
    Import quantified proteomics data from various formats.
    
    Parameters:
    - data_path: Path to quantification data file
    - data_type: Format type ('spectronaut', 'diann', 'maxquant', etc.)
    - config_dict: Configuration dictionary for import settings
    - **kwargs: Additional format-specific options
    
    Returns:
    DataFrame with standardized quantification data
    """

def get_supported_formats() -> List[str]:
    """
    Get list of supported quantification formats.
    
    Returns:
    List of supported format names
    """

def get_format_config(format_name: str) -> dict:
    """
    Get default configuration for specific format.
    
    Parameters:
    - format_name: Name of the format
    
    Returns:
    Configuration dictionary with default settings
    """

def validate_quantification_data(df: pd.DataFrame, 
                                format_type: str = None) -> dict:
    """
    Validate quantification data integrity and completeness.
    
    Parameters:
    - df: Quantification DataFrame to validate
    - format_type: Expected format type for validation
    
    Returns:
    Dictionary with validation results and issues
    """

Long-Format Data Reader

Specialized reader for long-format quantification tables commonly produced by DIA-NN, Spectronaut, and other DIA search engines.

class LongFormatReader:
    """Reader for long-format quantification data tables."""
    
    def __init__(self, config_dict: dict = None):
        """
        Initialize long-format reader.
        
        Parameters:
        - config_dict: Configuration for column mappings and processing
        """
    
    def read_file(self, filepath: str, **kwargs) -> pd.DataFrame:
        """
        Read long-format quantification file.
        
        Parameters:
        - filepath: Path to quantification file
        - **kwargs: Additional reading options
        
        Returns:
        DataFrame with processed quantification data
        """
    
    def set_column_mapping(self, mapping: dict) -> None:
        """
        Set custom column name mappings.
        
        Parameters:
        - mapping: Dictionary mapping file columns to standard names
        """
    
    def filter_data(self, df: pd.DataFrame, 
                   min_confidence: float = 0.01,
                   remove_decoys: bool = True) -> pd.DataFrame:
        """
        Apply quality filters to quantification data.
        
        Parameters:
        - df: Input quantification DataFrame
        - min_confidence: Minimum confidence threshold
        - remove_decoys: Whether to remove decoy identifications
        
        Returns:
        Filtered DataFrame
        """
    
    def aggregate_to_protein_level(self, df: pd.DataFrame,
                                  method: str = 'sum') -> pd.DataFrame:
        """
        Aggregate peptide-level to protein-level quantification.
        
        Parameters:
        - df: Peptide-level quantification DataFrame
        - method: Aggregation method ('sum', 'mean', 'median', 'maxlfq')
        
        Returns:
        Protein-level quantification DataFrame
        """

def standardize_long_format_columns(df: pd.DataFrame, 
                                   source_format: str) -> pd.DataFrame:
    """
    Standardize column names for long-format data.
    
    Parameters:
    - df: Input DataFrame with format-specific columns
    - source_format: Source format name ('diann', 'spectronaut', etc.)
    
    Returns:
    DataFrame with standardized column names
    """

Wide-Format Data Reader

Reader for wide-format quantification tables with samples as columns, commonly used in label-free quantification workflows.

class WideFormatReader:
    """Reader for wide-format quantification data tables."""
    
    def __init__(self, config_dict: dict = None):
        """
        Initialize wide-format reader.
        
        Parameters:
        - config_dict: Configuration for processing settings
        """
    
    def read_file(self, filepath: str, **kwargs) -> pd.DataFrame:
        """
        Read wide-format quantification file.
        
        Parameters:
        - filepath: Path to quantification file
        - **kwargs: Additional reading options
        
        Returns:
        DataFrame with processed quantification data
        """
    
    def identify_sample_columns(self, df: pd.DataFrame) -> List[str]:
        """
        Automatically identify sample/intensity columns.
        
        Parameters:
        - df: Input DataFrame
        
        Returns:
        List of column names containing quantification values
        """
    
    def convert_to_long_format(self, df: pd.DataFrame,
                              sample_columns: List[str] = None) -> pd.DataFrame:
        """
        Convert wide-format to long-format table.
        
        Parameters:
        - df: Wide-format DataFrame
        - sample_columns: List of sample columns to melt
        
        Returns:
        Long-format DataFrame
        """
    
    def normalize_intensities(self, df: pd.DataFrame,
                             method: str = 'median') -> pd.DataFrame:
        """
        Normalize quantification intensities across samples.
        
        Parameters:
        - df: Quantification DataFrame
        - method: Normalization method ('median', 'mean', 'quantile')
        
        Returns:
        Normalized DataFrame
        """

def detect_wide_format_type(df: pd.DataFrame) -> str:
    """
    Detect the type of wide-format quantification data.
    
    Parameters:
    - df: Input DataFrame
    
    Returns:
    Format type string ('maxquant', 'proteomics_ruler', 'generic')
    """

Configuration Management

System for managing format-specific configurations and column mappings for different quantification platforms.

class ConfigDictLoader:
    """Configuration management for quantification data formats."""
    
    def __init__(self, config_path: str = None):
        """
        Initialize configuration loader.
        
        Parameters:
        - config_path: Path to custom configuration file
        """
    
    def load_config(self, format_name: str) -> dict:
        """
        Load configuration for specific format.
        
        Parameters:
        - format_name: Name of the quantification format
        
        Returns:
        Configuration dictionary
        """
    
    def save_config(self, config: dict, format_name: str) -> None:
        """
        Save custom configuration for format.
        
        Parameters:
        - config: Configuration dictionary to save
        - format_name: Name for the configuration
        """
    
    def get_column_mapping(self, format_name: str) -> dict:
        """
        Get column name mappings for format.
        
        Parameters:
        - format_name: Format name
        
        Returns:
        Dictionary mapping format columns to standard names
        """
    
    def update_column_mapping(self, format_name: str, 
                             mapping: dict) -> None:
        """
        Update column mappings for format.
        
        Parameters:
        - format_name: Format name to update
        - mapping: New column mappings
        """

# Standard configuration constants
STANDARD_QUANTIFICATION_COLUMNS: dict = {
    'sequence': str,          # Peptide sequence
    'proteins': str,          # Protein identifiers
    'sample': str,            # Sample identifier
    'intensity': float,       # Quantification intensity
    'rt': float,              # Retention time
    'charge': int,            # Precursor charge
    'mz': float,              # Precursor m/z
    'qvalue': float,          # Identification confidence
    'run': str,               # LC-MS run identifier
    'channel': str,           # Labeling channel (for TMT/iTRAQ)
}

def get_default_config(format_name: str) -> dict:
    """
    Get default configuration for quantification format.
    
    Parameters:
    - format_name: Quantification format name
    
    Returns:
    Default configuration dictionary
    """

Data Reformatting and Processing

Utilities for reformatting and processing quantification data for downstream analysis workflows.

class TableReformatter:
    """Reformatter for quantification data tables."""
    
    def __init__(self):
        """Initialize table reformatter."""
    
    def reformat_for_analysis(self, df: pd.DataFrame,
                             analysis_type: str = 'differential') -> pd.DataFrame:
        """
        Reformat data for specific analysis workflows.
        
        Parameters:
        - df: Input quantification DataFrame
        - analysis_type: Type of analysis ('differential', 'network', 'timecourse')
        
        Returns:
        Reformatted DataFrame suitable for analysis
        """
    
    def create_design_matrix(self, df: pd.DataFrame,
                           sample_info: pd.DataFrame) -> pd.DataFrame:
        """
        Create design matrix for statistical analysis.
        
        Parameters:
        - df: Quantification DataFrame
        - sample_info: Sample metadata DataFrame
        
        Returns:
        Design matrix DataFrame
        """
    
    def pivot_to_matrix(self, df: pd.DataFrame,
                       index_cols: List[str],
                       value_col: str = 'intensity') -> pd.DataFrame:
        """
        Pivot quantification data to matrix format.
        
        Parameters:
        - df: Long-format quantification DataFrame
        - index_cols: Columns to use as row identifiers
        - value_col: Column containing values to pivot
        
        Returns:
        Matrix-format DataFrame
        """
    
    def handle_missing_values(self, df: pd.DataFrame,
                            method: str = 'impute') -> pd.DataFrame:
        """
        Handle missing quantification values.
        
        Parameters:
        - df: Quantification DataFrame with missing values
        - method: Handling method ('impute', 'remove', 'flag')
        
        Returns:
        DataFrame with missing values handled
        """

class PlexDIAReformatter:
    """Specialized reformatter for plexDIA quantification data."""
    
    def __init__(self):
        """Initialize plexDIA reformatter."""
    
    def process_plexdia_output(self, filepath: str) -> pd.DataFrame:
        """
        Process plexDIA output files.
        
        Parameters:
        - filepath: Path to plexDIA output file
        
        Returns:
        Processed quantification DataFrame
        """
    
    def extract_channel_intensities(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Extract individual channel intensities from plexDIA data.
        
        Parameters:
        - df: Raw plexDIA DataFrame
        
        Returns:
        DataFrame with separated channel intensities
        """
    
    def normalize_channels(self, df: pd.DataFrame,
                          method: str = 'sum') -> pd.DataFrame:
        """
        Normalize intensities across plexDIA channels.
        
        Parameters:
        - df: plexDIA quantification DataFrame
        - method: Normalization method ('sum', 'median', 'reference')
        
        Returns:
        Channel-normalized DataFrame
        """

def merge_quantification_data(dataframes: List[pd.DataFrame],
                             merge_on: List[str] = None) -> pd.DataFrame:
    """
    Merge multiple quantification datasets.
    
    Parameters:
    - dataframes: List of quantification DataFrames to merge
    - merge_on: Columns to merge on (default: sequence, proteins, charge)
    
    Returns:
    Merged quantification DataFrame
    """

def calculate_fold_changes(df: pd.DataFrame,
                          control_samples: List[str],
                          treatment_samples: List[str]) -> pd.DataFrame:
    """
    Calculate fold changes between sample groups.
    
    Parameters:
    - df: Quantification DataFrame
    - control_samples: List of control sample identifiers
    - treatment_samples: List of treatment sample identifiers
    
    Returns:
    DataFrame with fold changes and statistics
    """

Quality Control and Statistics

Functions for quality assessment and statistical analysis of quantification data.

def assess_data_quality(df: pd.DataFrame) -> dict:
    """
    Assess quantification data quality metrics.
    
    Parameters:
    - df: Quantification DataFrame
    
    Returns:
    Dictionary with quality metrics and statistics
    """

def calculate_cv_statistics(df: pd.DataFrame,
                           sample_groups: dict) -> pd.DataFrame:
    """
    Calculate coefficient of variation statistics.
    
    Parameters:
    - df: Quantification DataFrame
    - sample_groups: Dictionary mapping samples to groups
    
    Returns:
    DataFrame with CV statistics
    """

def identify_outlier_samples(df: pd.DataFrame,
                           method: str = 'pca') -> List[str]:
    """
    Identify outlier samples in quantification data.
    
    Parameters:
    - df: Quantification DataFrame
    - method: Outlier detection method ('pca', 'correlation', 'distance')
    
    Returns:
    List of outlier sample identifiers
    """

def generate_qa_report(df: pd.DataFrame, 
                      output_path: str = None) -> dict:
    """
    Generate comprehensive quality assessment report.
    
    Parameters:
    - df: Quantification DataFrame
    - output_path: Optional path to save HTML report
    
    Returns:
    Dictionary with QA metrics and plots
    """

Usage Examples

Basic Quantification Data Import

from alphabase.quantification.quant_reader.quant_reader_manager import import_data

# Import DIA-NN quantification data
diann_df = import_data('report.tsv', data_type='diann')
print(f"Imported {len(diann_df)} quantification entries")

# Import Spectronaut data
spectronaut_df = import_data('spectronaut_export.tsv', data_type='spectronaut')
print(f"Imported {len(spectronaut_df)} quantification entries")

# Auto-detect format
unknown_df = import_data('unknown_quant.tsv')  # Auto-detects format
print(f"Auto-detected format: {unknown_df.attrs.get('format_type', 'unknown')}")

Processing Long-Format Data

from alphabase.quantification.quant_reader.longformat_reader import LongFormatReader
import pandas as pd

# Create reader with custom configuration
reader = LongFormatReader()

# Read and process DIA-NN data
df = reader.read_file('diann_report.tsv')

# Apply quality filters
filtered_df = reader.filter_data(
    df,
    min_confidence=0.01,  # 1% FDR
    remove_decoys=True
)

# Aggregate to protein level
protein_df = reader.aggregate_to_protein_level(
    filtered_df,
    method='sum'  # Sum peptide intensities
)

print(f"Peptide-level: {len(filtered_df)} entries")
print(f"Protein-level: {len(protein_df)} entries")

Working with Wide-Format Data

from alphabase.quantification.quant_reader.wideformat_reader import WideFormatReader

# Process MaxQuant proteinGroups.txt
reader = WideFormatReader()
df = reader.read_file('proteinGroups.txt')

# Auto-identify intensity columns
sample_cols = reader.identify_sample_columns(df)
print(f"Found {len(sample_cols)} sample columns: {sample_cols[:5]}...")

# Convert to long format for analysis
long_df = reader.convert_to_long_format(df, sample_columns=sample_cols)

# Normalize intensities
normalized_df = reader.normalize_intensities(long_df, method='median')
print(f"Converted to long format: {len(long_df)} entries")

Advanced Data Processing

from alphabase.quantification.quant_reader.table_reformatter import TableReformatter
from alphabase.quantification.quant_reader.quantreader_utils import (
    merge_quantification_data, calculate_fold_changes
)

# Merge data from multiple experiments
experiment_dfs = [
    import_data('exp1_diann.tsv', data_type='diann'),
    import_data('exp2_diann.tsv', data_type='diann'),
    import_data('exp3_diann.tsv', data_type='diann')
]

merged_df = merge_quantification_data(
    experiment_dfs,
    merge_on=['sequence', 'proteins', 'charge']
)

# Create design matrix for statistical analysis
reformatter = TableReformatter()
sample_info = pd.DataFrame({
    'sample': ['exp1', 'exp2', 'exp3'],
    'condition': ['control', 'treatment', 'treatment'],
    'batch': [1, 1, 2]
})

design_matrix = reformatter.create_design_matrix(merged_df, sample_info)

# Calculate fold changes
fold_changes = calculate_fold_changes(
    merged_df,
    control_samples=['exp1'],
    treatment_samples=['exp2', 'exp3']
)

print(f"Calculated fold changes for {len(fold_changes)} proteins")

Quality Assessment

from alphabase.quantification.quant_reader.quantreader_utils import (
    assess_data_quality, generate_qa_report, identify_outlier_samples
)

# Assess data quality
quality_metrics = assess_data_quality(merged_df)
print(f"Quality metrics:")
print(f"  Missing values: {quality_metrics['missing_percentage']:.1f}%")
print(f"  CV median: {quality_metrics['cv_median']:.2f}")
print(f"  Dynamic range: {quality_metrics['dynamic_range']:.1f}")

# Identify outlier samples
outliers = identify_outlier_samples(merged_df, method='pca')
if outliers:
    print(f"Outlier samples detected: {outliers}")

# Generate comprehensive QA report
qa_report = generate_qa_report(merged_df, output_path='qa_report.html')
print(f"QA report saved with {len(qa_report['plots'])} plots")

Custom Configuration

from alphabase.quantification.quant_reader.config_dict_loader import ConfigDictLoader

# Create custom configuration
config_loader = ConfigDictLoader()

# Get default DIA-NN configuration
diann_config = config_loader.load_config('diann')
print(f"Default DIA-NN columns: {diann_config['column_mapping']}")

# Create custom configuration for new format
custom_config = {
    'column_mapping': {
        'peptide_sequence': 'sequence',
        'protein_id': 'proteins',
        'sample_name': 'sample',
        'peak_area': 'intensity',
        'retention_time': 'rt',
        'precursor_charge': 'charge'
    },
    'filters': {
        'min_confidence': 0.01,
        'remove_contaminants': True
    }
}

config_loader.save_config(custom_config, 'custom_format')

# Use custom configuration
custom_df = import_data('custom_data.tsv', 
                       data_type='custom_format',
                       config_dict=custom_config)

Install with Tessl CLI

npx tessl i tessl/pypi-alphabase

docs

advanced-peptide-operations.md

advanced-spectral-libraries.md

chemical-constants.md

fragment-ions.md

index.md

io-utilities.md

protein-analysis.md

psm-readers.md

quantification.md

smiles-chemistry.md

spectral-libraries.md

tile.json