An infrastructure Python package of the AlphaX ecosystem for MS proteomics
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Comprehensive quantification data processing capabilities for handling multi-format quantified peptide and protein data from various proteomics platforms. Provides unified interfaces for reading, reformatting, and processing quantification results from DIA-NN, Spectronaut, MaxQuant, and other proteomics tools.
Central management system for importing and processing quantified proteomics data from multiple sources with automatic format detection and standardization.
def import_data(data_path: str,
data_type: str = None,
config_dict: dict = None,
**kwargs) -> pd.DataFrame:
"""
Import quantified proteomics data from various formats.
Parameters:
- data_path: Path to quantification data file
- data_type: Format type ('spectronaut', 'diann', 'maxquant', etc.)
- config_dict: Configuration dictionary for import settings
- **kwargs: Additional format-specific options
Returns:
DataFrame with standardized quantification data
"""
def get_supported_formats() -> List[str]:
"""
Get list of supported quantification formats.
Returns:
List of supported format names
"""
def get_format_config(format_name: str) -> dict:
"""
Get default configuration for specific format.
Parameters:
- format_name: Name of the format
Returns:
Configuration dictionary with default settings
"""
def validate_quantification_data(df: pd.DataFrame,
format_type: str = None) -> dict:
"""
Validate quantification data integrity and completeness.
Parameters:
- df: Quantification DataFrame to validate
- format_type: Expected format type for validation
Returns:
Dictionary with validation results and issues
"""Specialized reader for long-format quantification tables commonly produced by DIA-NN, Spectronaut, and other DIA search engines.
class LongFormatReader:
"""Reader for long-format quantification data tables."""
def __init__(self, config_dict: dict = None):
"""
Initialize long-format reader.
Parameters:
- config_dict: Configuration for column mappings and processing
"""
def read_file(self, filepath: str, **kwargs) -> pd.DataFrame:
"""
Read long-format quantification file.
Parameters:
- filepath: Path to quantification file
- **kwargs: Additional reading options
Returns:
DataFrame with processed quantification data
"""
def set_column_mapping(self, mapping: dict) -> None:
"""
Set custom column name mappings.
Parameters:
- mapping: Dictionary mapping file columns to standard names
"""
def filter_data(self, df: pd.DataFrame,
min_confidence: float = 0.01,
remove_decoys: bool = True) -> pd.DataFrame:
"""
Apply quality filters to quantification data.
Parameters:
- df: Input quantification DataFrame
- min_confidence: Minimum confidence threshold
- remove_decoys: Whether to remove decoy identifications
Returns:
Filtered DataFrame
"""
def aggregate_to_protein_level(self, df: pd.DataFrame,
method: str = 'sum') -> pd.DataFrame:
"""
Aggregate peptide-level to protein-level quantification.
Parameters:
- df: Peptide-level quantification DataFrame
- method: Aggregation method ('sum', 'mean', 'median', 'maxlfq')
Returns:
Protein-level quantification DataFrame
"""
def standardize_long_format_columns(df: pd.DataFrame,
source_format: str) -> pd.DataFrame:
"""
Standardize column names for long-format data.
Parameters:
- df: Input DataFrame with format-specific columns
- source_format: Source format name ('diann', 'spectronaut', etc.)
Returns:
DataFrame with standardized column names
"""Reader for wide-format quantification tables with samples as columns, commonly used in label-free quantification workflows.
class WideFormatReader:
"""Reader for wide-format quantification data tables."""
def __init__(self, config_dict: dict = None):
"""
Initialize wide-format reader.
Parameters:
- config_dict: Configuration for processing settings
"""
def read_file(self, filepath: str, **kwargs) -> pd.DataFrame:
"""
Read wide-format quantification file.
Parameters:
- filepath: Path to quantification file
- **kwargs: Additional reading options
Returns:
DataFrame with processed quantification data
"""
def identify_sample_columns(self, df: pd.DataFrame) -> List[str]:
"""
Automatically identify sample/intensity columns.
Parameters:
- df: Input DataFrame
Returns:
List of column names containing quantification values
"""
def convert_to_long_format(self, df: pd.DataFrame,
sample_columns: List[str] = None) -> pd.DataFrame:
"""
Convert wide-format to long-format table.
Parameters:
- df: Wide-format DataFrame
- sample_columns: List of sample columns to melt
Returns:
Long-format DataFrame
"""
def normalize_intensities(self, df: pd.DataFrame,
method: str = 'median') -> pd.DataFrame:
"""
Normalize quantification intensities across samples.
Parameters:
- df: Quantification DataFrame
- method: Normalization method ('median', 'mean', 'quantile')
Returns:
Normalized DataFrame
"""
def detect_wide_format_type(df: pd.DataFrame) -> str:
"""
Detect the type of wide-format quantification data.
Parameters:
- df: Input DataFrame
Returns:
Format type string ('maxquant', 'proteomics_ruler', 'generic')
"""System for managing format-specific configurations and column mappings for different quantification platforms.
class ConfigDictLoader:
"""Configuration management for quantification data formats."""
def __init__(self, config_path: str = None):
"""
Initialize configuration loader.
Parameters:
- config_path: Path to custom configuration file
"""
def load_config(self, format_name: str) -> dict:
"""
Load configuration for specific format.
Parameters:
- format_name: Name of the quantification format
Returns:
Configuration dictionary
"""
def save_config(self, config: dict, format_name: str) -> None:
"""
Save custom configuration for format.
Parameters:
- config: Configuration dictionary to save
- format_name: Name for the configuration
"""
def get_column_mapping(self, format_name: str) -> dict:
"""
Get column name mappings for format.
Parameters:
- format_name: Format name
Returns:
Dictionary mapping format columns to standard names
"""
def update_column_mapping(self, format_name: str,
mapping: dict) -> None:
"""
Update column mappings for format.
Parameters:
- format_name: Format name to update
- mapping: New column mappings
"""
# Standard configuration constants
STANDARD_QUANTIFICATION_COLUMNS: dict = {
'sequence': str, # Peptide sequence
'proteins': str, # Protein identifiers
'sample': str, # Sample identifier
'intensity': float, # Quantification intensity
'rt': float, # Retention time
'charge': int, # Precursor charge
'mz': float, # Precursor m/z
'qvalue': float, # Identification confidence
'run': str, # LC-MS run identifier
'channel': str, # Labeling channel (for TMT/iTRAQ)
}
def get_default_config(format_name: str) -> dict:
"""
Get default configuration for quantification format.
Parameters:
- format_name: Quantification format name
Returns:
Default configuration dictionary
"""Utilities for reformatting and processing quantification data for downstream analysis workflows.
class TableReformatter:
"""Reformatter for quantification data tables."""
def __init__(self):
"""Initialize table reformatter."""
def reformat_for_analysis(self, df: pd.DataFrame,
analysis_type: str = 'differential') -> pd.DataFrame:
"""
Reformat data for specific analysis workflows.
Parameters:
- df: Input quantification DataFrame
- analysis_type: Type of analysis ('differential', 'network', 'timecourse')
Returns:
Reformatted DataFrame suitable for analysis
"""
def create_design_matrix(self, df: pd.DataFrame,
sample_info: pd.DataFrame) -> pd.DataFrame:
"""
Create design matrix for statistical analysis.
Parameters:
- df: Quantification DataFrame
- sample_info: Sample metadata DataFrame
Returns:
Design matrix DataFrame
"""
def pivot_to_matrix(self, df: pd.DataFrame,
index_cols: List[str],
value_col: str = 'intensity') -> pd.DataFrame:
"""
Pivot quantification data to matrix format.
Parameters:
- df: Long-format quantification DataFrame
- index_cols: Columns to use as row identifiers
- value_col: Column containing values to pivot
Returns:
Matrix-format DataFrame
"""
def handle_missing_values(self, df: pd.DataFrame,
method: str = 'impute') -> pd.DataFrame:
"""
Handle missing quantification values.
Parameters:
- df: Quantification DataFrame with missing values
- method: Handling method ('impute', 'remove', 'flag')
Returns:
DataFrame with missing values handled
"""
class PlexDIAReformatter:
"""Specialized reformatter for plexDIA quantification data."""
def __init__(self):
"""Initialize plexDIA reformatter."""
def process_plexdia_output(self, filepath: str) -> pd.DataFrame:
"""
Process plexDIA output files.
Parameters:
- filepath: Path to plexDIA output file
Returns:
Processed quantification DataFrame
"""
def extract_channel_intensities(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Extract individual channel intensities from plexDIA data.
Parameters:
- df: Raw plexDIA DataFrame
Returns:
DataFrame with separated channel intensities
"""
def normalize_channels(self, df: pd.DataFrame,
method: str = 'sum') -> pd.DataFrame:
"""
Normalize intensities across plexDIA channels.
Parameters:
- df: plexDIA quantification DataFrame
- method: Normalization method ('sum', 'median', 'reference')
Returns:
Channel-normalized DataFrame
"""
def merge_quantification_data(dataframes: List[pd.DataFrame],
merge_on: List[str] = None) -> pd.DataFrame:
"""
Merge multiple quantification datasets.
Parameters:
- dataframes: List of quantification DataFrames to merge
- merge_on: Columns to merge on (default: sequence, proteins, charge)
Returns:
Merged quantification DataFrame
"""
def calculate_fold_changes(df: pd.DataFrame,
control_samples: List[str],
treatment_samples: List[str]) -> pd.DataFrame:
"""
Calculate fold changes between sample groups.
Parameters:
- df: Quantification DataFrame
- control_samples: List of control sample identifiers
- treatment_samples: List of treatment sample identifiers
Returns:
DataFrame with fold changes and statistics
"""Functions for quality assessment and statistical analysis of quantification data.
def assess_data_quality(df: pd.DataFrame) -> dict:
"""
Assess quantification data quality metrics.
Parameters:
- df: Quantification DataFrame
Returns:
Dictionary with quality metrics and statistics
"""
def calculate_cv_statistics(df: pd.DataFrame,
sample_groups: dict) -> pd.DataFrame:
"""
Calculate coefficient of variation statistics.
Parameters:
- df: Quantification DataFrame
- sample_groups: Dictionary mapping samples to groups
Returns:
DataFrame with CV statistics
"""
def identify_outlier_samples(df: pd.DataFrame,
method: str = 'pca') -> List[str]:
"""
Identify outlier samples in quantification data.
Parameters:
- df: Quantification DataFrame
- method: Outlier detection method ('pca', 'correlation', 'distance')
Returns:
List of outlier sample identifiers
"""
def generate_qa_report(df: pd.DataFrame,
output_path: str = None) -> dict:
"""
Generate comprehensive quality assessment report.
Parameters:
- df: Quantification DataFrame
- output_path: Optional path to save HTML report
Returns:
Dictionary with QA metrics and plots
"""from alphabase.quantification.quant_reader.quant_reader_manager import import_data
# Import DIA-NN quantification data
diann_df = import_data('report.tsv', data_type='diann')
print(f"Imported {len(diann_df)} quantification entries")
# Import Spectronaut data
spectronaut_df = import_data('spectronaut_export.tsv', data_type='spectronaut')
print(f"Imported {len(spectronaut_df)} quantification entries")
# Auto-detect format
unknown_df = import_data('unknown_quant.tsv') # Auto-detects format
print(f"Auto-detected format: {unknown_df.attrs.get('format_type', 'unknown')}")from alphabase.quantification.quant_reader.longformat_reader import LongFormatReader
import pandas as pd
# Create reader with custom configuration
reader = LongFormatReader()
# Read and process DIA-NN data
df = reader.read_file('diann_report.tsv')
# Apply quality filters
filtered_df = reader.filter_data(
df,
min_confidence=0.01, # 1% FDR
remove_decoys=True
)
# Aggregate to protein level
protein_df = reader.aggregate_to_protein_level(
filtered_df,
method='sum' # Sum peptide intensities
)
print(f"Peptide-level: {len(filtered_df)} entries")
print(f"Protein-level: {len(protein_df)} entries")from alphabase.quantification.quant_reader.wideformat_reader import WideFormatReader
# Process MaxQuant proteinGroups.txt
reader = WideFormatReader()
df = reader.read_file('proteinGroups.txt')
# Auto-identify intensity columns
sample_cols = reader.identify_sample_columns(df)
print(f"Found {len(sample_cols)} sample columns: {sample_cols[:5]}...")
# Convert to long format for analysis
long_df = reader.convert_to_long_format(df, sample_columns=sample_cols)
# Normalize intensities
normalized_df = reader.normalize_intensities(long_df, method='median')
print(f"Converted to long format: {len(long_df)} entries")from alphabase.quantification.quant_reader.table_reformatter import TableReformatter
from alphabase.quantification.quant_reader.quantreader_utils import (
merge_quantification_data, calculate_fold_changes
)
# Merge data from multiple experiments
experiment_dfs = [
import_data('exp1_diann.tsv', data_type='diann'),
import_data('exp2_diann.tsv', data_type='diann'),
import_data('exp3_diann.tsv', data_type='diann')
]
merged_df = merge_quantification_data(
experiment_dfs,
merge_on=['sequence', 'proteins', 'charge']
)
# Create design matrix for statistical analysis
reformatter = TableReformatter()
sample_info = pd.DataFrame({
'sample': ['exp1', 'exp2', 'exp3'],
'condition': ['control', 'treatment', 'treatment'],
'batch': [1, 1, 2]
})
design_matrix = reformatter.create_design_matrix(merged_df, sample_info)
# Calculate fold changes
fold_changes = calculate_fold_changes(
merged_df,
control_samples=['exp1'],
treatment_samples=['exp2', 'exp3']
)
print(f"Calculated fold changes for {len(fold_changes)} proteins")from alphabase.quantification.quant_reader.quantreader_utils import (
assess_data_quality, generate_qa_report, identify_outlier_samples
)
# Assess data quality
quality_metrics = assess_data_quality(merged_df)
print(f"Quality metrics:")
print(f" Missing values: {quality_metrics['missing_percentage']:.1f}%")
print(f" CV median: {quality_metrics['cv_median']:.2f}")
print(f" Dynamic range: {quality_metrics['dynamic_range']:.1f}")
# Identify outlier samples
outliers = identify_outlier_samples(merged_df, method='pca')
if outliers:
print(f"Outlier samples detected: {outliers}")
# Generate comprehensive QA report
qa_report = generate_qa_report(merged_df, output_path='qa_report.html')
print(f"QA report saved with {len(qa_report['plots'])} plots")from alphabase.quantification.quant_reader.config_dict_loader import ConfigDictLoader
# Create custom configuration
config_loader = ConfigDictLoader()
# Get default DIA-NN configuration
diann_config = config_loader.load_config('diann')
print(f"Default DIA-NN columns: {diann_config['column_mapping']}")
# Create custom configuration for new format
custom_config = {
'column_mapping': {
'peptide_sequence': 'sequence',
'protein_id': 'proteins',
'sample_name': 'sample',
'peak_area': 'intensity',
'retention_time': 'rt',
'precursor_charge': 'charge'
},
'filters': {
'min_confidence': 0.01,
'remove_contaminants': True
}
}
config_loader.save_config(custom_config, 'custom_format')
# Use custom configuration
custom_df = import_data('custom_data.tsv',
data_type='custom_format',
config_dict=custom_config)Install with Tessl CLI
npx tessl i tessl/pypi-alphabase