tessl/pypi-alphabase

An infrastructure Python package of the AlphaX ecosystem for MS proteomics

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Protein Analysis

Name: tessl/pypi-alphabase
Author: tessl

FASTA file processing, protein sequence analysis, and enzymatic digestion utilities. Supports protein inference workflows, sequence manipulation, and integration with proteomics identification pipelines for comprehensive protein-level analysis.

Capabilities

FASTA File Processing

Core functionality for reading and processing FASTA files with support for various protein database formats.

def read_fasta_file(filepath: str) -> Iterator[tuple[str, str]]:
    """
    Read FASTA file line by line using generator for memory efficiency.
    
    Parameters:
    - filepath: Path to FASTA file
    
    Yields:
    Tuples of (header, sequence) for each protein entry
    
    Usage:
    for header, sequence in read_fasta_file('proteins.fasta'):
        process_protein(header, sequence)
    """

def get_uniprot_gene_name(description: str) -> str:
    """
    Extract gene name from UniProt protein description.
    
    Parameters:
    - description: UniProt protein description line
    
    Returns:
    Gene name if found, empty string otherwise
    
    Example:
    desc = "sp|P12345|EXAMPLE_HUMAN Example protein GN=GENE1 PE=1 SV=2"
    gene = get_uniprot_gene_name(desc)  # Returns "GENE1"
    """

def parse_fasta_header(header: str) -> dict:
    """
    Parse FASTA header into structured information.
    
    Parameters:
    - header: FASTA header line (without >)
    
    Returns:
    Dictionary with parsed header information
    """

def validate_protein_sequence(sequence: str) -> bool:
    """
    Validate protein sequence contains only valid amino acids.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    True if sequence is valid
    """

def clean_protein_sequence(sequence: str) -> str:
    """
    Clean protein sequence by removing invalid characters.
    
    Parameters:
    - sequence: Raw protein sequence
    
    Returns:
    Cleaned protein sequence with only valid amino acids
    """

Protein Sequence Analysis

Functions for analyzing protein sequences, calculating properties, and extracting features.

def calculate_protein_mass(sequence: str) -> float:
    """
    Calculate protein molecular weight from sequence.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    Molecular weight in Daltons
    """

def calculate_protein_properties(sequence: str) -> dict:
    """
    Calculate comprehensive protein properties.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    Dictionary with molecular weight, pI, charge, etc.
    """

def get_amino_acid_composition(sequence: str) -> dict:
    """
    Get amino acid composition for protein sequence.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    Dictionary with counts for each amino acid
    """

def find_signal_peptide(sequence: str) -> tuple[bool, int]:
    """
    Predict signal peptide presence and cleavage site.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    Tuple of (has_signal_peptide, cleavage_position)
    """

def find_transmembrane_domains(sequence: str) -> List[tuple[int, int]]:
    """
    Predict transmembrane domain locations.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    List of (start, end) positions for transmembrane domains
    """

Enzymatic Digestion

Functions for simulating enzymatic digestion of proteins with various proteases.

def digest_protein(sequence: str, enzyme: str = 'trypsin', 
                  missed_cleavages: int = 2, min_length: int = 6, 
                  max_length: int = 30) -> List[str]:
    """
    Digest protein sequence with specified enzyme.
    
    Parameters:
    - sequence: Protein sequence to digest
    - enzyme: Enzyme name ('trypsin', 'chymotrypsin', 'lysc', etc.)
    - missed_cleavages: Maximum number of missed cleavages
    - min_length: Minimum peptide length
    - max_length: Maximum peptide length
    
    Returns:
    List of peptide sequences from digestion
    """

def get_enzyme_specificity(enzyme: str) -> dict:
    """
    Get cleavage specificity for proteolytic enzyme.
    
    Parameters:
    - enzyme: Enzyme name
    
    Returns:
    Dictionary with cleavage rules and specificity
    """

def find_cleavage_sites(sequence: str, enzyme: str = 'trypsin') -> List[int]:
    """
    Find all potential cleavage sites for enzyme.
    
    Parameters:
    - sequence: Protein sequence
    - enzyme: Enzyme name
    
    Returns:
    List of cleavage positions in sequence
    """

def generate_peptides_with_modifications(sequence: str, 
                                       modifications: List[str] = None,
                                       enzyme: str = 'trypsin') -> List[dict]:
    """
    Generate peptides with variable modifications.
    
    Parameters:
    - sequence: Protein sequence
    - modifications: List of modification names to consider
    - enzyme: Digestion enzyme
    
    Returns:
    List of dictionaries with peptide info and modifications
    """

Protein Database Processing

Functions for processing and analyzing protein databases at scale.

def load_protein_database(fasta_path: str, 
                         include_decoys: bool = False) -> pd.DataFrame:
    """
    Load protein database into DataFrame format.
    
    Parameters:
    - fasta_path: Path to FASTA database file
    - include_decoys: Whether to include decoy proteins
    
    Returns:
    DataFrame with protein information
    """

def create_decoy_database(fasta_path: str, output_path: str,
                         decoy_prefix: str = 'DECOY_',
                         method: str = 'reverse') -> None:
    """
    Create decoy protein database for FDR calculation.
    
    Parameters:
    - fasta_path: Input FASTA file
    - output_path: Output FASTA file with decoys
    - decoy_prefix: Prefix for decoy protein IDs
    - method: Decoy generation method ('reverse', 'shuffle')
    """

def filter_database_by_taxa(fasta_path: str, output_path: str,
                           taxa_ids: List[int]) -> None:
    """
    Filter protein database by taxonomic IDs.
    
    Parameters:
    - fasta_path: Input FASTA file
    - output_path: Output filtered FASTA file
    - taxa_ids: List of NCBI taxonomy IDs to keep
    """

def merge_protein_databases(fasta_paths: List[str], 
                           output_path: str) -> None:
    """
    Merge multiple protein databases into single file.
    
    Parameters:
    - fasta_paths: List of input FASTA files
    - output_path: Output merged FASTA file
    """

def deduplicate_proteins(fasta_path: str, output_path: str,
                        by_sequence: bool = True) -> None:
    """
    Remove duplicate proteins from database.
    
    Parameters:
    - fasta_path: Input FASTA file
    - output_path: Output deduplicated FASTA file
    - by_sequence: Remove duplicates by sequence (True) or ID (False)
    """

Protein Inference

Functions for protein inference from peptide identifications, handling shared peptides and protein groups.

def map_peptides_to_proteins(peptides: List[str], 
                           protein_db: pd.DataFrame) -> dict:
    """
    Map peptide sequences to their source proteins.
    
    Parameters:
    - peptides: List of peptide sequences
    - protein_db: DataFrame with protein sequences
    
    Returns:
    Dictionary mapping peptides to lists of protein IDs
    """

def perform_protein_inference(psm_df: pd.DataFrame,
                            protein_db: pd.DataFrame,
                            method: str = 'parsimony') -> pd.DataFrame:
    """
    Perform protein inference from PSM identifications.
    
    Parameters:
    - psm_df: DataFrame with PSM identifications
    - protein_db: Protein database DataFrame
    - method: Inference method ('parsimony', 'maxquant', 'simple')
    
    Returns:
    DataFrame with protein-level results
    """

def create_protein_groups(protein_matches: dict,
                         method: str = 'maxquant') -> List[List[str]]:
    """
    Create protein groups from peptide-protein mappings.
    
    Parameters:
    - protein_matches: Dictionary of peptide to protein mappings
    - method: Grouping method
    
    Returns:
    List of protein groups (lists of protein IDs)
    """

def calculate_protein_coverage(protein_id: str, peptides: List[str],
                              protein_sequence: str) -> float:
    """
    Calculate sequence coverage for protein.
    
    Parameters:
    - protein_id: Protein identifier
    - peptides: List of identified peptides
    - protein_sequence: Full protein sequence
    
    Returns:
    Sequence coverage as fraction (0-1)
    """

def filter_proteins_by_evidence(protein_df: pd.DataFrame,
                               min_peptides: int = 2,
                               min_unique_peptides: int = 1) -> pd.DataFrame:
    """
    Filter proteins by identification evidence.
    
    Parameters:
    - protein_df: DataFrame with protein identifications
    - min_peptides: Minimum total peptides required
    - min_unique_peptides: Minimum unique peptides required
    
    Returns:
    Filtered protein DataFrame
    """

Sequence Utilities

Additional utilities for protein sequence manipulation and analysis.

def reverse_protein_sequence(sequence: str) -> str:
    """
    Reverse protein sequence for decoy generation.
    
    Parameters:
    - sequence: Original protein sequence
    
    Returns:
    Reversed sequence
    """

def shuffle_protein_sequence(sequence: str, seed: int = None) -> str:
    """
    Shuffle protein sequence while maintaining amino acid composition.
    
    Parameters:
    - sequence: Original protein sequence  
    - seed: Random seed for reproducible shuffling
    
    Returns:
    Shuffled sequence
    """

def translate_dna_to_protein(dna_sequence: str, frame: int = 0) -> str:
    """
    Translate DNA sequence to protein sequence.
    
    Parameters:
    - dna_sequence: DNA nucleotide sequence
    - frame: Reading frame (0, 1, or 2)
    
    Returns:
    Translated protein sequence
    """

def find_open_reading_frames(dna_sequence: str, 
                           min_length: int = 100) -> List[dict]:
    """
    Find open reading frames in DNA sequence.
    
    Parameters:
    - dna_sequence: DNA nucleotide sequence
    - min_length: Minimum ORF length in nucleotides
    
    Returns:
    List of ORF information dictionaries
    """

def convert_sequence_format(sequence: str, input_format: str,
                          output_format: str) -> str:
    """
    Convert between different sequence formats.
    
    Parameters:
    - sequence: Input sequence
    - input_format: Input format ('dna', 'rna', 'protein')
    - output_format: Output format
    
    Returns:
    Converted sequence
    """

Usage Examples

Basic FASTA Processing

from alphabase.protein.fasta import read_fasta_file, get_uniprot_gene_name

# Read FASTA file efficiently
protein_count = 0
for header, sequence in read_fasta_file('uniprot_human.fasta'):
    protein_count += 1
    
    # Extract gene name from UniProt header
    gene_name = get_uniprot_gene_name(header)
    
    # Process protein
    if len(sequence) > 100:  # Filter by length
        print(f"Protein {protein_count}: {gene_name}, Length: {len(sequence)}")
    
    if protein_count >= 10:  # Process first 10 proteins
        break

print(f"Processed {protein_count} proteins")

Protein Digestion and Analysis

from alphabase.protein.fasta import digest_protein, calculate_protein_properties

# Example protein sequence
protein_seq = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGGYKWENQPWLNGIPVEELENLTQHLPDLVDQAIGVGRQGKVFVLVPKGEAPGDYVNLNRVLPWLLPSLIHNMHSTPDFFKTGIPVLYLSRRILNQHGQNVEILGQKQSGEAGTMEVLDEAFLKGQRRSQKKSKKNSQGGSQIRKTCVSLNRLRREVSQYFISDRPLVLDMKIPEESRQSLAQVIRRQRGEKRGFTWVPVRDGNGIIDQTVLIARGKKRSSEDGGNNLLISRFGSIGGDGLSRFGDATLSSFGGDSGLMRGDQETVTFVPLSFSGNQGMSQGTFSPKQSLNLLDPGSMGGTSFMSQRRSQKASQGNNYSQSRKKLMSGQFCGQASGEAMRYKVKPEDFSYILRRRKLASQQKQSFDLIPVHNGKMKGSHGKMTPEMQGSQRQKMPLRNLLDFTEGQMGR"

# Calculate protein properties
properties = calculate_protein_properties(protein_seq)
print(f"Protein properties: {properties}")

# Digest with trypsin
peptides = digest_protein(
    sequence=protein_seq,
    enzyme='trypsin',
    missed_cleavages=2,
    min_length=6,
    max_length=30
)

print(f"Generated {len(peptides)} tryptic peptides")
for i, peptide in enumerate(peptides[:5]):  # Show first 5
    print(f"Peptide {i+1}: {peptide}")

Protein Database Processing

import pandas as pd
from alphabase.protein.fasta import load_protein_database, create_decoy_database

# Load protein database
protein_db = load_protein_database('human_proteome.fasta')
print(f"Loaded {len(protein_db)} proteins")

# Create decoy database
create_decoy_database(
    fasta_path='human_proteome.fasta',
    output_path='human_proteome_with_decoys.fasta',
    decoy_prefix='DECOY_',
    method='reverse'
)

# Filter database by taxonomy (example: human proteins only)
filter_database_by_taxa(
    fasta_path='uniprot_all.fasta',
    output_path='uniprot_human.fasta',
    taxa_ids=[9606]  # Human NCBI taxonomy ID
)

Protein Inference Workflow

from alphabase.protein.fasta import perform_protein_inference, map_peptides_to_proteins
from alphabase.psm_reader import MaxQuantReader

# Load PSM identifications
mq_reader = MaxQuantReader()
psm_df = mq_reader.import_file('msms.txt')

# Load protein database
protein_db = load_protein_database('proteome.fasta')

# Map peptides to proteins
peptides = psm_df['sequence'].unique().tolist()
peptide_protein_map = map_peptides_to_proteins(peptides, protein_db)

print(f"Mapped {len(peptides)} peptides to proteins")

# Perform protein inference
protein_results = perform_protein_inference(
    psm_df=psm_df,
    protein_db=protein_db,
    method='parsimony'
)

print(f"Identified {len(protein_results)} protein groups")

# Filter by evidence
high_confidence_proteins = filter_proteins_by_evidence(
    protein_results,
    min_peptides=2,
    min_unique_peptides=1
)

print(f"High confidence proteins: {len(high_confidence_proteins)}")

Advanced Sequence Analysis

from alphabase.protein.fasta import (
    find_signal_peptide, find_transmembrane_domains,
    get_amino_acid_composition, translate_dna_to_protein
)

# Analyze protein sequence features
protein_seq = "MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPVNGFNSDYNWPLEKSPPDPNTPVDDEALEKFLPTTGIIVDMHRVLNKLLEKRHPVEAYHQIRSMSSAELFKHAAKSSLLHYVPASAQHVTLGYGYPYDAHLADAIYLKLLTKDTAELPKVAQGPGGKGQMRVAFLKDTPTDMHRVAFLRELHRRQHRGADELLSEKLLQSLMQRQVQLQIQAQEQRGRSQKLQRIEEALRKLAEVHTQNMEKFQFSLQMQLVQMQQQTVLLMQVQNLAHLQQQIQNQQMQMDLDTQVLDMLRNSPSLTEKLTEYAEDRMNHSDMSQDFHFPGLQCDRFMSPKFLEGLQSSLSEVNLPAQVKMVTKMFQKLDLDVLLQMQAQRQGRDQADKMIEKLAEMDDEQRAATDQKLAEERVRQLQADMRKCQTRQNQLSAARDLLKQKMNLMQQQVQMHQQHLQIAQQKRQFKAMQHVDHQTMIDRFLNDVQKLQRLQRQKRQQQQQQHQHQQMHQRQRRQHQQHHHQRQIAQQQLMQNQLPSFRSVHQMDLQKNQKQRRQRQKQKQMQKQKLLQRQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKGADQADKMIEKLMEMDDEQRAATDQKLAEERVRQLQADMRKCQTRQNQLSAARDLLKKMNLMQQQVQMHQQHLQIAQQKRQFKAMQHVDHQTMIDRFLNDVQKLQRLQRQKRQHQHQQHQQRQRKQKAQQKIAQHQMQKVQAEIHMQAKMKNQGQSRQKLRAIKGRPKRFQPSEPVLDVDPVVEKLMKKLSESVLEKGTVNTSSLMDNKFLLQRQAKILESLLRRQVNHRKLQMEMQARHTQRQKVNELQRRQQMHQRMHVSGHRGKLQKRNNSQKMAQHVMQAEKQRLSSLQNMQRQAIQMNQRQRDQLLRSRLRQQRSYRDKQFSQKIKMEERRSSRKRLVHAVRRHRIRRRASRSRSRS"

# Check for signal peptide
has_signal, cleavage_pos = find_signal_peptide(protein_seq)
print(f"Signal peptide: {has_signal}, Cleavage at: {cleavage_pos}")

# Find transmembrane domains
tm_domains = find_transmembrane_domains(protein_seq)
print(f"Transmembrane domains: {tm_domains}")

# Get amino acid composition
aa_comp = get_amino_acid_composition(protein_seq)
print(f"AA composition: {aa_comp}")

# Translate DNA to protein
dna_seq = "ATGAAGTGGGTAACATTTAT"  # Example DNA sequence
protein_from_dna = translate_dna_to_protein(dna_seq)
print(f"Translated protein: {protein_from_dna}")

Install with Tessl CLI