CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-alphabase

An infrastructure Python package of the AlphaX ecosystem for MS proteomics

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

protein-analysis.mddocs/

Protein Analysis

FASTA file processing, protein sequence analysis, and enzymatic digestion utilities. Supports protein inference workflows, sequence manipulation, and integration with proteomics identification pipelines for comprehensive protein-level analysis.

Capabilities

FASTA File Processing

Core functionality for reading and processing FASTA files with support for various protein database formats.

def read_fasta_file(filepath: str) -> Iterator[tuple[str, str]]:
    """
    Read FASTA file line by line using generator for memory efficiency.
    
    Parameters:
    - filepath: Path to FASTA file
    
    Yields:
    Tuples of (header, sequence) for each protein entry
    
    Usage:
    for header, sequence in read_fasta_file('proteins.fasta'):
        process_protein(header, sequence)
    """

def get_uniprot_gene_name(description: str) -> str:
    """
    Extract gene name from UniProt protein description.
    
    Parameters:
    - description: UniProt protein description line
    
    Returns:
    Gene name if found, empty string otherwise
    
    Example:
    desc = "sp|P12345|EXAMPLE_HUMAN Example protein GN=GENE1 PE=1 SV=2"
    gene = get_uniprot_gene_name(desc)  # Returns "GENE1"
    """

def parse_fasta_header(header: str) -> dict:
    """
    Parse FASTA header into structured information.
    
    Parameters:
    - header: FASTA header line (without >)
    
    Returns:
    Dictionary with parsed header information
    """

def validate_protein_sequence(sequence: str) -> bool:
    """
    Validate protein sequence contains only valid amino acids.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    True if sequence is valid
    """

def clean_protein_sequence(sequence: str) -> str:
    """
    Clean protein sequence by removing invalid characters.
    
    Parameters:
    - sequence: Raw protein sequence
    
    Returns:
    Cleaned protein sequence with only valid amino acids
    """

Protein Sequence Analysis

Functions for analyzing protein sequences, calculating properties, and extracting features.

def calculate_protein_mass(sequence: str) -> float:
    """
    Calculate protein molecular weight from sequence.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    Molecular weight in Daltons
    """

def calculate_protein_properties(sequence: str) -> dict:
    """
    Calculate comprehensive protein properties.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    Dictionary with molecular weight, pI, charge, etc.
    """

def get_amino_acid_composition(sequence: str) -> dict:
    """
    Get amino acid composition for protein sequence.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    Dictionary with counts for each amino acid
    """

def find_signal_peptide(sequence: str) -> tuple[bool, int]:
    """
    Predict signal peptide presence and cleavage site.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    Tuple of (has_signal_peptide, cleavage_position)
    """

def find_transmembrane_domains(sequence: str) -> List[tuple[int, int]]:
    """
    Predict transmembrane domain locations.
    
    Parameters:
    - sequence: Protein sequence string
    
    Returns:
    List of (start, end) positions for transmembrane domains
    """

Enzymatic Digestion

Functions for simulating enzymatic digestion of proteins with various proteases.

def digest_protein(sequence: str, enzyme: str = 'trypsin', 
                  missed_cleavages: int = 2, min_length: int = 6, 
                  max_length: int = 30) -> List[str]:
    """
    Digest protein sequence with specified enzyme.
    
    Parameters:
    - sequence: Protein sequence to digest
    - enzyme: Enzyme name ('trypsin', 'chymotrypsin', 'lysc', etc.)
    - missed_cleavages: Maximum number of missed cleavages
    - min_length: Minimum peptide length
    - max_length: Maximum peptide length
    
    Returns:
    List of peptide sequences from digestion
    """

def get_enzyme_specificity(enzyme: str) -> dict:
    """
    Get cleavage specificity for proteolytic enzyme.
    
    Parameters:
    - enzyme: Enzyme name
    
    Returns:
    Dictionary with cleavage rules and specificity
    """

def find_cleavage_sites(sequence: str, enzyme: str = 'trypsin') -> List[int]:
    """
    Find all potential cleavage sites for enzyme.
    
    Parameters:
    - sequence: Protein sequence
    - enzyme: Enzyme name
    
    Returns:
    List of cleavage positions in sequence
    """

def generate_peptides_with_modifications(sequence: str, 
                                       modifications: List[str] = None,
                                       enzyme: str = 'trypsin') -> List[dict]:
    """
    Generate peptides with variable modifications.
    
    Parameters:
    - sequence: Protein sequence
    - modifications: List of modification names to consider
    - enzyme: Digestion enzyme
    
    Returns:
    List of dictionaries with peptide info and modifications
    """

Protein Database Processing

Functions for processing and analyzing protein databases at scale.

def load_protein_database(fasta_path: str, 
                         include_decoys: bool = False) -> pd.DataFrame:
    """
    Load protein database into DataFrame format.
    
    Parameters:
    - fasta_path: Path to FASTA database file
    - include_decoys: Whether to include decoy proteins
    
    Returns:
    DataFrame with protein information
    """

def create_decoy_database(fasta_path: str, output_path: str,
                         decoy_prefix: str = 'DECOY_',
                         method: str = 'reverse') -> None:
    """
    Create decoy protein database for FDR calculation.
    
    Parameters:
    - fasta_path: Input FASTA file
    - output_path: Output FASTA file with decoys
    - decoy_prefix: Prefix for decoy protein IDs
    - method: Decoy generation method ('reverse', 'shuffle')
    """

def filter_database_by_taxa(fasta_path: str, output_path: str,
                           taxa_ids: List[int]) -> None:
    """
    Filter protein database by taxonomic IDs.
    
    Parameters:
    - fasta_path: Input FASTA file
    - output_path: Output filtered FASTA file
    - taxa_ids: List of NCBI taxonomy IDs to keep
    """

def merge_protein_databases(fasta_paths: List[str], 
                           output_path: str) -> None:
    """
    Merge multiple protein databases into single file.
    
    Parameters:
    - fasta_paths: List of input FASTA files
    - output_path: Output merged FASTA file
    """

def deduplicate_proteins(fasta_path: str, output_path: str,
                        by_sequence: bool = True) -> None:
    """
    Remove duplicate proteins from database.
    
    Parameters:
    - fasta_path: Input FASTA file
    - output_path: Output deduplicated FASTA file
    - by_sequence: Remove duplicates by sequence (True) or ID (False)
    """

Protein Inference

Functions for protein inference from peptide identifications, handling shared peptides and protein groups.

def map_peptides_to_proteins(peptides: List[str], 
                           protein_db: pd.DataFrame) -> dict:
    """
    Map peptide sequences to their source proteins.
    
    Parameters:
    - peptides: List of peptide sequences
    - protein_db: DataFrame with protein sequences
    
    Returns:
    Dictionary mapping peptides to lists of protein IDs
    """

def perform_protein_inference(psm_df: pd.DataFrame,
                            protein_db: pd.DataFrame,
                            method: str = 'parsimony') -> pd.DataFrame:
    """
    Perform protein inference from PSM identifications.
    
    Parameters:
    - psm_df: DataFrame with PSM identifications
    - protein_db: Protein database DataFrame
    - method: Inference method ('parsimony', 'maxquant', 'simple')
    
    Returns:
    DataFrame with protein-level results
    """

def create_protein_groups(protein_matches: dict,
                         method: str = 'maxquant') -> List[List[str]]:
    """
    Create protein groups from peptide-protein mappings.
    
    Parameters:
    - protein_matches: Dictionary of peptide to protein mappings
    - method: Grouping method
    
    Returns:
    List of protein groups (lists of protein IDs)
    """

def calculate_protein_coverage(protein_id: str, peptides: List[str],
                              protein_sequence: str) -> float:
    """
    Calculate sequence coverage for protein.
    
    Parameters:
    - protein_id: Protein identifier
    - peptides: List of identified peptides
    - protein_sequence: Full protein sequence
    
    Returns:
    Sequence coverage as fraction (0-1)
    """

def filter_proteins_by_evidence(protein_df: pd.DataFrame,
                               min_peptides: int = 2,
                               min_unique_peptides: int = 1) -> pd.DataFrame:
    """
    Filter proteins by identification evidence.
    
    Parameters:
    - protein_df: DataFrame with protein identifications
    - min_peptides: Minimum total peptides required
    - min_unique_peptides: Minimum unique peptides required
    
    Returns:
    Filtered protein DataFrame
    """

Sequence Utilities

Additional utilities for protein sequence manipulation and analysis.

def reverse_protein_sequence(sequence: str) -> str:
    """
    Reverse protein sequence for decoy generation.
    
    Parameters:
    - sequence: Original protein sequence
    
    Returns:
    Reversed sequence
    """

def shuffle_protein_sequence(sequence: str, seed: int = None) -> str:
    """
    Shuffle protein sequence while maintaining amino acid composition.
    
    Parameters:
    - sequence: Original protein sequence  
    - seed: Random seed for reproducible shuffling
    
    Returns:
    Shuffled sequence
    """

def translate_dna_to_protein(dna_sequence: str, frame: int = 0) -> str:
    """
    Translate DNA sequence to protein sequence.
    
    Parameters:
    - dna_sequence: DNA nucleotide sequence
    - frame: Reading frame (0, 1, or 2)
    
    Returns:
    Translated protein sequence
    """

def find_open_reading_frames(dna_sequence: str, 
                           min_length: int = 100) -> List[dict]:
    """
    Find open reading frames in DNA sequence.
    
    Parameters:
    - dna_sequence: DNA nucleotide sequence
    - min_length: Minimum ORF length in nucleotides
    
    Returns:
    List of ORF information dictionaries
    """

def convert_sequence_format(sequence: str, input_format: str,
                          output_format: str) -> str:
    """
    Convert between different sequence formats.
    
    Parameters:
    - sequence: Input sequence
    - input_format: Input format ('dna', 'rna', 'protein')
    - output_format: Output format
    
    Returns:
    Converted sequence
    """

Usage Examples

Basic FASTA Processing

from alphabase.protein.fasta import read_fasta_file, get_uniprot_gene_name

# Read FASTA file efficiently
protein_count = 0
for header, sequence in read_fasta_file('uniprot_human.fasta'):
    protein_count += 1
    
    # Extract gene name from UniProt header
    gene_name = get_uniprot_gene_name(header)
    
    # Process protein
    if len(sequence) > 100:  # Filter by length
        print(f"Protein {protein_count}: {gene_name}, Length: {len(sequence)}")
    
    if protein_count >= 10:  # Process first 10 proteins
        break

print(f"Processed {protein_count} proteins")

Protein Digestion and Analysis

from alphabase.protein.fasta import digest_protein, calculate_protein_properties

# Example protein sequence
protein_seq = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGGYKWENQPWLNGIPVEELENLTQHLPDLVDQAIGVGRQGKVFVLVPKGEAPGDYVNLNRVLPWLLPSLIHNMHSTPDFFKTGIPVLYLSRRILNQHGQNVEILGQKQSGEAGTMEVLDEAFLKGQRRSQKKSKKNSQGGSQIRKTCVSLNRLRREVSQYFISDRPLVLDMKIPEESRQSLAQVIRRQRGEKRGFTWVPVRDGNGIIDQTVLIARGKKRSSEDGGNNLLISRFGSIGGDGLSRFGDATLSSFGGDSGLMRGDQETVTFVPLSFSGNQGMSQGTFSPKQSLNLLDPGSMGGTSFMSQRRSQKASQGNNYSQSRKKLMSGQFCGQASGEAMRYKVKPEDFSYILRRRKLASQQKQSFDLIPVHNGKMKGSHGKMTPEMQGSQRQKMPLRNLLDFTEGQMGR"

# Calculate protein properties
properties = calculate_protein_properties(protein_seq)
print(f"Protein properties: {properties}")

# Digest with trypsin
peptides = digest_protein(
    sequence=protein_seq,
    enzyme='trypsin',
    missed_cleavages=2,
    min_length=6,
    max_length=30
)

print(f"Generated {len(peptides)} tryptic peptides")
for i, peptide in enumerate(peptides[:5]):  # Show first 5
    print(f"Peptide {i+1}: {peptide}")

Protein Database Processing

import pandas as pd
from alphabase.protein.fasta import load_protein_database, create_decoy_database

# Load protein database
protein_db = load_protein_database('human_proteome.fasta')
print(f"Loaded {len(protein_db)} proteins")

# Create decoy database
create_decoy_database(
    fasta_path='human_proteome.fasta',
    output_path='human_proteome_with_decoys.fasta',
    decoy_prefix='DECOY_',
    method='reverse'
)

# Filter database by taxonomy (example: human proteins only)
filter_database_by_taxa(
    fasta_path='uniprot_all.fasta',
    output_path='uniprot_human.fasta',
    taxa_ids=[9606]  # Human NCBI taxonomy ID
)

Protein Inference Workflow

from alphabase.protein.fasta import perform_protein_inference, map_peptides_to_proteins
from alphabase.psm_reader import MaxQuantReader

# Load PSM identifications
mq_reader = MaxQuantReader()
psm_df = mq_reader.import_file('msms.txt')

# Load protein database
protein_db = load_protein_database('proteome.fasta')

# Map peptides to proteins
peptides = psm_df['sequence'].unique().tolist()
peptide_protein_map = map_peptides_to_proteins(peptides, protein_db)

print(f"Mapped {len(peptides)} peptides to proteins")

# Perform protein inference
protein_results = perform_protein_inference(
    psm_df=psm_df,
    protein_db=protein_db,
    method='parsimony'
)

print(f"Identified {len(protein_results)} protein groups")

# Filter by evidence
high_confidence_proteins = filter_proteins_by_evidence(
    protein_results,
    min_peptides=2,
    min_unique_peptides=1
)

print(f"High confidence proteins: {len(high_confidence_proteins)}")

Advanced Sequence Analysis

from alphabase.protein.fasta import (
    find_signal_peptide, find_transmembrane_domains,
    get_amino_acid_composition, translate_dna_to_protein
)

# Analyze protein sequence features
protein_seq = "MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPVNGFNSDYNWPLEKSPPDPNTPVDDEALEKFLPTTGIIVDMHRVLNKLLEKRHPVEAYHQIRSMSSAELFKHAAKSSLLHYVPASAQHVTLGYGYPYDAHLADAIYLKLLTKDTAELPKVAQGPGGKGQMRVAFLKDTPTDMHRVAFLRELHRRQHRGADELLSEKLLQSLMQRQVQLQIQAQEQRGRSQKLQRIEEALRKLAEVHTQNMEKFQFSLQMQLVQMQQQTVLLMQVQNLAHLQQQIQNQQMQMDLDTQVLDMLRNSPSLTEKLTEYAEDRMNHSDMSQDFHFPGLQCDRFMSPKFLEGLQSSLSEVNLPAQVKMVTKMFQKLDLDVLLQMQAQRQGRDQADKMIEKLAEMDDEQRAATDQKLAEERVRQLQADMRKCQTRQNQLSAARDLLKQKMNLMQQQVQMHQQHLQIAQQKRQFKAMQHVDHQTMIDRFLNDVQKLQRLQRQKRQQQQQQHQHQQMHQRQRRQHQQHHHQRQIAQQQLMQNQLPSFRSVHQMDLQKNQKQRRQRQKQKQMQKQKLLQRQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKGADQADKMIEKLMEMDDEQRAATDQKLAEERVRQLQADMRKCQTRQNQLSAARDLLKKMNLMQQQVQMHQQHLQIAQQKRQFKAMQHVDHQTMIDRFLNDVQKLQRLQRQKRQHQHQQHQQRQRKQKAQQKIAQHQMQKVQAEIHMQAKMKNQGQSRQKLRAIKGRPKRFQPSEPVLDVDPVVEKLMKKLSESVLEKGTVNTSSLMDNKFLLQRQAKILESLLRRQVNHRKLQMEMQARHTQRQKVNELQRRQQMHQRMHVSGHRGKLQKRNNSQKMAQHVMQAEKQRLSSLQNMQRQAIQMNQRQRDQLLRSRLRQQRSYRDKQFSQKIKMEERRSSRKRLVHAVRRHRIRRRASRSRSRS"

# Check for signal peptide
has_signal, cleavage_pos = find_signal_peptide(protein_seq)
print(f"Signal peptide: {has_signal}, Cleavage at: {cleavage_pos}")

# Find transmembrane domains
tm_domains = find_transmembrane_domains(protein_seq)
print(f"Transmembrane domains: {tm_domains}")

# Get amino acid composition
aa_comp = get_amino_acid_composition(protein_seq)
print(f"AA composition: {aa_comp}")

# Translate DNA to protein
dna_seq = "ATGAAGTGGGTAACATTTAT"  # Example DNA sequence
protein_from_dna = translate_dna_to_protein(dna_seq)
print(f"Translated protein: {protein_from_dna}")

Install with Tessl CLI

npx tessl i tessl/pypi-alphabase

docs

advanced-peptide-operations.md

advanced-spectral-libraries.md

chemical-constants.md

fragment-ions.md

index.md

io-utilities.md

protein-analysis.md

psm-readers.md

quantification.md

smiles-chemistry.md

spectral-libraries.md

tile.json