0
# Protein Analysis
1
2
FASTA file processing, protein sequence analysis, and enzymatic digestion utilities. Supports protein inference workflows, sequence manipulation, and integration with proteomics identification pipelines for comprehensive protein-level analysis.
3
4
## Capabilities
5
6
### FASTA File Processing
7
8
Core functionality for reading and processing FASTA files with support for various protein database formats.
9
10
```python { .api }
11
def read_fasta_file(filepath: str) -> Iterator[tuple[str, str]]:
12
"""
13
Read FASTA file line by line using generator for memory efficiency.
14
15
Parameters:
16
- filepath: Path to FASTA file
17
18
Yields:
19
Tuples of (header, sequence) for each protein entry
20
21
Usage:
22
for header, sequence in read_fasta_file('proteins.fasta'):
23
process_protein(header, sequence)
24
"""
25
26
def get_uniprot_gene_name(description: str) -> str:
27
"""
28
Extract gene name from UniProt protein description.
29
30
Parameters:
31
- description: UniProt protein description line
32
33
Returns:
34
Gene name if found, empty string otherwise
35
36
Example:
37
desc = "sp|P12345|EXAMPLE_HUMAN Example protein GN=GENE1 PE=1 SV=2"
38
gene = get_uniprot_gene_name(desc) # Returns "GENE1"
39
"""
40
41
def parse_fasta_header(header: str) -> dict:
42
"""
43
Parse FASTA header into structured information.
44
45
Parameters:
46
- header: FASTA header line (without >)
47
48
Returns:
49
Dictionary with parsed header information
50
"""
51
52
def validate_protein_sequence(sequence: str) -> bool:
53
"""
54
Validate protein sequence contains only valid amino acids.
55
56
Parameters:
57
- sequence: Protein sequence string
58
59
Returns:
60
True if sequence is valid
61
"""
62
63
def clean_protein_sequence(sequence: str) -> str:
64
"""
65
Clean protein sequence by removing invalid characters.
66
67
Parameters:
68
- sequence: Raw protein sequence
69
70
Returns:
71
Cleaned protein sequence with only valid amino acids
72
"""
73
```
74
75
### Protein Sequence Analysis
76
77
Functions for analyzing protein sequences, calculating properties, and extracting features.
78
79
```python { .api }
80
def calculate_protein_mass(sequence: str) -> float:
81
"""
82
Calculate protein molecular weight from sequence.
83
84
Parameters:
85
- sequence: Protein sequence string
86
87
Returns:
88
Molecular weight in Daltons
89
"""
90
91
def calculate_protein_properties(sequence: str) -> dict:
92
"""
93
Calculate comprehensive protein properties.
94
95
Parameters:
96
- sequence: Protein sequence string
97
98
Returns:
99
Dictionary with molecular weight, pI, charge, etc.
100
"""
101
102
def get_amino_acid_composition(sequence: str) -> dict:
103
"""
104
Get amino acid composition for protein sequence.
105
106
Parameters:
107
- sequence: Protein sequence string
108
109
Returns:
110
Dictionary with counts for each amino acid
111
"""
112
113
def find_signal_peptide(sequence: str) -> tuple[bool, int]:
114
"""
115
Predict signal peptide presence and cleavage site.
116
117
Parameters:
118
- sequence: Protein sequence string
119
120
Returns:
121
Tuple of (has_signal_peptide, cleavage_position)
122
"""
123
124
def find_transmembrane_domains(sequence: str) -> List[tuple[int, int]]:
125
"""
126
Predict transmembrane domain locations.
127
128
Parameters:
129
- sequence: Protein sequence string
130
131
Returns:
132
List of (start, end) positions for transmembrane domains
133
"""
134
```
135
136
### Enzymatic Digestion
137
138
Functions for simulating enzymatic digestion of proteins with various proteases.
139
140
```python { .api }
141
def digest_protein(sequence: str, enzyme: str = 'trypsin',
142
missed_cleavages: int = 2, min_length: int = 6,
143
max_length: int = 30) -> List[str]:
144
"""
145
Digest protein sequence with specified enzyme.
146
147
Parameters:
148
- sequence: Protein sequence to digest
149
- enzyme: Enzyme name ('trypsin', 'chymotrypsin', 'lysc', etc.)
150
- missed_cleavages: Maximum number of missed cleavages
151
- min_length: Minimum peptide length
152
- max_length: Maximum peptide length
153
154
Returns:
155
List of peptide sequences from digestion
156
"""
157
158
def get_enzyme_specificity(enzyme: str) -> dict:
159
"""
160
Get cleavage specificity for proteolytic enzyme.
161
162
Parameters:
163
- enzyme: Enzyme name
164
165
Returns:
166
Dictionary with cleavage rules and specificity
167
"""
168
169
def find_cleavage_sites(sequence: str, enzyme: str = 'trypsin') -> List[int]:
170
"""
171
Find all potential cleavage sites for enzyme.
172
173
Parameters:
174
- sequence: Protein sequence
175
- enzyme: Enzyme name
176
177
Returns:
178
List of cleavage positions in sequence
179
"""
180
181
def generate_peptides_with_modifications(sequence: str,
182
modifications: List[str] = None,
183
enzyme: str = 'trypsin') -> List[dict]:
184
"""
185
Generate peptides with variable modifications.
186
187
Parameters:
188
- sequence: Protein sequence
189
- modifications: List of modification names to consider
190
- enzyme: Digestion enzyme
191
192
Returns:
193
List of dictionaries with peptide info and modifications
194
"""
195
```
196
197
### Protein Database Processing
198
199
Functions for processing and analyzing protein databases at scale.
200
201
```python { .api }
202
def load_protein_database(fasta_path: str,
203
include_decoys: bool = False) -> pd.DataFrame:
204
"""
205
Load protein database into DataFrame format.
206
207
Parameters:
208
- fasta_path: Path to FASTA database file
209
- include_decoys: Whether to include decoy proteins
210
211
Returns:
212
DataFrame with protein information
213
"""
214
215
def create_decoy_database(fasta_path: str, output_path: str,
216
decoy_prefix: str = 'DECOY_',
217
method: str = 'reverse') -> None:
218
"""
219
Create decoy protein database for FDR calculation.
220
221
Parameters:
222
- fasta_path: Input FASTA file
223
- output_path: Output FASTA file with decoys
224
- decoy_prefix: Prefix for decoy protein IDs
225
- method: Decoy generation method ('reverse', 'shuffle')
226
"""
227
228
def filter_database_by_taxa(fasta_path: str, output_path: str,
229
taxa_ids: List[int]) -> None:
230
"""
231
Filter protein database by taxonomic IDs.
232
233
Parameters:
234
- fasta_path: Input FASTA file
235
- output_path: Output filtered FASTA file
236
- taxa_ids: List of NCBI taxonomy IDs to keep
237
"""
238
239
def merge_protein_databases(fasta_paths: List[str],
240
output_path: str) -> None:
241
"""
242
Merge multiple protein databases into single file.
243
244
Parameters:
245
- fasta_paths: List of input FASTA files
246
- output_path: Output merged FASTA file
247
"""
248
249
def deduplicate_proteins(fasta_path: str, output_path: str,
250
by_sequence: bool = True) -> None:
251
"""
252
Remove duplicate proteins from database.
253
254
Parameters:
255
- fasta_path: Input FASTA file
256
- output_path: Output deduplicated FASTA file
257
- by_sequence: Remove duplicates by sequence (True) or ID (False)
258
"""
259
```
260
261
### Protein Inference
262
263
Functions for protein inference from peptide identifications, handling shared peptides and protein groups.
264
265
```python { .api }
266
def map_peptides_to_proteins(peptides: List[str],
267
protein_db: pd.DataFrame) -> dict:
268
"""
269
Map peptide sequences to their source proteins.
270
271
Parameters:
272
- peptides: List of peptide sequences
273
- protein_db: DataFrame with protein sequences
274
275
Returns:
276
Dictionary mapping peptides to lists of protein IDs
277
"""
278
279
def perform_protein_inference(psm_df: pd.DataFrame,
280
protein_db: pd.DataFrame,
281
method: str = 'parsimony') -> pd.DataFrame:
282
"""
283
Perform protein inference from PSM identifications.
284
285
Parameters:
286
- psm_df: DataFrame with PSM identifications
287
- protein_db: Protein database DataFrame
288
- method: Inference method ('parsimony', 'maxquant', 'simple')
289
290
Returns:
291
DataFrame with protein-level results
292
"""
293
294
def create_protein_groups(protein_matches: dict,
295
method: str = 'maxquant') -> List[List[str]]:
296
"""
297
Create protein groups from peptide-protein mappings.
298
299
Parameters:
300
- protein_matches: Dictionary of peptide to protein mappings
301
- method: Grouping method
302
303
Returns:
304
List of protein groups (lists of protein IDs)
305
"""
306
307
def calculate_protein_coverage(protein_id: str, peptides: List[str],
308
protein_sequence: str) -> float:
309
"""
310
Calculate sequence coverage for protein.
311
312
Parameters:
313
- protein_id: Protein identifier
314
- peptides: List of identified peptides
315
- protein_sequence: Full protein sequence
316
317
Returns:
318
Sequence coverage as fraction (0-1)
319
"""
320
321
def filter_proteins_by_evidence(protein_df: pd.DataFrame,
322
min_peptides: int = 2,
323
min_unique_peptides: int = 1) -> pd.DataFrame:
324
"""
325
Filter proteins by identification evidence.
326
327
Parameters:
328
- protein_df: DataFrame with protein identifications
329
- min_peptides: Minimum total peptides required
330
- min_unique_peptides: Minimum unique peptides required
331
332
Returns:
333
Filtered protein DataFrame
334
"""
335
```
336
337
### Sequence Utilities
338
339
Additional utilities for protein sequence manipulation and analysis.
340
341
```python { .api }
342
def reverse_protein_sequence(sequence: str) -> str:
343
"""
344
Reverse protein sequence for decoy generation.
345
346
Parameters:
347
- sequence: Original protein sequence
348
349
Returns:
350
Reversed sequence
351
"""
352
353
def shuffle_protein_sequence(sequence: str, seed: int = None) -> str:
354
"""
355
Shuffle protein sequence while maintaining amino acid composition.
356
357
Parameters:
358
- sequence: Original protein sequence
359
- seed: Random seed for reproducible shuffling
360
361
Returns:
362
Shuffled sequence
363
"""
364
365
def translate_dna_to_protein(dna_sequence: str, frame: int = 0) -> str:
366
"""
367
Translate DNA sequence to protein sequence.
368
369
Parameters:
370
- dna_sequence: DNA nucleotide sequence
371
- frame: Reading frame (0, 1, or 2)
372
373
Returns:
374
Translated protein sequence
375
"""
376
377
def find_open_reading_frames(dna_sequence: str,
378
min_length: int = 100) -> List[dict]:
379
"""
380
Find open reading frames in DNA sequence.
381
382
Parameters:
383
- dna_sequence: DNA nucleotide sequence
384
- min_length: Minimum ORF length in nucleotides
385
386
Returns:
387
List of ORF information dictionaries
388
"""
389
390
def convert_sequence_format(sequence: str, input_format: str,
391
output_format: str) -> str:
392
"""
393
Convert between different sequence formats.
394
395
Parameters:
396
- sequence: Input sequence
397
- input_format: Input format ('dna', 'rna', 'protein')
398
- output_format: Output format
399
400
Returns:
401
Converted sequence
402
"""
403
```
404
405
## Usage Examples
406
407
### Basic FASTA Processing
408
409
```python
410
from alphabase.protein.fasta import read_fasta_file, get_uniprot_gene_name
411
412
# Read FASTA file efficiently
413
protein_count = 0
414
for header, sequence in read_fasta_file('uniprot_human.fasta'):
415
protein_count += 1
416
417
# Extract gene name from UniProt header
418
gene_name = get_uniprot_gene_name(header)
419
420
# Process protein
421
if len(sequence) > 100: # Filter by length
422
print(f"Protein {protein_count}: {gene_name}, Length: {len(sequence)}")
423
424
if protein_count >= 10: # Process first 10 proteins
425
break
426
427
print(f"Processed {protein_count} proteins")
428
```
429
430
### Protein Digestion and Analysis
431
432
```python
433
from alphabase.protein.fasta import digest_protein, calculate_protein_properties
434
435
# Example protein sequence
436
protein_seq = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGGYKWENQPWLNGIPVEELENLTQHLPDLVDQAIGVGRQGKVFVLVPKGEAPGDYVNLNRVLPWLLPSLIHNMHSTPDFFKTGIPVLYLSRRILNQHGQNVEILGQKQSGEAGTMEVLDEAFLKGQRRSQKKSKKNSQGGSQIRKTCVSLNRLRREVSQYFISDRPLVLDMKIPEESRQSLAQVIRRQRGEKRGFTWVPVRDGNGIIDQTVLIARGKKRSSEDGGNNLLISRFGSIGGDGLSRFGDATLSSFGGDSGLMRGDQETVTFVPLSFSGNQGMSQGTFSPKQSLNLLDPGSMGGTSFMSQRRSQKASQGNNYSQSRKKLMSGQFCGQASGEAMRYKVKPEDFSYILRRRKLASQQKQSFDLIPVHNGKMKGSHGKMTPEMQGSQRQKMPLRNLLDFTEGQMGR"
437
438
# Calculate protein properties
439
properties = calculate_protein_properties(protein_seq)
440
print(f"Protein properties: {properties}")
441
442
# Digest with trypsin
443
peptides = digest_protein(
444
sequence=protein_seq,
445
enzyme='trypsin',
446
missed_cleavages=2,
447
min_length=6,
448
max_length=30
449
)
450
451
print(f"Generated {len(peptides)} tryptic peptides")
452
for i, peptide in enumerate(peptides[:5]): # Show first 5
453
print(f"Peptide {i+1}: {peptide}")
454
```
455
456
### Protein Database Processing
457
458
```python
459
import pandas as pd
460
from alphabase.protein.fasta import load_protein_database, create_decoy_database
461
462
# Load protein database
463
protein_db = load_protein_database('human_proteome.fasta')
464
print(f"Loaded {len(protein_db)} proteins")
465
466
# Create decoy database
467
create_decoy_database(
468
fasta_path='human_proteome.fasta',
469
output_path='human_proteome_with_decoys.fasta',
470
decoy_prefix='DECOY_',
471
method='reverse'
472
)
473
474
# Filter database by taxonomy (example: human proteins only)
475
filter_database_by_taxa(
476
fasta_path='uniprot_all.fasta',
477
output_path='uniprot_human.fasta',
478
taxa_ids=[9606] # Human NCBI taxonomy ID
479
)
480
```
481
482
### Protein Inference Workflow
483
484
```python
485
from alphabase.protein.fasta import perform_protein_inference, map_peptides_to_proteins
486
from alphabase.psm_reader import MaxQuantReader
487
488
# Load PSM identifications
489
mq_reader = MaxQuantReader()
490
psm_df = mq_reader.import_file('msms.txt')
491
492
# Load protein database
493
protein_db = load_protein_database('proteome.fasta')
494
495
# Map peptides to proteins
496
peptides = psm_df['sequence'].unique().tolist()
497
peptide_protein_map = map_peptides_to_proteins(peptides, protein_db)
498
499
print(f"Mapped {len(peptides)} peptides to proteins")
500
501
# Perform protein inference
502
protein_results = perform_protein_inference(
503
psm_df=psm_df,
504
protein_db=protein_db,
505
method='parsimony'
506
)
507
508
print(f"Identified {len(protein_results)} protein groups")
509
510
# Filter by evidence
511
high_confidence_proteins = filter_proteins_by_evidence(
512
protein_results,
513
min_peptides=2,
514
min_unique_peptides=1
515
)
516
517
print(f"High confidence proteins: {len(high_confidence_proteins)}")
518
```
519
520
### Advanced Sequence Analysis
521
522
```python
523
from alphabase.protein.fasta import (
524
find_signal_peptide, find_transmembrane_domains,
525
get_amino_acid_composition, translate_dna_to_protein
526
)
527
528
# Analyze protein sequence features
529
protein_seq = "MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPVNGFNSDYNWPLEKSPPDPNTPVDDEALEKFLPTTGIIVDMHRVLNKLLEKRHPVEAYHQIRSMSSAELFKHAAKSSLLHYVPASAQHVTLGYGYPYDAHLADAIYLKLLTKDTAELPKVAQGPGGKGQMRVAFLKDTPTDMHRVAFLRELHRRQHRGADELLSEKLLQSLMQRQVQLQIQAQEQRGRSQKLQRIEEALRKLAEVHTQNMEKFQFSLQMQLVQMQQQTVLLMQVQNLAHLQQQIQNQQMQMDLDTQVLDMLRNSPSLTEKLTEYAEDRMNHSDMSQDFHFPGLQCDRFMSPKFLEGLQSSLSEVNLPAQVKMVTKMFQKLDLDVLLQMQAQRQGRDQADKMIEKLAEMDDEQRAATDQKLAEERVRQLQADMRKCQTRQNQLSAARDLLKQKMNLMQQQVQMHQQHLQIAQQKRQFKAMQHVDHQTMIDRFLNDVQKLQRLQRQKRQQQQQQHQHQQMHQRQRRQHQQHHHQRQIAQQQLMQNQLPSFRSVHQMDLQKNQKQRRQRQKQKQMQKQKLLQRQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKQKGADQADKMIEKLMEMDDEQRAATDQKLAEERVRQLQADMRKCQTRQNQLSAARDLLKKMNLMQQQVQMHQQHLQIAQQKRQFKAMQHVDHQTMIDRFLNDVQKLQRLQRQKRQHQHQQHQQRQRKQKAQQKIAQHQMQKVQAEIHMQAKMKNQGQSRQKLRAIKGRPKRFQPSEPVLDVDPVVEKLMKKLSESVLEKGTVNTSSLMDNKFLLQRQAKILESLLRRQVNHRKLQMEMQARHTQRQKVNELQRRQQMHQRMHVSGHRGKLQKRNNSQKMAQHVMQAEKQRLSSLQNMQRQAIQMNQRQRDQLLRSRLRQQRSYRDKQFSQKIKMEERRSSRKRLVHAVRRHRIRRRASRSRSRS"
530
531
# Check for signal peptide
532
has_signal, cleavage_pos = find_signal_peptide(protein_seq)
533
print(f"Signal peptide: {has_signal}, Cleavage at: {cleavage_pos}")
534
535
# Find transmembrane domains
536
tm_domains = find_transmembrane_domains(protein_seq)
537
print(f"Transmembrane domains: {tm_domains}")
538
539
# Get amino acid composition
540
aa_comp = get_amino_acid_composition(protein_seq)
541
print(f"AA composition: {aa_comp}")
542
543
# Translate DNA to protein
544
dna_seq = "ATGAAGTGGGTAACATTTAT" # Example DNA sequence
545
protein_from_dna = translate_dna_to_protein(dna_seq)
546
print(f"Translated protein: {protein_from_dna}")
547
```