0
# PSM Reading and Processing
1
2
Unified interface for reading Peptide-Spectrum Match (PSM) files from multiple proteomics search engines. Standardizes column mappings and data formats across different tools for seamless data integration and downstream analysis workflows.
3
4
## Capabilities
5
6
### Base PSM Reader Class
7
8
Foundation class providing common functionality for all PSM readers with standardized interface and column mapping.
9
10
```python { .api }
11
class PSMReaderBase:
12
"""Base class for all PSM readers with common functionality."""
13
14
def __init__(self):
15
"""Initialize PSM reader with default settings."""
16
17
def import_file(self, filepath: str) -> pd.DataFrame:
18
"""
19
Import PSM file and return standardized DataFrame.
20
21
Parameters:
22
- filepath: Path to PSM file
23
24
Returns:
25
DataFrame with standardized column names and data types
26
"""
27
28
def get_modification_mapping(self) -> dict:
29
"""
30
Get modification name mapping for this search engine.
31
32
Returns:
33
Dictionary mapping search engine mod names to standard names
34
"""
35
36
def get_column_mapping(self) -> dict:
37
"""
38
Get column name mapping for this search engine.
39
40
Returns:
41
Dictionary mapping search engine columns to standard names
42
"""
43
44
def set_modification_mapping(self, mod_mapping: dict) -> None:
45
"""
46
Set custom modification mapping.
47
48
Parameters:
49
- mod_mapping: Dictionary with modification name mappings
50
"""
51
52
def validate_file_format(self, filepath: str) -> bool:
53
"""
54
Validate if file format matches this reader.
55
56
Parameters:
57
- filepath: Path to file to validate
58
59
Returns:
60
True if file format is compatible
61
"""
62
```
63
64
### Search Engine Specific Readers
65
66
Individual reader classes for different proteomics search engines, each inheriting from PSMReaderBase.
67
68
```python { .api }
69
class MaxQuantReader(PSMReaderBase):
70
"""Reader for MaxQuant msms.txt and evidence.txt files."""
71
72
def __init__(self):
73
"""Initialize MaxQuant reader with specific column mappings."""
74
75
def import_file(self, filepath: str) -> pd.DataFrame:
76
"""
77
Import MaxQuant output file.
78
79
Parameters:
80
- filepath: Path to msms.txt or evidence.txt file
81
82
Returns:
83
Standardized DataFrame with MaxQuant PSM data
84
"""
85
86
class DiannReader(PSMReaderBase):
87
"""Reader for DIA-NN report.tsv files."""
88
89
def __init__(self):
90
"""Initialize DIA-NN reader with specific settings."""
91
92
def import_file(self, filepath: str) -> pd.DataFrame:
93
"""
94
Import DIA-NN report file.
95
96
Parameters:
97
- filepath: Path to DIA-NN report.tsv file
98
99
Returns:
100
Standardized DataFrame with DIA-NN results
101
"""
102
103
class SpectronautReader(PSMReaderBase):
104
"""Reader for Spectronaut export files."""
105
106
def __init__(self):
107
"""Initialize Spectronaut reader."""
108
109
def import_file(self, filepath: str) -> pd.DataFrame:
110
"""
111
Import Spectronaut export file.
112
113
Parameters:
114
- filepath: Path to Spectronaut export file
115
116
Returns:
117
Standardized DataFrame with Spectronaut data
118
"""
119
120
class SwathReader(PSMReaderBase):
121
"""Reader for SWATH output files."""
122
123
def __init__(self):
124
"""Initialize SWATH reader."""
125
126
class SpectronautReportReader(PSMReaderBase):
127
"""Reader for Spectronaut report files."""
128
129
def __init__(self):
130
"""Initialize Spectronaut report reader."""
131
132
class MSFragger_PSM_TSV_Reader(PSMReaderBase):
133
"""Reader for MSFragger PSM TSV files."""
134
135
def __init__(self):
136
"""Initialize MSFragger TSV reader."""
137
138
def import_file(self, filepath: str) -> pd.DataFrame:
139
"""
140
Import MSFragger PSM TSV file.
141
142
Parameters:
143
- filepath: Path to MSFragger psm.tsv file
144
145
Returns:
146
Standardized DataFrame with MSFragger PSM data
147
"""
148
149
class MSFraggerPepXMLReader(PSMReaderBase):
150
"""Reader for MSFragger pepXML files."""
151
152
def __init__(self):
153
"""Initialize MSFragger pepXML reader."""
154
155
def import_file(self, filepath: str) -> pd.DataFrame:
156
"""
157
Import MSFragger pepXML file.
158
159
Parameters:
160
- filepath: Path to pepXML file
161
162
Returns:
163
Standardized DataFrame with pepXML data
164
"""
165
166
class MSFraggerPepXML(MSFraggerPepXMLReader):
167
"""Alias for MSFraggerPepXMLReader for backwards compatibility."""
168
169
class pFindReader(PSMReaderBase):
170
"""Reader for pFind output files."""
171
172
def __init__(self):
173
"""Initialize pFind reader."""
174
175
class SageReaderTSV(PSMReaderBase):
176
"""Reader for Sage TSV output files."""
177
178
def __init__(self):
179
"""Initialize Sage TSV reader."""
180
181
def import_file(self, filepath: str) -> pd.DataFrame:
182
"""
183
Import Sage TSV file.
184
185
Parameters:
186
- filepath: Path to Sage results.sage.tsv file
187
188
Returns:
189
Standardized DataFrame with Sage results
190
"""
191
192
class SageReaderParquet(PSMReaderBase):
193
"""Reader for Sage Parquet output files."""
194
195
def __init__(self):
196
"""Initialize Sage Parquet reader."""
197
198
def import_file(self, filepath: str) -> pd.DataFrame:
199
"""
200
Import Sage Parquet file.
201
202
Parameters:
203
- filepath: Path to Sage .parquet file
204
205
Returns:
206
Standardized DataFrame with Sage results
207
"""
208
209
class AlphaPeptReader(PSMReaderBase):
210
"""Reader for AlphaPept output files."""
211
212
def __init__(self):
213
"""Initialize AlphaPept reader."""
214
215
class AlphaDiaReaderTsv(PSMReaderBase):
216
"""Reader for AlphaDIA TSV output files."""
217
218
def __init__(self):
219
"""Initialize AlphaDIA TSV reader."""
220
221
def import_file(self, filepath: str) -> pd.DataFrame:
222
"""
223
Import AlphaDIA TSV file.
224
225
Parameters:
226
- filepath: Path to AlphaDIA output.tsv file
227
228
Returns:
229
Standardized DataFrame with AlphaDIA results
230
"""
231
232
class AlphaDiaReaderParquet(PSMReaderBase):
233
"""Reader for AlphaDIA Parquet output files."""
234
235
def __init__(self):
236
"""Initialize AlphaDIA Parquet reader."""
237
238
def import_file(self, filepath: str) -> pd.DataFrame:
239
"""
240
Import AlphaDIA Parquet file.
241
242
Parameters:
243
- filepath: Path to AlphaDIA .parquet file
244
245
Returns:
246
Standardized DataFrame with AlphaDIA results
247
"""
248
```
249
250
### PSM Reader Provider System
251
252
Centralized system for managing and accessing PSM readers with automatic format detection.
253
254
```python { .api }
255
# Provider object for accessing registered readers
256
psm_reader_provider: dict # Dictionary of all registered PSM readers
257
258
# YAML configuration for reader settings
259
psm_reader_yaml: dict # Configuration settings for PSM readers
260
261
def get_reader_by_name(reader_name: str) -> PSMReaderBase:
262
"""
263
Get PSM reader instance by name.
264
265
Parameters:
266
- reader_name: Name of the reader ('maxquant', 'diann', etc.)
267
268
Returns:
269
Instantiated PSM reader
270
"""
271
272
def get_reader_by_file(filepath: str) -> PSMReaderBase:
273
"""
274
Auto-detect and return appropriate reader for file.
275
276
Parameters:
277
- filepath: Path to PSM file
278
279
Returns:
280
Best matching PSM reader for the file format
281
"""
282
283
def list_available_readers() -> List[str]:
284
"""
285
List all available PSM reader names.
286
287
Returns:
288
List of registered reader names
289
"""
290
291
def register_custom_reader(name: str, reader_class: type) -> None:
292
"""
293
Register custom PSM reader.
294
295
Parameters:
296
- name: Name for the custom reader
297
- reader_class: PSM reader class inheriting from PSMReaderBase
298
"""
299
```
300
301
### Column Standardization
302
303
Standard column names and data types used across all PSM readers for consistent output.
304
305
```python { .api }
306
# Standard column names used by all readers
307
STANDARD_COLUMNS: dict = {
308
'sequence': str, # Peptide sequence
309
'mods': str, # Modification string
310
'charge': int, # Precursor charge
311
'proteins': str, # Protein identifiers
312
'rt': float, # Retention time
313
'mz': float, # Precursor m/z
314
'mass': float, # Precursor mass
315
'score': float, # Primary identification score
316
'qvalue': float, # Q-value (FDR)
317
'pep': float, # Posterior error probability
318
'intensity': float, # Precursor intensity
319
'spec_idx': int, # Spectrum index
320
'run': str, # Run/file identifier
321
'scan': int, # Scan number
322
}
323
324
def standardize_columns(df: pd.DataFrame, column_mapping: dict) -> pd.DataFrame:
325
"""
326
Apply column standardization to DataFrame.
327
328
Parameters:
329
- df: Input DataFrame with search engine specific columns
330
- column_mapping: Mapping from original to standard column names
331
332
Returns:
333
DataFrame with standardized column names and types
334
"""
335
336
def validate_required_columns(df: pd.DataFrame, required: List[str] = None) -> bool:
337
"""
338
Validate that DataFrame contains required columns.
339
340
Parameters:
341
- df: DataFrame to validate
342
- required: List of required column names
343
344
Returns:
345
True if all required columns are present
346
"""
347
```
348
349
## Usage Examples
350
351
### Basic PSM File Reading
352
353
```python
354
from alphabase.psm_reader import MaxQuantReader, DiannReader, SpectronautReader
355
356
# Read MaxQuant msms.txt file
357
mq_reader = MaxQuantReader()
358
mq_df = mq_reader.import_file('msms.txt')
359
print(f"MaxQuant PSMs: {len(mq_df)}")
360
361
# Read DIA-NN report
362
diann_reader = DiannReader()
363
diann_df = diann_reader.import_file('report.tsv')
364
print(f"DIA-NN PSMs: {len(diann_df)}")
365
366
# Read Spectronaut export
367
spec_reader = SpectronautReader()
368
spec_df = spec_reader.import_file('spectronaut_export.tsv')
369
print(f"Spectronaut PSMs: {len(spec_df)}")
370
371
# All DataFrames now have standardized column names
372
print(f"Columns: {mq_df.columns.tolist()}")
373
```
374
375
### Using the Provider System
376
377
```python
378
from alphabase.psm_reader import psm_reader_provider
379
380
# Get reader by name
381
reader = psm_reader_provider['maxquant']()
382
df = reader.import_file('msms.txt')
383
384
# Auto-detect file format (if supported)
385
auto_reader = get_reader_by_file('unknown_format.tsv')
386
if auto_reader:
387
df = auto_reader.import_file('unknown_format.tsv')
388
389
# List all available readers
390
available = list_available_readers()
391
print(f"Available readers: {available}")
392
```
393
394
### Working with Multiple Search Engines
395
396
```python
397
import pandas as pd
398
from alphabase.psm_reader import MaxQuantReader, DiannReader, SageReaderTSV
399
400
# Read files from different search engines
401
readers_and_files = [
402
(MaxQuantReader(), 'maxquant/msms.txt'),
403
(DiannReader(), 'diann/report.tsv'),
404
(SageReaderTSV(), 'sage/results.sage.tsv')
405
]
406
407
all_psms = []
408
for reader, filepath in readers_and_files:
409
df = reader.import_file(filepath)
410
df['search_engine'] = reader.__class__.__name__
411
all_psms.append(df)
412
413
# Combine all PSMs with standardized columns
414
combined_df = pd.concat(all_psms, ignore_index=True)
415
print(f"Total PSMs from all engines: {len(combined_df)}")
416
print(f"Search engines: {combined_df['search_engine'].unique()}")
417
```
418
419
### Custom Modification Mappings
420
421
```python
422
from alphabase.psm_reader import MaxQuantReader
423
424
# Create reader with custom modification mapping
425
reader = MaxQuantReader()
426
427
# Get current modification mapping
428
current_mapping = reader.get_modification_mapping()
429
print(f"Current mappings: {current_mapping}")
430
431
# Add custom modifications
432
custom_mapping = {
433
'Oxidation (M)': 'Oxidation',
434
'Phospho (STY)': 'Phosphorylation',
435
'Acetyl (Protein N-term)': 'Acetylation'
436
}
437
438
reader.set_modification_mapping(custom_mapping)
439
440
# Import file with custom mappings
441
df = reader.import_file('msms.txt')
442
```
443
444
### Advanced Processing Workflows
445
446
```python
447
from alphabase.psm_reader import DiannReader
448
import numpy as np
449
450
# Read DIA-NN results
451
reader = DiannReader()
452
df = reader.import_file('report.tsv')
453
454
# Apply quality filters using standardized columns
455
filtered_df = df[
456
(df['qvalue'] <= 0.01) & # 1% FDR
457
(df['score'] >= 0.99) & # High confidence
458
(df['rt'] > 0) # Valid retention time
459
].copy()
460
461
print(f"Original PSMs: {len(df)}")
462
print(f"After filtering: {len(filtered_df)}")
463
464
# Group by sequence for peptide-level analysis
465
peptide_level = filtered_df.groupby('sequence').agg({
466
'score': 'max',
467
'intensity': 'sum',
468
'proteins': 'first',
469
'rt': 'mean'
470
}).reset_index()
471
472
print(f"Unique peptides: {len(peptide_level)}")
473
```
474
475
### Custom Reader Development
476
477
```python
478
from alphabase.psm_reader import PSMReaderBase
479
import pandas as pd
480
481
class CustomReader(PSMReaderBase):
482
"""Custom reader for proprietary format."""
483
484
def __init__(self):
485
super().__init__()
486
# Define column mappings specific to this format
487
self.column_mapping = {
488
'peptide_seq': 'sequence',
489
'precursor_charge': 'charge',
490
'protein_ids': 'proteins',
491
'retention_time': 'rt',
492
'confidence_score': 'score'
493
}
494
495
def import_file(self, filepath: str) -> pd.DataFrame:
496
"""Import custom format file."""
497
# Read raw file
498
raw_df = pd.read_csv(filepath, sep='\t')
499
500
# Apply column mapping
501
standardized_df = self.standardize_columns(raw_df, self.column_mapping)
502
503
# Apply any format-specific processing
504
standardized_df['mods'] = '' # No modifications in this format
505
506
return standardized_df
507
508
# Register custom reader
509
register_custom_reader('custom', CustomReader)
510
511
# Use custom reader
512
custom_reader = CustomReader()
513
df = custom_reader.import_file('custom_format.tsv')
514
```