Tessl Tile for pypi/alphabase@1.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-peptide-operations.md advanced-spectral-libraries.md chemical-constants.md fragment-ions.md index.md io-utilities.md protein-analysis.md psm-readers.md quantification.md smiles-chemistry.md spectral-libraries.md

psm-readers.mddocs/

0
# PSM Reading and Processing
1

2
Unified interface for reading Peptide-Spectrum Match (PSM) files from multiple proteomics search engines. Standardizes column mappings and data formats across different tools for seamless data integration and downstream analysis workflows.
3

4
## Capabilities
5

6
### Base PSM Reader Class
7

8
Foundation class providing common functionality for all PSM readers with standardized interface and column mapping.
9

10
```python { .api }
11
class PSMReaderBase:
12
    """Base class for all PSM readers with common functionality."""
13
    
14
    def __init__(self):
15
        """Initialize PSM reader with default settings."""
16
    
17
    def import_file(self, filepath: str) -> pd.DataFrame:
18
        """
19
        Import PSM file and return standardized DataFrame.
20
        
21
        Parameters:
22
        - filepath: Path to PSM file
23
        
24
        Returns:
25
        DataFrame with standardized column names and data types
26
        """
27
    
28
    def get_modification_mapping(self) -> dict:
29
        """
30
        Get modification name mapping for this search engine.
31
        
32
        Returns:
33
        Dictionary mapping search engine mod names to standard names
34
        """
35
    
36
    def get_column_mapping(self) -> dict:
37
        """
38
        Get column name mapping for this search engine.
39
        
40
        Returns:
41
        Dictionary mapping search engine columns to standard names
42
        """
43
    
44
    def set_modification_mapping(self, mod_mapping: dict) -> None:
45
        """
46
        Set custom modification mapping.
47
        
48
        Parameters:
49
        - mod_mapping: Dictionary with modification name mappings
50
        """
51
    
52
    def validate_file_format(self, filepath: str) -> bool:
53
        """
54
        Validate if file format matches this reader.
55
        
56
        Parameters:
57
        - filepath: Path to file to validate
58
        
59
        Returns:
60
        True if file format is compatible
61
        """
62
```
63

64
### Search Engine Specific Readers
65

66
Individual reader classes for different proteomics search engines, each inheriting from PSMReaderBase.
67

68
```python { .api }
69
class MaxQuantReader(PSMReaderBase):
70
    """Reader for MaxQuant msms.txt and evidence.txt files."""
71
    
72
    def __init__(self):
73
        """Initialize MaxQuant reader with specific column mappings."""
74
    
75
    def import_file(self, filepath: str) -> pd.DataFrame:
76
        """
77
        Import MaxQuant output file.
78
        
79
        Parameters:
80
        - filepath: Path to msms.txt or evidence.txt file
81
        
82
        Returns:
83
        Standardized DataFrame with MaxQuant PSM data
84
        """
85

86
class DiannReader(PSMReaderBase):
87
    """Reader for DIA-NN report.tsv files."""
88
    
89
    def __init__(self):
90
        """Initialize DIA-NN reader with specific settings."""
91
    
92
    def import_file(self, filepath: str) -> pd.DataFrame:
93
        """
94
        Import DIA-NN report file.
95
        
96
        Parameters:
97
        - filepath: Path to DIA-NN report.tsv file
98
        
99
        Returns:
100
        Standardized DataFrame with DIA-NN results
101
        """
102

103
class SpectronautReader(PSMReaderBase):
104
    """Reader for Spectronaut export files."""
105
    
106
    def __init__(self):
107
        """Initialize Spectronaut reader."""
108
    
109
    def import_file(self, filepath: str) -> pd.DataFrame:
110
        """
111
        Import Spectronaut export file.
112
        
113
        Parameters:
114
        - filepath: Path to Spectronaut export file
115
        
116
        Returns:
117
        Standardized DataFrame with Spectronaut data
118
        """
119

120
class SwathReader(PSMReaderBase):
121
    """Reader for SWATH output files."""
122
    
123
    def __init__(self):
124
        """Initialize SWATH reader."""
125

126
class SpectronautReportReader(PSMReaderBase):
127
    """Reader for Spectronaut report files."""
128
    
129
    def __init__(self):
130
        """Initialize Spectronaut report reader."""
131

132
class MSFragger_PSM_TSV_Reader(PSMReaderBase):
133
    """Reader for MSFragger PSM TSV files."""
134
    
135
    def __init__(self):
136
        """Initialize MSFragger TSV reader."""
137
    
138
    def import_file(self, filepath: str) -> pd.DataFrame:
139
        """
140
        Import MSFragger PSM TSV file.
141
        
142
        Parameters:
143
        - filepath: Path to MSFragger psm.tsv file
144
        
145
        Returns:
146
        Standardized DataFrame with MSFragger PSM data
147
        """
148

149
class MSFraggerPepXMLReader(PSMReaderBase):
150
    """Reader for MSFragger pepXML files."""
151
    
152
    def __init__(self):
153
        """Initialize MSFragger pepXML reader."""
154
    
155
    def import_file(self, filepath: str) -> pd.DataFrame:
156
        """
157
        Import MSFragger pepXML file.
158
        
159
        Parameters:
160
        - filepath: Path to pepXML file
161
        
162
        Returns:
163
        Standardized DataFrame with pepXML data
164
        """
165

166
class MSFraggerPepXML(MSFraggerPepXMLReader):
167
    """Alias for MSFraggerPepXMLReader for backwards compatibility."""
168

169
class pFindReader(PSMReaderBase):
170
    """Reader for pFind output files."""
171
    
172
    def __init__(self):
173
        """Initialize pFind reader."""
174

175
class SageReaderTSV(PSMReaderBase):
176
    """Reader for Sage TSV output files."""
177
    
178
    def __init__(self):
179
        """Initialize Sage TSV reader."""
180
    
181
    def import_file(self, filepath: str) -> pd.DataFrame:
182
        """
183
        Import Sage TSV file.
184
        
185
        Parameters:
186
        - filepath: Path to Sage results.sage.tsv file
187
        
188
        Returns:
189
        Standardized DataFrame with Sage results
190
        """
191

192
class SageReaderParquet(PSMReaderBase):
193
    """Reader for Sage Parquet output files."""
194
    
195
    def __init__(self):
196
        """Initialize Sage Parquet reader."""
197
    
198
    def import_file(self, filepath: str) -> pd.DataFrame:
199
        """
200
        Import Sage Parquet file.
201
        
202
        Parameters:
203
        - filepath: Path to Sage .parquet file
204
        
205
        Returns:
206
        Standardized DataFrame with Sage results
207
        """
208

209
class AlphaPeptReader(PSMReaderBase):
210
    """Reader for AlphaPept output files."""
211
    
212
    def __init__(self):
213
        """Initialize AlphaPept reader."""
214

215
class AlphaDiaReaderTsv(PSMReaderBase):
216
    """Reader for AlphaDIA TSV output files."""
217
    
218
    def __init__(self):
219
        """Initialize AlphaDIA TSV reader."""
220
    
221
    def import_file(self, filepath: str) -> pd.DataFrame:
222
        """
223
        Import AlphaDIA TSV file.
224
        
225
        Parameters:
226
        - filepath: Path to AlphaDIA output.tsv file
227
        
228
        Returns:
229
        Standardized DataFrame with AlphaDIA results
230
        """
231

232
class AlphaDiaReaderParquet(PSMReaderBase):
233
    """Reader for AlphaDIA Parquet output files."""
234
    
235
    def __init__(self):
236
        """Initialize AlphaDIA Parquet reader."""
237
    
238
    def import_file(self, filepath: str) -> pd.DataFrame:
239
        """
240
        Import AlphaDIA Parquet file.
241
        
242
        Parameters:
243
        - filepath: Path to AlphaDIA .parquet file
244
        
245
        Returns:
246
        Standardized DataFrame with AlphaDIA results
247
        """
248
```
249

250
### PSM Reader Provider System
251

252
Centralized system for managing and accessing PSM readers with automatic format detection.
253

254
```python { .api }
255
# Provider object for accessing registered readers
256
psm_reader_provider: dict  # Dictionary of all registered PSM readers
257

258
# YAML configuration for reader settings
259
psm_reader_yaml: dict  # Configuration settings for PSM readers
260

261
def get_reader_by_name(reader_name: str) -> PSMReaderBase:
262
    """
263
    Get PSM reader instance by name.
264
    
265
    Parameters:
266
    - reader_name: Name of the reader ('maxquant', 'diann', etc.)
267
    
268
    Returns:
269
    Instantiated PSM reader
270
    """
271

272
def get_reader_by_file(filepath: str) -> PSMReaderBase:
273
    """
274
    Auto-detect and return appropriate reader for file.
275
    
276
    Parameters:
277
    - filepath: Path to PSM file
278
    
279
    Returns:
280
    Best matching PSM reader for the file format
281
    """
282

283
def list_available_readers() -> List[str]:
284
    """
285
    List all available PSM reader names.
286
    
287
    Returns:
288
    List of registered reader names
289
    """
290

291
def register_custom_reader(name: str, reader_class: type) -> None:
292
    """
293
    Register custom PSM reader.
294
    
295
    Parameters:
296
    - name: Name for the custom reader
297
    - reader_class: PSM reader class inheriting from PSMReaderBase
298
    """
299
```
300

301
### Column Standardization
302

303
Standard column names and data types used across all PSM readers for consistent output.
304

305
```python { .api }
306
# Standard column names used by all readers
307
STANDARD_COLUMNS: dict = {
308
    'sequence': str,           # Peptide sequence
309
    'mods': str,              # Modification string
310
    'charge': int,            # Precursor charge
311
    'proteins': str,          # Protein identifiers
312
    'rt': float,              # Retention time
313
    'mz': float,              # Precursor m/z
314
    'mass': float,            # Precursor mass
315
    'score': float,           # Primary identification score
316
    'qvalue': float,          # Q-value (FDR)
317
    'pep': float,             # Posterior error probability
318
    'intensity': float,       # Precursor intensity
319
    'spec_idx': int,          # Spectrum index
320
    'run': str,               # Run/file identifier
321
    'scan': int,              # Scan number
322
}
323

324
def standardize_columns(df: pd.DataFrame, column_mapping: dict) -> pd.DataFrame:
325
    """
326
    Apply column standardization to DataFrame.
327
    
328
    Parameters:
329
    - df: Input DataFrame with search engine specific columns
330
    - column_mapping: Mapping from original to standard column names
331
    
332
    Returns:
333
    DataFrame with standardized column names and types
334
    """
335

336
def validate_required_columns(df: pd.DataFrame, required: List[str] = None) -> bool:
337
    """
338
    Validate that DataFrame contains required columns.
339
    
340
    Parameters:
341
    - df: DataFrame to validate
342
    - required: List of required column names
343
    
344
    Returns:
345
    True if all required columns are present
346
    """
347
```
348

349
## Usage Examples
350

351
### Basic PSM File Reading
352

353
```python
354
from alphabase.psm_reader import MaxQuantReader, DiannReader, SpectronautReader
355

356
# Read MaxQuant msms.txt file
357
mq_reader = MaxQuantReader()
358
mq_df = mq_reader.import_file('msms.txt')
359
print(f"MaxQuant PSMs: {len(mq_df)}")
360

361
# Read DIA-NN report
362
diann_reader = DiannReader()
363
diann_df = diann_reader.import_file('report.tsv')
364
print(f"DIA-NN PSMs: {len(diann_df)}")
365

366
# Read Spectronaut export
367
spec_reader = SpectronautReader()
368
spec_df = spec_reader.import_file('spectronaut_export.tsv')
369
print(f"Spectronaut PSMs: {len(spec_df)}")
370

371
# All DataFrames now have standardized column names
372
print(f"Columns: {mq_df.columns.tolist()}")
373
```
374

375
### Using the Provider System
376

377
```python
378
from alphabase.psm_reader import psm_reader_provider
379

380
# Get reader by name
381
reader = psm_reader_provider['maxquant']()
382
df = reader.import_file('msms.txt')
383

384
# Auto-detect file format (if supported)
385
auto_reader = get_reader_by_file('unknown_format.tsv')
386
if auto_reader:
387
    df = auto_reader.import_file('unknown_format.tsv')
388

389
# List all available readers
390
available = list_available_readers()
391
print(f"Available readers: {available}")
392
```
393

394
### Working with Multiple Search Engines
395

396
```python
397
import pandas as pd
398
from alphabase.psm_reader import MaxQuantReader, DiannReader, SageReaderTSV
399

400
# Read files from different search engines
401
readers_and_files = [
402
    (MaxQuantReader(), 'maxquant/msms.txt'),
403
    (DiannReader(), 'diann/report.tsv'), 
404
    (SageReaderTSV(), 'sage/results.sage.tsv')
405
]
406

407
all_psms = []
408
for reader, filepath in readers_and_files:
409
    df = reader.import_file(filepath)
410
    df['search_engine'] = reader.__class__.__name__
411
    all_psms.append(df)
412

413
# Combine all PSMs with standardized columns
414
combined_df = pd.concat(all_psms, ignore_index=True)
415
print(f"Total PSMs from all engines: {len(combined_df)}")
416
print(f"Search engines: {combined_df['search_engine'].unique()}")
417
```
418

419
### Custom Modification Mappings
420

421
```python
422
from alphabase.psm_reader import MaxQuantReader
423

424
# Create reader with custom modification mapping
425
reader = MaxQuantReader()
426

427
# Get current modification mapping
428
current_mapping = reader.get_modification_mapping()
429
print(f"Current mappings: {current_mapping}")
430

431
# Add custom modifications
432
custom_mapping = {
433
    'Oxidation (M)': 'Oxidation',
434
    'Phospho (STY)': 'Phosphorylation',
435
    'Acetyl (Protein N-term)': 'Acetylation'
436
}
437

438
reader.set_modification_mapping(custom_mapping)
439

440
# Import file with custom mappings
441
df = reader.import_file('msms.txt')
442
```
443

444
### Advanced Processing Workflows
445

446
```python
447
from alphabase.psm_reader import DiannReader
448
import numpy as np
449

450
# Read DIA-NN results
451
reader = DiannReader()
452
df = reader.import_file('report.tsv')
453

454
# Apply quality filters using standardized columns
455
filtered_df = df[
456
    (df['qvalue'] <= 0.01) &  # 1% FDR
457
    (df['score'] >= 0.99) &   # High confidence
458
    (df['rt'] > 0)            # Valid retention time
459
].copy()
460

461
print(f"Original PSMs: {len(df)}")
462
print(f"After filtering: {len(filtered_df)}")
463

464
# Group by sequence for peptide-level analysis
465
peptide_level = filtered_df.groupby('sequence').agg({
466
    'score': 'max',
467
    'intensity': 'sum',
468
    'proteins': 'first',
469
    'rt': 'mean'
470
}).reset_index()
471

472
print(f"Unique peptides: {len(peptide_level)}")
473
```
474

475
### Custom Reader Development
476

477
```python
478
from alphabase.psm_reader import PSMReaderBase
479
import pandas as pd
480

481
class CustomReader(PSMReaderBase):
482
    """Custom reader for proprietary format."""
483
    
484
    def __init__(self):
485
        super().__init__()
486
        # Define column mappings specific to this format
487
        self.column_mapping = {
488
            'peptide_seq': 'sequence',
489
            'precursor_charge': 'charge',
490
            'protein_ids': 'proteins',
491
            'retention_time': 'rt',
492
            'confidence_score': 'score'
493
        }
494
    
495
    def import_file(self, filepath: str) -> pd.DataFrame:
496
        """Import custom format file."""
497
        # Read raw file
498
        raw_df = pd.read_csv(filepath, sep='\t')
499
        
500
        # Apply column mapping
501
        standardized_df = self.standardize_columns(raw_df, self.column_mapping)
502
        
503
        # Apply any format-specific processing
504
        standardized_df['mods'] = ''  # No modifications in this format
505
        
506
        return standardized_df
507

508
# Register custom reader
509
register_custom_reader('custom', CustomReader)
510

511
# Use custom reader
512
custom_reader = CustomReader()
513
df = custom_reader.import_file('custom_format.tsv')
514
```

Version

Tile

Files

psm-readers.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

psm-readers.mddocs/