or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-peptide-operations.mdadvanced-spectral-libraries.mdchemical-constants.mdfragment-ions.mdindex.mdio-utilities.mdprotein-analysis.mdpsm-readers.mdquantification.mdsmiles-chemistry.mdspectral-libraries.md

psm-readers.mddocs/

0

# PSM Reading and Processing

1

2

Unified interface for reading Peptide-Spectrum Match (PSM) files from multiple proteomics search engines. Standardizes column mappings and data formats across different tools for seamless data integration and downstream analysis workflows.

3

4

## Capabilities

5

6

### Base PSM Reader Class

7

8

Foundation class providing common functionality for all PSM readers with standardized interface and column mapping.

9

10

```python { .api }

11

class PSMReaderBase:

12

"""Base class for all PSM readers with common functionality."""

13

14

def __init__(self):

15

"""Initialize PSM reader with default settings."""

16

17

def import_file(self, filepath: str) -> pd.DataFrame:

18

"""

19

Import PSM file and return standardized DataFrame.

20

21

Parameters:

22

- filepath: Path to PSM file

23

24

Returns:

25

DataFrame with standardized column names and data types

26

"""

27

28

def get_modification_mapping(self) -> dict:

29

"""

30

Get modification name mapping for this search engine.

31

32

Returns:

33

Dictionary mapping search engine mod names to standard names

34

"""

35

36

def get_column_mapping(self) -> dict:

37

"""

38

Get column name mapping for this search engine.

39

40

Returns:

41

Dictionary mapping search engine columns to standard names

42

"""

43

44

def set_modification_mapping(self, mod_mapping: dict) -> None:

45

"""

46

Set custom modification mapping.

47

48

Parameters:

49

- mod_mapping: Dictionary with modification name mappings

50

"""

51

52

def validate_file_format(self, filepath: str) -> bool:

53

"""

54

Validate if file format matches this reader.

55

56

Parameters:

57

- filepath: Path to file to validate

58

59

Returns:

60

True if file format is compatible

61

"""

62

```

63

64

### Search Engine Specific Readers

65

66

Individual reader classes for different proteomics search engines, each inheriting from PSMReaderBase.

67

68

```python { .api }

69

class MaxQuantReader(PSMReaderBase):

70

"""Reader for MaxQuant msms.txt and evidence.txt files."""

71

72

def __init__(self):

73

"""Initialize MaxQuant reader with specific column mappings."""

74

75

def import_file(self, filepath: str) -> pd.DataFrame:

76

"""

77

Import MaxQuant output file.

78

79

Parameters:

80

- filepath: Path to msms.txt or evidence.txt file

81

82

Returns:

83

Standardized DataFrame with MaxQuant PSM data

84

"""

85

86

class DiannReader(PSMReaderBase):

87

"""Reader for DIA-NN report.tsv files."""

88

89

def __init__(self):

90

"""Initialize DIA-NN reader with specific settings."""

91

92

def import_file(self, filepath: str) -> pd.DataFrame:

93

"""

94

Import DIA-NN report file.

95

96

Parameters:

97

- filepath: Path to DIA-NN report.tsv file

98

99

Returns:

100

Standardized DataFrame with DIA-NN results

101

"""

102

103

class SpectronautReader(PSMReaderBase):

104

"""Reader for Spectronaut export files."""

105

106

def __init__(self):

107

"""Initialize Spectronaut reader."""

108

109

def import_file(self, filepath: str) -> pd.DataFrame:

110

"""

111

Import Spectronaut export file.

112

113

Parameters:

114

- filepath: Path to Spectronaut export file

115

116

Returns:

117

Standardized DataFrame with Spectronaut data

118

"""

119

120

class SwathReader(PSMReaderBase):

121

"""Reader for SWATH output files."""

122

123

def __init__(self):

124

"""Initialize SWATH reader."""

125

126

class SpectronautReportReader(PSMReaderBase):

127

"""Reader for Spectronaut report files."""

128

129

def __init__(self):

130

"""Initialize Spectronaut report reader."""

131

132

class MSFragger_PSM_TSV_Reader(PSMReaderBase):

133

"""Reader for MSFragger PSM TSV files."""

134

135

def __init__(self):

136

"""Initialize MSFragger TSV reader."""

137

138

def import_file(self, filepath: str) -> pd.DataFrame:

139

"""

140

Import MSFragger PSM TSV file.

141

142

Parameters:

143

- filepath: Path to MSFragger psm.tsv file

144

145

Returns:

146

Standardized DataFrame with MSFragger PSM data

147

"""

148

149

class MSFraggerPepXMLReader(PSMReaderBase):

150

"""Reader for MSFragger pepXML files."""

151

152

def __init__(self):

153

"""Initialize MSFragger pepXML reader."""

154

155

def import_file(self, filepath: str) -> pd.DataFrame:

156

"""

157

Import MSFragger pepXML file.

158

159

Parameters:

160

- filepath: Path to pepXML file

161

162

Returns:

163

Standardized DataFrame with pepXML data

164

"""

165

166

class MSFraggerPepXML(MSFraggerPepXMLReader):

167

"""Alias for MSFraggerPepXMLReader for backwards compatibility."""

168

169

class pFindReader(PSMReaderBase):

170

"""Reader for pFind output files."""

171

172

def __init__(self):

173

"""Initialize pFind reader."""

174

175

class SageReaderTSV(PSMReaderBase):

176

"""Reader for Sage TSV output files."""

177

178

def __init__(self):

179

"""Initialize Sage TSV reader."""

180

181

def import_file(self, filepath: str) -> pd.DataFrame:

182

"""

183

Import Sage TSV file.

184

185

Parameters:

186

- filepath: Path to Sage results.sage.tsv file

187

188

Returns:

189

Standardized DataFrame with Sage results

190

"""

191

192

class SageReaderParquet(PSMReaderBase):

193

"""Reader for Sage Parquet output files."""

194

195

def __init__(self):

196

"""Initialize Sage Parquet reader."""

197

198

def import_file(self, filepath: str) -> pd.DataFrame:

199

"""

200

Import Sage Parquet file.

201

202

Parameters:

203

- filepath: Path to Sage .parquet file

204

205

Returns:

206

Standardized DataFrame with Sage results

207

"""

208

209

class AlphaPeptReader(PSMReaderBase):

210

"""Reader for AlphaPept output files."""

211

212

def __init__(self):

213

"""Initialize AlphaPept reader."""

214

215

class AlphaDiaReaderTsv(PSMReaderBase):

216

"""Reader for AlphaDIA TSV output files."""

217

218

def __init__(self):

219

"""Initialize AlphaDIA TSV reader."""

220

221

def import_file(self, filepath: str) -> pd.DataFrame:

222

"""

223

Import AlphaDIA TSV file.

224

225

Parameters:

226

- filepath: Path to AlphaDIA output.tsv file

227

228

Returns:

229

Standardized DataFrame with AlphaDIA results

230

"""

231

232

class AlphaDiaReaderParquet(PSMReaderBase):

233

"""Reader for AlphaDIA Parquet output files."""

234

235

def __init__(self):

236

"""Initialize AlphaDIA Parquet reader."""

237

238

def import_file(self, filepath: str) -> pd.DataFrame:

239

"""

240

Import AlphaDIA Parquet file.

241

242

Parameters:

243

- filepath: Path to AlphaDIA .parquet file

244

245

Returns:

246

Standardized DataFrame with AlphaDIA results

247

"""

248

```

249

250

### PSM Reader Provider System

251

252

Centralized system for managing and accessing PSM readers with automatic format detection.

253

254

```python { .api }

255

# Provider object for accessing registered readers

256

psm_reader_provider: dict # Dictionary of all registered PSM readers

257

258

# YAML configuration for reader settings

259

psm_reader_yaml: dict # Configuration settings for PSM readers

260

261

def get_reader_by_name(reader_name: str) -> PSMReaderBase:

262

"""

263

Get PSM reader instance by name.

264

265

Parameters:

266

- reader_name: Name of the reader ('maxquant', 'diann', etc.)

267

268

Returns:

269

Instantiated PSM reader

270

"""

271

272

def get_reader_by_file(filepath: str) -> PSMReaderBase:

273

"""

274

Auto-detect and return appropriate reader for file.

275

276

Parameters:

277

- filepath: Path to PSM file

278

279

Returns:

280

Best matching PSM reader for the file format

281

"""

282

283

def list_available_readers() -> List[str]:

284

"""

285

List all available PSM reader names.

286

287

Returns:

288

List of registered reader names

289

"""

290

291

def register_custom_reader(name: str, reader_class: type) -> None:

292

"""

293

Register custom PSM reader.

294

295

Parameters:

296

- name: Name for the custom reader

297

- reader_class: PSM reader class inheriting from PSMReaderBase

298

"""

299

```

300

301

### Column Standardization

302

303

Standard column names and data types used across all PSM readers for consistent output.

304

305

```python { .api }

306

# Standard column names used by all readers

307

STANDARD_COLUMNS: dict = {

308

'sequence': str, # Peptide sequence

309

'mods': str, # Modification string

310

'charge': int, # Precursor charge

311

'proteins': str, # Protein identifiers

312

'rt': float, # Retention time

313

'mz': float, # Precursor m/z

314

'mass': float, # Precursor mass

315

'score': float, # Primary identification score

316

'qvalue': float, # Q-value (FDR)

317

'pep': float, # Posterior error probability

318

'intensity': float, # Precursor intensity

319

'spec_idx': int, # Spectrum index

320

'run': str, # Run/file identifier

321

'scan': int, # Scan number

322

}

323

324

def standardize_columns(df: pd.DataFrame, column_mapping: dict) -> pd.DataFrame:

325

"""

326

Apply column standardization to DataFrame.

327

328

Parameters:

329

- df: Input DataFrame with search engine specific columns

330

- column_mapping: Mapping from original to standard column names

331

332

Returns:

333

DataFrame with standardized column names and types

334

"""

335

336

def validate_required_columns(df: pd.DataFrame, required: List[str] = None) -> bool:

337

"""

338

Validate that DataFrame contains required columns.

339

340

Parameters:

341

- df: DataFrame to validate

342

- required: List of required column names

343

344

Returns:

345

True if all required columns are present

346

"""

347

```

348

349

## Usage Examples

350

351

### Basic PSM File Reading

352

353

```python

354

from alphabase.psm_reader import MaxQuantReader, DiannReader, SpectronautReader

355

356

# Read MaxQuant msms.txt file

357

mq_reader = MaxQuantReader()

358

mq_df = mq_reader.import_file('msms.txt')

359

print(f"MaxQuant PSMs: {len(mq_df)}")

360

361

# Read DIA-NN report

362

diann_reader = DiannReader()

363

diann_df = diann_reader.import_file('report.tsv')

364

print(f"DIA-NN PSMs: {len(diann_df)}")

365

366

# Read Spectronaut export

367

spec_reader = SpectronautReader()

368

spec_df = spec_reader.import_file('spectronaut_export.tsv')

369

print(f"Spectronaut PSMs: {len(spec_df)}")

370

371

# All DataFrames now have standardized column names

372

print(f"Columns: {mq_df.columns.tolist()}")

373

```

374

375

### Using the Provider System

376

377

```python

378

from alphabase.psm_reader import psm_reader_provider

379

380

# Get reader by name

381

reader = psm_reader_provider['maxquant']()

382

df = reader.import_file('msms.txt')

383

384

# Auto-detect file format (if supported)

385

auto_reader = get_reader_by_file('unknown_format.tsv')

386

if auto_reader:

387

df = auto_reader.import_file('unknown_format.tsv')

388

389

# List all available readers

390

available = list_available_readers()

391

print(f"Available readers: {available}")

392

```

393

394

### Working with Multiple Search Engines

395

396

```python

397

import pandas as pd

398

from alphabase.psm_reader import MaxQuantReader, DiannReader, SageReaderTSV

399

400

# Read files from different search engines

401

readers_and_files = [

402

(MaxQuantReader(), 'maxquant/msms.txt'),

403

(DiannReader(), 'diann/report.tsv'),

404

(SageReaderTSV(), 'sage/results.sage.tsv')

405

]

406

407

all_psms = []

408

for reader, filepath in readers_and_files:

409

df = reader.import_file(filepath)

410

df['search_engine'] = reader.__class__.__name__

411

all_psms.append(df)

412

413

# Combine all PSMs with standardized columns

414

combined_df = pd.concat(all_psms, ignore_index=True)

415

print(f"Total PSMs from all engines: {len(combined_df)}")

416

print(f"Search engines: {combined_df['search_engine'].unique()}")

417

```

418

419

### Custom Modification Mappings

420

421

```python

422

from alphabase.psm_reader import MaxQuantReader

423

424

# Create reader with custom modification mapping

425

reader = MaxQuantReader()

426

427

# Get current modification mapping

428

current_mapping = reader.get_modification_mapping()

429

print(f"Current mappings: {current_mapping}")

430

431

# Add custom modifications

432

custom_mapping = {

433

'Oxidation (M)': 'Oxidation',

434

'Phospho (STY)': 'Phosphorylation',

435

'Acetyl (Protein N-term)': 'Acetylation'

436

}

437

438

reader.set_modification_mapping(custom_mapping)

439

440

# Import file with custom mappings

441

df = reader.import_file('msms.txt')

442

```

443

444

### Advanced Processing Workflows

445

446

```python

447

from alphabase.psm_reader import DiannReader

448

import numpy as np

449

450

# Read DIA-NN results

451

reader = DiannReader()

452

df = reader.import_file('report.tsv')

453

454

# Apply quality filters using standardized columns

455

filtered_df = df[

456

(df['qvalue'] <= 0.01) & # 1% FDR

457

(df['score'] >= 0.99) & # High confidence

458

(df['rt'] > 0) # Valid retention time

459

].copy()

460

461

print(f"Original PSMs: {len(df)}")

462

print(f"After filtering: {len(filtered_df)}")

463

464

# Group by sequence for peptide-level analysis

465

peptide_level = filtered_df.groupby('sequence').agg({

466

'score': 'max',

467

'intensity': 'sum',

468

'proteins': 'first',

469

'rt': 'mean'

470

}).reset_index()

471

472

print(f"Unique peptides: {len(peptide_level)}")

473

```

474

475

### Custom Reader Development

476

477

```python

478

from alphabase.psm_reader import PSMReaderBase

479

import pandas as pd

480

481

class CustomReader(PSMReaderBase):

482

"""Custom reader for proprietary format."""

483

484

def __init__(self):

485

super().__init__()

486

# Define column mappings specific to this format

487

self.column_mapping = {

488

'peptide_seq': 'sequence',

489

'precursor_charge': 'charge',

490

'protein_ids': 'proteins',

491

'retention_time': 'rt',

492

'confidence_score': 'score'

493

}

494

495

def import_file(self, filepath: str) -> pd.DataFrame:

496

"""Import custom format file."""

497

# Read raw file

498

raw_df = pd.read_csv(filepath, sep='\t')

499

500

# Apply column mapping

501

standardized_df = self.standardize_columns(raw_df, self.column_mapping)

502

503

# Apply any format-specific processing

504

standardized_df['mods'] = '' # No modifications in this format

505

506

return standardized_df

507

508

# Register custom reader

509

register_custom_reader('custom', CustomReader)

510

511

# Use custom reader

512

custom_reader = CustomReader()

513

df = custom_reader.import_file('custom_format.tsv')

514

```