0
# Advanced Spectral Library Operations
1
2
Extended spectral library functionality including decoy generation, format conversion, library validation, and specialized library formats. Provides comprehensive tools for spectral library manipulation, quality control, and integration with various proteomics workflows and search engines.
3
4
## Capabilities
5
6
### Decoy Generation and Management
7
8
Comprehensive decoy generation capabilities supporting multiple strategies and integration with target-decoy search workflows.
9
10
```python { .api }
11
class SpecLibDecoy:
12
"""Extended spectral library with integrated decoy generation and management."""
13
14
def __init__(self, target_lib: SpecLibBase = None):
15
"""
16
Initialize spectral library with decoy capabilities.
17
18
Parameters:
19
- target_lib: Target spectral library to extend with decoys
20
"""
21
22
def generate_decoys(self, method: str = 'diann',
23
decoy_prefix: str = 'DECOY_',
24
keep_peptide_types: bool = True) -> None:
25
"""
26
Generate decoy sequences using specified method.
27
28
Parameters:
29
- method: Decoy generation method ('diann', 'pseudo_reverse', 'shuffle')
30
- decoy_prefix: Prefix for decoy protein identifiers
31
- keep_peptide_types: Preserve peptide characteristics in decoys
32
"""
33
34
def validate_decoy_quality(self) -> dict:
35
"""
36
Assess quality of generated decoy sequences.
37
38
Returns:
39
Dictionary with decoy quality metrics and statistics
40
"""
41
42
def get_target_decoy_ratio(self) -> float:
43
"""
44
Calculate ratio of target to decoy sequences.
45
46
Returns:
47
Target-to-decoy ratio
48
"""
49
50
def separate_targets_and_decoys(self) -> tuple['SpecLibBase', 'SpecLibBase']:
51
"""
52
Split library into separate target and decoy libraries.
53
54
Returns:
55
Tuple of (target_library, decoy_library)
56
"""
57
58
class DIANNDecoyGenerator:
59
"""DIANN-style decoy generation with advanced sequence manipulation."""
60
61
def __init__(self, keep_peptide_types: bool = True,
62
min_peptide_length: int = 6,
63
max_peptide_length: int = 30):
64
"""
65
Initialize DIANN decoy generator.
66
67
Parameters:
68
- keep_peptide_types: Preserve tryptic characteristics
69
- min_peptide_length: Minimum length for generated decoys
70
- max_peptide_length: Maximum length for generated decoys
71
"""
72
73
def generate_decoy_sequence(self, target_sequence: str,
74
target_proteins: str) -> tuple[str, str]:
75
"""
76
Generate single decoy sequence from target.
77
78
Parameters:
79
- target_sequence: Target peptide sequence
80
- target_proteins: Target protein identifiers
81
82
Returns:
83
Tuple of (decoy_sequence, decoy_proteins)
84
"""
85
86
def generate_decoy_library(self, target_lib: SpecLibBase,
87
decoy_prefix: str = 'DECOY_') -> SpecLibBase:
88
"""
89
Generate complete decoy library from target library.
90
91
Parameters:
92
- target_lib: Target spectral library
93
- decoy_prefix: Prefix for decoy identifiers
94
95
Returns:
96
New spectral library with decoy sequences
97
"""
98
99
def validate_sequence_properties(self, target_seq: str,
100
decoy_seq: str) -> dict:
101
"""
102
Compare properties between target and decoy sequences.
103
104
Parameters:
105
- target_seq: Original target sequence
106
- decoy_seq: Generated decoy sequence
107
108
Returns:
109
Dictionary with property comparisons
110
"""
111
112
class PseudoReverseDecoyGenerator:
113
"""Pseudo-reverse decoy generation with tryptic preservation."""
114
115
def __init__(self, cleavage_rule: str = 'trypsin'):
116
"""
117
Initialize pseudo-reverse generator.
118
119
Parameters:
120
- cleavage_rule: Enzyme cleavage specificity to preserve
121
"""
122
123
def generate_pseudo_reverse(self, sequence: str) -> str:
124
"""
125
Generate pseudo-reverse sequence preserving cleavage sites.
126
127
Parameters:
128
- sequence: Target peptide sequence
129
130
Returns:
131
Pseudo-reverse decoy sequence
132
"""
133
134
def preserve_cleavage_specificity(self, sequence: str,
135
enzyme: str = 'trypsin') -> str:
136
"""
137
Ensure decoy maintains enzymatic cleavage characteristics.
138
139
Parameters:
140
- sequence: Input sequence
141
- enzyme: Enzyme specificity to preserve
142
143
Returns:
144
Modified sequence with preserved cleavage sites
145
"""
146
147
class BaseDecoyGenerator:
148
"""Base class for custom decoy generation strategies."""
149
150
def __init__(self):
151
"""Initialize base decoy generator."""
152
153
def generate_decoy(self, target_sequence: str,
154
target_proteins: str,
155
**kwargs) -> tuple[str, str]:
156
"""
157
Generate decoy sequence (to be implemented by subclasses).
158
159
Parameters:
160
- target_sequence: Target peptide sequence
161
- target_proteins: Target protein identifiers
162
- **kwargs: Strategy-specific parameters
163
164
Returns:
165
Tuple of (decoy_sequence, decoy_proteins)
166
"""
167
raise NotImplementedError("Subclasses must implement generate_decoy")
168
169
def validate_decoy(self, target_seq: str, decoy_seq: str) -> bool:
170
"""
171
Validate generated decoy sequence.
172
173
Parameters:
174
- target_seq: Original target sequence
175
- decoy_seq: Generated decoy sequence
176
177
Returns:
178
True if decoy passes validation checks
179
"""
180
return True
181
182
class SpecLibDecoyProvider:
183
"""Provider system for decoy generation strategies."""
184
185
@staticmethod
186
def get_generator(method: str, **kwargs) -> BaseDecoyGenerator:
187
"""
188
Get decoy generator instance by method name.
189
190
Parameters:
191
- method: Generator method ('diann', 'pseudo_reverse', 'shuffle')
192
- **kwargs: Method-specific parameters
193
194
Returns:
195
Configured decoy generator instance
196
"""
197
198
@staticmethod
199
def list_available_methods() -> List[str]:
200
"""
201
List all available decoy generation methods.
202
203
Returns:
204
List of method names
205
"""
206
207
@staticmethod
208
def register_custom_generator(name: str,
209
generator_class: type) -> None:
210
"""
211
Register custom decoy generation method.
212
213
Parameters:
214
- name: Name for the custom method
215
- generator_class: Class implementing BaseDecoyGenerator
216
"""
217
```
218
219
### Flat Spectral Library Format
220
221
Specialized flat format for efficient storage and retrieval of large spectral libraries.
222
223
```python { .api }
224
class SpecLibFlat:
225
"""Flat spectral library format optimized for large-scale storage."""
226
227
def __init__(self):
228
"""Initialize flat spectral library."""
229
230
def from_spec_lib(self, spec_lib: SpecLibBase) -> None:
231
"""
232
Convert standard spectral library to flat format.
233
234
Parameters:
235
- spec_lib: Standard SpecLibBase to convert
236
"""
237
238
def to_spec_lib(self) -> SpecLibBase:
239
"""
240
Convert flat library back to standard format.
241
242
Returns:
243
Standard SpecLibBase instance
244
"""
245
246
def save_flat(self, filepath: str,
247
compression: str = 'gzip') -> None:
248
"""
249
Save flat library to compressed file.
250
251
Parameters:
252
- filepath: Output file path
253
- compression: Compression method ('gzip', 'bz2', 'xz')
254
"""
255
256
def load_flat(self, filepath: str) -> None:
257
"""
258
Load flat library from compressed file.
259
260
Parameters:
261
- filepath: Input file path
262
"""
263
264
def get_precursor_range(self, start_idx: int,
265
end_idx: int) -> pd.DataFrame:
266
"""
267
Get precursor range without loading full library.
268
269
Parameters:
270
- start_idx: Starting precursor index
271
- end_idx: Ending precursor index
272
273
Returns:
274
DataFrame with precursor range
275
"""
276
277
def query_by_mz_range(self, min_mz: float,
278
max_mz: float) -> pd.DataFrame:
279
"""
280
Query precursors by m/z range efficiently.
281
282
Parameters:
283
- min_mz: Minimum m/z value
284
- max_mz: Maximum m/z value
285
286
Returns:
287
DataFrame with precursors in m/z range
288
"""
289
290
def create_index(self, index_type: str = 'mz') -> None:
291
"""
292
Create optimized index for fast queries.
293
294
Parameters:
295
- index_type: Type of index ('mz', 'rt', 'sequence')
296
"""
297
298
def optimize_storage(self) -> dict:
299
"""
300
Optimize storage layout and compression.
301
302
Returns:
303
Dictionary with optimization statistics
304
"""
305
```
306
307
### Library Readers and Format Conversion
308
309
Comprehensive readers for various spectral library formats and conversion utilities.
310
311
```python { .api }
312
class LibraryReaderBase:
313
"""Base class for spectral library format readers."""
314
315
def __init__(self):
316
"""Initialize library reader."""
317
318
def read_library(self, filepath: str, **kwargs) -> SpecLibBase:
319
"""
320
Read spectral library from file.
321
322
Parameters:
323
- filepath: Path to library file
324
- **kwargs: Format-specific options
325
326
Returns:
327
Loaded spectral library
328
"""
329
raise NotImplementedError("Subclasses must implement read_library")
330
331
def validate_format(self, filepath: str) -> bool:
332
"""
333
Validate if file matches expected format.
334
335
Parameters:
336
- filepath: File path to validate
337
338
Returns:
339
True if format is compatible
340
"""
341
return True
342
343
def get_library_info(self, filepath: str) -> dict:
344
"""
345
Get library metadata without full loading.
346
347
Parameters:
348
- filepath: Library file path
349
350
Returns:
351
Dictionary with library information
352
"""
353
return {}
354
355
class CSVLibraryReader(LibraryReaderBase):
356
"""Reader for CSV-format spectral libraries."""
357
358
def __init__(self, delimiter: str = ','):
359
"""
360
Initialize CSV reader.
361
362
Parameters:
363
- delimiter: CSV delimiter character
364
"""
365
366
def read_library(self, filepath: str, **kwargs) -> SpecLibBase:
367
"""
368
Read spectral library from CSV file.
369
370
Parameters:
371
- filepath: Path to CSV library file
372
- **kwargs: CSV reading options
373
374
Returns:
375
Loaded spectral library
376
"""
377
378
def set_column_mapping(self, mapping: dict) -> None:
379
"""
380
Set custom column name mappings.
381
382
Parameters:
383
- mapping: Dictionary mapping CSV columns to standard names
384
"""
385
386
class TSVLibraryReader(LibraryReaderBase):
387
"""Reader for TSV-format spectral libraries."""
388
389
def __init__(self):
390
"""Initialize TSV reader."""
391
392
def read_library(self, filepath: str, **kwargs) -> SpecLibBase:
393
"""
394
Read spectral library from TSV file.
395
396
Parameters:
397
- filepath: Path to TSV library file
398
- **kwargs: TSV reading options
399
400
Returns:
401
Loaded spectral library
402
"""
403
404
class MSPLibraryReader(LibraryReaderBase):
405
"""Reader for MSP-format spectral libraries."""
406
407
def __init__(self):
408
"""Initialize MSP reader."""
409
410
def read_library(self, filepath: str, **kwargs) -> SpecLibBase:
411
"""
412
Read spectral library from MSP file.
413
414
Parameters:
415
- filepath: Path to MSP library file
416
- **kwargs: MSP reading options
417
418
Returns:
419
Loaded spectral library
420
"""
421
422
def parse_msp_entry(self, entry_text: str) -> dict:
423
"""
424
Parse individual MSP library entry.
425
426
Parameters:
427
- entry_text: Raw MSP entry text
428
429
Returns:
430
Dictionary with parsed entry information
431
"""
432
433
def get_library_reader(filepath: str) -> LibraryReaderBase:
434
"""
435
Auto-detect and return appropriate library reader.
436
437
Parameters:
438
- filepath: Path to library file
439
440
Returns:
441
Appropriate reader instance for the file format
442
"""
443
444
def convert_library_format(input_path: str,
445
output_path: str,
446
input_format: str = None,
447
output_format: str = 'hdf5') -> None:
448
"""
449
Convert spectral library between formats.
450
451
Parameters:
452
- input_path: Input library file path
453
- output_path: Output library file path
454
- input_format: Input format (auto-detected if None)
455
- output_format: Output format ('hdf5', 'csv', 'msp')
456
"""
457
```
458
459
### Library Translation and Format Support
460
461
Utilities for translating between different spectral library formats and search engine requirements.
462
463
```python { .api }
464
class WritingProcess:
465
"""Multiprocessing writer for efficient library export."""
466
467
def __init__(self, n_processes: int = 4):
468
"""
469
Initialize multiprocessing writer.
470
471
Parameters:
472
- n_processes: Number of worker processes
473
"""
474
475
def write_library_parallel(self, spec_lib: SpecLibBase,
476
output_path: str,
477
format_type: str = 'tsv',
478
chunk_size: int = 10000) -> None:
479
"""
480
Write library using parallel processing.
481
482
Parameters:
483
- spec_lib: Spectral library to write
484
- output_path: Output file path
485
- format_type: Output format
486
- chunk_size: Number of precursors per chunk
487
"""
488
489
def write_multiple_formats(self, spec_lib: SpecLibBase,
490
base_path: str,
491
formats: List[str]) -> dict:
492
"""
493
Write library in multiple formats simultaneously.
494
495
Parameters:
496
- spec_lib: Spectral library to write
497
- base_path: Base output path (extensions added automatically)
498
- formats: List of output formats
499
500
Returns:
501
Dictionary mapping formats to output file paths
502
"""
503
504
def translate_to_diann_format(spec_lib: SpecLibBase,
505
output_path: str) -> None:
506
"""
507
Translate library to DIA-NN compatible format.
508
509
Parameters:
510
- spec_lib: Input spectral library
511
- output_path: Output file path for DIA-NN library
512
"""
513
514
def translate_to_spectronaut_format(spec_lib: SpecLibBase,
515
output_path: str) -> None:
516
"""
517
Translate library to Spectronaut compatible format.
518
519
Parameters:
520
- spec_lib: Input spectral library
521
- output_path: Output file path for Spectronaut library
522
"""
523
524
def translate_to_openswath_format(spec_lib: SpecLibBase,
525
output_path: str) -> None:
526
"""
527
Translate library to OpenSWATH compatible format.
528
529
Parameters:
530
- spec_lib: Input spectral library
531
- output_path: Output file path for OpenSWATH library
532
"""
533
534
def translate_to_skyline_format(spec_lib: SpecLibBase,
535
output_path: str) -> None:
536
"""
537
Translate library to Skyline compatible format.
538
539
Parameters:
540
- spec_lib: Input spectral library
541
- output_path: Output file path for Skyline library
542
"""
543
544
def create_search_engine_libraries(spec_lib: SpecLibBase,
545
output_dir: str,
546
engines: List[str] = None) -> dict:
547
"""
548
Create libraries for multiple search engines.
549
550
Parameters:
551
- spec_lib: Input spectral library
552
- output_dir: Directory for output files
553
- engines: List of search engines ('diann', 'spectronaut', 'openswath')
554
555
Returns:
556
Dictionary mapping engines to output file paths
557
"""
558
```
559
560
### Library Validation and Quality Control
561
562
Comprehensive validation system for assessing spectral library quality and completeness.
563
564
```python { .api }
565
class Schema:
566
"""Schema validation system for spectral libraries."""
567
568
def __init__(self, required_columns: List[str] = None,
569
optional_columns: List[str] = None):
570
"""
571
Initialize schema validator.
572
573
Parameters:
574
- required_columns: List of required column names
575
- optional_columns: List of optional column names
576
"""
577
578
def validate_library(self, spec_lib: SpecLibBase) -> dict:
579
"""
580
Validate spectral library against schema.
581
582
Parameters:
583
- spec_lib: Spectral library to validate
584
585
Returns:
586
Dictionary with validation results and issues
587
"""
588
589
def add_column_requirement(self, column: str,
590
requirement_type: str,
591
**kwargs) -> None:
592
"""
593
Add column validation requirement.
594
595
Parameters:
596
- column: Column name
597
- requirement_type: Type of requirement ('required', 'optional', 'forbidden')
598
- **kwargs: Additional requirement parameters
599
"""
600
601
class Required:
602
"""Required column specification for schema validation."""
603
604
def __init__(self, column_name: str,
605
data_type: type = None,
606
validation_func: callable = None):
607
"""
608
Define required column.
609
610
Parameters:
611
- column_name: Name of required column
612
- data_type: Expected data type
613
- validation_func: Custom validation function
614
"""
615
616
def validate(self, df: pd.DataFrame) -> dict:
617
"""
618
Validate column presence and properties.
619
620
Parameters:
621
- df: DataFrame to validate
622
623
Returns:
624
Validation result dictionary
625
"""
626
627
class Optional:
628
"""Optional column specification for schema validation."""
629
630
def __init__(self, column_name: str,
631
data_type: type = None,
632
default_value=None):
633
"""
634
Define optional column.
635
636
Parameters:
637
- column_name: Name of optional column
638
- data_type: Expected data type if present
639
- default_value: Default value if column missing
640
"""
641
642
def validate(self, df: pd.DataFrame) -> dict:
643
"""
644
Validate optional column if present.
645
646
Parameters:
647
- df: DataFrame to validate
648
649
Returns:
650
Validation result dictionary
651
"""
652
653
class Column:
654
"""Generic column specification with flexible validation."""
655
656
def __init__(self, name: str,
657
required: bool = True,
658
data_type: type = None,
659
min_value=None,
660
max_value=None,
661
allowed_values: List = None):
662
"""
663
Define column specification.
664
665
Parameters:
666
- name: Column name
667
- required: Whether column is required
668
- data_type: Expected data type
669
- min_value: Minimum allowed value
670
- max_value: Maximum allowed value
671
- allowed_values: List of allowed values
672
"""
673
674
def validate(self, df: pd.DataFrame) -> dict:
675
"""
676
Perform comprehensive column validation.
677
678
Parameters:
679
- df: DataFrame to validate
680
681
Returns:
682
Detailed validation results
683
"""
684
685
def validate_spectral_library_completeness(spec_lib: SpecLibBase) -> dict:
686
"""
687
Validate spectral library completeness and consistency.
688
689
Parameters:
690
- spec_lib: Spectral library to validate
691
692
Returns:
693
Dictionary with completeness assessment
694
"""
695
696
def assess_library_quality_metrics(spec_lib: SpecLibBase) -> dict:
697
"""
698
Calculate comprehensive library quality metrics.
699
700
Parameters:
701
- spec_lib: Spectral library to assess
702
703
Returns:
704
Dictionary with quality metrics and statistics
705
"""
706
707
def check_library_integrity(spec_lib: SpecLibBase) -> dict:
708
"""
709
Check spectral library data integrity.
710
711
Parameters:
712
- spec_lib: Spectral library to check
713
714
Returns:
715
Dictionary with integrity check results
716
"""
717
718
def generate_library_report(spec_lib: SpecLibBase,
719
output_path: str = None) -> dict:
720
"""
721
Generate comprehensive library quality report.
722
723
Parameters:
724
- spec_lib: Spectral library to analyze
725
- output_path: Optional path to save HTML report
726
727
Returns:
728
Dictionary with report data and statistics
729
"""
730
```
731
732
## Usage Examples
733
734
### Decoy Generation and Management
735
736
```python
737
from alphabase.spectral_library.decoy import SpecLibDecoy, DIANNDecoyGenerator
738
from alphabase.spectral_library.base import SpecLibBase
739
import pandas as pd
740
741
# Create target library
742
target_lib = SpecLibBase()
743
target_lib.precursor_df = pd.DataFrame({
744
'sequence': ['PEPTIDE', 'SEQUENCE', 'EXAMPLE'],
745
'mods': ['', 'Phospho (STY)@2', ''],
746
'charge': [2, 3, 2],
747
'proteins': ['P12345', 'P67890', 'P11111']
748
})
749
target_lib.refine_df()
750
751
# Create decoy library using DIANN method
752
decoy_lib = SpecLibDecoy(target_lib)
753
decoy_lib.generate_decoys(method='diann', decoy_prefix='DECOY_')
754
755
print(f"Target precursors: {len(target_lib.precursor_df)}")
756
print(f"Total with decoys: {len(decoy_lib.precursor_df)}")
757
print(f"Target-decoy ratio: {decoy_lib.get_target_decoy_ratio():.1f}")
758
759
# Validate decoy quality
760
quality_metrics = decoy_lib.validate_decoy_quality()
761
print(f"Decoy quality metrics: {quality_metrics}")
762
763
# Separate targets and decoys
764
targets, decoys = decoy_lib.separate_targets_and_decoys()
765
print(f"Separated: {len(targets.precursor_df)} targets, {len(decoys.precursor_df)} decoys")
766
```
767
768
### Advanced Decoy Generation
769
770
```python
771
from alphabase.spectral_library.decoy import (
772
DIANNDecoyGenerator, PseudoReverseDecoyGenerator, SpecLibDecoyProvider
773
)
774
775
# Use DIANN decoy generator directly
776
diann_gen = DIANNDecoyGenerator(keep_peptide_types=True)
777
target_seq = "PEPTIDE"
778
decoy_seq, decoy_proteins = diann_gen.generate_decoy_sequence(
779
target_seq, "P12345"
780
)
781
print(f"DIANN decoy: {target_seq} -> {decoy_seq}")
782
783
# Validate sequence properties
784
properties = diann_gen.validate_sequence_properties(target_seq, decoy_seq)
785
print(f"Property comparison: {properties}")
786
787
# Use pseudo-reverse generator
788
pseudo_gen = PseudoReverseDecoyGenerator(cleavage_rule='trypsin')
789
pseudo_decoy = pseudo_gen.generate_pseudo_reverse(target_seq)
790
print(f"Pseudo-reverse decoy: {target_seq} -> {pseudo_decoy}")
791
792
# Use provider system
793
generator = SpecLibDecoyProvider.get_generator('diann', keep_peptide_types=True)
794
print(f"Available methods: {SpecLibDecoyProvider.list_available_methods()}")
795
```
796
797
### Flat Library Format Operations
798
799
```python
800
from alphabase.spectral_library.flat import SpecLibFlat
801
802
# Convert standard library to flat format
803
flat_lib = SpecLibFlat()
804
flat_lib.from_spec_lib(target_lib)
805
806
# Save in compressed format
807
flat_lib.save_flat('library_flat.gz', compression='gzip')
808
809
# Load flat library
810
new_flat = SpecLibFlat()
811
new_flat.load_flat('library_flat.gz')
812
813
# Efficient range queries
814
precursor_range = new_flat.get_precursor_range(0, 10)
815
print(f"First 10 precursors: {len(precursor_range)}")
816
817
# Query by m/z range
818
mz_range = new_flat.query_by_mz_range(400.0, 500.0)
819
print(f"Precursors in m/z 400-500: {len(mz_range)}")
820
821
# Create index for fast queries
822
new_flat.create_index(index_type='mz')
823
824
# Optimize storage
825
optimization_stats = new_flat.optimize_storage()
826
print(f"Storage optimization: {optimization_stats}")
827
```
828
829
### Library Format Conversion
830
831
```python
832
from alphabase.spectral_library.reader import (
833
get_library_reader, convert_library_format
834
)
835
from alphabase.spectral_library.translate import (
836
translate_to_diann_format, create_search_engine_libraries
837
)
838
839
# Auto-detect and read library format
840
reader = get_library_reader('unknown_library.tsv')
841
loaded_lib = reader.read_library('unknown_library.tsv')
842
print(f"Loaded library: {len(loaded_lib.precursor_df)} precursors")
843
844
# Convert between formats
845
convert_library_format(
846
input_path='library.csv',
847
output_path='library.h5',
848
input_format='csv',
849
output_format='hdf5'
850
)
851
852
# Translate to specific search engine formats
853
translate_to_diann_format(loaded_lib, 'library_diann.tsv')
854
print("Translated to DIA-NN format")
855
856
# Create libraries for multiple search engines
857
engine_libraries = create_search_engine_libraries(
858
loaded_lib,
859
output_dir='./libraries/',
860
engines=['diann', 'spectronaut', 'openswath']
861
)
862
print(f"Created libraries: {list(engine_libraries.keys())}")
863
```
864
865
### Library Validation and Quality Control
866
867
```python
868
from alphabase.spectral_library.validate import (
869
Schema, Required, Optional, validate_spectral_library_completeness,
870
assess_library_quality_metrics, generate_library_report
871
)
872
873
# Create validation schema
874
schema = Schema()
875
schema.add_column_requirement('sequence', 'required', data_type=str)
876
schema.add_column_requirement('charge', 'required', data_type=int)
877
schema.add_column_requirement('proteins', 'required', data_type=str)
878
schema.add_column_requirement('rt', 'optional', data_type=float)
879
880
# Validate library against schema
881
validation_results = schema.validate_library(loaded_lib)
882
print(f"Schema validation: {validation_results['passed']}")
883
if not validation_results['passed']:
884
print(f"Issues: {validation_results['issues']}")
885
886
# Check library completeness
887
completeness = validate_spectral_library_completeness(loaded_lib)
888
print(f"Library completeness:")
889
print(f" Precursor completeness: {completeness['precursor_completeness']:.1%}")
890
print(f" Fragment completeness: {completeness['fragment_completeness']:.1%}")
891
892
# Assess quality metrics
893
quality_metrics = assess_library_quality_metrics(loaded_lib)
894
print(f"Quality metrics:")
895
print(f" Average fragments per precursor: {quality_metrics['avg_fragments_per_precursor']:.1f}")
896
print(f" m/z range: {quality_metrics['mz_range']}")
897
print(f" Charge distribution: {quality_metrics['charge_distribution']}")
898
899
# Generate comprehensive report
900
report_data = generate_library_report(loaded_lib, 'library_report.html')
901
print(f"Generated report with {len(report_data['sections'])} sections")
902
```
903
904
### Parallel Library Processing
905
906
```python
907
from alphabase.spectral_library.translate import WritingProcess
908
909
# Process large library with multiple workers
910
writer = WritingProcess(n_processes=8)
911
912
# Write library in parallel
913
writer.write_library_parallel(
914
spec_lib=loaded_lib,
915
output_path='large_library.tsv',
916
format_type='tsv',
917
chunk_size=50000
918
)
919
920
# Write multiple formats simultaneously
921
format_paths = writer.write_multiple_formats(
922
spec_lib=loaded_lib,
923
base_path='library',
924
formats=['tsv', 'csv', 'msp']
925
)
926
print(f"Created formats: {format_paths}")
927
```
928
929
### Advanced Validation Workflows
930
931
```python
932
from alphabase.spectral_library.validate import Required, Optional, Column
933
934
# Create detailed column specifications
935
columns = [
936
Required('sequence', data_type=str),
937
Required('charge', data_type=int),
938
Required('proteins', data_type=str),
939
Optional('rt', data_type=float, default_value=0.0),
940
Column('mz', required=True, data_type=float, min_value=100.0, max_value=2000.0),
941
Column('intensity', required=False, data_type=float, min_value=0.0)
942
]
943
944
# Validate each column specification
945
validation_results = []
946
for col_spec in columns:
947
result = col_spec.validate(loaded_lib.precursor_df)
948
validation_results.append(result)
949
print(f"Column {col_spec.name}: {'PASS' if result['valid'] else 'FAIL'}")
950
951
# Custom validation workflow
952
def validate_library_for_dia_analysis(spec_lib):
953
"""Custom validation for DIA analysis requirements."""
954
issues = []
955
956
# Check for minimum precursors
957
if len(spec_lib.precursor_df) < 1000:
958
issues.append("Insufficient precursors for DIA analysis")
959
960
# Check charge distribution
961
charge_dist = spec_lib.precursor_df['charge'].value_counts()
962
if charge_dist.get(2, 0) / len(spec_lib.precursor_df) < 0.3:
963
issues.append("Low proportion of doubly charged precursors")
964
965
# Check m/z coverage
966
mz_min = spec_lib.precursor_df['mz'].min()
967
mz_max = spec_lib.precursor_df['mz'].max()
968
if mz_max - mz_min < 500:
969
issues.append("Limited m/z range coverage")
970
971
return {
972
'suitable_for_dia': len(issues) == 0,
973
'issues': issues,
974
'precursor_count': len(spec_lib.precursor_df),
975
'mz_range': (mz_min, mz_max),
976
'charge_distribution': charge_dist.to_dict()
977
}
978
979
# Apply custom validation
980
dia_validation = validate_library_for_dia_analysis(loaded_lib)
981
print(f"DIA suitability: {dia_validation}")
982
```