0
# SAM/BAM/CRAM Alignment Files
1
2
Comprehensive support for reading and writing sequence alignment files in SAM, BAM, and CRAM formats. Provides random access, indexing, pileup analysis, and full metadata handling.
3
4
## Capabilities
5
6
### AlignmentFile
7
8
Main interface for reading and writing alignment files with support for all SAM/BAM/CRAM formats.
9
10
```python { .api }
11
class AlignmentFile:
12
def __init__(self, filepath_or_object, mode=None, template=None, reference_names=None, reference_lengths=None, text=None, header=None, add_sq_text=False, check_header=True, check_sq=True, reference_filename=None, filename=None, index_filename=None, filepath_index=None, require_index=False, duplicate_filehandle=True, ignore_truncation=False, threads=1):
13
"""
14
Open an alignment file for reading or writing.
15
16
Parameters:
17
- filepath_or_object: str or file-like, path to file or file object
18
- mode: str, file mode ('r', 'w', 'rb', 'wb', 'rc', 'wc')
19
- template: AlignmentFile, template for header when writing
20
- reference_names: list, sequence names for header
21
- reference_lengths: list, sequence lengths for header
22
- text: str, SAM header text
23
- header: AlignmentHeader, header object
24
- check_header: bool, validate file header
25
- check_sq: bool, validate sequence dictionary
26
- reference_filename: str, reference FASTA file
27
- index_filename: str, index file path
28
- require_index: bool, require index file
29
- threads: int, number of threads for compression
30
31
Returns:
32
AlignmentFile object
33
"""
34
35
def fetch(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, multiple_iterators=False, until_eof=False):
36
"""
37
Fetch reads from a region.
38
39
Parameters:
40
- contig: str, chromosome/contig name
41
- start: int, 0-based start position
42
- stop: int, 0-based stop position
43
- region: str, region string (chr:start-stop)
44
- multiple_iterators: bool, allow concurrent iterators
45
- until_eof: bool, read until end of file
46
47
Returns:
48
Iterator of AlignedSegment objects
49
"""
50
51
def pileup(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, **kwargs):
52
"""
53
Create pileup of reads covering each position.
54
55
Parameters:
56
- contig: str, chromosome/contig name
57
- start: int, 0-based start position
58
- stop: int, 0-based stop position
59
- stepper: str, pileup algorithm ('all', 'nofilter', 'samtools')
60
- fastafile: FastaFile, reference sequences
61
- mask: int, ignore reads with these flags
62
- max_depth: int, maximum read depth
63
- truncate: bool, truncate to region boundaries
64
- min_base_quality: int, minimum base quality
65
66
Returns:
67
Iterator of PileupColumn objects
68
"""
69
70
def count(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, read_callback=None, until_eof=False):
71
"""
72
Count reads in a region.
73
74
Returns:
75
int, number of reads
76
"""
77
78
def head(self, n):
79
"""
80
Return first n reads.
81
82
Returns:
83
Iterator of AlignedSegment objects
84
"""
85
86
def mate(self, read):
87
"""
88
Find mate of a paired read.
89
90
Returns:
91
AlignedSegment or None
92
"""
93
94
def write(self, read):
95
"""
96
Write an aligned segment.
97
98
Parameters:
99
- read: AlignedSegment, read to write
100
"""
101
102
def close(self):
103
"""Close the file."""
104
105
# Properties
106
@property
107
def header(self) -> "AlignmentHeader":
108
"""File header information."""
109
110
@property
111
def references(self) -> tuple:
112
"""Reference sequence names."""
113
114
@property
115
def lengths(self) -> tuple:
116
"""Reference sequence lengths."""
117
118
@property
119
def nreferences(self) -> int:
120
"""Number of reference sequences."""
121
122
@property
123
def mapped(self) -> int:
124
"""Number of mapped reads."""
125
126
@property
127
def unmapped(self) -> int:
128
"""Number of unmapped reads."""
129
130
@property
131
def nocoordinate(self) -> int:
132
"""Number of reads without coordinates."""
133
134
@property
135
def filename(self) -> str:
136
"""Filename of the file."""
137
138
@property
139
def is_valid_tid(self) -> bool:
140
"""Check if template ID is valid."""
141
142
def get_index_statistics(self):
143
"""
144
Get index statistics for each reference.
145
146
Returns:
147
List of IndexStats namedtuples
148
"""
149
150
def count_coverage(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, quality_threshold=0, read_callback=None):
151
"""
152
Count coverage per position across A, C, G, T bases.
153
154
Parameters:
155
- contig: str, chromosome/contig name
156
- start: int, 0-based start position
157
- stop: int, 0-based stop position
158
- region: str, region string (chr:start-stop)
159
- quality_threshold: int, minimum base quality
160
- read_callback: callable, filter reads
161
162
Returns:
163
Tuple of 4 arrays (A, C, G, T coverage per position)
164
"""
165
166
def has_index(self):
167
"""
168
Check if file has an index.
169
170
Returns:
171
bool, True if index exists
172
"""
173
174
def check_index(self):
175
"""
176
Check if existing index is valid.
177
178
Returns:
179
bool, True if index is valid
180
"""
181
182
def find_introns(self, read_iterator):
183
"""
184
Find introns from read iterator (optimized version).
185
186
Parameters:
187
- read_iterator: iterator, reads to analyze
188
189
Returns:
190
dict, mapping (start, end) tuples to counts
191
"""
192
193
def find_introns_slow(self, read_iterator):
194
"""
195
Find introns from read iterator (slower but more compatible).
196
197
Parameters:
198
- read_iterator: iterator, reads to analyze
199
200
Returns:
201
dict, mapping (start, end) tuples to counts
202
"""
203
204
@property
205
def reference_filename(self):
206
"""Reference FASTA filename if specified."""
207
```
208
209
### AlignmentHeader
210
211
Header information for alignment files with methods for accessing and manipulating metadata.
212
213
```python { .api }
214
class AlignmentHeader:
215
def __init__(self):
216
"""Create empty alignment header."""
217
218
@classmethod
219
def from_text(cls, text):
220
"""
221
Create header from SAM header text.
222
223
Parameters:
224
- text: str, SAM header text
225
226
Returns:
227
AlignmentHeader object
228
"""
229
230
@classmethod
231
def from_dict(cls, header_dict):
232
"""
233
Create header from dictionary.
234
235
Parameters:
236
- header_dict: dict, header data
237
238
Returns:
239
AlignmentHeader object
240
"""
241
242
@classmethod
243
def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=False):
244
"""
245
Create header from reference sequences.
246
247
Parameters:
248
- reference_names: list, sequence names
249
- reference_lengths: list, sequence lengths
250
- text: str, additional header text
251
- add_sq_text: bool, add @SQ lines
252
253
Returns:
254
AlignmentHeader object
255
"""
256
257
def copy(self):
258
"""
259
Create a copy of the header.
260
261
Returns:
262
AlignmentHeader object
263
"""
264
265
def to_dict(self):
266
"""
267
Convert header to dictionary.
268
269
Returns:
270
dict, header data
271
"""
272
273
def get_reference_name(self, tid):
274
"""
275
Get reference name by template ID.
276
277
Parameters:
278
- tid: int, template ID
279
280
Returns:
281
str, reference name or None
282
"""
283
284
def get_reference_length(self, reference):
285
"""
286
Get reference sequence length.
287
288
Parameters:
289
- reference: str, reference name
290
291
Returns:
292
int, sequence length
293
"""
294
295
def get_tid(self, reference):
296
"""
297
Get template ID for reference.
298
299
Parameters:
300
- reference: str, reference name
301
302
Returns:
303
int, template ID
304
"""
305
306
def is_valid_tid(self, tid):
307
"""
308
Check if template ID is valid.
309
310
Parameters:
311
- tid: int, template ID
312
313
Returns:
314
bool, True if valid
315
"""
316
317
@property
318
def nreferences(self):
319
"""Number of reference sequences."""
320
321
@property
322
def references(self):
323
"""Tuple of reference sequence names."""
324
325
@property
326
def lengths(self):
327
"""Tuple of reference sequence lengths."""
328
```
329
330
### IndexStats
331
332
Named tuple containing index statistics for a reference sequence.
333
334
```python { .api }
335
class IndexStats(NamedTuple):
336
contig: str # Reference sequence name
337
mapped: int # Number of mapped reads
338
unmapped: int # Number of unmapped reads
339
total: int # Total number of reads
340
```
341
342
### AlignedSegment
343
344
Individual aligned read or segment with all alignment information and optional tags.
345
346
```python { .api }
347
class AlignedSegment:
348
def __init__(self):
349
"""Create a new aligned segment."""
350
351
# Core properties
352
@property
353
def query_name(self) -> str:
354
"""Read name/identifier."""
355
356
@query_name.setter
357
def query_name(self, value: str):
358
"""Set read name."""
359
360
@property
361
def query_sequence(self) -> str:
362
"""Read sequence."""
363
364
@query_sequence.setter
365
def query_sequence(self, value: str):
366
"""Set read sequence."""
367
368
@property
369
def query_qualities(self) -> list:
370
"""Base quality scores as list of integers."""
371
372
@query_qualities.setter
373
def query_qualities(self, value: list):
374
"""Set base quality scores."""
375
376
@property
377
def flag(self) -> int:
378
"""SAM flag as integer."""
379
380
@flag.setter
381
def flag(self, value: int):
382
"""Set SAM flag."""
383
384
@property
385
def reference_id(self) -> int:
386
"""Reference sequence ID (-1 if unmapped)."""
387
388
@reference_id.setter
389
def reference_id(self, value: int):
390
"""Set reference sequence ID."""
391
392
@property
393
def reference_start(self) -> int:
394
"""0-based start position on reference."""
395
396
@reference_start.setter
397
def reference_start(self, value: int):
398
"""Set reference start position."""
399
400
@property
401
def reference_end(self) -> int:
402
"""0-based end position on reference."""
403
404
@property
405
def next_reference_id(self) -> int:
406
"""Reference ID of mate."""
407
408
@next_reference_id.setter
409
def next_reference_id(self, value: int):
410
"""Set mate reference ID."""
411
412
@property
413
def next_reference_start(self) -> int:
414
"""Start position of mate."""
415
416
@next_reference_start.setter
417
def next_reference_start(self, value: int):
418
"""Set mate start position."""
419
420
@property
421
def template_length(self) -> int:
422
"""Template length (insert size)."""
423
424
@template_length.setter
425
def template_length(self, value: int):
426
"""Set template length."""
427
428
@property
429
def mapping_quality(self) -> int:
430
"""Mapping quality score."""
431
432
@mapping_quality.setter
433
def mapping_quality(self, value: int):
434
"""Set mapping quality."""
435
436
@property
437
def cigar(self) -> tuple:
438
"""CIGAR alignment as tuple of (operation, length) pairs."""
439
440
@cigar.setter
441
def cigar(self, value: tuple):
442
"""Set CIGAR alignment."""
443
444
@property
445
def cigarstring(self) -> str:
446
"""CIGAR as string representation."""
447
448
@cigarstring.setter
449
def cigarstring(self, value: str):
450
"""Set CIGAR from string."""
451
452
# Flag properties
453
@property
454
def is_paired(self) -> bool:
455
"""True if read is paired."""
456
457
@property
458
def is_proper_pair(self) -> bool:
459
"""True if read is in proper pair."""
460
461
@property
462
def is_unmapped(self) -> bool:
463
"""True if read is unmapped."""
464
465
@property
466
def mate_is_unmapped(self) -> bool:
467
"""True if mate is unmapped."""
468
469
@property
470
def is_reverse(self) -> bool:
471
"""True if read is reverse complemented."""
472
473
@property
474
def mate_is_reverse(self) -> bool:
475
"""True if mate is reverse complemented."""
476
477
@property
478
def is_read1(self) -> bool:
479
"""True if first read in pair."""
480
481
@property
482
def is_read2(self) -> bool:
483
"""True if second read in pair."""
484
485
@property
486
def is_secondary(self) -> bool:
487
"""True if secondary alignment."""
488
489
@property
490
def is_qcfail(self) -> bool:
491
"""True if read fails quality control."""
492
493
@property
494
def is_duplicate(self) -> bool:
495
"""True if read is duplicate."""
496
497
@property
498
def is_supplementary(self) -> bool:
499
"""True if supplementary alignment."""
500
501
# Methods
502
def get_tag(self, tag, with_value_type=False):
503
"""
504
Get optional alignment tag.
505
506
Parameters:
507
- tag: str, two-character tag name
508
- with_value_type: bool, return (value, type) tuple
509
510
Returns:
511
Tag value or (value, type) tuple
512
"""
513
514
def set_tag(self, tag, value, value_type=None, replace=True):
515
"""
516
Set optional alignment tag.
517
518
Parameters:
519
- tag: str, two-character tag name
520
- value: tag value
521
- value_type: str, value type ('A', 'i', 'f', 'Z', 'H', 'B')
522
- replace: bool, replace existing tag
523
"""
524
525
def has_tag(self, tag):
526
"""
527
Check if tag exists.
528
529
Returns:
530
bool, True if tag exists
531
"""
532
533
def get_tags(self, with_value_type=False):
534
"""
535
Get all tags.
536
537
Returns:
538
List of (tag, value) or (tag, value, type) tuples
539
"""
540
541
def set_tags(self, tags):
542
"""
543
Set multiple tags.
544
545
Parameters:
546
- tags: list of (tag, value) or (tag, value, type) tuples
547
"""
548
549
def get_aligned_pairs(self, matches_only=False, with_seq=False):
550
"""
551
Get alignment between query and reference.
552
553
Parameters:
554
- matches_only: bool, only return matching positions
555
- with_seq: bool, include sequence characters
556
557
Returns:
558
List of (query_pos, ref_pos) or (query_pos, ref_pos, ref_base) tuples
559
"""
560
561
def get_reference_positions(self, full_length=False):
562
"""
563
Get reference positions corresponding to query.
564
565
Returns:
566
List of reference positions
567
"""
568
569
def to_string(self):
570
"""
571
Convert to SAM format string.
572
573
Returns:
574
str, SAM format representation
575
"""
576
577
def compare(self, other):
578
"""
579
Compare two aligned segments.
580
581
Returns:
582
int, comparison result
583
"""
584
```
585
586
### AlignmentHeader
587
588
Header information for alignment files including sequence dictionary and metadata.
589
590
```python { .api }
591
class AlignmentHeader:
592
def __init__(self, dictionary=None):
593
"""
594
Create alignment header.
595
596
Parameters:
597
- dictionary: dict, header information
598
"""
599
600
@property
601
def references(self) -> tuple:
602
"""Reference sequence names."""
603
604
@property
605
def lengths(self) -> tuple:
606
"""Reference sequence lengths."""
607
608
def to_dict(self):
609
"""
610
Convert header to dictionary.
611
612
Returns:
613
dict, header information
614
"""
615
616
def as_dict(self):
617
"""
618
Get header as dictionary.
619
620
Returns:
621
dict, header information
622
"""
623
624
def copy(self):
625
"""
626
Create copy of header.
627
628
Returns:
629
AlignmentHeader, header copy
630
"""
631
```
632
633
### PileupColumn
634
635
Column of reads covering a genomic position for variant calling and depth analysis.
636
637
```python { .api }
638
class PileupColumn:
639
@property
640
def reference_id(self) -> int:
641
"""Reference sequence ID."""
642
643
@property
644
def reference_pos(self) -> int:
645
"""0-based reference position."""
646
647
@property
648
def nsegments(self) -> int:
649
"""Number of reads covering position."""
650
651
@property
652
def pileups(self) -> list:
653
"""List of PileupRead objects."""
654
655
def get_query_sequences(self, mark_matches=False, mark_ends=False, add_indels=False):
656
"""
657
Get query sequences at position.
658
659
Parameters:
660
- mark_matches: bool, mark matches with '.'
661
- mark_ends: bool, mark read ends with '$'
662
- add_indels: bool, include indel information
663
664
Returns:
665
List of sequence strings
666
"""
667
668
def get_query_qualities(self):
669
"""
670
Get base qualities at position.
671
672
Returns:
673
List of quality scores
674
"""
675
676
def get_query_names(self):
677
"""
678
Get read names at position.
679
680
Returns:
681
List of read names
682
"""
683
```
684
685
### PileupRead
686
687
Individual read within a pileup column with position-specific information.
688
689
```python { .api }
690
class PileupRead:
691
@property
692
def alignment(self) -> "AlignedSegment":
693
"""Underlying aligned segment."""
694
695
@property
696
def query_position(self) -> int:
697
"""Position in read sequence (None for indels)."""
698
699
@property
700
def query_position_or_next(self) -> int:
701
"""Query position or next position for indels."""
702
703
@property
704
def is_del(self) -> bool:
705
"""True if position is deletion."""
706
707
@property
708
def is_head(self) -> bool:
709
"""True if start of read."""
710
711
@property
712
def is_tail(self) -> bool:
713
"""True if end of read."""
714
715
@property
716
def is_refskip(self) -> bool:
717
"""True if reference skip (N in CIGAR)."""
718
719
@property
720
def level(self) -> int:
721
"""Level in pileup display."""
722
723
@property
724
def indel(self) -> int:
725
"""Length of indel (+ insertion, - deletion)."""
726
```
727
728
### IndexStats
729
730
Named tuple containing index statistics for reference sequences.
731
732
```python { .api }
733
IndexStats = namedtuple("IndexStats", ["contig", "mapped", "unmapped", "total"])
734
```
735
736
### IndexedReads
737
738
Index alignment file by read name for fast lookup by query name.
739
740
```python { .api }
741
class IndexedReads:
742
def __init__(self, samfile, multiple_iterators=False):
743
"""
744
Create read name index.
745
746
Parameters:
747
- samfile: AlignmentFile, alignment file to index
748
- multiple_iterators: bool, allow concurrent access
749
"""
750
751
def find(self, query_name):
752
"""
753
Find reads by name.
754
755
Parameters:
756
- query_name: str, read name to find
757
758
Returns:
759
Iterator of AlignedSegment objects
760
"""
761
```
762
763
## Usage Examples
764
765
### Basic File Reading
766
767
```python
768
import pysam
769
770
# Read SAM/BAM file
771
with pysam.AlignmentFile("input.bam", "rb") as bamfile:
772
# Iterate over all reads
773
for read in bamfile:
774
print(f"{read.query_name}: {read.reference_start}-{read.reference_end}")
775
776
# Fetch reads in region
777
for read in bamfile.fetch("chr1", 1000, 2000):
778
if not read.is_unmapped:
779
print(f"Mapped read: {read.query_sequence}")
780
781
# Read with index
782
with pysam.AlignmentFile("input.bam", "rb", index_filename="input.bam.bai") as bamfile:
783
count = bamfile.count("chr1", 1000, 2000)
784
print(f"Reads in region: {count}")
785
```
786
787
### Pileup Analysis
788
789
```python
790
import pysam
791
792
with pysam.AlignmentFile("input.bam", "rb") as bamfile:
793
for pileupcolumn in bamfile.pileup("chr1", 1000, 2000):
794
print(f"Position {pileupcolumn.reference_pos}: {pileupcolumn.nsegments} reads")
795
796
# Get base calls at position
797
bases = pileupcolumn.get_query_sequences()
798
qualities = pileupcolumn.get_query_qualities()
799
800
for base, qual in zip(bases, qualities):
801
if qual >= 20: # Filter by quality
802
print(f" Base: {base}, Quality: {qual}")
803
```
804
805
### Writing Files
806
807
```python
808
import pysam
809
810
# Create new BAM file
811
header = {"HD": {"VN": "1.0", "SO": "coordinate"},
812
"SQ": [{"LN": 1575, "SN": "chr1"},
813
{"LN": 1584, "SN": "chr2"}]}
814
815
with pysam.AlignmentFile("output.bam", "wb", header=header) as outfile:
816
# Create aligned segment
817
read = pysam.AlignedSegment()
818
read.query_name = "read1"
819
read.query_sequence = "ACGTACGTACGT"
820
read.flag = 0
821
read.reference_id = 0 # chr1
822
read.reference_start = 100
823
read.mapping_quality = 60
824
read.cigar = [(0, 12)] # 12M
825
826
outfile.write(read)
827
```