Tessl Tile for pypi/pysam@0.23.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

alignment-files.md bgzf-files.md command-tools.md index.md sequence-files.md tabix-files.md utilities.md variant-files.md

alignment-files.mddocs/

0
# SAM/BAM/CRAM Alignment Files
1

2
Comprehensive support for reading and writing sequence alignment files in SAM, BAM, and CRAM formats. Provides random access, indexing, pileup analysis, and full metadata handling.
3

4
## Capabilities
5

6
### AlignmentFile
7

8
Main interface for reading and writing alignment files with support for all SAM/BAM/CRAM formats.
9

10
```python { .api }
11
class AlignmentFile:
12
    def __init__(self, filepath_or_object, mode=None, template=None, reference_names=None, reference_lengths=None, text=None, header=None, add_sq_text=False, check_header=True, check_sq=True, reference_filename=None, filename=None, index_filename=None, filepath_index=None, require_index=False, duplicate_filehandle=True, ignore_truncation=False, threads=1):
13
        """
14
        Open an alignment file for reading or writing.
15
        
16
        Parameters:
17
        - filepath_or_object: str or file-like, path to file or file object
18
        - mode: str, file mode ('r', 'w', 'rb', 'wb', 'rc', 'wc')
19
        - template: AlignmentFile, template for header when writing
20
        - reference_names: list, sequence names for header
21
        - reference_lengths: list, sequence lengths for header  
22
        - text: str, SAM header text
23
        - header: AlignmentHeader, header object
24
        - check_header: bool, validate file header
25
        - check_sq: bool, validate sequence dictionary
26
        - reference_filename: str, reference FASTA file
27
        - index_filename: str, index file path
28
        - require_index: bool, require index file
29
        - threads: int, number of threads for compression
30
        
31
        Returns:
32
        AlignmentFile object
33
        """
34
    
35
    def fetch(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, multiple_iterators=False, until_eof=False):
36
        """
37
        Fetch reads from a region.
38
        
39
        Parameters:
40
        - contig: str, chromosome/contig name
41
        - start: int, 0-based start position  
42
        - stop: int, 0-based stop position
43
        - region: str, region string (chr:start-stop)
44
        - multiple_iterators: bool, allow concurrent iterators
45
        - until_eof: bool, read until end of file
46
        
47
        Returns:
48
        Iterator of AlignedSegment objects
49
        """
50
    
51
    def pileup(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, **kwargs):
52
        """
53
        Create pileup of reads covering each position.
54
        
55
        Parameters:
56
        - contig: str, chromosome/contig name
57
        - start: int, 0-based start position
58
        - stop: int, 0-based stop position  
59
        - stepper: str, pileup algorithm ('all', 'nofilter', 'samtools')
60
        - fastafile: FastaFile, reference sequences
61
        - mask: int, ignore reads with these flags
62
        - max_depth: int, maximum read depth
63
        - truncate: bool, truncate to region boundaries
64
        - min_base_quality: int, minimum base quality
65
        
66
        Returns:
67
        Iterator of PileupColumn objects
68
        """
69
    
70
    def count(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, read_callback=None, until_eof=False):
71
        """
72
        Count reads in a region.
73
        
74
        Returns:
75
        int, number of reads
76
        """
77
    
78
    def head(self, n):
79
        """
80
        Return first n reads.
81
        
82
        Returns:
83
        Iterator of AlignedSegment objects
84
        """
85
    
86
    def mate(self, read):
87
        """
88
        Find mate of a paired read.
89
        
90
        Returns:
91
        AlignedSegment or None
92
        """
93
    
94
    def write(self, read):
95
        """
96
        Write an aligned segment.
97
        
98
        Parameters:
99
        - read: AlignedSegment, read to write
100
        """
101
    
102
    def close(self):
103
        """Close the file."""
104
    
105
    # Properties
106
    @property
107
    def header(self) -> "AlignmentHeader":
108
        """File header information."""
109
    
110
    @property
111
    def references(self) -> tuple:
112
        """Reference sequence names."""
113
    
114
    @property 
115
    def lengths(self) -> tuple:
116
        """Reference sequence lengths."""
117
    
118
    @property
119
    def nreferences(self) -> int:
120
        """Number of reference sequences."""
121
    
122
    @property
123
    def mapped(self) -> int:
124
        """Number of mapped reads."""
125
    
126
    @property
127
    def unmapped(self) -> int:
128
        """Number of unmapped reads."""
129
    
130
    @property
131
    def nocoordinate(self) -> int:
132
        """Number of reads without coordinates."""
133
    
134
    @property
135
    def filename(self) -> str:
136
        """Filename of the file."""
137
    
138
    @property
139
    def is_valid_tid(self) -> bool:
140
        """Check if template ID is valid."""
141
    
142
    def get_index_statistics(self):
143
        """
144
        Get index statistics for each reference.
145
        
146
        Returns:
147
        List of IndexStats namedtuples
148
        """
149
    
150
    def count_coverage(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, quality_threshold=0, read_callback=None):
151
        """
152
        Count coverage per position across A, C, G, T bases.
153
        
154
        Parameters:
155
        - contig: str, chromosome/contig name
156
        - start: int, 0-based start position
157
        - stop: int, 0-based stop position
158
        - region: str, region string (chr:start-stop)
159
        - quality_threshold: int, minimum base quality
160
        - read_callback: callable, filter reads
161
        
162
        Returns:
163
        Tuple of 4 arrays (A, C, G, T coverage per position)
164
        """
165
    
166
    def has_index(self):
167
        """
168
        Check if file has an index.
169
        
170
        Returns:
171
        bool, True if index exists
172
        """
173
    
174
    def check_index(self):
175
        """
176
        Check if existing index is valid.
177
        
178
        Returns:
179
        bool, True if index is valid
180
        """
181
    
182
    def find_introns(self, read_iterator):
183
        """
184
        Find introns from read iterator (optimized version).
185
        
186
        Parameters:
187
        - read_iterator: iterator, reads to analyze
188
        
189
        Returns:
190
        dict, mapping (start, end) tuples to counts
191
        """
192
    
193
    def find_introns_slow(self, read_iterator):
194
        """
195
        Find introns from read iterator (slower but more compatible).
196
        
197
        Parameters:
198
        - read_iterator: iterator, reads to analyze
199
        
200
        Returns:
201
        dict, mapping (start, end) tuples to counts
202
        """
203
    
204
    @property
205
    def reference_filename(self):
206
        """Reference FASTA filename if specified."""
207
```
208

209
### AlignmentHeader
210

211
Header information for alignment files with methods for accessing and manipulating metadata.
212

213
```python { .api }
214
class AlignmentHeader:
215
    def __init__(self):
216
        """Create empty alignment header."""
217
    
218
    @classmethod
219
    def from_text(cls, text):
220
        """
221
        Create header from SAM header text.
222
        
223
        Parameters:
224
        - text: str, SAM header text
225
        
226
        Returns:
227
        AlignmentHeader object
228
        """
229
    
230
    @classmethod
231
    def from_dict(cls, header_dict):
232
        """
233
        Create header from dictionary.
234
        
235
        Parameters:
236
        - header_dict: dict, header data
237
        
238
        Returns:
239
        AlignmentHeader object
240
        """
241
    
242
    @classmethod
243
    def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=False):
244
        """
245
        Create header from reference sequences.
246
        
247
        Parameters:
248
        - reference_names: list, sequence names
249
        - reference_lengths: list, sequence lengths
250
        - text: str, additional header text
251
        - add_sq_text: bool, add @SQ lines
252
        
253
        Returns:
254
        AlignmentHeader object
255
        """
256
    
257
    def copy(self):
258
        """
259
        Create a copy of the header.
260
        
261
        Returns:
262
        AlignmentHeader object
263
        """
264
    
265
    def to_dict(self):
266
        """
267
        Convert header to dictionary.
268
        
269
        Returns:
270
        dict, header data
271
        """
272
    
273
    def get_reference_name(self, tid):
274
        """
275
        Get reference name by template ID.
276
        
277
        Parameters:
278
        - tid: int, template ID
279
        
280
        Returns:
281
        str, reference name or None
282
        """
283
    
284
    def get_reference_length(self, reference):
285
        """
286
        Get reference sequence length.
287
        
288
        Parameters:
289
        - reference: str, reference name
290
        
291
        Returns:
292
        int, sequence length
293
        """
294
    
295
    def get_tid(self, reference):
296
        """
297
        Get template ID for reference.
298
        
299
        Parameters:
300
        - reference: str, reference name
301
        
302
        Returns:
303
        int, template ID
304
        """
305
    
306
    def is_valid_tid(self, tid):
307
        """
308
        Check if template ID is valid.
309
        
310
        Parameters:
311
        - tid: int, template ID
312
        
313
        Returns:
314
        bool, True if valid
315
        """
316
    
317
    @property
318
    def nreferences(self):
319
        """Number of reference sequences."""
320
    
321
    @property
322
    def references(self):
323
        """Tuple of reference sequence names."""
324
    
325
    @property
326
    def lengths(self):
327
        """Tuple of reference sequence lengths."""
328
```
329

330
### IndexStats
331

332
Named tuple containing index statistics for a reference sequence.
333

334
```python { .api }
335
class IndexStats(NamedTuple):
336
    contig: str      # Reference sequence name
337
    mapped: int      # Number of mapped reads
338
    unmapped: int    # Number of unmapped reads  
339
    total: int       # Total number of reads
340
```
341

342
### AlignedSegment
343

344
Individual aligned read or segment with all alignment information and optional tags.
345

346
```python { .api }
347
class AlignedSegment:
348
    def __init__(self):
349
        """Create a new aligned segment."""
350
    
351
    # Core properties
352
    @property
353
    def query_name(self) -> str:
354
        """Read name/identifier."""
355
    
356
    @query_name.setter  
357
    def query_name(self, value: str):
358
        """Set read name."""
359
    
360
    @property
361
    def query_sequence(self) -> str:
362
        """Read sequence."""
363
    
364
    @query_sequence.setter
365
    def query_sequence(self, value: str):
366
        """Set read sequence."""
367
    
368
    @property
369
    def query_qualities(self) -> list:
370
        """Base quality scores as list of integers."""
371
    
372
    @query_qualities.setter
373
    def query_qualities(self, value: list):
374
        """Set base quality scores."""
375
    
376
    @property
377
    def flag(self) -> int:
378
        """SAM flag as integer."""
379
    
380
    @flag.setter
381
    def flag(self, value: int):
382
        """Set SAM flag."""
383
    
384
    @property
385
    def reference_id(self) -> int:
386
        """Reference sequence ID (-1 if unmapped)."""
387
    
388
    @reference_id.setter
389
    def reference_id(self, value: int):
390
        """Set reference sequence ID."""
391
    
392
    @property
393
    def reference_start(self) -> int:
394
        """0-based start position on reference."""
395
    
396
    @reference_start.setter
397
    def reference_start(self, value: int):
398
        """Set reference start position."""
399
    
400
    @property
401
    def reference_end(self) -> int:
402
        """0-based end position on reference."""
403
    
404
    @property
405
    def next_reference_id(self) -> int:
406
        """Reference ID of mate."""
407
    
408
    @next_reference_id.setter
409
    def next_reference_id(self, value: int):
410
        """Set mate reference ID."""
411
    
412
    @property
413
    def next_reference_start(self) -> int:
414
        """Start position of mate."""
415
    
416
    @next_reference_start.setter  
417
    def next_reference_start(self, value: int):
418
        """Set mate start position."""
419
    
420
    @property
421
    def template_length(self) -> int:
422
        """Template length (insert size)."""
423
    
424
    @template_length.setter
425
    def template_length(self, value: int):
426
        """Set template length."""
427
    
428
    @property
429
    def mapping_quality(self) -> int:
430
        """Mapping quality score."""
431
    
432
    @mapping_quality.setter
433
    def mapping_quality(self, value: int):
434
        """Set mapping quality."""
435
    
436
    @property
437
    def cigar(self) -> tuple:
438
        """CIGAR alignment as tuple of (operation, length) pairs."""
439
    
440
    @cigar.setter
441
    def cigar(self, value: tuple):
442
        """Set CIGAR alignment."""
443
    
444
    @property
445
    def cigarstring(self) -> str:
446
        """CIGAR as string representation."""
447
    
448
    @cigarstring.setter
449
    def cigarstring(self, value: str):
450
        """Set CIGAR from string."""
451
    
452
    # Flag properties
453
    @property
454
    def is_paired(self) -> bool:
455
        """True if read is paired."""
456
    
457
    @property
458
    def is_proper_pair(self) -> bool:
459
        """True if read is in proper pair."""
460
    
461
    @property
462
    def is_unmapped(self) -> bool:
463
        """True if read is unmapped."""
464
    
465
    @property
466
    def mate_is_unmapped(self) -> bool:
467
        """True if mate is unmapped."""
468
    
469
    @property
470
    def is_reverse(self) -> bool:
471
        """True if read is reverse complemented."""
472
    
473
    @property
474
    def mate_is_reverse(self) -> bool:
475
        """True if mate is reverse complemented."""
476
    
477
    @property
478
    def is_read1(self) -> bool:
479
        """True if first read in pair."""
480
    
481
    @property
482
    def is_read2(self) -> bool:
483
        """True if second read in pair."""
484
    
485
    @property
486
    def is_secondary(self) -> bool:
487
        """True if secondary alignment."""
488
    
489
    @property
490
    def is_qcfail(self) -> bool:
491
        """True if read fails quality control."""
492
    
493
    @property
494
    def is_duplicate(self) -> bool:
495
        """True if read is duplicate."""
496
    
497
    @property
498
    def is_supplementary(self) -> bool:
499
        """True if supplementary alignment."""
500
    
501
    # Methods
502
    def get_tag(self, tag, with_value_type=False):
503
        """
504
        Get optional alignment tag.
505
        
506
        Parameters:
507
        - tag: str, two-character tag name
508
        - with_value_type: bool, return (value, type) tuple
509
        
510
        Returns:
511
        Tag value or (value, type) tuple
512
        """
513
    
514
    def set_tag(self, tag, value, value_type=None, replace=True):
515
        """
516
        Set optional alignment tag.
517
        
518
        Parameters:
519
        - tag: str, two-character tag name
520
        - value: tag value
521
        - value_type: str, value type ('A', 'i', 'f', 'Z', 'H', 'B')
522
        - replace: bool, replace existing tag
523
        """
524
    
525
    def has_tag(self, tag):
526
        """
527
        Check if tag exists.
528
        
529
        Returns:
530
        bool, True if tag exists
531
        """
532
    
533
    def get_tags(self, with_value_type=False):
534
        """
535
        Get all tags.
536
        
537
        Returns:
538
        List of (tag, value) or (tag, value, type) tuples
539
        """
540
    
541
    def set_tags(self, tags):
542
        """
543
        Set multiple tags.
544
        
545
        Parameters:
546
        - tags: list of (tag, value) or (tag, value, type) tuples
547
        """
548
    
549
    def get_aligned_pairs(self, matches_only=False, with_seq=False):
550
        """
551
        Get alignment between query and reference.
552
        
553
        Parameters:
554
        - matches_only: bool, only return matching positions
555
        - with_seq: bool, include sequence characters
556
        
557
        Returns:
558
        List of (query_pos, ref_pos) or (query_pos, ref_pos, ref_base) tuples
559
        """
560
    
561
    def get_reference_positions(self, full_length=False):
562
        """
563
        Get reference positions corresponding to query.
564
        
565
        Returns:
566
        List of reference positions
567
        """
568
    
569
    def to_string(self):
570
        """
571
        Convert to SAM format string.
572
        
573
        Returns:
574
        str, SAM format representation
575
        """
576
    
577
    def compare(self, other):
578
        """
579
        Compare two aligned segments.
580
        
581
        Returns:
582
        int, comparison result
583
        """
584
```
585

586
### AlignmentHeader
587

588
Header information for alignment files including sequence dictionary and metadata.
589

590
```python { .api }
591
class AlignmentHeader:
592
    def __init__(self, dictionary=None):
593
        """
594
        Create alignment header.
595
        
596
        Parameters:
597
        - dictionary: dict, header information
598
        """
599
    
600
    @property
601
    def references(self) -> tuple:
602
        """Reference sequence names."""
603
    
604
    @property
605
    def lengths(self) -> tuple:
606
        """Reference sequence lengths."""
607
    
608
    def to_dict(self):
609
        """
610
        Convert header to dictionary.
611
        
612
        Returns:
613
        dict, header information
614
        """
615
    
616
    def as_dict(self):
617
        """
618
        Get header as dictionary.
619
        
620
        Returns:
621
        dict, header information
622
        """
623
    
624
    def copy(self):
625
        """
626
        Create copy of header.
627
        
628
        Returns:
629
        AlignmentHeader, header copy
630
        """
631
```
632

633
### PileupColumn
634

635
Column of reads covering a genomic position for variant calling and depth analysis.
636

637
```python { .api }
638
class PileupColumn:
639
    @property
640
    def reference_id(self) -> int:
641
        """Reference sequence ID."""
642
    
643
    @property
644
    def reference_pos(self) -> int:
645
        """0-based reference position."""
646
    
647
    @property
648
    def nsegments(self) -> int:
649
        """Number of reads covering position."""
650
    
651
    @property
652
    def pileups(self) -> list:
653
        """List of PileupRead objects."""
654
    
655
    def get_query_sequences(self, mark_matches=False, mark_ends=False, add_indels=False):
656
        """
657
        Get query sequences at position.
658
        
659
        Parameters:
660
        - mark_matches: bool, mark matches with '.'
661
        - mark_ends: bool, mark read ends with '$'
662
        - add_indels: bool, include indel information
663
        
664
        Returns:
665
        List of sequence strings
666
        """
667
    
668
    def get_query_qualities(self):
669
        """
670
        Get base qualities at position.
671
        
672
        Returns:
673
        List of quality scores
674
        """
675
    
676
    def get_query_names(self):
677
        """
678
        Get read names at position.
679
        
680
        Returns:
681
        List of read names
682
        """
683
```
684

685
### PileupRead
686

687
Individual read within a pileup column with position-specific information.
688

689
```python { .api }
690
class PileupRead:
691
    @property
692
    def alignment(self) -> "AlignedSegment":
693
        """Underlying aligned segment."""
694
    
695
    @property
696
    def query_position(self) -> int:
697
        """Position in read sequence (None for indels)."""
698
    
699
    @property
700
    def query_position_or_next(self) -> int:
701
        """Query position or next position for indels."""
702
    
703
    @property
704
    def is_del(self) -> bool:
705
        """True if position is deletion."""
706
    
707
    @property
708
    def is_head(self) -> bool:
709
        """True if start of read."""
710
    
711
    @property
712
    def is_tail(self) -> bool:
713
        """True if end of read."""
714
    
715
    @property
716
    def is_refskip(self) -> bool:
717
        """True if reference skip (N in CIGAR)."""
718
    
719
    @property
720
    def level(self) -> int:
721
        """Level in pileup display."""
722
    
723
    @property
724
    def indel(self) -> int:
725
        """Length of indel (+ insertion, - deletion)."""
726
```
727

728
### IndexStats
729

730
Named tuple containing index statistics for reference sequences.
731

732
```python { .api }
733
IndexStats = namedtuple("IndexStats", ["contig", "mapped", "unmapped", "total"])
734
```
735

736
### IndexedReads
737

738
Index alignment file by read name for fast lookup by query name.
739

740
```python { .api }
741
class IndexedReads:
742
    def __init__(self, samfile, multiple_iterators=False):
743
        """
744
        Create read name index.
745
        
746
        Parameters:
747
        - samfile: AlignmentFile, alignment file to index
748
        - multiple_iterators: bool, allow concurrent access
749
        """
750
    
751
    def find(self, query_name):
752
        """
753
        Find reads by name.
754
        
755
        Parameters:
756
        - query_name: str, read name to find
757
        
758
        Returns:
759
        Iterator of AlignedSegment objects
760
        """
761
```
762

763
## Usage Examples
764

765
### Basic File Reading
766

767
```python
768
import pysam
769

770
# Read SAM/BAM file
771
with pysam.AlignmentFile("input.bam", "rb") as bamfile:
772
    # Iterate over all reads
773
    for read in bamfile:
774
        print(f"{read.query_name}: {read.reference_start}-{read.reference_end}")
775
    
776
    # Fetch reads in region
777
    for read in bamfile.fetch("chr1", 1000, 2000):
778
        if not read.is_unmapped:
779
            print(f"Mapped read: {read.query_sequence}")
780

781
# Read with index
782
with pysam.AlignmentFile("input.bam", "rb", index_filename="input.bam.bai") as bamfile:
783
    count = bamfile.count("chr1", 1000, 2000) 
784
    print(f"Reads in region: {count}")
785
```
786

787
### Pileup Analysis
788

789
```python
790
import pysam
791

792
with pysam.AlignmentFile("input.bam", "rb") as bamfile:
793
    for pileupcolumn in bamfile.pileup("chr1", 1000, 2000):
794
        print(f"Position {pileupcolumn.reference_pos}: {pileupcolumn.nsegments} reads")
795
        
796
        # Get base calls at position
797
        bases = pileupcolumn.get_query_sequences()
798
        qualities = pileupcolumn.get_query_qualities()
799
        
800
        for base, qual in zip(bases, qualities):
801
            if qual >= 20:  # Filter by quality
802
                print(f"  Base: {base}, Quality: {qual}")
803
```
804

805
### Writing Files
806

807
```python
808
import pysam
809

810
# Create new BAM file
811
header = {"HD": {"VN": "1.0", "SO": "coordinate"},
812
          "SQ": [{"LN": 1575, "SN": "chr1"}, 
813
                 {"LN": 1584, "SN": "chr2"}]}
814

815
with pysam.AlignmentFile("output.bam", "wb", header=header) as outfile:
816
    # Create aligned segment
817
    read = pysam.AlignedSegment()
818
    read.query_name = "read1"
819
    read.query_sequence = "ACGTACGTACGT"
820
    read.flag = 0
821
    read.reference_id = 0  # chr1
822
    read.reference_start = 100
823
    read.mapping_quality = 60
824
    read.cigar = [(0, 12)]  # 12M
825
    
826
    outfile.write(read)
827
```

Version

Tile

Files

alignment-files.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

alignment-files.mddocs/