tessl/pypi-pysam

Package for reading, manipulating, and writing genomic data

—

Pending

Overview

Eval results

Files

SAM/BAM/CRAM Alignment Files

Name: tessl/pypi-pysam
Author: tessl

Comprehensive support for reading and writing sequence alignment files in SAM, BAM, and CRAM formats. Provides random access, indexing, pileup analysis, and full metadata handling.

Capabilities

AlignmentFile

Main interface for reading and writing alignment files with support for all SAM/BAM/CRAM formats.

class AlignmentFile:
    def __init__(self, filepath_or_object, mode=None, template=None, reference_names=None, reference_lengths=None, text=None, header=None, add_sq_text=False, check_header=True, check_sq=True, reference_filename=None, filename=None, index_filename=None, filepath_index=None, require_index=False, duplicate_filehandle=True, ignore_truncation=False, threads=1):
        """
        Open an alignment file for reading or writing.
        
        Parameters:
        - filepath_or_object: str or file-like, path to file or file object
        - mode: str, file mode ('r', 'w', 'rb', 'wb', 'rc', 'wc')
        - template: AlignmentFile, template for header when writing
        - reference_names: list, sequence names for header
        - reference_lengths: list, sequence lengths for header  
        - text: str, SAM header text
        - header: AlignmentHeader, header object
        - check_header: bool, validate file header
        - check_sq: bool, validate sequence dictionary
        - reference_filename: str, reference FASTA file
        - index_filename: str, index file path
        - require_index: bool, require index file
        - threads: int, number of threads for compression
        
        Returns:
        AlignmentFile object
        """
    
    def fetch(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, multiple_iterators=False, until_eof=False):
        """
        Fetch reads from a region.
        
        Parameters:
        - contig: str, chromosome/contig name
        - start: int, 0-based start position  
        - stop: int, 0-based stop position
        - region: str, region string (chr:start-stop)
        - multiple_iterators: bool, allow concurrent iterators
        - until_eof: bool, read until end of file
        
        Returns:
        Iterator of AlignedSegment objects
        """
    
    def pileup(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, **kwargs):
        """
        Create pileup of reads covering each position.
        
        Parameters:
        - contig: str, chromosome/contig name
        - start: int, 0-based start position
        - stop: int, 0-based stop position  
        - stepper: str, pileup algorithm ('all', 'nofilter', 'samtools')
        - fastafile: FastaFile, reference sequences
        - mask: int, ignore reads with these flags
        - max_depth: int, maximum read depth
        - truncate: bool, truncate to region boundaries
        - min_base_quality: int, minimum base quality
        
        Returns:
        Iterator of PileupColumn objects
        """
    
    def count(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, read_callback=None, until_eof=False):
        """
        Count reads in a region.
        
        Returns:
        int, number of reads
        """
    
    def head(self, n):
        """
        Return first n reads.
        
        Returns:
        Iterator of AlignedSegment objects
        """
    
    def mate(self, read):
        """
        Find mate of a paired read.
        
        Returns:
        AlignedSegment or None
        """
    
    def write(self, read):
        """
        Write an aligned segment.
        
        Parameters:
        - read: AlignedSegment, read to write
        """
    
    def close(self):
        """Close the file."""
    
    # Properties
    @property
    def header(self) -> "AlignmentHeader":
        """File header information."""
    
    @property
    def references(self) -> tuple:
        """Reference sequence names."""
    
    @property 
    def lengths(self) -> tuple:
        """Reference sequence lengths."""
    
    @property
    def nreferences(self) -> int:
        """Number of reference sequences."""
    
    @property
    def mapped(self) -> int:
        """Number of mapped reads."""
    
    @property
    def unmapped(self) -> int:
        """Number of unmapped reads."""
    
    @property
    def nocoordinate(self) -> int:
        """Number of reads without coordinates."""
    
    @property
    def filename(self) -> str:
        """Filename of the file."""
    
    @property
    def is_valid_tid(self) -> bool:
        """Check if template ID is valid."""
    
    def get_index_statistics(self):
        """
        Get index statistics for each reference.
        
        Returns:
        List of IndexStats namedtuples
        """
    
    def count_coverage(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, quality_threshold=0, read_callback=None):
        """
        Count coverage per position across A, C, G, T bases.
        
        Parameters:
        - contig: str, chromosome/contig name
        - start: int, 0-based start position
        - stop: int, 0-based stop position
        - region: str, region string (chr:start-stop)
        - quality_threshold: int, minimum base quality
        - read_callback: callable, filter reads
        
        Returns:
        Tuple of 4 arrays (A, C, G, T coverage per position)
        """
    
    def has_index(self):
        """
        Check if file has an index.
        
        Returns:
        bool, True if index exists
        """
    
    def check_index(self):
        """
        Check if existing index is valid.
        
        Returns:
        bool, True if index is valid
        """
    
    def find_introns(self, read_iterator):
        """
        Find introns from read iterator (optimized version).
        
        Parameters:
        - read_iterator: iterator, reads to analyze
        
        Returns:
        dict, mapping (start, end) tuples to counts
        """
    
    def find_introns_slow(self, read_iterator):
        """
        Find introns from read iterator (slower but more compatible).
        
        Parameters:
        - read_iterator: iterator, reads to analyze
        
        Returns:
        dict, mapping (start, end) tuples to counts
        """
    
    @property
    def reference_filename(self):
        """Reference FASTA filename if specified."""

AlignmentHeader

Header information for alignment files with methods for accessing and manipulating metadata.

class AlignmentHeader:
    def __init__(self):
        """Create empty alignment header."""
    
    @classmethod
    def from_text(cls, text):
        """
        Create header from SAM header text.
        
        Parameters:
        - text: str, SAM header text
        
        Returns:
        AlignmentHeader object
        """
    
    @classmethod
    def from_dict(cls, header_dict):
        """
        Create header from dictionary.
        
        Parameters:
        - header_dict: dict, header data
        
        Returns:
        AlignmentHeader object
        """
    
    @classmethod
    def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=False):
        """
        Create header from reference sequences.
        
        Parameters:
        - reference_names: list, sequence names
        - reference_lengths: list, sequence lengths
        - text: str, additional header text
        - add_sq_text: bool, add @SQ lines
        
        Returns:
        AlignmentHeader object
        """
    
    def copy(self):
        """
        Create a copy of the header.
        
        Returns:
        AlignmentHeader object
        """
    
    def to_dict(self):
        """
        Convert header to dictionary.
        
        Returns:
        dict, header data
        """
    
    def get_reference_name(self, tid):
        """
        Get reference name by template ID.
        
        Parameters:
        - tid: int, template ID
        
        Returns:
        str, reference name or None
        """
    
    def get_reference_length(self, reference):
        """
        Get reference sequence length.
        
        Parameters:
        - reference: str, reference name
        
        Returns:
        int, sequence length
        """
    
    def get_tid(self, reference):
        """
        Get template ID for reference.
        
        Parameters:
        - reference: str, reference name
        
        Returns:
        int, template ID
        """
    
    def is_valid_tid(self, tid):
        """
        Check if template ID is valid.
        
        Parameters:
        - tid: int, template ID
        
        Returns:
        bool, True if valid
        """
    
    @property
    def nreferences(self):
        """Number of reference sequences."""
    
    @property
    def references(self):
        """Tuple of reference sequence names."""
    
    @property
    def lengths(self):
        """Tuple of reference sequence lengths."""

IndexStats

Named tuple containing index statistics for a reference sequence.

class IndexStats(NamedTuple):
    contig: str      # Reference sequence name
    mapped: int      # Number of mapped reads
    unmapped: int    # Number of unmapped reads  
    total: int       # Total number of reads

AlignedSegment

Individual aligned read or segment with all alignment information and optional tags.

class AlignedSegment:
    def __init__(self):
        """Create a new aligned segment."""
    
    # Core properties
    @property
    def query_name(self) -> str:
        """Read name/identifier."""
    
    @query_name.setter  
    def query_name(self, value: str):
        """Set read name."""
    
    @property
    def query_sequence(self) -> str:
        """Read sequence."""
    
    @query_sequence.setter
    def query_sequence(self, value: str):
        """Set read sequence."""
    
    @property
    def query_qualities(self) -> list:
        """Base quality scores as list of integers."""
    
    @query_qualities.setter
    def query_qualities(self, value: list):
        """Set base quality scores."""
    
    @property
    def flag(self) -> int:
        """SAM flag as integer."""
    
    @flag.setter
    def flag(self, value: int):
        """Set SAM flag."""
    
    @property
    def reference_id(self) -> int:
        """Reference sequence ID (-1 if unmapped)."""
    
    @reference_id.setter
    def reference_id(self, value: int):
        """Set reference sequence ID."""
    
    @property
    def reference_start(self) -> int:
        """0-based start position on reference."""
    
    @reference_start.setter
    def reference_start(self, value: int):
        """Set reference start position."""
    
    @property
    def reference_end(self) -> int:
        """0-based end position on reference."""
    
    @property
    def next_reference_id(self) -> int:
        """Reference ID of mate."""
    
    @next_reference_id.setter
    def next_reference_id(self, value: int):
        """Set mate reference ID."""
    
    @property
    def next_reference_start(self) -> int:
        """Start position of mate."""
    
    @next_reference_start.setter  
    def next_reference_start(self, value: int):
        """Set mate start position."""
    
    @property
    def template_length(self) -> int:
        """Template length (insert size)."""
    
    @template_length.setter
    def template_length(self, value: int):
        """Set template length."""
    
    @property
    def mapping_quality(self) -> int:
        """Mapping quality score."""
    
    @mapping_quality.setter
    def mapping_quality(self, value: int):
        """Set mapping quality."""
    
    @property
    def cigar(self) -> tuple:
        """CIGAR alignment as tuple of (operation, length) pairs."""
    
    @cigar.setter
    def cigar(self, value: tuple):
        """Set CIGAR alignment."""
    
    @property
    def cigarstring(self) -> str:
        """CIGAR as string representation."""
    
    @cigarstring.setter
    def cigarstring(self, value: str):
        """Set CIGAR from string."""
    
    # Flag properties
    @property
    def is_paired(self) -> bool:
        """True if read is paired."""
    
    @property
    def is_proper_pair(self) -> bool:
        """True if read is in proper pair."""
    
    @property
    def is_unmapped(self) -> bool:
        """True if read is unmapped."""
    
    @property
    def mate_is_unmapped(self) -> bool:
        """True if mate is unmapped."""
    
    @property
    def is_reverse(self) -> bool:
        """True if read is reverse complemented."""
    
    @property
    def mate_is_reverse(self) -> bool:
        """True if mate is reverse complemented."""
    
    @property
    def is_read1(self) -> bool:
        """True if first read in pair."""
    
    @property
    def is_read2(self) -> bool:
        """True if second read in pair."""
    
    @property
    def is_secondary(self) -> bool:
        """True if secondary alignment."""
    
    @property
    def is_qcfail(self) -> bool:
        """True if read fails quality control."""
    
    @property
    def is_duplicate(self) -> bool:
        """True if read is duplicate."""
    
    @property
    def is_supplementary(self) -> bool:
        """True if supplementary alignment."""
    
    # Methods
    def get_tag(self, tag, with_value_type=False):
        """
        Get optional alignment tag.
        
        Parameters:
        - tag: str, two-character tag name
        - with_value_type: bool, return (value, type) tuple
        
        Returns:
        Tag value or (value, type) tuple
        """
    
    def set_tag(self, tag, value, value_type=None, replace=True):
        """
        Set optional alignment tag.
        
        Parameters:
        - tag: str, two-character tag name
        - value: tag value
        - value_type: str, value type ('A', 'i', 'f', 'Z', 'H', 'B')
        - replace: bool, replace existing tag
        """
    
    def has_tag(self, tag):
        """
        Check if tag exists.
        
        Returns:
        bool, True if tag exists
        """
    
    def get_tags(self, with_value_type=False):
        """
        Get all tags.
        
        Returns:
        List of (tag, value) or (tag, value, type) tuples
        """
    
    def set_tags(self, tags):
        """
        Set multiple tags.
        
        Parameters:
        - tags: list of (tag, value) or (tag, value, type) tuples
        """
    
    def get_aligned_pairs(self, matches_only=False, with_seq=False):
        """
        Get alignment between query and reference.
        
        Parameters:
        - matches_only: bool, only return matching positions
        - with_seq: bool, include sequence characters
        
        Returns:
        List of (query_pos, ref_pos) or (query_pos, ref_pos, ref_base) tuples
        """
    
    def get_reference_positions(self, full_length=False):
        """
        Get reference positions corresponding to query.
        
        Returns:
        List of reference positions
        """
    
    def to_string(self):
        """
        Convert to SAM format string.
        
        Returns:
        str, SAM format representation
        """
    
    def compare(self, other):
        """
        Compare two aligned segments.
        
        Returns:
        int, comparison result
        """

AlignmentHeader

Header information for alignment files including sequence dictionary and metadata.

class AlignmentHeader:
    def __init__(self, dictionary=None):
        """
        Create alignment header.
        
        Parameters:
        - dictionary: dict, header information
        """
    
    @property
    def references(self) -> tuple:
        """Reference sequence names."""
    
    @property
    def lengths(self) -> tuple:
        """Reference sequence lengths."""
    
    def to_dict(self):
        """
        Convert header to dictionary.
        
        Returns:
        dict, header information
        """
    
    def as_dict(self):
        """
        Get header as dictionary.
        
        Returns:
        dict, header information
        """
    
    def copy(self):
        """
        Create copy of header.
        
        Returns:
        AlignmentHeader, header copy
        """

PileupColumn

Column of reads covering a genomic position for variant calling and depth analysis.

class PileupColumn:
    @property
    def reference_id(self) -> int:
        """Reference sequence ID."""
    
    @property
    def reference_pos(self) -> int:
        """0-based reference position."""
    
    @property
    def nsegments(self) -> int:
        """Number of reads covering position."""
    
    @property
    def pileups(self) -> list:
        """List of PileupRead objects."""
    
    def get_query_sequences(self, mark_matches=False, mark_ends=False, add_indels=False):
        """
        Get query sequences at position.
        
        Parameters:
        - mark_matches: bool, mark matches with '.'
        - mark_ends: bool, mark read ends with '$'
        - add_indels: bool, include indel information
        
        Returns:
        List of sequence strings
        """
    
    def get_query_qualities(self):
        """
        Get base qualities at position.
        
        Returns:
        List of quality scores
        """
    
    def get_query_names(self):
        """
        Get read names at position.
        
        Returns:
        List of read names
        """

PileupRead

Individual read within a pileup column with position-specific information.

class PileupRead:
    @property
    def alignment(self) -> "AlignedSegment":
        """Underlying aligned segment."""
    
    @property
    def query_position(self) -> int:
        """Position in read sequence (None for indels)."""
    
    @property
    def query_position_or_next(self) -> int:
        """Query position or next position for indels."""
    
    @property
    def is_del(self) -> bool:
        """True if position is deletion."""
    
    @property
    def is_head(self) -> bool:
        """True if start of read."""
    
    @property
    def is_tail(self) -> bool:
        """True if end of read."""
    
    @property
    def is_refskip(self) -> bool:
        """True if reference skip (N in CIGAR)."""
    
    @property
    def level(self) -> int:
        """Level in pileup display."""
    
    @property
    def indel(self) -> int:
        """Length of indel (+ insertion, - deletion)."""

IndexStats

Named tuple containing index statistics for reference sequences.

IndexStats = namedtuple("IndexStats", ["contig", "mapped", "unmapped", "total"])

IndexedReads

Index alignment file by read name for fast lookup by query name.

class IndexedReads:
    def __init__(self, samfile, multiple_iterators=False):
        """
        Create read name index.
        
        Parameters:
        - samfile: AlignmentFile, alignment file to index
        - multiple_iterators: bool, allow concurrent access
        """
    
    def find(self, query_name):
        """
        Find reads by name.
        
        Parameters:
        - query_name: str, read name to find
        
        Returns:
        Iterator of AlignedSegment objects
        """

Usage Examples

Basic File Reading

import pysam

# Read SAM/BAM file
with pysam.AlignmentFile("input.bam", "rb") as bamfile:
    # Iterate over all reads
    for read in bamfile:
        print(f"{read.query_name}: {read.reference_start}-{read.reference_end}")
    
    # Fetch reads in region
    for read in bamfile.fetch("chr1", 1000, 2000):
        if not read.is_unmapped:
            print(f"Mapped read: {read.query_sequence}")

# Read with index
with pysam.AlignmentFile("input.bam", "rb", index_filename="input.bam.bai") as bamfile:
    count = bamfile.count("chr1", 1000, 2000) 
    print(f"Reads in region: {count}")

Pileup Analysis

import pysam

with pysam.AlignmentFile("input.bam", "rb") as bamfile:
    for pileupcolumn in bamfile.pileup("chr1", 1000, 2000):
        print(f"Position {pileupcolumn.reference_pos}: {pileupcolumn.nsegments} reads")
        
        # Get base calls at position
        bases = pileupcolumn.get_query_sequences()
        qualities = pileupcolumn.get_query_qualities()
        
        for base, qual in zip(bases, qualities):
            if qual >= 20:  # Filter by quality
                print(f"  Base: {base}, Quality: {qual}")

Writing Files

import pysam

# Create new BAM file
header = {"HD": {"VN": "1.0", "SO": "coordinate"},
          "SQ": [{"LN": 1575, "SN": "chr1"}, 
                 {"LN": 1584, "SN": "chr2"}]}

with pysam.AlignmentFile("output.bam", "wb", header=header) as outfile:
    # Create aligned segment
    read = pysam.AlignedSegment()
    read.query_name = "read1"
    read.query_sequence = "ACGTACGTACGT"
    read.flag = 0
    read.reference_id = 0  # chr1
    read.reference_start = 100
    read.mapping_quality = 60
    read.cigar = [(0, 12)]  # 12M
    
    outfile.write(read)

Install with Tessl CLI