Package for reading, manipulating, and writing genomic data
—
Comprehensive support for reading and writing sequence alignment files in SAM, BAM, and CRAM formats. Provides random access, indexing, pileup analysis, and full metadata handling.
Main interface for reading and writing alignment files with support for all SAM/BAM/CRAM formats.
class AlignmentFile:
def __init__(self, filepath_or_object, mode=None, template=None, reference_names=None, reference_lengths=None, text=None, header=None, add_sq_text=False, check_header=True, check_sq=True, reference_filename=None, filename=None, index_filename=None, filepath_index=None, require_index=False, duplicate_filehandle=True, ignore_truncation=False, threads=1):
"""
Open an alignment file for reading or writing.
Parameters:
- filepath_or_object: str or file-like, path to file or file object
- mode: str, file mode ('r', 'w', 'rb', 'wb', 'rc', 'wc')
- template: AlignmentFile, template for header when writing
- reference_names: list, sequence names for header
- reference_lengths: list, sequence lengths for header
- text: str, SAM header text
- header: AlignmentHeader, header object
- check_header: bool, validate file header
- check_sq: bool, validate sequence dictionary
- reference_filename: str, reference FASTA file
- index_filename: str, index file path
- require_index: bool, require index file
- threads: int, number of threads for compression
Returns:
AlignmentFile object
"""
def fetch(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, multiple_iterators=False, until_eof=False):
"""
Fetch reads from a region.
Parameters:
- contig: str, chromosome/contig name
- start: int, 0-based start position
- stop: int, 0-based stop position
- region: str, region string (chr:start-stop)
- multiple_iterators: bool, allow concurrent iterators
- until_eof: bool, read until end of file
Returns:
Iterator of AlignedSegment objects
"""
def pileup(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, **kwargs):
"""
Create pileup of reads covering each position.
Parameters:
- contig: str, chromosome/contig name
- start: int, 0-based start position
- stop: int, 0-based stop position
- stepper: str, pileup algorithm ('all', 'nofilter', 'samtools')
- fastafile: FastaFile, reference sequences
- mask: int, ignore reads with these flags
- max_depth: int, maximum read depth
- truncate: bool, truncate to region boundaries
- min_base_quality: int, minimum base quality
Returns:
Iterator of PileupColumn objects
"""
def count(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, read_callback=None, until_eof=False):
"""
Count reads in a region.
Returns:
int, number of reads
"""
def head(self, n):
"""
Return first n reads.
Returns:
Iterator of AlignedSegment objects
"""
def mate(self, read):
"""
Find mate of a paired read.
Returns:
AlignedSegment or None
"""
def write(self, read):
"""
Write an aligned segment.
Parameters:
- read: AlignedSegment, read to write
"""
def close(self):
"""Close the file."""
# Properties
@property
def header(self) -> "AlignmentHeader":
"""File header information."""
@property
def references(self) -> tuple:
"""Reference sequence names."""
@property
def lengths(self) -> tuple:
"""Reference sequence lengths."""
@property
def nreferences(self) -> int:
"""Number of reference sequences."""
@property
def mapped(self) -> int:
"""Number of mapped reads."""
@property
def unmapped(self) -> int:
"""Number of unmapped reads."""
@property
def nocoordinate(self) -> int:
"""Number of reads without coordinates."""
@property
def filename(self) -> str:
"""Filename of the file."""
@property
def is_valid_tid(self) -> bool:
"""Check if template ID is valid."""
def get_index_statistics(self):
"""
Get index statistics for each reference.
Returns:
List of IndexStats namedtuples
"""
def count_coverage(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, quality_threshold=0, read_callback=None):
"""
Count coverage per position across A, C, G, T bases.
Parameters:
- contig: str, chromosome/contig name
- start: int, 0-based start position
- stop: int, 0-based stop position
- region: str, region string (chr:start-stop)
- quality_threshold: int, minimum base quality
- read_callback: callable, filter reads
Returns:
Tuple of 4 arrays (A, C, G, T coverage per position)
"""
def has_index(self):
"""
Check if file has an index.
Returns:
bool, True if index exists
"""
def check_index(self):
"""
Check if existing index is valid.
Returns:
bool, True if index is valid
"""
def find_introns(self, read_iterator):
"""
Find introns from read iterator (optimized version).
Parameters:
- read_iterator: iterator, reads to analyze
Returns:
dict, mapping (start, end) tuples to counts
"""
def find_introns_slow(self, read_iterator):
"""
Find introns from read iterator (slower but more compatible).
Parameters:
- read_iterator: iterator, reads to analyze
Returns:
dict, mapping (start, end) tuples to counts
"""
@property
def reference_filename(self):
"""Reference FASTA filename if specified."""Header information for alignment files with methods for accessing and manipulating metadata.
class AlignmentHeader:
def __init__(self):
"""Create empty alignment header."""
@classmethod
def from_text(cls, text):
"""
Create header from SAM header text.
Parameters:
- text: str, SAM header text
Returns:
AlignmentHeader object
"""
@classmethod
def from_dict(cls, header_dict):
"""
Create header from dictionary.
Parameters:
- header_dict: dict, header data
Returns:
AlignmentHeader object
"""
@classmethod
def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=False):
"""
Create header from reference sequences.
Parameters:
- reference_names: list, sequence names
- reference_lengths: list, sequence lengths
- text: str, additional header text
- add_sq_text: bool, add @SQ lines
Returns:
AlignmentHeader object
"""
def copy(self):
"""
Create a copy of the header.
Returns:
AlignmentHeader object
"""
def to_dict(self):
"""
Convert header to dictionary.
Returns:
dict, header data
"""
def get_reference_name(self, tid):
"""
Get reference name by template ID.
Parameters:
- tid: int, template ID
Returns:
str, reference name or None
"""
def get_reference_length(self, reference):
"""
Get reference sequence length.
Parameters:
- reference: str, reference name
Returns:
int, sequence length
"""
def get_tid(self, reference):
"""
Get template ID for reference.
Parameters:
- reference: str, reference name
Returns:
int, template ID
"""
def is_valid_tid(self, tid):
"""
Check if template ID is valid.
Parameters:
- tid: int, template ID
Returns:
bool, True if valid
"""
@property
def nreferences(self):
"""Number of reference sequences."""
@property
def references(self):
"""Tuple of reference sequence names."""
@property
def lengths(self):
"""Tuple of reference sequence lengths."""Named tuple containing index statistics for a reference sequence.
class IndexStats(NamedTuple):
contig: str # Reference sequence name
mapped: int # Number of mapped reads
unmapped: int # Number of unmapped reads
total: int # Total number of readsIndividual aligned read or segment with all alignment information and optional tags.
class AlignedSegment:
def __init__(self):
"""Create a new aligned segment."""
# Core properties
@property
def query_name(self) -> str:
"""Read name/identifier."""
@query_name.setter
def query_name(self, value: str):
"""Set read name."""
@property
def query_sequence(self) -> str:
"""Read sequence."""
@query_sequence.setter
def query_sequence(self, value: str):
"""Set read sequence."""
@property
def query_qualities(self) -> list:
"""Base quality scores as list of integers."""
@query_qualities.setter
def query_qualities(self, value: list):
"""Set base quality scores."""
@property
def flag(self) -> int:
"""SAM flag as integer."""
@flag.setter
def flag(self, value: int):
"""Set SAM flag."""
@property
def reference_id(self) -> int:
"""Reference sequence ID (-1 if unmapped)."""
@reference_id.setter
def reference_id(self, value: int):
"""Set reference sequence ID."""
@property
def reference_start(self) -> int:
"""0-based start position on reference."""
@reference_start.setter
def reference_start(self, value: int):
"""Set reference start position."""
@property
def reference_end(self) -> int:
"""0-based end position on reference."""
@property
def next_reference_id(self) -> int:
"""Reference ID of mate."""
@next_reference_id.setter
def next_reference_id(self, value: int):
"""Set mate reference ID."""
@property
def next_reference_start(self) -> int:
"""Start position of mate."""
@next_reference_start.setter
def next_reference_start(self, value: int):
"""Set mate start position."""
@property
def template_length(self) -> int:
"""Template length (insert size)."""
@template_length.setter
def template_length(self, value: int):
"""Set template length."""
@property
def mapping_quality(self) -> int:
"""Mapping quality score."""
@mapping_quality.setter
def mapping_quality(self, value: int):
"""Set mapping quality."""
@property
def cigar(self) -> tuple:
"""CIGAR alignment as tuple of (operation, length) pairs."""
@cigar.setter
def cigar(self, value: tuple):
"""Set CIGAR alignment."""
@property
def cigarstring(self) -> str:
"""CIGAR as string representation."""
@cigarstring.setter
def cigarstring(self, value: str):
"""Set CIGAR from string."""
# Flag properties
@property
def is_paired(self) -> bool:
"""True if read is paired."""
@property
def is_proper_pair(self) -> bool:
"""True if read is in proper pair."""
@property
def is_unmapped(self) -> bool:
"""True if read is unmapped."""
@property
def mate_is_unmapped(self) -> bool:
"""True if mate is unmapped."""
@property
def is_reverse(self) -> bool:
"""True if read is reverse complemented."""
@property
def mate_is_reverse(self) -> bool:
"""True if mate is reverse complemented."""
@property
def is_read1(self) -> bool:
"""True if first read in pair."""
@property
def is_read2(self) -> bool:
"""True if second read in pair."""
@property
def is_secondary(self) -> bool:
"""True if secondary alignment."""
@property
def is_qcfail(self) -> bool:
"""True if read fails quality control."""
@property
def is_duplicate(self) -> bool:
"""True if read is duplicate."""
@property
def is_supplementary(self) -> bool:
"""True if supplementary alignment."""
# Methods
def get_tag(self, tag, with_value_type=False):
"""
Get optional alignment tag.
Parameters:
- tag: str, two-character tag name
- with_value_type: bool, return (value, type) tuple
Returns:
Tag value or (value, type) tuple
"""
def set_tag(self, tag, value, value_type=None, replace=True):
"""
Set optional alignment tag.
Parameters:
- tag: str, two-character tag name
- value: tag value
- value_type: str, value type ('A', 'i', 'f', 'Z', 'H', 'B')
- replace: bool, replace existing tag
"""
def has_tag(self, tag):
"""
Check if tag exists.
Returns:
bool, True if tag exists
"""
def get_tags(self, with_value_type=False):
"""
Get all tags.
Returns:
List of (tag, value) or (tag, value, type) tuples
"""
def set_tags(self, tags):
"""
Set multiple tags.
Parameters:
- tags: list of (tag, value) or (tag, value, type) tuples
"""
def get_aligned_pairs(self, matches_only=False, with_seq=False):
"""
Get alignment between query and reference.
Parameters:
- matches_only: bool, only return matching positions
- with_seq: bool, include sequence characters
Returns:
List of (query_pos, ref_pos) or (query_pos, ref_pos, ref_base) tuples
"""
def get_reference_positions(self, full_length=False):
"""
Get reference positions corresponding to query.
Returns:
List of reference positions
"""
def to_string(self):
"""
Convert to SAM format string.
Returns:
str, SAM format representation
"""
def compare(self, other):
"""
Compare two aligned segments.
Returns:
int, comparison result
"""Header information for alignment files including sequence dictionary and metadata.
class AlignmentHeader:
def __init__(self, dictionary=None):
"""
Create alignment header.
Parameters:
- dictionary: dict, header information
"""
@property
def references(self) -> tuple:
"""Reference sequence names."""
@property
def lengths(self) -> tuple:
"""Reference sequence lengths."""
def to_dict(self):
"""
Convert header to dictionary.
Returns:
dict, header information
"""
def as_dict(self):
"""
Get header as dictionary.
Returns:
dict, header information
"""
def copy(self):
"""
Create copy of header.
Returns:
AlignmentHeader, header copy
"""Column of reads covering a genomic position for variant calling and depth analysis.
class PileupColumn:
@property
def reference_id(self) -> int:
"""Reference sequence ID."""
@property
def reference_pos(self) -> int:
"""0-based reference position."""
@property
def nsegments(self) -> int:
"""Number of reads covering position."""
@property
def pileups(self) -> list:
"""List of PileupRead objects."""
def get_query_sequences(self, mark_matches=False, mark_ends=False, add_indels=False):
"""
Get query sequences at position.
Parameters:
- mark_matches: bool, mark matches with '.'
- mark_ends: bool, mark read ends with '$'
- add_indels: bool, include indel information
Returns:
List of sequence strings
"""
def get_query_qualities(self):
"""
Get base qualities at position.
Returns:
List of quality scores
"""
def get_query_names(self):
"""
Get read names at position.
Returns:
List of read names
"""Individual read within a pileup column with position-specific information.
class PileupRead:
@property
def alignment(self) -> "AlignedSegment":
"""Underlying aligned segment."""
@property
def query_position(self) -> int:
"""Position in read sequence (None for indels)."""
@property
def query_position_or_next(self) -> int:
"""Query position or next position for indels."""
@property
def is_del(self) -> bool:
"""True if position is deletion."""
@property
def is_head(self) -> bool:
"""True if start of read."""
@property
def is_tail(self) -> bool:
"""True if end of read."""
@property
def is_refskip(self) -> bool:
"""True if reference skip (N in CIGAR)."""
@property
def level(self) -> int:
"""Level in pileup display."""
@property
def indel(self) -> int:
"""Length of indel (+ insertion, - deletion)."""Named tuple containing index statistics for reference sequences.
IndexStats = namedtuple("IndexStats", ["contig", "mapped", "unmapped", "total"])Index alignment file by read name for fast lookup by query name.
class IndexedReads:
def __init__(self, samfile, multiple_iterators=False):
"""
Create read name index.
Parameters:
- samfile: AlignmentFile, alignment file to index
- multiple_iterators: bool, allow concurrent access
"""
def find(self, query_name):
"""
Find reads by name.
Parameters:
- query_name: str, read name to find
Returns:
Iterator of AlignedSegment objects
"""import pysam
# Read SAM/BAM file
with pysam.AlignmentFile("input.bam", "rb") as bamfile:
# Iterate over all reads
for read in bamfile:
print(f"{read.query_name}: {read.reference_start}-{read.reference_end}")
# Fetch reads in region
for read in bamfile.fetch("chr1", 1000, 2000):
if not read.is_unmapped:
print(f"Mapped read: {read.query_sequence}")
# Read with index
with pysam.AlignmentFile("input.bam", "rb", index_filename="input.bam.bai") as bamfile:
count = bamfile.count("chr1", 1000, 2000)
print(f"Reads in region: {count}")import pysam
with pysam.AlignmentFile("input.bam", "rb") as bamfile:
for pileupcolumn in bamfile.pileup("chr1", 1000, 2000):
print(f"Position {pileupcolumn.reference_pos}: {pileupcolumn.nsegments} reads")
# Get base calls at position
bases = pileupcolumn.get_query_sequences()
qualities = pileupcolumn.get_query_qualities()
for base, qual in zip(bases, qualities):
if qual >= 20: # Filter by quality
print(f" Base: {base}, Quality: {qual}")import pysam
# Create new BAM file
header = {"HD": {"VN": "1.0", "SO": "coordinate"},
"SQ": [{"LN": 1575, "SN": "chr1"},
{"LN": 1584, "SN": "chr2"}]}
with pysam.AlignmentFile("output.bam", "wb", header=header) as outfile:
# Create aligned segment
read = pysam.AlignedSegment()
read.query_name = "read1"
read.query_sequence = "ACGTACGTACGT"
read.flag = 0
read.reference_id = 0 # chr1
read.reference_start = 100
read.mapping_quality = 60
read.cigar = [(0, 12)] # 12M
outfile.write(read)Install with Tessl CLI
npx tessl i tessl/pypi-pysam