Package for reading, manipulating, and writing genomic data
—
Support for reading FASTA and FASTQ sequence files with both random access (indexed FASTA) and streaming capabilities. Handles sequence extraction, quality scores, and efficient iteration over large files.
Random access to indexed FASTA files using faidx index for efficient sequence retrieval.
class FastaFile:
def __init__(self, filename, filepath_index=None, one_based_attributes=True, as_raw=False, reference_filename=None, duplicate_filehandle=True):
"""
Open indexed FASTA file for random access.
Parameters:
- filename: str, path to FASTA file
- filepath_index: str, path to .fai index file
- one_based_attributes: bool, use 1-based coordinates for attributes
- as_raw: bool, return sequences as bytes
- reference_filename: str, alternative name for file
- duplicate_filehandle: bool, allow multiple handles
Returns:
FastaFile object
"""
def fetch(self, reference, start=None, end=None, region=None):
"""
Fetch sequence from region.
Parameters:
- reference: str, sequence name
- start: int, start position (0-based or 1-based)
- end: int, end position (0-based or 1-based)
- region: str, region string (chr:start-end)
Returns:
str, sequence string
"""
def get_reference_length(self, reference):
"""
Get length of reference sequence.
Parameters:
- reference: str, sequence name
Returns:
int, sequence length
"""
def close(self):
"""Close the file."""
# Properties
@property
def filename(self) -> str:
"""FASTA filename."""
@property
def references(self) -> tuple:
"""Reference sequence names."""
@property
def nreferences(self) -> int:
"""Number of reference sequences."""
@property
def lengths(self) -> tuple:
"""Reference sequence lengths."""
@property
def is_open(self) -> bool:
"""True if file is open."""
def __contains__(self, reference):
"""Check if reference exists."""
def __len__(self):
"""Number of references."""
def keys(self):
"""
Get reference names.
Returns:
Iterator of reference names
"""Streaming access to FASTA and FASTQ files for sequential reading without indexing.
class FastxFile:
def __init__(self, filename, mode="r", persist=True):
"""
Open FASTA/FASTQ file for streaming.
Parameters:
- filename: str, path to FASTA/FASTQ file
- mode: str, file mode ('r' for reading)
- persist: bool, keep file handle open between reads
Returns:
FastxFile object
"""
def __iter__(self):
"""
Iterate over sequences.
Returns:
Iterator of FastxRecord objects
"""
def __enter__(self):
"""Context manager entry."""
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
def close(self):
"""Close the file."""
@property
def filename(self) -> str:
"""File name."""
@property
def is_open(self) -> bool:
"""True if file is open."""Individual sequence record from FASTA or FASTQ file with sequence and optional quality data.
class FastxRecord:
def __init__(self, name=None, sequence=None, comment=None, quality=None):
"""
Create sequence record.
Parameters:
- name: str, sequence name/identifier
- sequence: str, sequence data
- comment: str, comment line
- quality: str, quality scores (FASTQ only)
"""
@property
def name(self) -> str:
"""Sequence name/identifier."""
@name.setter
def name(self, value: str):
"""Set sequence name."""
@property
def sequence(self) -> str:
"""Sequence data."""
@sequence.setter
def sequence(self, value: str):
"""Set sequence data."""
@property
def comment(self) -> str:
"""Comment/description."""
@comment.setter
def comment(self, value: str):
"""Set comment."""
@property
def quality(self) -> str:
"""Quality string (FASTQ only)."""
@quality.setter
def quality(self, value: str):
"""Set quality string."""
def __len__(self):
"""Sequence length."""
def __str__(self):
"""String representation."""
def get_quality_array(self, offset=33):
"""
Get quality scores as array.
Parameters:
- offset: int, quality score offset (33 for Sanger, 64 for Illumina 1.3+)
Returns:
list, quality scores as integers
"""Fast read-only access to FASTQ entries for compatibility with older versions.
class FastqProxy:
@property
def name(self) -> str:
"""Sequence name."""
@property
def sequence(self) -> str:
"""Sequence data."""
@property
def comment(self) -> str:
"""Comment line."""
@property
def quality(self) -> str:
"""Quality string."""
def get_quality_array(self, offset=33):
"""
Get quality scores as array.
Returns:
list, quality scores
"""Compatibility classes for older pysam versions.
class Fastafile:
"""Legacy alias for FastaFile."""
def __init__(self, *args, **kwargs):
"""Same as FastaFile.__init__."""
class FastqFile:
"""Legacy alias for FastxFile."""
def __init__(self, *args, **kwargs):
"""Same as FastxFile.__init__."""import pysam
# Random access to indexed FASTA
with pysam.FastaFile("reference.fa") as fastafile:
# Get full sequence
sequence = fastafile.fetch("chr1")
print(f"chr1 length: {len(sequence)}")
# Get subsequence
region_seq = fastafile.fetch("chr1", 1000, 2000)
print(f"Region sequence: {region_seq}")
# Check available references
for ref in fastafile.references:
length = fastafile.get_reference_length(ref)
print(f"{ref}: {length} bp")
# Use region string
seq = fastafile.fetch(region="chr1:1000-2000")
print(f"Fetched: {seq}")import pysam
# Read FASTA file sequentially
with pysam.FastxFile("sequences.fa") as fastafile:
for record in fastafile:
print(f">{record.name}")
print(f"Length: {len(record.sequence)}")
print(f"Sequence: {record.sequence[:50]}...")
if record.comment:
print(f"Comment: {record.comment}")
# Read FASTQ file with quality scores
with pysam.FastxFile("reads.fastq") as fastqfile:
for record in fastqfile:
print(f"@{record.name}")
print(f"Sequence: {record.sequence}")
print(f"Quality: {record.quality}")
# Convert quality to numeric scores
qual_scores = record.get_quality_array()
avg_qual = sum(qual_scores) / len(qual_scores)
print(f"Average quality: {avg_qual:.2f}")import pysam
# Filter sequences by length and quality
def filter_reads(input_file, output_file, min_length=50, min_avg_qual=20):
with pysam.FastxFile(input_file) as infile:
with open(output_file, 'w') as outfile:
for record in infile:
# Check length
if len(record.sequence) < min_length:
continue
# Check average quality (FASTQ only)
if record.quality:
qual_scores = record.get_quality_array()
avg_qual = sum(qual_scores) / len(qual_scores)
if avg_qual < min_avg_qual:
continue
# Write filtered record
if record.quality: # FASTQ format
outfile.write(f"@{record.name}\n")
outfile.write(f"{record.sequence}\n")
outfile.write("+\n")
outfile.write(f"{record.quality}\n")
else: # FASTA format
outfile.write(f">{record.name}\n")
outfile.write(f"{record.sequence}\n")
# Usage
filter_reads("input.fastq", "filtered.fastq", min_length=100, min_avg_qual=25)import pysam
from collections import Counter
# Analyze sequence composition
def analyze_composition(fasta_file):
composition = Counter()
total_length = 0
with pysam.FastxFile(fasta_file) as fastafile:
for record in fastafile:
sequence = record.sequence.upper()
composition.update(sequence)
total_length += len(sequence)
print(f"Total length: {total_length:,} bp")
print("Base composition:")
for base in ['A', 'T', 'G', 'C', 'N']:
count = composition[base]
percent = (count / total_length) * 100
print(f" {base}: {count:,} ({percent:.2f}%)")
gc_content = (composition['G'] + composition['C']) / total_length * 100
print(f"GC content: {gc_content:.2f}%")
# Extract sequences by region file
def extract_regions(fasta_file, regions_file, output_file):
"""Extract sequences from regions specified in BED-like format."""
with pysam.FastaFile(fasta_file) as fastafile:
with open(output_file, 'w') as outfile:
with open(regions_file) as regions:
for line in regions:
if line.startswith('#'):
continue
fields = line.strip().split('\t')
chrom, start, end = fields[0], int(fields[1]), int(fields[2])
name = f"{chrom}:{start}-{end}"
if len(fields) > 3:
name = fields[3]
# Extract sequence
sequence = fastafile.fetch(chrom, start, end)
# Write FASTA record
outfile.write(f">{name}\n")
outfile.write(f"{sequence}\n")
# Usage
analyze_composition("genome.fa")
extract_regions("genome.fa", "regions.bed", "extracted_sequences.fa")import pysam
def convert_quality_encoding(input_fastq, output_fastq, input_offset=64, output_offset=33):
"""Convert FASTQ quality encoding (e.g., Illumina 1.3+ to Sanger)."""
with pysam.FastxFile(input_fastq) as infile:
with open(output_fastq, 'w') as outfile:
for record in infile:
# Convert quality scores
qual_scores = record.get_quality_array(offset=input_offset)
# Convert to new encoding
new_quality = ''.join(chr(q + output_offset) for q in qual_scores)
# Write record with new quality
outfile.write(f"@{record.name}\n")
outfile.write(f"{record.sequence}\n")
outfile.write("+\n")
outfile.write(f"{new_quality}\n")
# Convert Illumina 1.3+ to Sanger encoding
convert_quality_encoding("illumina_reads.fastq", "sanger_reads.fastq",
input_offset=64, output_offset=33)Install with Tessl CLI
npx tessl i tessl/pypi-pysam