CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pysam

Package for reading, manipulating, and writing genomic data

Pending
Overview
Eval results
Files

sequence-files.mddocs/

FASTA/FASTQ Sequence Files

Support for reading FASTA and FASTQ sequence files with both random access (indexed FASTA) and streaming capabilities. Handles sequence extraction, quality scores, and efficient iteration over large files.

Capabilities

FastaFile

Random access to indexed FASTA files using faidx index for efficient sequence retrieval.

class FastaFile:
    def __init__(self, filename, filepath_index=None, one_based_attributes=True, as_raw=False, reference_filename=None, duplicate_filehandle=True):
        """
        Open indexed FASTA file for random access.
        
        Parameters:
        - filename: str, path to FASTA file
        - filepath_index: str, path to .fai index file
        - one_based_attributes: bool, use 1-based coordinates for attributes
        - as_raw: bool, return sequences as bytes
        - reference_filename: str, alternative name for file
        - duplicate_filehandle: bool, allow multiple handles
        
        Returns:
        FastaFile object
        """
    
    def fetch(self, reference, start=None, end=None, region=None):
        """
        Fetch sequence from region.
        
        Parameters:
        - reference: str, sequence name
        - start: int, start position (0-based or 1-based)
        - end: int, end position (0-based or 1-based)
        - region: str, region string (chr:start-end)
        
        Returns:
        str, sequence string
        """
    
    def get_reference_length(self, reference):
        """
        Get length of reference sequence.
        
        Parameters:
        - reference: str, sequence name
        
        Returns:
        int, sequence length
        """
    
    def close(self):
        """Close the file."""
    
    # Properties
    @property
    def filename(self) -> str:
        """FASTA filename."""
    
    @property
    def references(self) -> tuple:
        """Reference sequence names."""
    
    @property
    def nreferences(self) -> int:
        """Number of reference sequences."""
    
    @property
    def lengths(self) -> tuple:
        """Reference sequence lengths."""
    
    @property
    def is_open(self) -> bool:
        """True if file is open."""
    
    def __contains__(self, reference):
        """Check if reference exists."""
    
    def __len__(self):
        """Number of references."""
    
    def keys(self):
        """
        Get reference names.
        
        Returns:
        Iterator of reference names
        """

FastxFile

Streaming access to FASTA and FASTQ files for sequential reading without indexing.

class FastxFile:
    def __init__(self, filename, mode="r", persist=True):
        """
        Open FASTA/FASTQ file for streaming.
        
        Parameters:
        - filename: str, path to FASTA/FASTQ file
        - mode: str, file mode ('r' for reading)
        - persist: bool, keep file handle open between reads
        
        Returns:
        FastxFile object
        """
    
    def __iter__(self):
        """
        Iterate over sequences.
        
        Returns:
        Iterator of FastxRecord objects
        """
    
    def __enter__(self):
        """Context manager entry."""
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""
    
    def close(self):
        """Close the file."""
    
    @property
    def filename(self) -> str:
        """File name."""
    
    @property
    def is_open(self) -> bool:
        """True if file is open."""

FastxRecord

Individual sequence record from FASTA or FASTQ file with sequence and optional quality data.

class FastxRecord:
    def __init__(self, name=None, sequence=None, comment=None, quality=None):
        """
        Create sequence record.
        
        Parameters:
        - name: str, sequence name/identifier
        - sequence: str, sequence data
        - comment: str, comment line
        - quality: str, quality scores (FASTQ only)
        """
    
    @property
    def name(self) -> str:
        """Sequence name/identifier."""
    
    @name.setter
    def name(self, value: str):
        """Set sequence name."""
    
    @property
    def sequence(self) -> str:
        """Sequence data."""
    
    @sequence.setter
    def sequence(self, value: str):
        """Set sequence data."""
    
    @property
    def comment(self) -> str:
        """Comment/description."""
    
    @comment.setter
    def comment(self, value: str):
        """Set comment."""
    
    @property
    def quality(self) -> str:
        """Quality string (FASTQ only)."""
    
    @quality.setter
    def quality(self, value: str):
        """Set quality string."""
    
    def __len__(self):
        """Sequence length."""
    
    def __str__(self):
        """String representation."""
    
    def get_quality_array(self, offset=33):
        """
        Get quality scores as array.
        
        Parameters:
        - offset: int, quality score offset (33 for Sanger, 64 for Illumina 1.3+)
        
        Returns:
        list, quality scores as integers
        """

FastqProxy (Legacy)

Fast read-only access to FASTQ entries for compatibility with older versions.

class FastqProxy:
    @property
    def name(self) -> str:
        """Sequence name."""
    
    @property
    def sequence(self) -> str:
        """Sequence data."""
    
    @property
    def comment(self) -> str:
        """Comment line."""
    
    @property
    def quality(self) -> str:
        """Quality string."""
    
    def get_quality_array(self, offset=33):
        """
        Get quality scores as array.
        
        Returns:
        list, quality scores
        """

Legacy Classes

Compatibility classes for older pysam versions.

class Fastafile:
    """Legacy alias for FastaFile."""
    def __init__(self, *args, **kwargs):
        """Same as FastaFile.__init__."""

class FastqFile:
    """Legacy alias for FastxFile.""" 
    def __init__(self, *args, **kwargs):
        """Same as FastxFile.__init__."""

Usage Examples

Reading Indexed FASTA Files

import pysam

# Random access to indexed FASTA
with pysam.FastaFile("reference.fa") as fastafile:
    # Get full sequence
    sequence = fastafile.fetch("chr1")
    print(f"chr1 length: {len(sequence)}")
    
    # Get subsequence
    region_seq = fastafile.fetch("chr1", 1000, 2000)
    print(f"Region sequence: {region_seq}")
    
    # Check available references
    for ref in fastafile.references:
        length = fastafile.get_reference_length(ref)
        print(f"{ref}: {length} bp")
    
    # Use region string
    seq = fastafile.fetch(region="chr1:1000-2000")
    print(f"Fetched: {seq}")

Streaming FASTA/FASTQ Files

import pysam

# Read FASTA file sequentially
with pysam.FastxFile("sequences.fa") as fastafile:
    for record in fastafile:
        print(f">{record.name}")
        print(f"Length: {len(record.sequence)}")
        print(f"Sequence: {record.sequence[:50]}...")
        if record.comment:
            print(f"Comment: {record.comment}")

# Read FASTQ file with quality scores
with pysam.FastxFile("reads.fastq") as fastqfile:
    for record in fastqfile:
        print(f"@{record.name}")
        print(f"Sequence: {record.sequence}")
        print(f"Quality: {record.quality}")
        
        # Convert quality to numeric scores
        qual_scores = record.get_quality_array()
        avg_qual = sum(qual_scores) / len(qual_scores)
        print(f"Average quality: {avg_qual:.2f}")

Processing and Filtering

import pysam

# Filter sequences by length and quality
def filter_reads(input_file, output_file, min_length=50, min_avg_qual=20):
    with pysam.FastxFile(input_file) as infile:
        with open(output_file, 'w') as outfile:
            for record in infile:
                # Check length
                if len(record.sequence) < min_length:
                    continue
                
                # Check average quality (FASTQ only)
                if record.quality:
                    qual_scores = record.get_quality_array()
                    avg_qual = sum(qual_scores) / len(qual_scores)
                    if avg_qual < min_avg_qual:
                        continue
                
                # Write filtered record
                if record.quality:  # FASTQ format
                    outfile.write(f"@{record.name}\n")
                    outfile.write(f"{record.sequence}\n")
                    outfile.write("+\n")
                    outfile.write(f"{record.quality}\n")
                else:  # FASTA format
                    outfile.write(f">{record.name}\n")
                    outfile.write(f"{record.sequence}\n")

# Usage
filter_reads("input.fastq", "filtered.fastq", min_length=100, min_avg_qual=25)

Sequence Analysis

import pysam
from collections import Counter

# Analyze sequence composition
def analyze_composition(fasta_file):
    composition = Counter()
    total_length = 0
    
    with pysam.FastxFile(fasta_file) as fastafile:
        for record in fastafile:
            sequence = record.sequence.upper()
            composition.update(sequence)
            total_length += len(sequence)
    
    print(f"Total length: {total_length:,} bp")
    print("Base composition:")
    for base in ['A', 'T', 'G', 'C', 'N']:
        count = composition[base]
        percent = (count / total_length) * 100
        print(f"  {base}: {count:,} ({percent:.2f}%)")
    
    gc_content = (composition['G'] + composition['C']) / total_length * 100
    print(f"GC content: {gc_content:.2f}%")

# Extract sequences by region file
def extract_regions(fasta_file, regions_file, output_file):
    """Extract sequences from regions specified in BED-like format."""
    with pysam.FastaFile(fasta_file) as fastafile:
        with open(output_file, 'w') as outfile:
            with open(regions_file) as regions:
                for line in regions:
                    if line.startswith('#'):
                        continue
                    
                    fields = line.strip().split('\t')
                    chrom, start, end = fields[0], int(fields[1]), int(fields[2])
                    name = f"{chrom}:{start}-{end}"
                    
                    if len(fields) > 3:
                        name = fields[3]
                    
                    # Extract sequence
                    sequence = fastafile.fetch(chrom, start, end)
                    
                    # Write FASTA record
                    outfile.write(f">{name}\n")
                    outfile.write(f"{sequence}\n")

# Usage
analyze_composition("genome.fa")
extract_regions("genome.fa", "regions.bed", "extracted_sequences.fa")

Quality Score Conversion

import pysam

def convert_quality_encoding(input_fastq, output_fastq, input_offset=64, output_offset=33):
    """Convert FASTQ quality encoding (e.g., Illumina 1.3+ to Sanger)."""
    with pysam.FastxFile(input_fastq) as infile:
        with open(output_fastq, 'w') as outfile:
            for record in infile:
                # Convert quality scores
                qual_scores = record.get_quality_array(offset=input_offset)
                
                # Convert to new encoding
                new_quality = ''.join(chr(q + output_offset) for q in qual_scores)
                
                # Write record with new quality
                outfile.write(f"@{record.name}\n")
                outfile.write(f"{record.sequence}\n")
                outfile.write("+\n")
                outfile.write(f"{new_quality}\n")

# Convert Illumina 1.3+ to Sanger encoding
convert_quality_encoding("illumina_reads.fastq", "sanger_reads.fastq", 
                        input_offset=64, output_offset=33)

Install with Tessl CLI

npx tessl i tessl/pypi-pysam

docs

alignment-files.md

bgzf-files.md

command-tools.md

index.md

sequence-files.md

tabix-files.md

utilities.md

variant-files.md

tile.json