tessl/pypi-pysam

Package for reading, manipulating, and writing genomic data

—

Pending

Overview

Eval results

Files

Tabix-Indexed Files

Name: tessl/pypi-pysam
Author: tessl

Support for compressed, indexed genomic files using tabix indexing. Enables efficient random access to large compressed files in various formats including BED, GFF, GTF, and VCF.

Capabilities

TabixFile

Main interface for accessing tabix-indexed compressed files with format-specific parsers.

class TabixFile:
    def __init__(self, filename, parser=None, encoding="ascii", duplicate_filehandle=True):
        """
        Open tabix-indexed file.
        
        Parameters:
        - filename: str, path to tabix-indexed file (.gz)
        - parser: class, parser for file format (asTuple, asBed, asGTF, etc.)
        - encoding: str, text encoding
        - duplicate_filehandle: bool, allow multiple handles
        
        Returns:
        TabixFile object
        """
    
    def fetch(self, reference=None, start=None, end=None, region=None, parser=None, multiple_iterators=False):
        """
        Fetch records from region.
        
        Parameters:
        - reference: str, chromosome/contig name
        - start: int, start position (0-based)
        - end: int, end position (0-based)
        - region: str, region string (chr:start-end)
        - parser: class, override default parser
        - multiple_iterators: bool, allow concurrent iterators
        
        Returns:
        Iterator of parsed records
        """
    
    def close(self):
        """Close the file."""
    
    # Properties
    @property
    def filename(self) -> str:
        """File name."""
    
    @property
    def header(self) -> list:
        """Header lines."""
    
    @property
    def contigs(self) -> list:
        """Available contigs/chromosomes."""
    
    def __enter__(self):
        """Context manager entry."""
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""

Parser Classes

Format-specific parsers for different genomic file types.

class asTuple:
    """Parse records as tuples of strings."""
    def __call__(self, line):
        """
        Parse line to tuple.
        
        Returns:
        tuple, fields as strings
        """

class asBed:
    """Parse BED format records."""
    def __call__(self, line):
        """
        Parse BED line.
        
        Returns:
        BedProxy object
        """

class asGTF:
    """Parse GTF format records."""
    def __call__(self, line):
        """
        Parse GTF line.
        
        Returns:
        GTFProxy object
        """

class asGFF3:
    """Parse GFF3 format records."""
    def __call__(self, line):
        """
        Parse GFF3 line.
        
        Returns:
        GFF3Proxy object
        """

class asVCF:
    """Parse VCF format records."""
    def __call__(self, line):
        """
        Parse VCF line.
        
        Returns:
        VCFProxy object
        """

Proxy Classes

Efficient access to parsed record fields without copying data.

class TupleProxy:
    """Fast read-only access to parsed records as tuples."""
    
    def __getitem__(self, index):
        """
        Get field by index.
        
        Parameters:
        - index: int, field index
        
        Returns:
        str, field value
        """
    
    def __len__(self):
        """Number of fields."""
    
    def __str__(self):
        """String representation."""

class BedProxy(TupleProxy):
    """BED format record with named field access."""
    
    @property
    def contig(self) -> str:
        """Chromosome name."""
    
    @property
    def start(self) -> int:
        """Start position (0-based)."""
    
    @property
    def end(self) -> int:
        """End position (0-based)."""
    
    @property
    def name(self) -> str:
        """Feature name."""
    
    @property
    def score(self) -> float:
        """Score value."""
    
    @property
    def strand(self) -> str:
        """Strand ('+', '-', or '.')."""
    
    @property
    def thickstart(self) -> int:
        """Thick start position."""
    
    @property
    def thickend(self) -> int:
        """Thick end position."""
    
    @property
    def itemrgb(self) -> str:
        """RGB color value."""
    
    @property
    def blockcount(self) -> int:
        """Number of blocks."""
    
    @property
    def blocksizes(self) -> list:
        """Block sizes."""
    
    @property
    def blockstarts(self) -> list:
        """Block start positions."""

class GTFProxy(TupleProxy):
    """GTF format record with named field access."""
    
    @property
    def contig(self) -> str:
        """Chromosome name."""
    
    @property
    def source(self) -> str:
        """Annotation source."""
    
    @property
    def feature(self) -> str:
        """Feature type."""
    
    @property
    def start(self) -> int:
        """Start position (1-based)."""
    
    @property
    def end(self) -> int:
        """End position (1-based)."""
    
    @property
    def score(self) -> str:
        """Score value."""
    
    @property
    def strand(self) -> str:
        """Strand ('+', '-', or '.')."""
    
    @property
    def frame(self) -> str:
        """Reading frame."""
    
    @property
    def attributes(self) -> str:
        """Attributes string."""
    
    def asDict(self):
        """
        Parse attributes as dictionary.
        
        Returns:
        dict, attribute key-value pairs
        """

class GFF3Proxy(TupleProxy):
    """GFF3 format record with named field access."""
    
    @property
    def contig(self) -> str:
        """Chromosome name."""
    
    @property
    def source(self) -> str:
        """Annotation source."""
    
    @property
    def feature(self) -> str:
        """Feature type."""
    
    @property
    def start(self) -> int:
        """Start position (1-based)."""
    
    @property
    def end(self) -> int:
        """End position (1-based)."""
    
    @property
    def score(self) -> str:
        """Score value."""
    
    @property
    def strand(self) -> str:
        """Strand ('+', '-', or '.')."""
    
    @property
    def frame(self) -> str:
        """Reading frame."""
    
    @property
    def attributes(self) -> str:
        """Attributes string."""
    
    def asDict(self):
        """
        Parse attributes as dictionary.
        
        Returns:
        dict, attribute key-value pairs
        """

class VCFProxy(TupleProxy):
    """VCF format record with named field access."""
    
    @property
    def contig(self) -> str:
        """Chromosome name."""
    
    @property
    def pos(self) -> int:
        """Position (1-based)."""
    
    @property
    def id(self) -> str:
        """Variant identifier."""
    
    @property
    def ref(self) -> str:
        """Reference allele."""
    
    @property
    def alt(self) -> str:
        """Alternate alleles."""
    
    @property
    def qual(self) -> str:
        """Quality score."""
    
    @property
    def filter(self) -> str:
        """Filter status."""
    
    @property
    def info(self) -> str:
        """Info fields."""
    
    @property
    def format(self) -> str:
        """Format specification."""

Utility Functions

Functions for creating and manipulating tabix-indexed files.

def tabix_index(filename, preset=None, seq_col=None, start_col=None, end_col=None, line_skip=0, zerobased=False, **kwargs):
    """
    Create tabix index for a file.
    
    Parameters:
    - filename: str, file to index
    - preset: str, format preset ('gff', 'bed', 'sam', 'vcf')
    - seq_col: int, sequence/chromosome column (1-based)
    - start_col: int, start position column (1-based)
    - end_col: int, end position column (1-based)
    - line_skip: int, number of header lines to skip
    - zerobased: bool, coordinates are 0-based
    
    Returns:
    str, index filename
    """

def tabix_compress(filename_in, filename_out=None, force=False, **kwargs):
    """
    Compress file with bgzip.
    
    Parameters:
    - filename_in: str, input file
    - filename_out: str, output file (default: filename_in + .gz)
    - force: bool, overwrite existing file
    
    Returns:
    str, compressed filename
    """

def tabix_iterator(infile, parser=None):
    """
    Create iterator for unindexed files.
    
    Parameters:
    - infile: file object or str, input file
    - parser: class, record parser
    
    Returns:
    Iterator of parsed records
    """

def tabix_generic_iterator(infile, parser=None):
    """
    Generic file iterator.
    
    Parameters:
    - infile: file object, input file
    - parser: class, record parser
    
    Returns:
    Iterator of parsed records
    """

def tabix_file_iterator(infile, parser=None):
    """
    Compressed file iterator.
    
    Parameters:
    - infile: str, compressed file path
    - parser: class, record parser
    
    Returns:
    Iterator of parsed records
    """

Iterator Classes

Specialized iterators for different file access patterns.

class GZIterator:
    """Iterator for gzip-compressed files."""
    
    def __init__(self, filename, parser=None):
        """
        Create gzip iterator.
        
        Parameters:
        - filename: str, gzip file path
        - parser: class, record parser
        """
    
    def __iter__(self):
        """Iterate over records."""
    
    def __next__(self):
        """Get next record."""

class GZIteratorHead:
    """Iterator for headers in gzip files."""
    
    def __init__(self, filename, parser=None):
        """
        Create header iterator.
        
        Parameters:
        - filename: str, gzip file path
        - parser: class, record parser
        """
    
    def __iter__(self):
        """Iterate over header lines."""

Legacy Compatibility

class Tabixfile:
    """Legacy alias for TabixFile."""
    def __init__(self, *args, **kwargs):
        """Same as TabixFile.__init__."""

Usage Examples

Basic File Access

import pysam

# Open tabix-indexed BED file
with pysam.TabixFile("annotations.bed.gz", parser=pysam.asBed()) as tabixfile:
    # Fetch features in region
    for record in tabixfile.fetch("chr1", 1000, 2000):
        print(f"{record.contig}:{record.start}-{record.end} {record.name}")
        print(f"  Score: {record.score}, Strand: {record.strand}")

# Open GTF file with GTF parser
with pysam.TabixFile("genes.gtf.gz", parser=pysam.asGTF()) as tabixfile:
    for record in tabixfile.fetch("chr1", 1000000, 2000000):
        if record.feature == "gene":
            attrs = record.asDict()
            gene_name = attrs.get("gene_name", ["Unknown"])[0]
            print(f"Gene: {gene_name} at {record.start}-{record.end}")

Creating Tabix Indexes

import pysam

# Index a BED file
pysam.tabix_compress("features.bed", "features.bed.gz")
pysam.tabix_index("features.bed.gz", preset="bed")

# Index a custom format file
pysam.tabix_compress("custom.txt", "custom.txt.gz")
pysam.tabix_index("custom.txt.gz", 
                  seq_col=1,     # chromosome in column 1
                  start_col=2,   # start in column 2  
                  end_col=3,     # end in column 3
                  line_skip=1)   # skip header line

# Index VCF file
pysam.tabix_compress("variants.vcf", "variants.vcf.gz")
pysam.tabix_index("variants.vcf.gz", preset="vcf")

Format-Specific Processing

import pysam

# Process GTF annotations
def extract_genes(gtf_file, output_file, gene_type="protein_coding"):
    with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:
        with open(output_file, 'w') as outfile:
            outfile.write("gene_id\tgene_name\tchrom\tstart\tend\tstrand\n")
            
            for record in gtffile.fetch():
                if record.feature == "gene":
                    attrs = record.asDict()
                    
                    # Check gene type
                    if gene_type in attrs.get("gene_type", []):
                        gene_id = attrs.get("gene_id", [""])[0]
                        gene_name = attrs.get("gene_name", [""])[0]
                        
                        outfile.write(f"{gene_id}\t{gene_name}\t{record.contig}\t"
                                    f"{record.start}\t{record.end}\t{record.strand}\n")

# Process BED intervals
def merge_overlapping_intervals(bed_file, output_file):
    """Merge overlapping BED intervals."""
    intervals = []
    
    with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:
        for record in bedfile.fetch():
            intervals.append((record.contig, record.start, record.end, record.name))
    
    # Sort by chromosome and position
    intervals.sort(key=lambda x: (x[0], x[1]))
    
    # Merge overlapping intervals
    merged = []
    for chrom, start, end, name in intervals:
        if merged and merged[-1][0] == chrom and merged[-1][2] >= start:
            # Overlapping - extend previous interval
            merged[-1] = (chrom, merged[-1][1], max(merged[-1][2], end), 
                         merged[-1][3] + "," + name)
        else:
            # Non-overlapping - add new interval
            merged.append((chrom, start, end, name))
    
    # Write merged intervals
    with open(output_file, 'w') as outfile:
        for chrom, start, end, name in merged:
            outfile.write(f"{chrom}\t{start}\t{end}\t{name}\n")

# Usage
extract_genes("gencode.gtf.gz", "protein_coding_genes.txt")
merge_overlapping_intervals("peaks.bed.gz", "merged_peaks.bed")

Multi-format Analysis

import pysam
from collections import defaultdict

def intersect_features(bed_file, gtf_file, feature_type="exon"):
    """Find BED intervals that overlap with GTF features."""
    
    # Load BED intervals by chromosome
    bed_intervals = defaultdict(list)
    with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:
        for record in bedfile.fetch():
            bed_intervals[record.contig].append((record.start, record.end, record.name))
    
    # Check overlaps with GTF features
    overlaps = []
    with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:
        for chrom in bed_intervals:
            for record in gtffile.fetch(chrom):
                if record.feature == feature_type:
                    gtf_start, gtf_end = record.start - 1, record.end  # Convert to 0-based
                    
                    # Check each BED interval for overlap
                    for bed_start, bed_end, bed_name in bed_intervals[chrom]:
                        if bed_start < gtf_end and bed_end > gtf_start:
                            # Calculate overlap
                            overlap_start = max(bed_start, gtf_start)
                            overlap_end = min(bed_end, gtf_end)
                            overlap_len = overlap_end - overlap_start
                            
                            attrs = record.asDict()
                            gene_name = attrs.get("gene_name", ["Unknown"])[0]
                            
                            overlaps.append({
                                'bed_name': bed_name,
                                'gene_name': gene_name,
                                'chrom': chrom,
                                'overlap_start': overlap_start,
                                'overlap_end': overlap_end,
                                'overlap_length': overlap_len
                            })
    
    return overlaps

# Find peaks overlapping exons
overlaps = intersect_features("chip_peaks.bed.gz", "gencode.gtf.gz", "exon")
for overlap in overlaps[:10]:  # Show first 10
    print(f"Peak {overlap['bed_name']} overlaps {overlap['gene_name']} "
          f"by {overlap['overlap_length']} bp")

Install with Tessl CLI