Package for reading, manipulating, and writing genomic data
—
Support for compressed, indexed genomic files using tabix indexing. Enables efficient random access to large compressed files in various formats including BED, GFF, GTF, and VCF.
Main interface for accessing tabix-indexed compressed files with format-specific parsers.
class TabixFile:
def __init__(self, filename, parser=None, encoding="ascii", duplicate_filehandle=True):
"""
Open tabix-indexed file.
Parameters:
- filename: str, path to tabix-indexed file (.gz)
- parser: class, parser for file format (asTuple, asBed, asGTF, etc.)
- encoding: str, text encoding
- duplicate_filehandle: bool, allow multiple handles
Returns:
TabixFile object
"""
def fetch(self, reference=None, start=None, end=None, region=None, parser=None, multiple_iterators=False):
"""
Fetch records from region.
Parameters:
- reference: str, chromosome/contig name
- start: int, start position (0-based)
- end: int, end position (0-based)
- region: str, region string (chr:start-end)
- parser: class, override default parser
- multiple_iterators: bool, allow concurrent iterators
Returns:
Iterator of parsed records
"""
def close(self):
"""Close the file."""
# Properties
@property
def filename(self) -> str:
"""File name."""
@property
def header(self) -> list:
"""Header lines."""
@property
def contigs(self) -> list:
"""Available contigs/chromosomes."""
def __enter__(self):
"""Context manager entry."""
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""Format-specific parsers for different genomic file types.
class asTuple:
"""Parse records as tuples of strings."""
def __call__(self, line):
"""
Parse line to tuple.
Returns:
tuple, fields as strings
"""
class asBed:
"""Parse BED format records."""
def __call__(self, line):
"""
Parse BED line.
Returns:
BedProxy object
"""
class asGTF:
"""Parse GTF format records."""
def __call__(self, line):
"""
Parse GTF line.
Returns:
GTFProxy object
"""
class asGFF3:
"""Parse GFF3 format records."""
def __call__(self, line):
"""
Parse GFF3 line.
Returns:
GFF3Proxy object
"""
class asVCF:
"""Parse VCF format records."""
def __call__(self, line):
"""
Parse VCF line.
Returns:
VCFProxy object
"""Efficient access to parsed record fields without copying data.
class TupleProxy:
"""Fast read-only access to parsed records as tuples."""
def __getitem__(self, index):
"""
Get field by index.
Parameters:
- index: int, field index
Returns:
str, field value
"""
def __len__(self):
"""Number of fields."""
def __str__(self):
"""String representation."""
class BedProxy(TupleProxy):
"""BED format record with named field access."""
@property
def contig(self) -> str:
"""Chromosome name."""
@property
def start(self) -> int:
"""Start position (0-based)."""
@property
def end(self) -> int:
"""End position (0-based)."""
@property
def name(self) -> str:
"""Feature name."""
@property
def score(self) -> float:
"""Score value."""
@property
def strand(self) -> str:
"""Strand ('+', '-', or '.')."""
@property
def thickstart(self) -> int:
"""Thick start position."""
@property
def thickend(self) -> int:
"""Thick end position."""
@property
def itemrgb(self) -> str:
"""RGB color value."""
@property
def blockcount(self) -> int:
"""Number of blocks."""
@property
def blocksizes(self) -> list:
"""Block sizes."""
@property
def blockstarts(self) -> list:
"""Block start positions."""
class GTFProxy(TupleProxy):
"""GTF format record with named field access."""
@property
def contig(self) -> str:
"""Chromosome name."""
@property
def source(self) -> str:
"""Annotation source."""
@property
def feature(self) -> str:
"""Feature type."""
@property
def start(self) -> int:
"""Start position (1-based)."""
@property
def end(self) -> int:
"""End position (1-based)."""
@property
def score(self) -> str:
"""Score value."""
@property
def strand(self) -> str:
"""Strand ('+', '-', or '.')."""
@property
def frame(self) -> str:
"""Reading frame."""
@property
def attributes(self) -> str:
"""Attributes string."""
def asDict(self):
"""
Parse attributes as dictionary.
Returns:
dict, attribute key-value pairs
"""
class GFF3Proxy(TupleProxy):
"""GFF3 format record with named field access."""
@property
def contig(self) -> str:
"""Chromosome name."""
@property
def source(self) -> str:
"""Annotation source."""
@property
def feature(self) -> str:
"""Feature type."""
@property
def start(self) -> int:
"""Start position (1-based)."""
@property
def end(self) -> int:
"""End position (1-based)."""
@property
def score(self) -> str:
"""Score value."""
@property
def strand(self) -> str:
"""Strand ('+', '-', or '.')."""
@property
def frame(self) -> str:
"""Reading frame."""
@property
def attributes(self) -> str:
"""Attributes string."""
def asDict(self):
"""
Parse attributes as dictionary.
Returns:
dict, attribute key-value pairs
"""
class VCFProxy(TupleProxy):
"""VCF format record with named field access."""
@property
def contig(self) -> str:
"""Chromosome name."""
@property
def pos(self) -> int:
"""Position (1-based)."""
@property
def id(self) -> str:
"""Variant identifier."""
@property
def ref(self) -> str:
"""Reference allele."""
@property
def alt(self) -> str:
"""Alternate alleles."""
@property
def qual(self) -> str:
"""Quality score."""
@property
def filter(self) -> str:
"""Filter status."""
@property
def info(self) -> str:
"""Info fields."""
@property
def format(self) -> str:
"""Format specification."""Functions for creating and manipulating tabix-indexed files.
def tabix_index(filename, preset=None, seq_col=None, start_col=None, end_col=None, line_skip=0, zerobased=False, **kwargs):
"""
Create tabix index for a file.
Parameters:
- filename: str, file to index
- preset: str, format preset ('gff', 'bed', 'sam', 'vcf')
- seq_col: int, sequence/chromosome column (1-based)
- start_col: int, start position column (1-based)
- end_col: int, end position column (1-based)
- line_skip: int, number of header lines to skip
- zerobased: bool, coordinates are 0-based
Returns:
str, index filename
"""
def tabix_compress(filename_in, filename_out=None, force=False, **kwargs):
"""
Compress file with bgzip.
Parameters:
- filename_in: str, input file
- filename_out: str, output file (default: filename_in + .gz)
- force: bool, overwrite existing file
Returns:
str, compressed filename
"""
def tabix_iterator(infile, parser=None):
"""
Create iterator for unindexed files.
Parameters:
- infile: file object or str, input file
- parser: class, record parser
Returns:
Iterator of parsed records
"""
def tabix_generic_iterator(infile, parser=None):
"""
Generic file iterator.
Parameters:
- infile: file object, input file
- parser: class, record parser
Returns:
Iterator of parsed records
"""
def tabix_file_iterator(infile, parser=None):
"""
Compressed file iterator.
Parameters:
- infile: str, compressed file path
- parser: class, record parser
Returns:
Iterator of parsed records
"""Specialized iterators for different file access patterns.
class GZIterator:
"""Iterator for gzip-compressed files."""
def __init__(self, filename, parser=None):
"""
Create gzip iterator.
Parameters:
- filename: str, gzip file path
- parser: class, record parser
"""
def __iter__(self):
"""Iterate over records."""
def __next__(self):
"""Get next record."""
class GZIteratorHead:
"""Iterator for headers in gzip files."""
def __init__(self, filename, parser=None):
"""
Create header iterator.
Parameters:
- filename: str, gzip file path
- parser: class, record parser
"""
def __iter__(self):
"""Iterate over header lines."""class Tabixfile:
"""Legacy alias for TabixFile."""
def __init__(self, *args, **kwargs):
"""Same as TabixFile.__init__."""import pysam
# Open tabix-indexed BED file
with pysam.TabixFile("annotations.bed.gz", parser=pysam.asBed()) as tabixfile:
# Fetch features in region
for record in tabixfile.fetch("chr1", 1000, 2000):
print(f"{record.contig}:{record.start}-{record.end} {record.name}")
print(f" Score: {record.score}, Strand: {record.strand}")
# Open GTF file with GTF parser
with pysam.TabixFile("genes.gtf.gz", parser=pysam.asGTF()) as tabixfile:
for record in tabixfile.fetch("chr1", 1000000, 2000000):
if record.feature == "gene":
attrs = record.asDict()
gene_name = attrs.get("gene_name", ["Unknown"])[0]
print(f"Gene: {gene_name} at {record.start}-{record.end}")import pysam
# Index a BED file
pysam.tabix_compress("features.bed", "features.bed.gz")
pysam.tabix_index("features.bed.gz", preset="bed")
# Index a custom format file
pysam.tabix_compress("custom.txt", "custom.txt.gz")
pysam.tabix_index("custom.txt.gz",
seq_col=1, # chromosome in column 1
start_col=2, # start in column 2
end_col=3, # end in column 3
line_skip=1) # skip header line
# Index VCF file
pysam.tabix_compress("variants.vcf", "variants.vcf.gz")
pysam.tabix_index("variants.vcf.gz", preset="vcf")import pysam
# Process GTF annotations
def extract_genes(gtf_file, output_file, gene_type="protein_coding"):
with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:
with open(output_file, 'w') as outfile:
outfile.write("gene_id\tgene_name\tchrom\tstart\tend\tstrand\n")
for record in gtffile.fetch():
if record.feature == "gene":
attrs = record.asDict()
# Check gene type
if gene_type in attrs.get("gene_type", []):
gene_id = attrs.get("gene_id", [""])[0]
gene_name = attrs.get("gene_name", [""])[0]
outfile.write(f"{gene_id}\t{gene_name}\t{record.contig}\t"
f"{record.start}\t{record.end}\t{record.strand}\n")
# Process BED intervals
def merge_overlapping_intervals(bed_file, output_file):
"""Merge overlapping BED intervals."""
intervals = []
with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:
for record in bedfile.fetch():
intervals.append((record.contig, record.start, record.end, record.name))
# Sort by chromosome and position
intervals.sort(key=lambda x: (x[0], x[1]))
# Merge overlapping intervals
merged = []
for chrom, start, end, name in intervals:
if merged and merged[-1][0] == chrom and merged[-1][2] >= start:
# Overlapping - extend previous interval
merged[-1] = (chrom, merged[-1][1], max(merged[-1][2], end),
merged[-1][3] + "," + name)
else:
# Non-overlapping - add new interval
merged.append((chrom, start, end, name))
# Write merged intervals
with open(output_file, 'w') as outfile:
for chrom, start, end, name in merged:
outfile.write(f"{chrom}\t{start}\t{end}\t{name}\n")
# Usage
extract_genes("gencode.gtf.gz", "protein_coding_genes.txt")
merge_overlapping_intervals("peaks.bed.gz", "merged_peaks.bed")import pysam
from collections import defaultdict
def intersect_features(bed_file, gtf_file, feature_type="exon"):
"""Find BED intervals that overlap with GTF features."""
# Load BED intervals by chromosome
bed_intervals = defaultdict(list)
with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:
for record in bedfile.fetch():
bed_intervals[record.contig].append((record.start, record.end, record.name))
# Check overlaps with GTF features
overlaps = []
with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:
for chrom in bed_intervals:
for record in gtffile.fetch(chrom):
if record.feature == feature_type:
gtf_start, gtf_end = record.start - 1, record.end # Convert to 0-based
# Check each BED interval for overlap
for bed_start, bed_end, bed_name in bed_intervals[chrom]:
if bed_start < gtf_end and bed_end > gtf_start:
# Calculate overlap
overlap_start = max(bed_start, gtf_start)
overlap_end = min(bed_end, gtf_end)
overlap_len = overlap_end - overlap_start
attrs = record.asDict()
gene_name = attrs.get("gene_name", ["Unknown"])[0]
overlaps.append({
'bed_name': bed_name,
'gene_name': gene_name,
'chrom': chrom,
'overlap_start': overlap_start,
'overlap_end': overlap_end,
'overlap_length': overlap_len
})
return overlaps
# Find peaks overlapping exons
overlaps = intersect_features("chip_peaks.bed.gz", "gencode.gtf.gz", "exon")
for overlap in overlaps[:10]: # Show first 10
print(f"Peak {overlap['bed_name']} overlaps {overlap['gene_name']} "
f"by {overlap['overlap_length']} bp")Install with Tessl CLI
npx tessl i tessl/pypi-pysam