Package for reading, manipulating, and writing genomic data
—
Comprehensive support for reading and writing variant call format files in VCF and BCF formats. Provides header management, sample data access, filtering, and indexing capabilities.
Main interface for reading and writing VCF/BCF files with full header and sample support.
class VariantFile:
def __init__(self, filepath, mode="r", header=None, drop_samples=False, duplicate_filehandle=True, ignore_truncation=False, format_options=None, threads=1, index=None):
"""
Open a VCF/BCF file for reading or writing.
Parameters:
- filepath: str, path to VCF/BCF file
- mode: str, file mode ('r', 'w', 'rb', 'wb')
- header: VariantHeader, header for writing
- drop_samples: bool, ignore sample data
- duplicate_filehandle: bool, allow multiple handles
- ignore_truncation: bool, ignore truncated files
- threads: int, number of threads for compression
- index: str, path to index file
Returns:
VariantFile object
"""
def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
"""
Fetch variant records from a region.
Parameters:
- contig: str, chromosome/contig name
- start: int, 0-based start position
- stop: int, 0-based stop position
- region: str, region string (chr:start-stop)
- reopen: bool, reopen file for iteration
Returns:
Iterator of VariantRecord objects
"""
def write(self, record):
"""
Write a variant record.
Parameters:
- record: VariantRecord, variant to write
"""
def new_record(self, contig=None, start=None, stop=None, alleles=None, id=None, qual=None, filter=None, info=None, samples=None, **kwargs):
"""
Create new variant record.
Parameters:
- contig: str, chromosome name
- start: int, 0-based start position
- stop: int, 0-based stop position
- alleles: tuple, reference and alternate alleles
- id: str, variant identifier
- qual: float, quality score
- filter: str/list, filter status
- info: dict, INFO field data
- samples: dict, sample data
Returns:
VariantRecord object
"""
def copy_record(self, record):
"""
Create copy of variant record.
Returns:
VariantRecord object
"""
def close(self):
"""Close the file."""
# Properties
@property
def header(self) -> "VariantHeader":
"""File header."""
@property
def index(self):
"""File index."""
@property
def filename(self) -> str:
"""Filename."""
@property
def is_open(self) -> bool:
"""True if file is open."""
@property
def category(self) -> str:
"""File category."""
@property
def format(self) -> str:
"""File format."""
def check_index(self):
"""
Check if index exists and is valid.
Returns:
bool, True if valid index
"""Individual variant record with position, alleles, quality, and sample information.
class VariantRecord:
# Core properties
@property
def rid(self) -> int:
"""Reference sequence ID."""
@property
def contig(self) -> str:
"""Chromosome/contig name."""
@contig.setter
def contig(self, value: str):
"""Set chromosome name."""
@property
def pos(self) -> int:
"""1-based position."""
@pos.setter
def pos(self, value: int):
"""Set position."""
@property
def start(self) -> int:
"""0-based start position."""
@property
def stop(self) -> int:
"""0-based stop position."""
@property
def id(self) -> str:
"""Variant identifier."""
@id.setter
def id(self, value: str):
"""Set variant identifier."""
@property
def ref(self) -> str:
"""Reference allele."""
@ref.setter
def ref(self, value: str):
"""Set reference allele."""
@property
def alts(self) -> tuple:
"""Alternate alleles."""
@alts.setter
def alts(self, value: tuple):
"""Set alternate alleles."""
@property
def alleles(self) -> tuple:
"""All alleles (reference + alternates)."""
@alleles.setter
def alleles(self, value: tuple):
"""Set all alleles."""
@property
def qual(self) -> float:
"""Quality score."""
@qual.setter
def qual(self, value: float):
"""Set quality score."""
# Complex properties
@property
def filter(self) -> "VariantRecordFilter":
"""Filter information."""
@property
def info(self) -> "VariantRecordInfo":
"""INFO field data."""
@property
def format(self) -> "VariantRecordFormat":
"""FORMAT field definition."""
@property
def samples(self) -> "VariantRecordSamples":
"""Sample data."""
# Methods
def copy(self):
"""
Create copy of record.
Returns:
VariantRecord object
"""
def translate(self, mapping):
"""
Translate chromosome names.
Parameters:
- mapping: dict, chromosome name mapping
"""
def to_string(self):
"""
Convert to VCF format string.
Returns:
str, VCF line
"""VCF/BCF header containing metadata, sample information, and field definitions.
class VariantHeader:
def __init__(self):
"""Create new variant header."""
# Properties
@property
def version(self) -> str:
"""VCF format version."""
@property
def samples(self) -> "VariantHeaderSamples":
"""Sample names and metadata."""
@property
def records(self) -> "VariantHeaderRecords":
"""Header records."""
@property
def contigs(self) -> "VariantHeaderContigs":
"""Contig information."""
@property
def filters(self) -> "VariantHeaderRecords":
"""FILTER definitions."""
@property
def info(self) -> "VariantHeaderRecords":
"""INFO field definitions."""
@property
def formats(self) -> "VariantHeaderRecords":
"""FORMAT field definitions."""
# Methods
def add_record(self, record):
"""
Add header record.
Parameters:
- record: VariantHeaderRecord, record to add
"""
def add_sample(self, name):
"""
Add sample.
Parameters:
- name: str, sample name
"""
def add_line(self, line):
"""
Add header line.
Parameters:
- line: str, header line
"""
def copy(self):
"""
Create copy of header.
Returns:
VariantHeader object
"""
def merge(self, other):
"""
Merge with another header.
Parameters:
- other: VariantHeader, header to merge
"""
def subset(self, samples):
"""
Create subset with specific samples.
Parameters:
- samples: list, sample names to include
Returns:
VariantHeader object
"""
def to_string(self):
"""
Convert to VCF header string.
Returns:
str, VCF header
"""Sample data access for variant records with genotype and field information.
class VariantRecordSamples:
def __getitem__(self, sample):
"""
Get sample data.
Parameters:
- sample: str/int, sample name or index
Returns:
VariantRecordSample object
"""
def __contains__(self, sample):
"""Check if sample exists."""
def __len__(self):
"""Number of samples."""
def __iter__(self):
"""Iterate over samples."""
def keys(self):
"""
Get sample names.
Returns:
Iterator of sample names
"""
def values(self):
"""
Get sample data.
Returns:
Iterator of VariantRecordSample objects
"""
def items(self):
"""
Get sample items.
Returns:
Iterator of (name, VariantRecordSample) tuples
"""Individual sample data within a variant record.
class VariantRecordSample:
def __getitem__(self, field):
"""
Get field value.
Parameters:
- field: str, field name
Returns:
Field value
"""
def __setitem__(self, field, value):
"""
Set field value.
Parameters:
- field: str, field name
- value: field value
"""
def __contains__(self, field):
"""Check if field exists."""
def get(self, field, default=None):
"""
Get field with default.
Returns:
Field value or default
"""
def keys(self):
"""
Get field names.
Returns:
Iterator of field names
"""
def values(self):
"""
Get field values.
Returns:
Iterator of field values
"""
def items(self):
"""
Get field items.
Returns:
Iterator of (field, value) tuples
"""
@property
def name(self) -> str:
"""Sample name."""
# Genotype shortcuts
@property
def allele_indices(self) -> tuple:
"""Genotype allele indices."""
@property
def alleles(self) -> tuple:
"""Genotype alleles."""
@property
def phased(self) -> bool:
"""True if genotype is phased."""INFO field data access for variant records.
class VariantRecordInfo:
def __getitem__(self, key):
"""
Get INFO field value.
Parameters:
- key: str, INFO field name
Returns:
Field value
"""
def __setitem__(self, key, value):
"""
Set INFO field value.
Parameters:
- key: str, INFO field name
- value: field value
"""
def __delitem__(self, key):
"""Delete INFO field."""
def __contains__(self, key):
"""Check if INFO field exists."""
def __len__(self):
"""Number of INFO fields."""
def __iter__(self):
"""Iterate over INFO field names."""
def get(self, key, default=None):
"""
Get INFO field with default.
Returns:
Field value or default
"""
def keys(self):
"""
Get INFO field names.
Returns:
Iterator of field names
"""
def values(self):
"""
Get INFO field values.
Returns:
Iterator of field values
"""
def items(self):
"""
Get INFO field items.
Returns:
Iterator of (field, value) tuples
"""
def clear(self):
"""Remove all INFO fields."""
def update(self, other):
"""
Update with another INFO object.
Parameters:
- other: dict/VariantRecordInfo, data to update with
"""Filter status information for variant records.
class VariantRecordFilter:
def __contains__(self, name):
"""Check if filter is applied."""
def __iter__(self):
"""Iterate over applied filters."""
def __len__(self):
"""Number of applied filters."""
def add(self, name):
"""
Add filter.
Parameters:
- name: str, filter name
"""
def clear(self):
"""Remove all filters."""import pysam
# Read VCF file
with pysam.VariantFile("input.vcf") as vcffile:
# Iterate over all variants
for record in vcffile:
print(f"{record.contig}:{record.pos} {record.ref}->{record.alts}")
# Fetch variants in region
for record in vcffile.fetch("chr1", 1000, 2000):
if record.qual >= 30:
print(f"High quality variant: {record.id}")
# Access sample data
with pysam.VariantFile("input.vcf") as vcffile:
for record in vcffile:
for sample_name in record.samples:
sample = record.samples[sample_name]
genotype = sample["GT"]
print(f"Sample {sample_name}: {genotype}")import pysam
# Create header
header = pysam.VariantHeader()
header.add_line('##fileformat=VCFv4.2')
header.add_line('##contig=<ID=chr1,length=249250621>')
header.add_sample("Sample1")
header.add_sample("Sample2")
# Add INFO and FORMAT definitions
header.add_line('##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">')
header.add_line('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
header.add_line('##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">')
with pysam.VariantFile("output.vcf", "w", header=header) as vcffile:
# Create variant record
record = vcffile.new_record(
contig="chr1",
start=100,
alleles=("A", "T"),
qual=60.0,
info={"DP": 100}
)
# Set sample data
record.samples["Sample1"]["GT"] = (0, 1) # Het
record.samples["Sample1"]["DP"] = 50
record.samples["Sample2"]["GT"] = (1, 1) # Hom alt
record.samples["Sample2"]["DP"] = 45
vcffile.write(record)import pysam
with pysam.VariantFile("input.vcf") as infile:
# Create output with same header
with pysam.VariantFile("filtered.vcf", "w", header=infile.header) as outfile:
for record in infile:
# Filter by quality and depth
if record.qual >= 30 and record.info.get("DP", 0) >= 10:
# Check if any sample is homozygous alternate
has_hom_alt = False
for sample in record.samples.values():
if sample["GT"] == (1, 1):
has_hom_alt = True
break
if has_hom_alt:
outfile.write(record)Install with Tessl CLI
npx tessl i tessl/pypi-pysam