A VCFv4.0 and 4.1 parser for Python
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Comprehensive VCF file reading capabilities with streaming iteration, metadata extraction, and tabix support for efficient genomic data processing.
The main VCF file parser providing streaming access to variant records with comprehensive metadata support.
class Reader:
def __init__(self, fsock=None, filename=None, compressed=None,
prepend_chr=False, strict_whitespace=False, encoding='ascii'):
"""
Initialize VCF reader from file or stream.
Parameters:
- fsock: file-like object, open file handle
- filename: str, path to VCF file
- compressed: bool, whether file is gzip compressed (auto-detected)
- prepend_chr: bool, add 'chr' prefix to chromosome names
- strict_whitespace: bool, strict whitespace parsing
- encoding: str, file encoding (default 'ascii')
"""
def __iter__(self):
"""Iterator interface returning _Record objects."""
def __next__(self):
"""Get next variant record (Python 3.x iterator protocol)."""
def next(self):
"""Get next variant record (Python 2.x compatibility)."""
def fetch(self, chrom, start=None, end=None):
"""
Tabix-based region queries (requires pysam and indexed file).
Parameters:
- chrom: str, chromosome name
- start: int, start position (0-based, optional)
- end: int, end position (0-based, optional)
Returns:
Iterator of _Record objects in region
"""Access to parsed VCF header metadata and file information.
# Reader properties
metadata: dict # Complete header metadata (OrderedDict)
infos: dict # INFO field definitions (OrderedDict of _Info objects)
filters: dict # FILTER field definitions (OrderedDict of _Filter objects)
formats: dict # FORMAT field definitions (OrderedDict of _Format objects)
alts: dict # ALT field definitions (OrderedDict of _Alt objects)
contigs: dict # Contig information (OrderedDict of _Contig objects)
samples: list # Sample names from header
filename: str # Input filename if provided
encoding: str # File encoding usedclass VCFReader:
"""Alias for Reader class for backwards compatibility."""
passimport vcf
# Basic file reading
reader = vcf.Reader(filename='variants.vcf')
for record in reader:
print(f"Variant at {record.CHROM}:{record.POS}")
# Reading from compressed file
reader = vcf.Reader(filename='variants.vcf.gz')
# Reading from file handle
with open('variants.vcf', 'r') as f:
reader = vcf.Reader(fsock=f)
# Access header information
reader = vcf.Reader(filename='variants.vcf')
print("Samples:", reader.samples)
print("INFO fields:", list(reader.infos.keys()))
# Tabix region queries (requires pysam and indexed file)
reader = vcf.Reader(filename='variants.vcf.gz')
for record in reader.fetch('chr1', 1000000, 2000000):
print(f"Variant in region: {record.CHROM}:{record.POS}")class _Info:
"""INFO field metadata."""
id: str
num: str
type: str
desc: str
source: str
version: str
class _Filter:
"""FILTER field metadata."""
id: str
desc: str
class _Format:
"""FORMAT field metadata."""
id: str
num: str
type: str
desc: str
class _Contig:
"""Contig metadata."""
id: str
length: int
class _Alt:
"""ALT field metadata."""
id: str
desc: strInstall with Tessl CLI
npx tessl i tessl/pypi-pyvcf