A VCFv4.0 and 4.1 parser for Python
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Extensible filtering system with built-in filters for quality control and custom filter development for genomic variant analysis.
Base class for implementing custom VCF filters with standardized interface and argparse integration.
class Base:
"""Base class for VCF record filters."""
name: str # Filter identifier
def customize_parser(self, parser):
"""
Extend argparse parser with filter-specific options.
Parameters:
- parser: argparse.ArgumentParser, parser to extend
"""
def __init__(self, args):
"""
Initialize filter with argparse arguments.
Parameters:
- args: argparse.Namespace, parsed arguments
"""
def __call__(self, record):
"""
Filter a variant record.
Parameters:
- record: _Record, variant record to filter
Returns:
str or None: Filter name if record should be filtered, None to pass
"""
def filter_name(self):
"""
Generate filter name for VCF header.
Returns:
str: Filter name for FILTER field
"""Pre-implemented filters for common quality control tasks.
class SiteQuality(Base):
"""Filter by site quality score."""
name = 'sq' # Filter identifier
class VariantGenotypeQuality(Base):
"""Filter by minimum genotype quality across samples."""
name = 'mgq' # Filter identifier
class ErrorBiasFilter(Base):
"""Statistical error bias filter (requires rpy2)."""
name = 'eb' # Filter identifier
class DepthPerSample(Base):
"""Filter by minimum depth per sample."""
name = 'dps' # Filter identifier
class AvgDepthPerSample(Base):
"""Filter by average depth across samples."""
name = 'avg-dps' # Filter identifier
class SnpOnly(Base):
"""Keep only SNP variants, filter out indels and SVs."""
name = 'snp-only' # Filter identifierimport vcf
from vcf.filters import SiteQuality, DepthPerSample, SnpOnly
# Apply single filter
reader = vcf.Reader(filename='input.vcf')
site_filter = SiteQuality({'threshold': 30})
filtered_records = []
for record in reader:
filter_result = site_filter(record)
if filter_result is None: # Record passes filter
filtered_records.append(record)
else:
print(f"Filtered {record.CHROM}:{record.POS} - {filter_result}")
# Chain multiple filters
reader = vcf.Reader(filename='input.vcf')
filters = [
SiteQuality({'threshold': 30}),
DepthPerSample({'threshold': 10}),
SnpOnly({})
]
for record in reader:
passed = True
for filt in filters:
if filt(record) is not None:
passed = False
break
if passed:
print(f"Passed all filters: {record.CHROM}:{record.POS}")
# Write filtered VCF
reader = vcf.Reader(filename='input.vcf')
writer = vcf.Writer(open('filtered.vcf', 'w'), reader)
quality_filter = SiteQuality({'threshold': 30})
snp_filter = SnpOnly({})
for record in reader:
# Apply filters
if quality_filter(record) is None and snp_filter(record) is None:
writer.write_record(record)
else:
# Add filter tags
if quality_filter(record):
record.add_filter(quality_filter.filter_name())
if snp_filter(record):
record.add_filter(snp_filter.filter_name())
writer.write_record(record)
writer.close()import vcf
from vcf.filters import Base
class CustomQualityFilter(Base):
"""Custom filter combining quality and depth thresholds."""
name = 'custom_qual'
def customize_parser(self, parser):
parser.add_argument('--min-qual', type=float, default=30,
help='Minimum site quality')
parser.add_argument('--min-depth', type=int, default=10,
help='Minimum average depth')
def __init__(self, args):
self.min_qual = args.min_qual
self.min_depth = args.min_depth
def __call__(self, record):
# Check quality
if record.QUAL is None or record.QUAL < self.min_qual:
return self.filter_name()
# Check average depth
depths = []
for call in record.samples:
if call.called and hasattr(call.data, 'DP') and call.data.DP:
depths.append(call.data.DP)
if depths and sum(depths) / len(depths) < self.min_depth:
return self.filter_name()
return None # Pass filter
def filter_name(self):
return f"CustomQual{self.min_qual}Depth{self.min_depth}"
# Use custom filter
import argparse
parser = argparse.ArgumentParser()
custom_filter = CustomQualityFilter(argparse.Namespace(min_qual=40, min_depth=15))
reader = vcf.Reader(filename='input.vcf')
for record in reader:
if custom_filter(record) is None:
print(f"Passed: {record.CHROM}:{record.POS}")PyVCF provides command-line filtering through entry points:
# Available filters through entry points
vcf_filter.py --site-quality 30 input.vcf
vcf_filter.py --variant-genotype-quality 20 input.vcf
vcf_filter.py --depth-per-sample 10 input.vcf
vcf_filter.py --snp-only input.vcfEntry point filters:
site_quality = vcf.filters:SiteQualityvgq = vcf.filters:VariantGenotypeQualityeb = vcf.filters:ErrorBiasFilterdps = vcf.filters:DepthPerSampleavg-dps = vcf.filters:AvgDepthPerSamplesnp-only = vcf.filters:SnpOnlyInstall with Tessl CLI
npx tessl i tessl/pypi-pyvcf