A VCFv4.0 and 4.1 parser for Python
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
PyVCF constants for VCF specification compliance, field validation, and reserved field definitions for INFO and FORMAT fields.
VERSION: str # PyVCF version string (e.g., '0.6.8')Pre-defined INFO field specifications from VCF 4.0/4.1 specification.
RESERVED_INFO: dict = {
'AA': 'String', # Ancestral Allele
'AC': 'Integer', # Allele count in genotypes
'AF': 'Float', # Allele Frequency
'AN': 'Integer', # Total number of alleles in called genotypes
'BQ': 'Float', # Base Quality
'CIGAR': 'String', # CIGAR string describing alignment
'DB': 'Flag', # dbSNP membership
'DP': 'Integer', # Combined depth across samples
'END': 'Integer', # End position of variant
'H2': 'Flag', # HapMap2 membership
'H3': 'Flag', # HapMap3 membership
'MQ': 'Float', # Mapping Quality
'MQ0': 'Integer', # Number of MAPQ == 0 reads
'NS': 'Integer', # Number of samples with data
'SB': 'String', # Strand bias
'SOMATIC': 'Flag', # Somatic mutation
'VALIDATED': 'Flag', # Validated by follow-up experiment
'1000G': 'Flag', # 1000 Genomes membership
# Structural variant INFO fields
'IMPRECISE': 'Flag', # Imprecise structural variation
'NOVEL': 'Flag', # Novel structural variation
'SVEND': 'Integer', # End position of SV
'SVLEN': 'Integer', # Length of SV
'SVTYPE': 'String', # Type of structural variant
'MATEID': 'String', # ID of mate breakend
'EVENT': 'String', # ID of associated event
'HOMLEN': 'Integer', # Length of base pair homology
'DGVID': 'String', # ID from Database of Genomic Variants
'DBVARID': 'String', # ID from NCBI dbVar
}Pre-defined FORMAT field specifications from VCF 4.0/4.1 specification.
RESERVED_FORMAT: dict = {
'GT': 'String', # Genotype
'DP': 'Integer', # Read depth at this position for this sample
'FT': 'String', # Sample genotype filter
'GL': 'Float', # Genotype likelihoods
'GLE': 'String', # Genotype likelihoods (log10 encoded)
'PL': 'Integer', # Phred-scaled genotype likelihoods
'GP': 'Float', # Genotype posterior probabilities
'GQ': 'Integer', # Conditional genotype quality
'HQ': 'Integer', # Haplotype qualities
'PS': 'Integer', # Phase set identifier
'PQ': 'Integer', # Phasing quality
'EC': 'Integer', # Expected alternate allele counts
'MQ': 'Integer', # Mapping quality
'AD': 'Integer', # Allelic depths
'ADF': 'Integer', # Forward strand allelic depths
'ADR': 'Integer', # Reverse strand allelic depths
# Copy number fields
'CN': 'Integer', # Copy number genotype
'CNQ': 'Float', # Copy number genotype quality
'CNL': 'Float', # Copy number genotype likelihood
'NQ': 'Integer', # Phred style probability that the variant is novel
'HAP': 'Integer', # Unique haplotype identifier
'AHAP': 'Integer', # Alternative haplotype identifier
}Constants for interpreting field number specifications in VCF headers.
field_counts: dict = {
'.': None, # Unknown number of values
'A': -1, # Number of alternate alleles
'G': -2, # Number of genotypes (including reference)
'R': -3, # Number of alleles (reference + alternates)
}import vcf
# Access version information
print(f"PyVCF version: {vcf.VERSION}")
# Check if INFO field is reserved
if 'DP' in vcf.RESERVED_INFO:
print(f"DP is a reserved INFO field, type: {vcf.RESERVED_INFO['DP']}")
# Check if FORMAT field is reserved
if 'GT' in vcf.RESERVED_FORMAT:
print(f"GT is a reserved FORMAT field, type: {vcf.RESERVED_FORMAT['GT']}")
# Validate custom field names against reserved fields
def validate_custom_info_field(field_name):
if field_name in vcf.RESERVED_INFO:
print(f"Warning: {field_name} is a reserved INFO field")
return False
return True
def validate_custom_format_field(field_name):
if field_name in vcf.RESERVED_FORMAT:
print(f"Warning: {field_name} is a reserved FORMAT field")
return False
return True
# Example validation
validate_custom_info_field('CUSTOM_INFO') # OK
validate_custom_info_field('DP') # Warning: reserved
validate_custom_format_field('CUSTOM_GT') # OK
validate_custom_format_field('GT') # Warning: reservedimport vcf
reader = vcf.Reader(filename='variants.vcf')
for record in reader:
# Access standard INFO fields
if 'DP' in record.INFO:
depth = record.INFO['DP']
print(f"Total depth: {depth}")
if 'AF' in record.INFO:
allele_freqs = record.INFO['AF']
print(f"Allele frequencies: {allele_freqs}")
# Check for structural variant INFO fields
if record.is_sv:
if 'SVTYPE' in record.INFO:
sv_type = record.INFO['SVTYPE']
print(f"SV type: {sv_type}")
if 'SVLEN' in record.INFO:
sv_length = record.INFO['SVLEN']
print(f"SV length: {sv_length}")
# Access standard FORMAT fields for samples
for call in record.samples:
if hasattr(call.data, 'GT'):
print(f"Sample {call.sample} genotype: {call.data.GT}")
if hasattr(call.data, 'DP'):
print(f"Sample {call.sample} depth: {call.data.DP}")
if hasattr(call.data, 'GQ'):
print(f"Sample {call.sample} quality: {call.data.GQ}")import vcf
def validate_field_definition(field_dict, reserved_dict, field_type):
"""Validate field definitions against reserved specifications."""
for field_id, field_info in field_dict.items():
if field_id in reserved_dict:
expected_type = reserved_dict[field_id]
actual_type = field_info.type
if actual_type != expected_type:
print(f"Warning: {field_type} field {field_id} "
f"type mismatch: expected {expected_type}, "
f"got {actual_type}")
# Validate a VCF file's field definitions
reader = vcf.Reader(filename='variants.vcf')
# Validate INFO field definitions
validate_field_definition(reader.infos, vcf.RESERVED_INFO, 'INFO')
# Validate FORMAT field definitions
validate_field_definition(reader.formats, vcf.RESERVED_FORMAT, 'FORMAT')import vcf
def interpret_field_number(num_str, num_alts=None, num_samples=None):
"""Interpret VCF field number specifications."""
if num_str == '.':
return None # Variable number
elif num_str == 'A':
return num_alts if num_alts else -1
elif num_str == 'G':
# Number of genotypes = (n+1)(n+2)/2 where n = number of alleles
if num_alts:
n_alleles = num_alts + 1 # Include reference
return (n_alleles * (n_alleles + 1)) // 2
return -2
elif num_str == 'R':
return (num_alts + 1) if num_alts else -3
else:
try:
return int(num_str)
except ValueError:
return None
# Example usage with a record
reader = vcf.Reader(filename='variants.vcf')
record = next(reader)
# Interpret field numbers for this record's context
num_alts = len(record.ALT)
print(f"Number of alternates: {num_alts}")
for field_id, field_info in reader.infos.items():
expected_count = interpret_field_number(field_info.num, num_alts)
print(f"INFO {field_id}: expects {expected_count} values")Install with Tessl CLI
npx tessl i tessl/pypi-pyvcf