A VCFv4.0 and 4.1 parser for Python
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Comprehensive variant record representation with coordinate properties, variant classification, and population genetics statistics for genomic analysis.
The _Record class represents individual variant sites with complete VCF information and computed properties.
class _Record:
"""
Represents a single variant site (row) in a VCF file.
"""
# Standard VCF fields
CHROM: str # Chromosome name
POS: int # 1-based position
ID: str # Variant identifier
REF: str # Reference allele
ALT: list # List of alternate alleles
QUAL: float # Quality score
FILTER: list # Filter status
INFO: dict # INFO field dictionary
FORMAT: str # Format string
samples: list # List of _Call objects
# Coordinate properties
start: int # 0-based start coordinate (POS - 1)
end: int # 0-based end coordinate
affected_start: int # Start of affected region
affected_end: int # End of affected region
alleles: list # Combined REF and ALT alleles
# Variant classification properties
is_snp: bool # True if variant is a SNP
is_indel: bool # True if variant is an indel
is_sv: bool # True if variant is a structural variant
is_transition: bool # True if SNP is a transition
is_deletion: bool # True if indel is a deletion
is_monomorphic: bool # True for reference calls
is_filtered: bool # True if variant failed filters
var_type: str # "snp", "indel", "sv", "unknown"
var_subtype: str # "ts", "tv", "ins", "del", etc.
# Structural variant properties
sv_end: int # SV end position (from INFO.END)
is_sv_precise: bool # True if SV coordinates are precise
# Population statistics properties
num_called: int # Number of called samples
call_rate: float # Fraction of called genotypes
num_hom_ref: int # Number of homozygous reference calls
num_hom_alt: int # Number of homozygous alternate calls
num_het: int # Number of heterozygous calls
num_unknown: int # Number of uncalled genotypes
aaf: list # List of alternate allele frequencies
nucl_diversity: float # Nucleotide diversity estimate
heterozygosity: float # Site heterozygosity
def genotype(self, name: str):
"""
Get genotype call for specific sample.
Parameters:
- name: str, sample name
Returns:
_Call object for the sample
"""
def add_format(self, fmt: str):
"""
Add field to FORMAT.
Parameters:
- fmt: str, format field to add
"""
def add_filter(self, flt: str):
"""
Add filter to FILTER field.
Parameters:
- flt: str, filter name to add
"""
def add_info(self, info: str, value=True):
"""
Add INFO field.
Parameters:
- info: str, INFO field name
- value: INFO field value (default True for flags)
"""
def get_hom_refs(self):
"""
Get list of homozygous reference calls.
Returns:
List of _Call objects with homozygous reference genotypes
"""
def get_hom_alts(self):
"""
Get list of homozygous alternate calls.
Returns:
List of _Call objects with homozygous alternate genotypes
"""
def get_hets(self):
"""
Get list of heterozygous calls.
Returns:
List of _Call objects with heterozygous genotypes
"""
def get_unknowns(self):
"""
Get list of uncalled genotypes.
Returns:
List of _Call objects with uncalled genotypes
"""import vcf
reader = vcf.Reader(filename='variants.vcf')
for record in reader:
# Access basic variant information
print(f"Variant: {record.CHROM}:{record.POS} {record.REF}>{record.ALT}")
# Check variant classification
if record.is_snp:
print(f"SNP - Transition: {record.is_transition}")
elif record.is_indel:
print(f"Indel - Deletion: {record.is_deletion}")
# Population statistics
print(f"Call rate: {record.call_rate:.2f}")
print(f"Heterozygosity: {record.heterozygosity:.3f}")
if record.aaf:
print(f"Alternate allele frequencies: {record.aaf}")
# Access sample genotypes
hom_refs = record.get_hom_refs()
hets = record.get_hets()
hom_alts = record.get_hom_alts()
print(f"Genotype counts - Hom ref: {len(hom_refs)}, "
f"Het: {len(hets)}, Hom alt: {len(hom_alts)}")
# Get specific sample genotype
if 'SAMPLE1' in reader.samples:
call = record.genotype('SAMPLE1')
print(f"SAMPLE1 genotype: {call.gt_bases}")
# Modify record
record.add_info('ANALYZED', True)
if record.QUAL and record.QUAL < 30:
record.add_filter('LowQual')class _AltRecord:
"""Abstract base class for alternate allele representations."""
pass
class _Substitution(_AltRecord):
"""Regular nucleotide substitutions (SNV/MNV)."""
type: str # "SNV" or "MNV"
sequence: str # Alternate sequence
class _Breakend(_AltRecord):
"""Paired breakend for structural variants."""
type: str # "BND"
chr: str # Mate chromosome
pos: int # Mate position
orientation: bool # Breakend orientation
remoteOrientation: bool # Mate orientation
connectingSequence: str # Connecting sequence
withinMainAssembly: bool # True if mate in main assembly
class _SingleBreakend(_Breakend):
"""Single breakend (inherits from _Breakend)."""
pass
class _SV(_AltRecord):
"""Symbolic structural variant alleles (e.g., <DEL>, <DUP>)."""
type: str # SV type stringInstall with Tessl CLI
npx tessl i tessl/pypi-pyvcf