A VCFv4.0 and 4.1 parser for Python
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Individual sample genotype calls with classification, phase information, and variant analysis methods for population genetics and clinical genomics.
The _Call class represents individual sample genotype calls with comprehensive analysis properties.
class _Call:
"""
Represents a genotype call for one sample at one variant site.
"""
site: '_Record' # Reference to parent _Record
sample: str # Sample name
data: object # Namedtuple of FORMAT field data
called: bool # True if genotype was called
gt_nums: str # Raw genotype string (e.g., "0/1", "1|0")
gt_alleles: list # List of allele indices
ploidity: int # Number of alleles (e.g., 2 for diploid)
gt_bases: str # Actual DNA sequences (e.g., "A/G")
gt_type: int # Genotype type: 0=hom_ref, 1=het, 2=hom_alt, None=uncalled
phased: bool # True if genotype is phased
is_variant: bool # True if not reference call
is_het: bool # True if heterozygous
is_filtered: bool # True if call failed filters
def gt_phase_char(self):
"""
Get phase character for genotype.
Returns:
str: "/" for unphased, "|" for phased
"""import vcf
reader = vcf.Reader(filename='variants.vcf')
for record in reader:
print(f"Variant: {record.CHROM}:{record.POS}")
# Iterate through all sample calls
for call in record.samples:
print(f" Sample {call.sample}:")
if call.called:
print(f" Genotype: {call.gt_bases} ({call.gt_nums})")
print(f" Type: {['Hom Ref', 'Het', 'Hom Alt'][call.gt_type]}")
print(f" Phased: {call.phased}")
print(f" Variant: {call.is_variant}")
# Access FORMAT field data
if hasattr(call.data, 'DP'):
print(f" Depth: {call.data.DP}")
if hasattr(call.data, 'GQ'):
print(f" Genotype Quality: {call.data.GQ}")
else:
print(" Uncalled genotype")
# Get specific sample
if 'SAMPLE1' in reader.samples:
call = record.genotype('SAMPLE1')
if call.called and call.is_het:
print(f"SAMPLE1 is heterozygous: {call.gt_bases}")
# Count genotype types
called_samples = [call for call in record.samples if call.called]
het_count = sum(1 for call in called_samples if call.gt_type == 1)
hom_alt_count = sum(1 for call in called_samples if call.gt_type == 2)
print(f"Heterozygous calls: {het_count}")
print(f"Homozygous alternate calls: {hom_alt_count}")import vcf
reader = vcf.Reader(filename='variants.vcf')
for record in reader:
# Phase analysis
phased_calls = [call for call in record.samples if call.called and call.phased]
print(f"Phased genotypes: {len(phased_calls)}/{len(record.samples)}")
# Quality analysis (if GQ field present)
high_quality_calls = []
for call in record.samples:
if call.called and hasattr(call.data, 'GQ') and call.data.GQ >= 30:
high_quality_calls.append(call)
print(f"High quality calls (GQ>=30): {len(high_quality_calls)}")
# Depth analysis (if DP field present)
depths = []
for call in record.samples:
if call.called and hasattr(call.data, 'DP') and call.data.DP is not None:
depths.append(call.data.DP)
if depths:
avg_depth = sum(depths) / len(depths)
print(f"Average depth: {avg_depth:.1f}")
# Allele-specific analysis
for call in record.samples:
if call.called and call.gt_type == 1: # Heterozygous
allele_indices = call.gt_alleles
alleles = [record.alleles[int(i)] if i != '.' else '.' for i in allele_indices]
print(f"Het sample {call.sample}: alleles {alleles}")PyVCF automatically parses FORMAT fields into the call.data namedtuple. Common fields include:
# Common FORMAT fields accessible via call.data
call.data.GT # Genotype (string)
call.data.DP # Read depth (integer)
call.data.GQ # Genotype quality (integer)
call.data.PL # Phred-scaled genotype likelihoods (list of integers)
call.data.AD # Allelic depths (list of integers)
call.data.GL # Genotype likelihoods (list of floats)Install with Tessl CLI
npx tessl i tessl/pypi-pyvcf