CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyvcf

A VCFv4.0 and 4.1 parser for Python

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

constants.mddocs/

Constants and Reserved Fields

PyVCF constants for VCF specification compliance, field validation, and reserved field definitions for INFO and FORMAT fields.

Capabilities

Package Version

VERSION: str  # PyVCF version string (e.g., '0.6.8')

Reserved INFO Fields

Pre-defined INFO field specifications from VCF 4.0/4.1 specification.

RESERVED_INFO: dict = {
    'AA': 'String',     # Ancestral Allele
    'AC': 'Integer',    # Allele count in genotypes
    'AF': 'Float',      # Allele Frequency
    'AN': 'Integer',    # Total number of alleles in called genotypes
    'BQ': 'Float',      # Base Quality
    'CIGAR': 'String',  # CIGAR string describing alignment
    'DB': 'Flag',       # dbSNP membership
    'DP': 'Integer',    # Combined depth across samples
    'END': 'Integer',   # End position of variant
    'H2': 'Flag',       # HapMap2 membership
    'H3': 'Flag',       # HapMap3 membership
    'MQ': 'Float',      # Mapping Quality
    'MQ0': 'Integer',   # Number of MAPQ == 0 reads
    'NS': 'Integer',    # Number of samples with data
    'SB': 'String',     # Strand bias
    'SOMATIC': 'Flag',  # Somatic mutation
    'VALIDATED': 'Flag', # Validated by follow-up experiment
    '1000G': 'Flag',    # 1000 Genomes membership
    
    # Structural variant INFO fields
    'IMPRECISE': 'Flag',     # Imprecise structural variation
    'NOVEL': 'Flag',         # Novel structural variation
    'SVEND': 'Integer',      # End position of SV
    'SVLEN': 'Integer',      # Length of SV
    'SVTYPE': 'String',      # Type of structural variant
    'MATEID': 'String',      # ID of mate breakend
    'EVENT': 'String',       # ID of associated event
    'HOMLEN': 'Integer',     # Length of base pair homology
    'DGVID': 'String',       # ID from Database of Genomic Variants
    'DBVARID': 'String',     # ID from NCBI dbVar
}

Reserved FORMAT Fields

Pre-defined FORMAT field specifications from VCF 4.0/4.1 specification.

RESERVED_FORMAT: dict = {
    'GT': 'String',     # Genotype
    'DP': 'Integer',    # Read depth at this position for this sample
    'FT': 'String',     # Sample genotype filter
    'GL': 'Float',      # Genotype likelihoods
    'GLE': 'String',    # Genotype likelihoods (log10 encoded)
    'PL': 'Integer',    # Phred-scaled genotype likelihoods
    'GP': 'Float',      # Genotype posterior probabilities
    'GQ': 'Integer',    # Conditional genotype quality
    'HQ': 'Integer',    # Haplotype qualities
    'PS': 'Integer',    # Phase set identifier
    'PQ': 'Integer',    # Phasing quality
    'EC': 'Integer',    # Expected alternate allele counts
    'MQ': 'Integer',    # Mapping quality
    'AD': 'Integer',    # Allelic depths
    'ADF': 'Integer',   # Forward strand allelic depths
    'ADR': 'Integer',   # Reverse strand allelic depths
    
    # Copy number fields
    'CN': 'Integer',    # Copy number genotype
    'CNQ': 'Float',     # Copy number genotype quality
    'CNL': 'Float',     # Copy number genotype likelihood
    'NQ': 'Integer',    # Phred style probability that the variant is novel
    'HAP': 'Integer',   # Unique haplotype identifier
    'AHAP': 'Integer',  # Alternative haplotype identifier
}

Field Number Constants

Constants for interpreting field number specifications in VCF headers.

field_counts: dict = {
    '.': None,  # Unknown number of values
    'A': -1,    # Number of alternate alleles
    'G': -2,    # Number of genotypes (including reference)
    'R': -3,    # Number of alleles (reference + alternates)
}

Usage Examples

import vcf

# Access version information
print(f"PyVCF version: {vcf.VERSION}")

# Check if INFO field is reserved
if 'DP' in vcf.RESERVED_INFO:
    print(f"DP is a reserved INFO field, type: {vcf.RESERVED_INFO['DP']}")

# Check if FORMAT field is reserved
if 'GT' in vcf.RESERVED_FORMAT:
    print(f"GT is a reserved FORMAT field, type: {vcf.RESERVED_FORMAT['GT']}")

# Validate custom field names against reserved fields
def validate_custom_info_field(field_name):
    if field_name in vcf.RESERVED_INFO:
        print(f"Warning: {field_name} is a reserved INFO field")
        return False
    return True

def validate_custom_format_field(field_name):
    if field_name in vcf.RESERVED_FORMAT:
        print(f"Warning: {field_name} is a reserved FORMAT field")
        return False
    return True

# Example validation
validate_custom_info_field('CUSTOM_INFO')  # OK
validate_custom_info_field('DP')           # Warning: reserved
validate_custom_format_field('CUSTOM_GT')  # OK
validate_custom_format_field('GT')         # Warning: reserved

Working with Reserved Fields

import vcf

reader = vcf.Reader(filename='variants.vcf')

for record in reader:
    # Access standard INFO fields
    if 'DP' in record.INFO:
        depth = record.INFO['DP']
        print(f"Total depth: {depth}")
    
    if 'AF' in record.INFO:
        allele_freqs = record.INFO['AF']
        print(f"Allele frequencies: {allele_freqs}")
    
    # Check for structural variant INFO fields
    if record.is_sv:
        if 'SVTYPE' in record.INFO:
            sv_type = record.INFO['SVTYPE']
            print(f"SV type: {sv_type}")
        
        if 'SVLEN' in record.INFO:
            sv_length = record.INFO['SVLEN']
            print(f"SV length: {sv_length}")
    
    # Access standard FORMAT fields for samples
    for call in record.samples:
        if hasattr(call.data, 'GT'):
            print(f"Sample {call.sample} genotype: {call.data.GT}")
        
        if hasattr(call.data, 'DP'):
            print(f"Sample {call.sample} depth: {call.data.DP}")
        
        if hasattr(call.data, 'GQ'):
            print(f"Sample {call.sample} quality: {call.data.GQ}")

Header Field Type Validation

import vcf

def validate_field_definition(field_dict, reserved_dict, field_type):
    """Validate field definitions against reserved specifications."""
    for field_id, field_info in field_dict.items():
        if field_id in reserved_dict:
            expected_type = reserved_dict[field_id]
            actual_type = field_info.type
            
            if actual_type != expected_type:
                print(f"Warning: {field_type} field {field_id} "
                     f"type mismatch: expected {expected_type}, "
                     f"got {actual_type}")

# Validate a VCF file's field definitions
reader = vcf.Reader(filename='variants.vcf')

# Validate INFO field definitions
validate_field_definition(reader.infos, vcf.RESERVED_INFO, 'INFO')

# Validate FORMAT field definitions  
validate_field_definition(reader.formats, vcf.RESERVED_FORMAT, 'FORMAT')

Custom Field Number Handling

import vcf

def interpret_field_number(num_str, num_alts=None, num_samples=None):
    """Interpret VCF field number specifications."""
    if num_str == '.':
        return None  # Variable number
    elif num_str == 'A':
        return num_alts if num_alts else -1
    elif num_str == 'G':
        # Number of genotypes = (n+1)(n+2)/2 where n = number of alleles
        if num_alts:
            n_alleles = num_alts + 1  # Include reference
            return (n_alleles * (n_alleles + 1)) // 2
        return -2
    elif num_str == 'R':
        return (num_alts + 1) if num_alts else -3
    else:
        try:
            return int(num_str)
        except ValueError:
            return None

# Example usage with a record
reader = vcf.Reader(filename='variants.vcf')
record = next(reader)

# Interpret field numbers for this record's context
num_alts = len(record.ALT)
print(f"Number of alternates: {num_alts}")

for field_id, field_info in reader.infos.items():
    expected_count = interpret_field_number(field_info.num, num_alts)
    print(f"INFO {field_id}: expects {expected_count} values")

Install with Tessl CLI

npx tessl i tessl/pypi-pyvcf

docs

constants.md

genotype-analysis.md

index.md

sample-filtering.md

utils.md

variant-records.md

vcf-filtering.md

vcf-parsing.md

vcf-writing.md

tile.json