CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyvcf

A VCFv4.0 and 4.1 parser for Python

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

variant-records.mddocs/

Variant Record Analysis

Comprehensive variant record representation with coordinate properties, variant classification, and population genetics statistics for genomic analysis.

Capabilities

Variant Records

The _Record class represents individual variant sites with complete VCF information and computed properties.

class _Record:
    """
    Represents a single variant site (row) in a VCF file.
    """
    
    # Standard VCF fields
    CHROM: str  # Chromosome name
    POS: int    # 1-based position
    ID: str     # Variant identifier
    REF: str    # Reference allele
    ALT: list   # List of alternate alleles
    QUAL: float # Quality score
    FILTER: list # Filter status
    INFO: dict  # INFO field dictionary
    FORMAT: str # Format string
    samples: list # List of _Call objects
    
    # Coordinate properties
    start: int           # 0-based start coordinate (POS - 1)
    end: int             # 0-based end coordinate
    affected_start: int  # Start of affected region
    affected_end: int    # End of affected region
    alleles: list        # Combined REF and ALT alleles
    
    # Variant classification properties
    is_snp: bool         # True if variant is a SNP
    is_indel: bool       # True if variant is an indel
    is_sv: bool          # True if variant is a structural variant
    is_transition: bool  # True if SNP is a transition
    is_deletion: bool    # True if indel is a deletion
    is_monomorphic: bool # True for reference calls
    is_filtered: bool    # True if variant failed filters
    var_type: str        # "snp", "indel", "sv", "unknown"
    var_subtype: str     # "ts", "tv", "ins", "del", etc.
    
    # Structural variant properties
    sv_end: int          # SV end position (from INFO.END)
    is_sv_precise: bool  # True if SV coordinates are precise
    
    # Population statistics properties
    num_called: int      # Number of called samples
    call_rate: float     # Fraction of called genotypes
    num_hom_ref: int     # Number of homozygous reference calls
    num_hom_alt: int     # Number of homozygous alternate calls
    num_het: int         # Number of heterozygous calls
    num_unknown: int     # Number of uncalled genotypes
    aaf: list           # List of alternate allele frequencies
    nucl_diversity: float # Nucleotide diversity estimate
    heterozygosity: float # Site heterozygosity
    
    def genotype(self, name: str):
        """
        Get genotype call for specific sample.
        
        Parameters:
        - name: str, sample name
        
        Returns:
        _Call object for the sample
        """
        
    def add_format(self, fmt: str):
        """
        Add field to FORMAT.
        
        Parameters:
        - fmt: str, format field to add
        """
        
    def add_filter(self, flt: str):
        """
        Add filter to FILTER field.
        
        Parameters:
        - flt: str, filter name to add
        """
        
    def add_info(self, info: str, value=True):
        """
        Add INFO field.
        
        Parameters:
        - info: str, INFO field name
        - value: INFO field value (default True for flags)
        """
        
    def get_hom_refs(self):
        """
        Get list of homozygous reference calls.
        
        Returns:
        List of _Call objects with homozygous reference genotypes
        """
        
    def get_hom_alts(self):
        """
        Get list of homozygous alternate calls.
        
        Returns:
        List of _Call objects with homozygous alternate genotypes
        """
        
    def get_hets(self):
        """
        Get list of heterozygous calls.
        
        Returns:
        List of _Call objects with heterozygous genotypes
        """
        
    def get_unknowns(self):
        """
        Get list of uncalled genotypes.
        
        Returns:
        List of _Call objects with uncalled genotypes
        """

Usage Examples

import vcf

reader = vcf.Reader(filename='variants.vcf')

for record in reader:
    # Access basic variant information
    print(f"Variant: {record.CHROM}:{record.POS} {record.REF}>{record.ALT}")
    
    # Check variant classification
    if record.is_snp:
        print(f"SNP - Transition: {record.is_transition}")
    elif record.is_indel:
        print(f"Indel - Deletion: {record.is_deletion}")
    
    # Population statistics
    print(f"Call rate: {record.call_rate:.2f}")
    print(f"Heterozygosity: {record.heterozygosity:.3f}")
    if record.aaf:
        print(f"Alternate allele frequencies: {record.aaf}")
    
    # Access sample genotypes
    hom_refs = record.get_hom_refs()
    hets = record.get_hets()
    hom_alts = record.get_hom_alts()
    
    print(f"Genotype counts - Hom ref: {len(hom_refs)}, "
          f"Het: {len(hets)}, Hom alt: {len(hom_alts)}")
    
    # Get specific sample genotype
    if 'SAMPLE1' in reader.samples:
        call = record.genotype('SAMPLE1')
        print(f"SAMPLE1 genotype: {call.gt_bases}")
    
    # Modify record
    record.add_info('ANALYZED', True)
    if record.QUAL and record.QUAL < 30:
        record.add_filter('LowQual')

Types

Alternate Allele Classes

class _AltRecord:
    """Abstract base class for alternate allele representations."""
    pass

class _Substitution(_AltRecord):
    """Regular nucleotide substitutions (SNV/MNV)."""
    type: str      # "SNV" or "MNV"
    sequence: str  # Alternate sequence

class _Breakend(_AltRecord):
    """Paired breakend for structural variants."""
    type: str                    # "BND"
    chr: str                     # Mate chromosome
    pos: int                     # Mate position
    orientation: bool            # Breakend orientation
    remoteOrientation: bool      # Mate orientation
    connectingSequence: str      # Connecting sequence
    withinMainAssembly: bool     # True if mate in main assembly

class _SingleBreakend(_Breakend):
    """Single breakend (inherits from _Breakend)."""
    pass

class _SV(_AltRecord):
    """Symbolic structural variant alleles (e.g., <DEL>, <DUP>)."""
    type: str  # SV type string

Install with Tessl CLI

npx tessl i tessl/pypi-pyvcf

docs

constants.md

genotype-analysis.md

index.md

sample-filtering.md

utils.md

variant-records.md

vcf-filtering.md

vcf-parsing.md

vcf-writing.md

tile.json