CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyvcf

A VCFv4.0 and 4.1 parser for Python

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

utils.mddocs/

VCF Utilities

Utility functions for advanced VCF file operations including multi-file synchronization and sequence manipulation for comparative genomics workflows.

Capabilities

Multi-File VCF Processing

Process multiple VCF files simultaneously with coordinated iteration through variant records.

def walk_together(*readers, **kwargs):
    """
    Iterate over multiple VCF files simultaneously, yielding coordinated variant records.
    
    Parameters:
    - *readers: VCF Reader objects to iterate over together
    - **kwargs: vcf_record_sort_key - Optional function for custom record sorting/comparison
    
    Yields:
    List of _Record objects, one from each reader at synchronized positions.
    Records are None for readers that don't have variants at the current position.
    """

Sequence Manipulation

Utilities for working with reference and alternate allele sequences.

def trim_common_suffix(*sequences):
    """
    Remove common suffix from sequences while keeping at least 1 character.
    
    Parameters:
    - *sequences: str, input sequences to trim
    
    Returns:
    List[str]: Sequences with common suffix removed, maintaining minimum length
    """

Usage Examples

import vcf
from vcf.utils import walk_together, trim_common_suffix

# Synchronize multiple VCF files
reader1 = vcf.Reader(filename='population1.vcf')
reader2 = vcf.Reader(filename='population2.vcf')
reader3 = vcf.Reader(filename='population3.vcf')

# Process variants across all files simultaneously
for records in walk_together(reader1, reader2, reader3):
    record1, record2, record3 = records
    
    # Check which files have variants at this position
    present_in = []
    if record1: present_in.append('pop1')
    if record2: present_in.append('pop2')  
    if record3: present_in.append('pop3')
    
    if len(present_in) > 1:
        print(f"Shared variant at {record1.CHROM if record1 else record2.CHROM if record2 else record3.CHROM}:")
        print(f"  Present in: {', '.join(present_in)}")
        
        # Compare allele frequencies across populations
        for i, record in enumerate(records):
            if record and record.aaf:
                pop_name = ['pop1', 'pop2', 'pop3'][i]
                print(f"  {pop_name} AAF: {record.aaf}")

# Sequence manipulation examples
sequences = ['ATCGATCG', 'TTCGATCG', 'CTCGATCG']
trimmed = trim_common_suffix(*sequences)
print(f"Original: {sequences}")
print(f"Trimmed: {trimmed}")  # ['ATC', 'TTC', 'CTC'] - removed common 'GATCG'

# Example with variant alleles
ref = 'ATCGATCG'
alt = 'ATCGATCC'
trimmed_alleles = trim_common_suffix(ref, alt)
print(f"REF: {ref} -> {trimmed_alleles[0]}")  # ATCGATCG -> ATCGATCG  
print(f"ALT: {alt} -> {trimmed_alleles[1]}")  # ATCGATCC -> ATCGATCC

Advanced Multi-File Analysis

import vcf
from vcf.utils import walk_together

# Compare variant calling across different methods
caller1_reader = vcf.Reader(filename='gatk_variants.vcf')
caller2_reader = vcf.Reader(filename='freebayes_variants.vcf')
caller3_reader = vcf.Reader(filename='varscan_variants.vcf')

concordant_variants = []
discordant_variants = []

for records in walk_together(caller1_reader, caller2_reader, caller3_reader):
    gatk_record, freebayes_record, varscan_record = records
    
    # Count how many callers found this variant
    called_by = sum(1 for record in records if record is not None)
    
    if called_by >= 2:  # Concordant if found by 2+ callers
        # Use the record with highest quality
        best_record = max([r for r in records if r], key=lambda x: x.QUAL or 0)
        concordant_variants.append(best_record)
    elif called_by == 1:  # Discordant if found by only 1 caller
        solo_record = next(r for r in records if r)
        discordant_variants.append(solo_record)

print(f"Concordant variants: {len(concordant_variants)}")
print(f"Discordant variants: {len(discordant_variants)}")

# Write high-confidence variants (found by multiple callers)
template_reader = vcf.Reader(filename='gatk_variants.vcf')
with open('high_confidence.vcf', 'w') as output:
    writer = vcf.Writer(output, template_reader)
    for variant in concordant_variants:
        variant.add_info('CALLERS', called_by)  # Add metadata
        writer.write_record(variant)
    writer.close()

Custom Record Sorting

import vcf
from vcf.utils import walk_together

def custom_sort_key(record):
    """Custom sorting function prioritizing by chromosome, then position."""
    # Convert chromosome names to sortable format
    chrom = record.CHROM
    if chrom.startswith('chr'):
        chrom = chrom[3:]
    
    # Handle sex chromosomes and others
    if chrom == 'X':
        chrom_num = 23
    elif chrom == 'Y':
        chrom_num = 24
    elif chrom == 'MT' or chrom == 'M':
        chrom_num = 25
    else:
        try:
            chrom_num = int(chrom)
        except ValueError:
            chrom_num = 26  # Unknown chromosomes last
    
    return (chrom_num, record.POS)

# Use custom sorting for non-standard VCF files
reader1 = vcf.Reader(filename='unsorted1.vcf')
reader2 = vcf.Reader(filename='unsorted2.vcf')

for records in walk_together(reader1, reader2, vcf_record_sort_key=custom_sort_key):
    record1, record2 = records
    # Process synchronized records with custom sort order
    pass

Install with Tessl CLI

npx tessl i tessl/pypi-pyvcf

docs

constants.md

genotype-analysis.md

index.md

sample-filtering.md

utils.md

variant-records.md

vcf-filtering.md

vcf-parsing.md

vcf-writing.md

tile.json