CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyvcf

A VCFv4.0 and 4.1 parser for Python

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

sample-filtering.mddocs/

Sample-Based Filtering

Filter VCF files by sample during parsing to create subset files with specific samples for population studies and cohort analysis.

Capabilities

Sample Filter

Filter VCF files by sample names to create subset files containing only specified samples.

class SampleFilter:
    def __init__(self, infile, outfile=None, filters=None, invert=False):
        """
        Initialize sample filter for VCF files.
        
        Parameters:
        - infile: str or file-like, input VCF file
        - outfile: str or file-like, optional output file
        - filters: list of str, sample names to include/exclude
        - invert: bool, if True exclude listed samples, if False include only listed samples
        """
        
    def set_filters(self, filters=None, invert=False):
        """
        Set sample filters after initialization.
        
        Parameters:
        - filters: list of str, sample names to filter
        - invert: bool, invert filter logic
        """
        
    def write(self, outfile=None):
        """
        Write filtered VCF to output file.
        
        Parameters:
        - outfile: str or file-like, output file (uses initialization value if None)
        """
        
    # Properties
    samples: list  # Original sample list
    parser: 'Reader'  # Modified Reader instance with filtered samples

Usage Examples

import vcf
from vcf.sample_filter import SampleFilter

# Include specific samples
filter_obj = SampleFilter(
    infile='input.vcf',
    outfile='subset.vcf',
    filters=['SAMPLE1', 'SAMPLE2', 'SAMPLE3'],
    invert=False
)
filter_obj.write()

# Exclude specific samples
filter_obj = SampleFilter(
    infile='input.vcf',
    outfile='filtered.vcf',
    filters=['BAD_SAMPLE1', 'BAD_SAMPLE2'],
    invert=True
)
filter_obj.write()

# Use with file handles
with open('input.vcf', 'r') as infile, open('output.vcf', 'w') as outfile:
    filter_obj = SampleFilter(
        infile=infile,
        outfile=outfile,
        filters=['KEEP1', 'KEEP2']
    )
    filter_obj.write()

# Dynamic filtering
filter_obj = SampleFilter('large_cohort.vcf')

# Filter to population 1
filter_obj.set_filters(['POP1_001', 'POP1_002', 'POP1_003'], invert=False)
filter_obj.write('population1.vcf')

# Filter to population 2
filter_obj.set_filters(['POP2_001', 'POP2_002', 'POP2_003'], invert=False)
filter_obj.write('population2.vcf')

# Access filtered parser
filter_obj = SampleFilter('input.vcf', filters=['SAMPLE1', 'SAMPLE2'])
reader = filter_obj.parser

print("Filtered samples:", reader.samples)
for record in reader:
    print(f"Variant {record.CHROM}:{record.POS} has {len(record.samples)} sample calls")

Population Analysis Example

import vcf
from vcf.sample_filter import SampleFilter

# Define population groups
populations = {
    'EUR': ['EUR001', 'EUR002', 'EUR003', 'EUR004', 'EUR005'],
    'ASN': ['ASN001', 'ASN002', 'ASN003', 'ASN004', 'ASN005'],
    'AFR': ['AFR001', 'AFR002', 'AFR003', 'AFR004', 'AFR005']
}

input_file = 'multi_population.vcf'

# Create population-specific VCF files
for pop_name, sample_list in populations.items():
    output_file = f'{pop_name.lower()}_variants.vcf'
    
    filter_obj = SampleFilter(
        infile=input_file,
        outfile=output_file,
        filters=sample_list,
        invert=False
    )
    filter_obj.write()
    print(f"Created {output_file} with {len(sample_list)} samples")

# Analyze each population
for pop_name, sample_list in populations.items():
    filter_obj = SampleFilter(input_file, filters=sample_list)
    reader = filter_obj.parser
    
    variant_count = 0
    high_freq_variants = 0
    
    for record in reader:
        variant_count += 1
        
        # Calculate allele frequency in this population
        if record.aaf and max(record.aaf) > 0.1:  # >10% frequency
            high_freq_variants += 1
    
    print(f"{pop_name}: {variant_count} variants, {high_freq_variants} common variants")

Command Line Usage

# Command line sample filtering
vcf_sample_filter.py --include SAMPLE1,SAMPLE2,SAMPLE3 input.vcf output.vcf
vcf_sample_filter.py --exclude BAD1,BAD2 input.vcf filtered.vcf

Advanced Sample Selection

import vcf
from vcf.sample_filter import SampleFilter

# Read original file to inspect samples
reader = vcf.Reader(filename='input.vcf')
all_samples = reader.samples
print(f"Total samples: {len(all_samples)}")

# Filter samples by naming pattern
case_samples = [s for s in all_samples if s.startswith('CASE_')]
control_samples = [s for s in all_samples if s.startswith('CTRL_')]

print(f"Cases: {len(case_samples)}, Controls: {len(control_samples)}")

# Create case-only VCF
case_filter = SampleFilter(
    infile='input.vcf',
    outfile='cases_only.vcf',
    filters=case_samples
)
case_filter.write()

# Create control-only VCF
control_filter = SampleFilter(
    infile='input.vcf',
    outfile='controls_only.vcf',
    filters=control_samples
)
control_filter.write()

# Random sample subset
import random
random_samples = random.sample(all_samples, min(100, len(all_samples)))

random_filter = SampleFilter(
    infile='input.vcf',
    outfile='random_subset.vcf',
    filters=random_samples
)
random_filter.write()

Install with Tessl CLI

npx tessl i tessl/pypi-pyvcf

docs

constants.md

genotype-analysis.md

index.md

sample-filtering.md

utils.md

variant-records.md

vcf-filtering.md

vcf-parsing.md

vcf-writing.md

tile.json