A VCFv4.0 and 4.1 parser for Python
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Filter VCF files by sample during parsing to create subset files with specific samples for population studies and cohort analysis.
Filter VCF files by sample names to create subset files containing only specified samples.
class SampleFilter:
def __init__(self, infile, outfile=None, filters=None, invert=False):
"""
Initialize sample filter for VCF files.
Parameters:
- infile: str or file-like, input VCF file
- outfile: str or file-like, optional output file
- filters: list of str, sample names to include/exclude
- invert: bool, if True exclude listed samples, if False include only listed samples
"""
def set_filters(self, filters=None, invert=False):
"""
Set sample filters after initialization.
Parameters:
- filters: list of str, sample names to filter
- invert: bool, invert filter logic
"""
def write(self, outfile=None):
"""
Write filtered VCF to output file.
Parameters:
- outfile: str or file-like, output file (uses initialization value if None)
"""
# Properties
samples: list # Original sample list
parser: 'Reader' # Modified Reader instance with filtered samplesimport vcf
from vcf.sample_filter import SampleFilter
# Include specific samples
filter_obj = SampleFilter(
infile='input.vcf',
outfile='subset.vcf',
filters=['SAMPLE1', 'SAMPLE2', 'SAMPLE3'],
invert=False
)
filter_obj.write()
# Exclude specific samples
filter_obj = SampleFilter(
infile='input.vcf',
outfile='filtered.vcf',
filters=['BAD_SAMPLE1', 'BAD_SAMPLE2'],
invert=True
)
filter_obj.write()
# Use with file handles
with open('input.vcf', 'r') as infile, open('output.vcf', 'w') as outfile:
filter_obj = SampleFilter(
infile=infile,
outfile=outfile,
filters=['KEEP1', 'KEEP2']
)
filter_obj.write()
# Dynamic filtering
filter_obj = SampleFilter('large_cohort.vcf')
# Filter to population 1
filter_obj.set_filters(['POP1_001', 'POP1_002', 'POP1_003'], invert=False)
filter_obj.write('population1.vcf')
# Filter to population 2
filter_obj.set_filters(['POP2_001', 'POP2_002', 'POP2_003'], invert=False)
filter_obj.write('population2.vcf')
# Access filtered parser
filter_obj = SampleFilter('input.vcf', filters=['SAMPLE1', 'SAMPLE2'])
reader = filter_obj.parser
print("Filtered samples:", reader.samples)
for record in reader:
print(f"Variant {record.CHROM}:{record.POS} has {len(record.samples)} sample calls")import vcf
from vcf.sample_filter import SampleFilter
# Define population groups
populations = {
'EUR': ['EUR001', 'EUR002', 'EUR003', 'EUR004', 'EUR005'],
'ASN': ['ASN001', 'ASN002', 'ASN003', 'ASN004', 'ASN005'],
'AFR': ['AFR001', 'AFR002', 'AFR003', 'AFR004', 'AFR005']
}
input_file = 'multi_population.vcf'
# Create population-specific VCF files
for pop_name, sample_list in populations.items():
output_file = f'{pop_name.lower()}_variants.vcf'
filter_obj = SampleFilter(
infile=input_file,
outfile=output_file,
filters=sample_list,
invert=False
)
filter_obj.write()
print(f"Created {output_file} with {len(sample_list)} samples")
# Analyze each population
for pop_name, sample_list in populations.items():
filter_obj = SampleFilter(input_file, filters=sample_list)
reader = filter_obj.parser
variant_count = 0
high_freq_variants = 0
for record in reader:
variant_count += 1
# Calculate allele frequency in this population
if record.aaf and max(record.aaf) > 0.1: # >10% frequency
high_freq_variants += 1
print(f"{pop_name}: {variant_count} variants, {high_freq_variants} common variants")# Command line sample filtering
vcf_sample_filter.py --include SAMPLE1,SAMPLE2,SAMPLE3 input.vcf output.vcf
vcf_sample_filter.py --exclude BAD1,BAD2 input.vcf filtered.vcfimport vcf
from vcf.sample_filter import SampleFilter
# Read original file to inspect samples
reader = vcf.Reader(filename='input.vcf')
all_samples = reader.samples
print(f"Total samples: {len(all_samples)}")
# Filter samples by naming pattern
case_samples = [s for s in all_samples if s.startswith('CASE_')]
control_samples = [s for s in all_samples if s.startswith('CTRL_')]
print(f"Cases: {len(case_samples)}, Controls: {len(control_samples)}")
# Create case-only VCF
case_filter = SampleFilter(
infile='input.vcf',
outfile='cases_only.vcf',
filters=case_samples
)
case_filter.write()
# Create control-only VCF
control_filter = SampleFilter(
infile='input.vcf',
outfile='controls_only.vcf',
filters=control_samples
)
control_filter.write()
# Random sample subset
import random
random_samples = random.sample(all_samples, min(100, len(all_samples)))
random_filter = SampleFilter(
infile='input.vcf',
outfile='random_subset.vcf',
filters=random_samples
)
random_filter.write()Install with Tessl CLI
npx tessl i tessl/pypi-pyvcf