0
# VCF Utilities
1
2
Utility functions for advanced VCF file operations including multi-file synchronization and sequence manipulation for comparative genomics workflows.
3
4
## Capabilities
5
6
### Multi-File VCF Processing
7
8
Process multiple VCF files simultaneously with coordinated iteration through variant records.
9
10
```python { .api }
11
def walk_together(*readers, **kwargs):
12
"""
13
Iterate over multiple VCF files simultaneously, yielding coordinated variant records.
14
15
Parameters:
16
- *readers: VCF Reader objects to iterate over together
17
- **kwargs: vcf_record_sort_key - Optional function for custom record sorting/comparison
18
19
Yields:
20
List of _Record objects, one from each reader at synchronized positions.
21
Records are None for readers that don't have variants at the current position.
22
"""
23
```
24
25
### Sequence Manipulation
26
27
Utilities for working with reference and alternate allele sequences.
28
29
```python { .api }
30
def trim_common_suffix(*sequences):
31
"""
32
Remove common suffix from sequences while keeping at least 1 character.
33
34
Parameters:
35
- *sequences: str, input sequences to trim
36
37
Returns:
38
List[str]: Sequences with common suffix removed, maintaining minimum length
39
"""
40
```
41
42
### Usage Examples
43
44
```python
45
import vcf
46
from vcf.utils import walk_together, trim_common_suffix
47
48
# Synchronize multiple VCF files
49
reader1 = vcf.Reader(filename='population1.vcf')
50
reader2 = vcf.Reader(filename='population2.vcf')
51
reader3 = vcf.Reader(filename='population3.vcf')
52
53
# Process variants across all files simultaneously
54
for records in walk_together(reader1, reader2, reader3):
55
record1, record2, record3 = records
56
57
# Check which files have variants at this position
58
present_in = []
59
if record1: present_in.append('pop1')
60
if record2: present_in.append('pop2')
61
if record3: present_in.append('pop3')
62
63
if len(present_in) > 1:
64
print(f"Shared variant at {record1.CHROM if record1 else record2.CHROM if record2 else record3.CHROM}:")
65
print(f" Present in: {', '.join(present_in)}")
66
67
# Compare allele frequencies across populations
68
for i, record in enumerate(records):
69
if record and record.aaf:
70
pop_name = ['pop1', 'pop2', 'pop3'][i]
71
print(f" {pop_name} AAF: {record.aaf}")
72
73
# Sequence manipulation examples
74
sequences = ['ATCGATCG', 'TTCGATCG', 'CTCGATCG']
75
trimmed = trim_common_suffix(*sequences)
76
print(f"Original: {sequences}")
77
print(f"Trimmed: {trimmed}") # ['ATC', 'TTC', 'CTC'] - removed common 'GATCG'
78
79
# Example with variant alleles
80
ref = 'ATCGATCG'
81
alt = 'ATCGATCC'
82
trimmed_alleles = trim_common_suffix(ref, alt)
83
print(f"REF: {ref} -> {trimmed_alleles[0]}") # ATCGATCG -> ATCGATCG
84
print(f"ALT: {alt} -> {trimmed_alleles[1]}") # ATCGATCC -> ATCGATCC
85
```
86
87
### Advanced Multi-File Analysis
88
89
```python
90
import vcf
91
from vcf.utils import walk_together
92
93
# Compare variant calling across different methods
94
caller1_reader = vcf.Reader(filename='gatk_variants.vcf')
95
caller2_reader = vcf.Reader(filename='freebayes_variants.vcf')
96
caller3_reader = vcf.Reader(filename='varscan_variants.vcf')
97
98
concordant_variants = []
99
discordant_variants = []
100
101
for records in walk_together(caller1_reader, caller2_reader, caller3_reader):
102
gatk_record, freebayes_record, varscan_record = records
103
104
# Count how many callers found this variant
105
called_by = sum(1 for record in records if record is not None)
106
107
if called_by >= 2: # Concordant if found by 2+ callers
108
# Use the record with highest quality
109
best_record = max([r for r in records if r], key=lambda x: x.QUAL or 0)
110
concordant_variants.append(best_record)
111
elif called_by == 1: # Discordant if found by only 1 caller
112
solo_record = next(r for r in records if r)
113
discordant_variants.append(solo_record)
114
115
print(f"Concordant variants: {len(concordant_variants)}")
116
print(f"Discordant variants: {len(discordant_variants)}")
117
118
# Write high-confidence variants (found by multiple callers)
119
template_reader = vcf.Reader(filename='gatk_variants.vcf')
120
with open('high_confidence.vcf', 'w') as output:
121
writer = vcf.Writer(output, template_reader)
122
for variant in concordant_variants:
123
variant.add_info('CALLERS', called_by) # Add metadata
124
writer.write_record(variant)
125
writer.close()
126
```
127
128
### Custom Record Sorting
129
130
```python
131
import vcf
132
from vcf.utils import walk_together
133
134
def custom_sort_key(record):
135
"""Custom sorting function prioritizing by chromosome, then position."""
136
# Convert chromosome names to sortable format
137
chrom = record.CHROM
138
if chrom.startswith('chr'):
139
chrom = chrom[3:]
140
141
# Handle sex chromosomes and others
142
if chrom == 'X':
143
chrom_num = 23
144
elif chrom == 'Y':
145
chrom_num = 24
146
elif chrom == 'MT' or chrom == 'M':
147
chrom_num = 25
148
else:
149
try:
150
chrom_num = int(chrom)
151
except ValueError:
152
chrom_num = 26 # Unknown chromosomes last
153
154
return (chrom_num, record.POS)
155
156
# Use custom sorting for non-standard VCF files
157
reader1 = vcf.Reader(filename='unsorted1.vcf')
158
reader2 = vcf.Reader(filename='unsorted2.vcf')
159
160
for records in walk_together(reader1, reader2, vcf_record_sort_key=custom_sort_key):
161
record1, record2 = records
162
# Process synchronized records with custom sort order
163
pass
164
```