0
# Sample-Based Filtering
1
2
Filter VCF files by sample during parsing to create subset files with specific samples for population studies and cohort analysis.
3
4
## Capabilities
5
6
### Sample Filter
7
8
Filter VCF files by sample names to create subset files containing only specified samples.
9
10
```python { .api }
11
class SampleFilter:
12
def __init__(self, infile, outfile=None, filters=None, invert=False):
13
"""
14
Initialize sample filter for VCF files.
15
16
Parameters:
17
- infile: str or file-like, input VCF file
18
- outfile: str or file-like, optional output file
19
- filters: list of str, sample names to include/exclude
20
- invert: bool, if True exclude listed samples, if False include only listed samples
21
"""
22
23
def set_filters(self, filters=None, invert=False):
24
"""
25
Set sample filters after initialization.
26
27
Parameters:
28
- filters: list of str, sample names to filter
29
- invert: bool, invert filter logic
30
"""
31
32
def write(self, outfile=None):
33
"""
34
Write filtered VCF to output file.
35
36
Parameters:
37
- outfile: str or file-like, output file (uses initialization value if None)
38
"""
39
40
# Properties
41
samples: list # Original sample list
42
parser: 'Reader' # Modified Reader instance with filtered samples
43
```
44
45
### Usage Examples
46
47
```python
48
import vcf
49
from vcf.sample_filter import SampleFilter
50
51
# Include specific samples
52
filter_obj = SampleFilter(
53
infile='input.vcf',
54
outfile='subset.vcf',
55
filters=['SAMPLE1', 'SAMPLE2', 'SAMPLE3'],
56
invert=False
57
)
58
filter_obj.write()
59
60
# Exclude specific samples
61
filter_obj = SampleFilter(
62
infile='input.vcf',
63
outfile='filtered.vcf',
64
filters=['BAD_SAMPLE1', 'BAD_SAMPLE2'],
65
invert=True
66
)
67
filter_obj.write()
68
69
# Use with file handles
70
with open('input.vcf', 'r') as infile, open('output.vcf', 'w') as outfile:
71
filter_obj = SampleFilter(
72
infile=infile,
73
outfile=outfile,
74
filters=['KEEP1', 'KEEP2']
75
)
76
filter_obj.write()
77
78
# Dynamic filtering
79
filter_obj = SampleFilter('large_cohort.vcf')
80
81
# Filter to population 1
82
filter_obj.set_filters(['POP1_001', 'POP1_002', 'POP1_003'], invert=False)
83
filter_obj.write('population1.vcf')
84
85
# Filter to population 2
86
filter_obj.set_filters(['POP2_001', 'POP2_002', 'POP2_003'], invert=False)
87
filter_obj.write('population2.vcf')
88
89
# Access filtered parser
90
filter_obj = SampleFilter('input.vcf', filters=['SAMPLE1', 'SAMPLE2'])
91
reader = filter_obj.parser
92
93
print("Filtered samples:", reader.samples)
94
for record in reader:
95
print(f"Variant {record.CHROM}:{record.POS} has {len(record.samples)} sample calls")
96
```
97
98
### Population Analysis Example
99
100
```python
101
import vcf
102
from vcf.sample_filter import SampleFilter
103
104
# Define population groups
105
populations = {
106
'EUR': ['EUR001', 'EUR002', 'EUR003', 'EUR004', 'EUR005'],
107
'ASN': ['ASN001', 'ASN002', 'ASN003', 'ASN004', 'ASN005'],
108
'AFR': ['AFR001', 'AFR002', 'AFR003', 'AFR004', 'AFR005']
109
}
110
111
input_file = 'multi_population.vcf'
112
113
# Create population-specific VCF files
114
for pop_name, sample_list in populations.items():
115
output_file = f'{pop_name.lower()}_variants.vcf'
116
117
filter_obj = SampleFilter(
118
infile=input_file,
119
outfile=output_file,
120
filters=sample_list,
121
invert=False
122
)
123
filter_obj.write()
124
print(f"Created {output_file} with {len(sample_list)} samples")
125
126
# Analyze each population
127
for pop_name, sample_list in populations.items():
128
filter_obj = SampleFilter(input_file, filters=sample_list)
129
reader = filter_obj.parser
130
131
variant_count = 0
132
high_freq_variants = 0
133
134
for record in reader:
135
variant_count += 1
136
137
# Calculate allele frequency in this population
138
if record.aaf and max(record.aaf) > 0.1: # >10% frequency
139
high_freq_variants += 1
140
141
print(f"{pop_name}: {variant_count} variants, {high_freq_variants} common variants")
142
```
143
144
### Command Line Usage
145
146
```bash
147
# Command line sample filtering
148
vcf_sample_filter.py --include SAMPLE1,SAMPLE2,SAMPLE3 input.vcf output.vcf
149
vcf_sample_filter.py --exclude BAD1,BAD2 input.vcf filtered.vcf
150
```
151
152
### Advanced Sample Selection
153
154
```python
155
import vcf
156
from vcf.sample_filter import SampleFilter
157
158
# Read original file to inspect samples
159
reader = vcf.Reader(filename='input.vcf')
160
all_samples = reader.samples
161
print(f"Total samples: {len(all_samples)}")
162
163
# Filter samples by naming pattern
164
case_samples = [s for s in all_samples if s.startswith('CASE_')]
165
control_samples = [s for s in all_samples if s.startswith('CTRL_')]
166
167
print(f"Cases: {len(case_samples)}, Controls: {len(control_samples)}")
168
169
# Create case-only VCF
170
case_filter = SampleFilter(
171
infile='input.vcf',
172
outfile='cases_only.vcf',
173
filters=case_samples
174
)
175
case_filter.write()
176
177
# Create control-only VCF
178
control_filter = SampleFilter(
179
infile='input.vcf',
180
outfile='controls_only.vcf',
181
filters=control_samples
182
)
183
control_filter.write()
184
185
# Random sample subset
186
import random
187
random_samples = random.sample(all_samples, min(100, len(all_samples)))
188
189
random_filter = SampleFilter(
190
infile='input.vcf',
191
outfile='random_subset.vcf',
192
filters=random_samples
193
)
194
random_filter.write()
195
```