0
# VCF/BCF Variant Files
1
2
Comprehensive support for reading and writing variant call format files in VCF and BCF formats. Provides header management, sample data access, filtering, and indexing capabilities.
3
4
## Capabilities
5
6
### VariantFile
7
8
Main interface for reading and writing VCF/BCF files with full header and sample support.
9
10
```python { .api }
11
class VariantFile:
12
def __init__(self, filepath, mode="r", header=None, drop_samples=False, duplicate_filehandle=True, ignore_truncation=False, format_options=None, threads=1, index=None):
13
"""
14
Open a VCF/BCF file for reading or writing.
15
16
Parameters:
17
- filepath: str, path to VCF/BCF file
18
- mode: str, file mode ('r', 'w', 'rb', 'wb')
19
- header: VariantHeader, header for writing
20
- drop_samples: bool, ignore sample data
21
- duplicate_filehandle: bool, allow multiple handles
22
- ignore_truncation: bool, ignore truncated files
23
- threads: int, number of threads for compression
24
- index: str, path to index file
25
26
Returns:
27
VariantFile object
28
"""
29
30
def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
31
"""
32
Fetch variant records from a region.
33
34
Parameters:
35
- contig: str, chromosome/contig name
36
- start: int, 0-based start position
37
- stop: int, 0-based stop position
38
- region: str, region string (chr:start-stop)
39
- reopen: bool, reopen file for iteration
40
41
Returns:
42
Iterator of VariantRecord objects
43
"""
44
45
def write(self, record):
46
"""
47
Write a variant record.
48
49
Parameters:
50
- record: VariantRecord, variant to write
51
"""
52
53
def new_record(self, contig=None, start=None, stop=None, alleles=None, id=None, qual=None, filter=None, info=None, samples=None, **kwargs):
54
"""
55
Create new variant record.
56
57
Parameters:
58
- contig: str, chromosome name
59
- start: int, 0-based start position
60
- stop: int, 0-based stop position
61
- alleles: tuple, reference and alternate alleles
62
- id: str, variant identifier
63
- qual: float, quality score
64
- filter: str/list, filter status
65
- info: dict, INFO field data
66
- samples: dict, sample data
67
68
Returns:
69
VariantRecord object
70
"""
71
72
def copy_record(self, record):
73
"""
74
Create copy of variant record.
75
76
Returns:
77
VariantRecord object
78
"""
79
80
def close(self):
81
"""Close the file."""
82
83
# Properties
84
@property
85
def header(self) -> "VariantHeader":
86
"""File header."""
87
88
@property
89
def index(self):
90
"""File index."""
91
92
@property
93
def filename(self) -> str:
94
"""Filename."""
95
96
@property
97
def is_open(self) -> bool:
98
"""True if file is open."""
99
100
@property
101
def category(self) -> str:
102
"""File category."""
103
104
@property
105
def format(self) -> str:
106
"""File format."""
107
108
def check_index(self):
109
"""
110
Check if index exists and is valid.
111
112
Returns:
113
bool, True if valid index
114
"""
115
```
116
117
### VariantRecord
118
119
Individual variant record with position, alleles, quality, and sample information.
120
121
```python { .api }
122
class VariantRecord:
123
# Core properties
124
@property
125
def rid(self) -> int:
126
"""Reference sequence ID."""
127
128
@property
129
def contig(self) -> str:
130
"""Chromosome/contig name."""
131
132
@contig.setter
133
def contig(self, value: str):
134
"""Set chromosome name."""
135
136
@property
137
def pos(self) -> int:
138
"""1-based position."""
139
140
@pos.setter
141
def pos(self, value: int):
142
"""Set position."""
143
144
@property
145
def start(self) -> int:
146
"""0-based start position."""
147
148
@property
149
def stop(self) -> int:
150
"""0-based stop position."""
151
152
@property
153
def id(self) -> str:
154
"""Variant identifier."""
155
156
@id.setter
157
def id(self, value: str):
158
"""Set variant identifier."""
159
160
@property
161
def ref(self) -> str:
162
"""Reference allele."""
163
164
@ref.setter
165
def ref(self, value: str):
166
"""Set reference allele."""
167
168
@property
169
def alts(self) -> tuple:
170
"""Alternate alleles."""
171
172
@alts.setter
173
def alts(self, value: tuple):
174
"""Set alternate alleles."""
175
176
@property
177
def alleles(self) -> tuple:
178
"""All alleles (reference + alternates)."""
179
180
@alleles.setter
181
def alleles(self, value: tuple):
182
"""Set all alleles."""
183
184
@property
185
def qual(self) -> float:
186
"""Quality score."""
187
188
@qual.setter
189
def qual(self, value: float):
190
"""Set quality score."""
191
192
# Complex properties
193
@property
194
def filter(self) -> "VariantRecordFilter":
195
"""Filter information."""
196
197
@property
198
def info(self) -> "VariantRecordInfo":
199
"""INFO field data."""
200
201
@property
202
def format(self) -> "VariantRecordFormat":
203
"""FORMAT field definition."""
204
205
@property
206
def samples(self) -> "VariantRecordSamples":
207
"""Sample data."""
208
209
# Methods
210
def copy(self):
211
"""
212
Create copy of record.
213
214
Returns:
215
VariantRecord object
216
"""
217
218
def translate(self, mapping):
219
"""
220
Translate chromosome names.
221
222
Parameters:
223
- mapping: dict, chromosome name mapping
224
"""
225
226
def to_string(self):
227
"""
228
Convert to VCF format string.
229
230
Returns:
231
str, VCF line
232
"""
233
```
234
235
### VariantHeader
236
237
VCF/BCF header containing metadata, sample information, and field definitions.
238
239
```python { .api }
240
class VariantHeader:
241
def __init__(self):
242
"""Create new variant header."""
243
244
# Properties
245
@property
246
def version(self) -> str:
247
"""VCF format version."""
248
249
@property
250
def samples(self) -> "VariantHeaderSamples":
251
"""Sample names and metadata."""
252
253
@property
254
def records(self) -> "VariantHeaderRecords":
255
"""Header records."""
256
257
@property
258
def contigs(self) -> "VariantHeaderContigs":
259
"""Contig information."""
260
261
@property
262
def filters(self) -> "VariantHeaderRecords":
263
"""FILTER definitions."""
264
265
@property
266
def info(self) -> "VariantHeaderRecords":
267
"""INFO field definitions."""
268
269
@property
270
def formats(self) -> "VariantHeaderRecords":
271
"""FORMAT field definitions."""
272
273
# Methods
274
def add_record(self, record):
275
"""
276
Add header record.
277
278
Parameters:
279
- record: VariantHeaderRecord, record to add
280
"""
281
282
def add_sample(self, name):
283
"""
284
Add sample.
285
286
Parameters:
287
- name: str, sample name
288
"""
289
290
def add_line(self, line):
291
"""
292
Add header line.
293
294
Parameters:
295
- line: str, header line
296
"""
297
298
def copy(self):
299
"""
300
Create copy of header.
301
302
Returns:
303
VariantHeader object
304
"""
305
306
def merge(self, other):
307
"""
308
Merge with another header.
309
310
Parameters:
311
- other: VariantHeader, header to merge
312
"""
313
314
def subset(self, samples):
315
"""
316
Create subset with specific samples.
317
318
Parameters:
319
- samples: list, sample names to include
320
321
Returns:
322
VariantHeader object
323
"""
324
325
def to_string(self):
326
"""
327
Convert to VCF header string.
328
329
Returns:
330
str, VCF header
331
"""
332
```
333
334
### VariantRecordSamples
335
336
Sample data access for variant records with genotype and field information.
337
338
```python { .api }
339
class VariantRecordSamples:
340
def __getitem__(self, sample):
341
"""
342
Get sample data.
343
344
Parameters:
345
- sample: str/int, sample name or index
346
347
Returns:
348
VariantRecordSample object
349
"""
350
351
def __contains__(self, sample):
352
"""Check if sample exists."""
353
354
def __len__(self):
355
"""Number of samples."""
356
357
def __iter__(self):
358
"""Iterate over samples."""
359
360
def keys(self):
361
"""
362
Get sample names.
363
364
Returns:
365
Iterator of sample names
366
"""
367
368
def values(self):
369
"""
370
Get sample data.
371
372
Returns:
373
Iterator of VariantRecordSample objects
374
"""
375
376
def items(self):
377
"""
378
Get sample items.
379
380
Returns:
381
Iterator of (name, VariantRecordSample) tuples
382
"""
383
```
384
385
### VariantRecordSample
386
387
Individual sample data within a variant record.
388
389
```python { .api }
390
class VariantRecordSample:
391
def __getitem__(self, field):
392
"""
393
Get field value.
394
395
Parameters:
396
- field: str, field name
397
398
Returns:
399
Field value
400
"""
401
402
def __setitem__(self, field, value):
403
"""
404
Set field value.
405
406
Parameters:
407
- field: str, field name
408
- value: field value
409
"""
410
411
def __contains__(self, field):
412
"""Check if field exists."""
413
414
def get(self, field, default=None):
415
"""
416
Get field with default.
417
418
Returns:
419
Field value or default
420
"""
421
422
def keys(self):
423
"""
424
Get field names.
425
426
Returns:
427
Iterator of field names
428
"""
429
430
def values(self):
431
"""
432
Get field values.
433
434
Returns:
435
Iterator of field values
436
"""
437
438
def items(self):
439
"""
440
Get field items.
441
442
Returns:
443
Iterator of (field, value) tuples
444
"""
445
446
@property
447
def name(self) -> str:
448
"""Sample name."""
449
450
# Genotype shortcuts
451
@property
452
def allele_indices(self) -> tuple:
453
"""Genotype allele indices."""
454
455
@property
456
def alleles(self) -> tuple:
457
"""Genotype alleles."""
458
459
@property
460
def phased(self) -> bool:
461
"""True if genotype is phased."""
462
```
463
464
### VariantRecordInfo
465
466
INFO field data access for variant records.
467
468
```python { .api }
469
class VariantRecordInfo:
470
def __getitem__(self, key):
471
"""
472
Get INFO field value.
473
474
Parameters:
475
- key: str, INFO field name
476
477
Returns:
478
Field value
479
"""
480
481
def __setitem__(self, key, value):
482
"""
483
Set INFO field value.
484
485
Parameters:
486
- key: str, INFO field name
487
- value: field value
488
"""
489
490
def __delitem__(self, key):
491
"""Delete INFO field."""
492
493
def __contains__(self, key):
494
"""Check if INFO field exists."""
495
496
def __len__(self):
497
"""Number of INFO fields."""
498
499
def __iter__(self):
500
"""Iterate over INFO field names."""
501
502
def get(self, key, default=None):
503
"""
504
Get INFO field with default.
505
506
Returns:
507
Field value or default
508
"""
509
510
def keys(self):
511
"""
512
Get INFO field names.
513
514
Returns:
515
Iterator of field names
516
"""
517
518
def values(self):
519
"""
520
Get INFO field values.
521
522
Returns:
523
Iterator of field values
524
"""
525
526
def items(self):
527
"""
528
Get INFO field items.
529
530
Returns:
531
Iterator of (field, value) tuples
532
"""
533
534
def clear(self):
535
"""Remove all INFO fields."""
536
537
def update(self, other):
538
"""
539
Update with another INFO object.
540
541
Parameters:
542
- other: dict/VariantRecordInfo, data to update with
543
"""
544
```
545
546
### VariantRecordFilter
547
548
Filter status information for variant records.
549
550
```python { .api }
551
class VariantRecordFilter:
552
def __contains__(self, name):
553
"""Check if filter is applied."""
554
555
def __iter__(self):
556
"""Iterate over applied filters."""
557
558
def __len__(self):
559
"""Number of applied filters."""
560
561
def add(self, name):
562
"""
563
Add filter.
564
565
Parameters:
566
- name: str, filter name
567
"""
568
569
def clear(self):
570
"""Remove all filters."""
571
```
572
573
## Usage Examples
574
575
### Basic File Reading
576
577
```python
578
import pysam
579
580
# Read VCF file
581
with pysam.VariantFile("input.vcf") as vcffile:
582
# Iterate over all variants
583
for record in vcffile:
584
print(f"{record.contig}:{record.pos} {record.ref}->{record.alts}")
585
586
# Fetch variants in region
587
for record in vcffile.fetch("chr1", 1000, 2000):
588
if record.qual >= 30:
589
print(f"High quality variant: {record.id}")
590
591
# Access sample data
592
with pysam.VariantFile("input.vcf") as vcffile:
593
for record in vcffile:
594
for sample_name in record.samples:
595
sample = record.samples[sample_name]
596
genotype = sample["GT"]
597
print(f"Sample {sample_name}: {genotype}")
598
```
599
600
### Writing VCF Files
601
602
```python
603
import pysam
604
605
# Create header
606
header = pysam.VariantHeader()
607
header.add_line('##fileformat=VCFv4.2')
608
header.add_line('##contig=<ID=chr1,length=249250621>')
609
header.add_sample("Sample1")
610
header.add_sample("Sample2")
611
612
# Add INFO and FORMAT definitions
613
header.add_line('##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">')
614
header.add_line('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
615
header.add_line('##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">')
616
617
with pysam.VariantFile("output.vcf", "w", header=header) as vcffile:
618
# Create variant record
619
record = vcffile.new_record(
620
contig="chr1",
621
start=100,
622
alleles=("A", "T"),
623
qual=60.0,
624
info={"DP": 100}
625
)
626
627
# Set sample data
628
record.samples["Sample1"]["GT"] = (0, 1) # Het
629
record.samples["Sample1"]["DP"] = 50
630
record.samples["Sample2"]["GT"] = (1, 1) # Hom alt
631
record.samples["Sample2"]["DP"] = 45
632
633
vcffile.write(record)
634
```
635
636
### Filtering and Processing
637
638
```python
639
import pysam
640
641
with pysam.VariantFile("input.vcf") as infile:
642
# Create output with same header
643
with pysam.VariantFile("filtered.vcf", "w", header=infile.header) as outfile:
644
for record in infile:
645
# Filter by quality and depth
646
if record.qual >= 30 and record.info.get("DP", 0) >= 10:
647
# Check if any sample is homozygous alternate
648
has_hom_alt = False
649
for sample in record.samples.values():
650
if sample["GT"] == (1, 1):
651
has_hom_alt = True
652
break
653
654
if has_hom_alt:
655
outfile.write(record)
656
```