0
# I/O Operations
1
2
Scikit-allel provides comprehensive file I/O capabilities for working with standard genetic data formats including VCF, GFF, and FASTA files, with support for large datasets and various output formats.
3
4
## VCF File Operations
5
6
### Reading VCF Files
7
8
Read VCF (Variant Call Format) files into structured arrays.
9
10
```python
11
import allel
12
13
# Read entire VCF file
14
variants, samples, genotypes = allel.read_vcf('data.vcf')
15
16
# Read specific fields
17
variants, samples, genotypes = allel.read_vcf(
18
'data.vcf',
19
fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']
20
)
21
22
# Read with region filtering
23
data = allel.read_vcf('data.vcf', region='chr1:1000000-2000000')
24
25
# Read with sample selection
26
data = allel.read_vcf('data.vcf', samples=['sample1', 'sample2', 'sample3'])
27
```
28
{ .api }
29
30
**Key Functions:**
31
32
```python
33
def read_vcf(input, fields=None, exclude_fields=None, rename_fields=None,
34
types=None, numbers=None, alt_number=DEFAULT_ALT_NUMBER, fills=None,
35
region=None, tabix='tabix', samples=None, transformers=None,
36
buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, log=None):
37
"""
38
Read a VCF file into structured arrays.
39
40
Args:
41
input (str): Path to VCF file or file-like object
42
fields (list, optional): Specific fields to read (e.g., ['variants/CHROM', 'calldata/GT'])
43
exclude_fields (list, optional): Fields to exclude from reading
44
rename_fields (dict, optional): Mapping to rename fields
45
types (dict, optional): Data types for specific fields
46
numbers (dict, optional): Number specifications for INFO fields
47
alt_number (int): Number of alternate alleles to read (default: DEFAULT_ALT_NUMBER)
48
fills (dict, optional): Fill values for missing data
49
region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')
50
tabix (str): Path to tabix executable for indexed files
51
samples (list, optional): Specific samples to include
52
transformers (dict, optional): Custom transformation functions
53
buffer_size (int): Buffer size for file reading
54
chunk_length (int): Number of variants to read per chunk
55
log (file, optional): Log file for messages
56
57
Returns:
58
tuple: (variants, samples, calldata) structured arrays
59
"""
60
61
def iter_vcf_chunks(input, fields=None, types=None, numbers=None, alt_number=None,
62
chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,
63
samples=None, transformers=None, log=None):
64
"""
65
Iterate over VCF file in chunks for memory-efficient processing.
66
67
Args:
68
input (str): Path to VCF file
69
fields (list, optional): Fields to read
70
chunk_length (int): Variants per chunk
71
buffer_size (int): File buffer size
72
region (str, optional): Genomic region
73
samples (list, optional): Sample subset
74
transformers (dict, optional): Field transformers
75
log (file, optional): Log file
76
77
Yields:
78
tuple: (variants, samples, calldata) for each chunk
79
"""
80
81
def read_vcf_headers(input):
82
"""
83
Read only the header information from a VCF file.
84
85
Args:
86
input (str): Path to VCF file or file-like object
87
88
Returns:
89
dict: Header information including metadata and column names
90
"""
91
```
92
{ .api }
93
94
### VCF Format Conversion
95
96
Convert VCF files to other formats for analysis or storage.
97
98
```python
99
# Convert to NumPy compressed format
100
allel.vcf_to_npz('input.vcf', 'output.npz', compression='gzip')
101
102
# Convert to HDF5
103
allel.vcf_to_hdf5('input.vcf', 'output.h5', group='/variants',
104
compression='gzip', chunks=True)
105
106
# Convert to Zarr format
107
allel.vcf_to_zarr('input.vcf', 'output.zarr', compressor='blosc')
108
109
# Convert to pandas DataFrame
110
df = allel.vcf_to_dataframe('input.vcf', alt_number=1)
111
112
# Convert to CSV
113
allel.vcf_to_csv('input.vcf', 'output.csv', write_header=True)
114
```
115
{ .api }
116
117
```python
118
def vcf_to_npz(input, output, fields=None, types=None, numbers=None, alt_number=None,
119
chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,
120
samples=None, transformers=None, log=None, compression='gzip'):
121
"""
122
Convert VCF file to NumPy compressed format.
123
124
Args:
125
input (str): Input VCF file path
126
output (str): Output NPZ file path
127
fields (list, optional): Fields to include
128
compression (str): Compression method ('gzip', 'bz2', 'lzma')
129
130
Returns:
131
None
132
"""
133
134
def vcf_to_hdf5(input, output, group='/', fields=None, types=None, numbers=None,
135
alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,
136
region=None, samples=None, transformers=None, log=None,
137
compression='gzip', compression_opts=1, chunks=True, fletcher32=False, shuffle=False):
138
"""
139
Convert VCF file to HDF5 format.
140
141
Args:
142
input (str): Input VCF file path
143
output (str): Output HDF5 file path
144
group (str): HDF5 group to write to
145
compression (str): Compression method
146
compression_opts (int): Compression level
147
chunks (bool or tuple): Chunking strategy
148
fletcher32 (bool): Enable Fletcher32 checksum
149
shuffle (bool): Enable shuffle filter
150
151
Returns:
152
None
153
"""
154
155
def vcf_to_zarr(input, output, group='', fields=None, types=None, numbers=None,
156
alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,
157
region=None, samples=None, transformers=None, log=None,
158
compressor=None, chunks=True):
159
"""
160
Convert VCF file to Zarr format.
161
162
Args:
163
input (str): Input VCF file path
164
output (str): Output Zarr path
165
group (str): Zarr group name
166
compressor (object, optional): Compression algorithm (e.g., blosc.Blosc())
167
chunks (bool or tuple): Chunking strategy
168
169
Returns:
170
None
171
"""
172
173
def vcf_to_dataframe(input, fields=None, alt_number=None, chunk_length=1000,
174
buffer_size=DEFAULT_BUFFER_SIZE, region=None, samples=None,
175
transformers=None, log=None):
176
"""
177
Convert VCF file to pandas DataFrame.
178
179
Args:
180
input (str): Input VCF file path
181
fields (list, optional): Fields to include
182
alt_number (int, optional): Number of alternate alleles
183
184
Returns:
185
pandas.DataFrame: VCF data as DataFrame
186
"""
187
```
188
{ .api }
189
190
### Writing VCF Files
191
192
Write genetic data back to VCF format.
193
194
```python
195
# Write arrays to VCF file
196
allel.write_vcf('output.vcf', variants, calldata, samples=sample_names,
197
meta={'fileformat': 'VCFv4.2'})
198
199
# Customize field descriptions
200
descriptions = {
201
'DP': 'Total read depth',
202
'GQ': 'Genotype quality'
203
}
204
allel.write_vcf('output.vcf', variants, calldata, description=descriptions)
205
```
206
{ .api }
207
208
```python
209
def write_vcf(path, variants, calldata, samples=None, meta=None, rename=None,
210
number=None, description=None, fill=None, write_header=True):
211
"""
212
Write variant data to VCF format.
213
214
Args:
215
path (str): Output VCF file path
216
variants (dict): Variant information arrays
217
calldata (dict): Call data arrays (per-sample information)
218
samples (list, optional): Sample names
219
meta (dict, optional): Metadata for VCF header
220
rename (dict, optional): Field name mappings
221
number (dict, optional): Number specifications for INFO fields
222
description (dict, optional): Field descriptions for header
223
fill (dict, optional): Fill values for missing data
224
write_header (bool): Whether to write VCF header
225
226
Returns:
227
None
228
"""
229
```
230
{ .api }
231
232
## GFF File Operations
233
234
### Reading GFF Files
235
236
Parse GFF3 (General Feature Format) files for genomic annotations.
237
238
```python
239
# Read GFF3 file into record array
240
features = allel.gff3_to_recarray('annotations.gff3')
241
242
# Read specific genomic region
243
features = allel.gff3_to_recarray('annotations.gff3', region='chr1:1000000-2000000')
244
245
# Convert to pandas DataFrame for easier manipulation
246
df = allel.gff3_to_dataframe('annotations.gff3')
247
248
# Iterate over GFF records for memory efficiency
249
for record in allel.iter_gff3('annotations.gff3'):
250
print(record['type'], record['start'], record['end'])
251
```
252
{ .api }
253
254
```python
255
def gff3_to_recarray(path, attributes=None, region=None, score_fill=-1):
256
"""
257
Read GFF3 file into structured record array.
258
259
Args:
260
path (str): Path to GFF3 file
261
attributes (list, optional): Attribute names to parse from INFO field
262
region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')
263
score_fill (float, optional): Fill value for missing scores
264
265
Returns:
266
numpy.recarray: Structured array with GFF3 fields
267
"""
268
269
def gff3_to_dataframe(input, region=None):
270
"""
271
Read GFF3 file into pandas DataFrame.
272
273
Args:
274
input (str): Path to GFF3 file
275
region (str, optional): Genomic region to read
276
277
Returns:
278
pandas.DataFrame: GFF3 data as DataFrame
279
"""
280
281
def iter_gff3(path, attributes=None, region=None, score_fill=-1):
282
"""
283
Iterate over GFF3 records from file.
284
285
Args:
286
path (str): Path to GFF3 file
287
attributes (list, optional): Attribute names to parse
288
region (str, optional): Genomic region filter
289
score_fill (float, optional): Fill value for missing scores
290
291
Yields:
292
dict: GFF3 record as dictionary with standard fields
293
"""
294
```
295
{ .api }
296
297
## FASTA File Operations
298
299
### Reading FASTA Files
300
301
Read FASTA format sequences for reference genomes and sequence analysis.
302
303
```python
304
# Write sequences to FASTA file
305
sequences = {'chr1': 'ATCGATCG', 'chr2': 'GCTAGCTA'}
306
allel.write_fasta('output.fasta', sequences, ['chr1', 'chr2'])
307
```
308
{ .api }
309
310
```python
311
def write_fasta(path, sequences, names, mode='w', width=80):
312
"""
313
Write sequences to FASTA format file.
314
315
Args:
316
path (str): Output FASTA file path
317
sequences (dict or list): Sequences to write
318
names (list): Sequence names/identifiers
319
mode (str): File open mode ('w' for write, 'a' for append)
320
width (int): Line width for sequence wrapping
321
322
Returns:
323
None
324
"""
325
326
327
```
328
{ .api }
329
330
## File Format Utilities
331
332
### Format Detection and Handling
333
334
Utilities for working with different file formats and sources.
335
336
```python
337
# Handle compressed files automatically
338
data = allel.read_vcf('data.vcf.gz') # Automatically handles gzip
339
data = allel.read_vcf('data.vcf.bz2') # Automatically handles bzip2
340
341
# Read from URLs
342
data = allel.read_vcf('https://example.com/data.vcf.gz')
343
344
# Custom buffer sizes for large files
345
data = allel.read_vcf('large_file.vcf', buffer_size=1048576) # 1MB buffer
346
```
347
{ .api }
348
349
## Data Import Examples
350
351
### Complete Workflow Examples
352
353
```python
354
# Example 1: Basic VCF to analysis workflow
355
import allel
356
import numpy as np
357
358
# Read VCF file
359
variants, samples, genotypes = allel.read_vcf(
360
'my_data.vcf',
361
fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']
362
)
363
364
# Create genotype array
365
g = allel.GenotypeArray(genotypes['GT'])
366
367
# Basic quality control
368
ac = g.count_alleles()
369
segregating = ac.is_segregating()
370
biallelic = ac.allelism() == 2
371
372
# Filter to high-quality, biallelic, segregating variants
373
mask = segregating & biallelic
374
g_filtered = g[mask]
375
variants_filtered = {k: v[mask] for k, v in variants.items()}
376
377
# Example 2: Large file processing with chunking
378
def process_large_vcf(filename):
379
results = []
380
for variants, samples, calldata in allel.iter_vcf_chunks(filename, chunk_length=10000):
381
g = allel.GenotypeArray(calldata['GT'])
382
ac = g.count_alleles()
383
diversity = allel.mean_pairwise_difference(ac)
384
results.extend(diversity)
385
return np.array(results)
386
387
# Example 3: Multi-format workflow
388
# Read VCF and annotations
389
variants, samples, genotypes = allel.read_vcf('variants.vcf')
390
annotations = allel.read_gff3('genes.gff3')
391
reference = allel.read_fasta('reference.fasta')
392
393
# Convert to efficient storage format
394
allel.vcf_to_zarr('variants.vcf', 'variants.zarr', compressor='blosc')
395
```
396
{ .api }
397
398
## Memory Management
399
400
### Efficient Data Loading
401
402
Handle large datasets efficiently with selective loading and chunking.
403
404
```python
405
# Read only essential fields to save memory
406
minimal_data = allel.read_vcf(
407
'large_file.vcf',
408
fields=['variants/CHROM', 'variants/POS', 'calldata/GT']
409
)
410
411
# Process in chunks for memory efficiency
412
def analyze_in_chunks(vcf_file, chunk_size=5000):
413
all_stats = []
414
for chunk in allel.iter_vcf_chunks(vcf_file, chunk_length=chunk_size):
415
variants, samples, calldata = chunk
416
g = allel.GenotypeArray(calldata['GT'])
417
stats = g.count_alleles()
418
all_stats.append(stats)
419
return all_stats
420
421
# Use region-based loading for targeted analysis
422
region_data = allel.read_vcf('genome.vcf', region='chr22:20000000-25000000')
423
```
424
{ .api }
425
426
## Error Handling
427
428
### Common I/O Issues
429
430
Handle common file format and data issues gracefully.
431
432
```python
433
try:
434
# Attempt to read VCF with error handling
435
data = allel.read_vcf('data.vcf')
436
except Exception as e:
437
print(f"Error reading VCF: {e}")
438
# Try with different parameters
439
data = allel.read_vcf('data.vcf', alt_number=1)
440
441
# Validate data after reading
442
def validate_genetic_data(variants, genotypes):
443
"""Validate genetic data integrity."""
444
if len(variants['POS']) != genotypes['GT'].shape[0]:
445
raise ValueError("Mismatch between variants and genotypes")
446
447
# Check for reasonable chromosome names
448
valid_chroms = set([str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'])
449
invalid_chroms = set(variants['CHROM']) - valid_chroms
450
if invalid_chroms:
451
print(f"Warning: Unusual chromosome names found: {invalid_chroms}")
452
453
return True
454
```
455
{ .api }