Tessl Tile for pypi/scikit-allel@1.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

data-structures.md index.md io.md statistics.md utilities.md

io.mddocs/

0
# I/O Operations
1

2
Scikit-allel provides comprehensive file I/O capabilities for working with standard genetic data formats including VCF, GFF, and FASTA files, with support for large datasets and various output formats.
3

4
## VCF File Operations
5

6
### Reading VCF Files
7

8
Read VCF (Variant Call Format) files into structured arrays.
9

10
```python
11
import allel
12

13
# Read entire VCF file
14
variants, samples, genotypes = allel.read_vcf('data.vcf')
15

16
# Read specific fields
17
variants, samples, genotypes = allel.read_vcf(
18
    'data.vcf', 
19
    fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']
20
)
21

22
# Read with region filtering
23
data = allel.read_vcf('data.vcf', region='chr1:1000000-2000000')
24

25
# Read with sample selection
26
data = allel.read_vcf('data.vcf', samples=['sample1', 'sample2', 'sample3'])
27
```
28
{ .api }
29

30
**Key Functions:**
31

32
```python
33
def read_vcf(input, fields=None, exclude_fields=None, rename_fields=None, 
34
             types=None, numbers=None, alt_number=DEFAULT_ALT_NUMBER, fills=None,
35
             region=None, tabix='tabix', samples=None, transformers=None,
36
             buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, log=None):
37
    """
38
    Read a VCF file into structured arrays.
39
    
40
    Args:
41
        input (str): Path to VCF file or file-like object
42
        fields (list, optional): Specific fields to read (e.g., ['variants/CHROM', 'calldata/GT'])
43
        exclude_fields (list, optional): Fields to exclude from reading
44
        rename_fields (dict, optional): Mapping to rename fields
45
        types (dict, optional): Data types for specific fields
46
        numbers (dict, optional): Number specifications for INFO fields
47
        alt_number (int): Number of alternate alleles to read (default: DEFAULT_ALT_NUMBER)
48
        fills (dict, optional): Fill values for missing data
49
        region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')
50
        tabix (str): Path to tabix executable for indexed files
51
        samples (list, optional): Specific samples to include
52
        transformers (dict, optional): Custom transformation functions
53
        buffer_size (int): Buffer size for file reading
54
        chunk_length (int): Number of variants to read per chunk
55
        log (file, optional): Log file for messages
56
        
57
    Returns:
58
        tuple: (variants, samples, calldata) structured arrays
59
    """
60

61
def iter_vcf_chunks(input, fields=None, types=None, numbers=None, alt_number=None,
62
                    chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,
63
                    samples=None, transformers=None, log=None):
64
    """
65
    Iterate over VCF file in chunks for memory-efficient processing.
66
    
67
    Args:
68
        input (str): Path to VCF file
69
        fields (list, optional): Fields to read
70
        chunk_length (int): Variants per chunk
71
        buffer_size (int): File buffer size
72
        region (str, optional): Genomic region
73
        samples (list, optional): Sample subset
74
        transformers (dict, optional): Field transformers
75
        log (file, optional): Log file
76
        
77
    Yields:
78
        tuple: (variants, samples, calldata) for each chunk
79
    """
80

81
def read_vcf_headers(input):
82
    """
83
    Read only the header information from a VCF file.
84
    
85
    Args:
86
        input (str): Path to VCF file or file-like object
87
        
88
    Returns:
89
        dict: Header information including metadata and column names
90
    """
91
```
92
{ .api }
93

94
### VCF Format Conversion
95

96
Convert VCF files to other formats for analysis or storage.
97

98
```python
99
# Convert to NumPy compressed format
100
allel.vcf_to_npz('input.vcf', 'output.npz', compression='gzip')
101

102
# Convert to HDF5
103
allel.vcf_to_hdf5('input.vcf', 'output.h5', group='/variants',
104
                  compression='gzip', chunks=True)
105

106
# Convert to Zarr format
107
allel.vcf_to_zarr('input.vcf', 'output.zarr', compressor='blosc')
108

109
# Convert to pandas DataFrame
110
df = allel.vcf_to_dataframe('input.vcf', alt_number=1)
111

112
# Convert to CSV
113
allel.vcf_to_csv('input.vcf', 'output.csv', write_header=True)
114
```
115
{ .api }
116

117
```python
118
def vcf_to_npz(input, output, fields=None, types=None, numbers=None, alt_number=None,
119
               chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,
120
               samples=None, transformers=None, log=None, compression='gzip'):
121
    """
122
    Convert VCF file to NumPy compressed format.
123
    
124
    Args:
125
        input (str): Input VCF file path
126
        output (str): Output NPZ file path
127
        fields (list, optional): Fields to include
128
        compression (str): Compression method ('gzip', 'bz2', 'lzma')
129
        
130
    Returns:
131
        None
132
    """
133

134
def vcf_to_hdf5(input, output, group='/', fields=None, types=None, numbers=None,
135
                alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,
136
                region=None, samples=None, transformers=None, log=None,
137
                compression='gzip', compression_opts=1, chunks=True, fletcher32=False, shuffle=False):
138
    """
139
    Convert VCF file to HDF5 format.
140
    
141
    Args:
142
        input (str): Input VCF file path
143
        output (str): Output HDF5 file path
144
        group (str): HDF5 group to write to
145
        compression (str): Compression method
146
        compression_opts (int): Compression level
147
        chunks (bool or tuple): Chunking strategy
148
        fletcher32 (bool): Enable Fletcher32 checksum
149
        shuffle (bool): Enable shuffle filter
150
        
151
    Returns:
152
        None
153
    """
154

155
def vcf_to_zarr(input, output, group='', fields=None, types=None, numbers=None,
156
                alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,
157
                region=None, samples=None, transformers=None, log=None,
158
                compressor=None, chunks=True):
159
    """
160
    Convert VCF file to Zarr format.
161
    
162
    Args:
163
        input (str): Input VCF file path
164
        output (str): Output Zarr path
165
        group (str): Zarr group name
166
        compressor (object, optional): Compression algorithm (e.g., blosc.Blosc())
167
        chunks (bool or tuple): Chunking strategy
168
        
169
    Returns:
170
        None
171
    """
172

173
def vcf_to_dataframe(input, fields=None, alt_number=None, chunk_length=1000,
174
                     buffer_size=DEFAULT_BUFFER_SIZE, region=None, samples=None,
175
                     transformers=None, log=None):
176
    """
177
    Convert VCF file to pandas DataFrame.
178
    
179
    Args:
180
        input (str): Input VCF file path
181
        fields (list, optional): Fields to include
182
        alt_number (int, optional): Number of alternate alleles
183
        
184
    Returns:
185
        pandas.DataFrame: VCF data as DataFrame
186
    """
187
```
188
{ .api }
189

190
### Writing VCF Files
191

192
Write genetic data back to VCF format.
193

194
```python
195
# Write arrays to VCF file
196
allel.write_vcf('output.vcf', variants, calldata, samples=sample_names,
197
                meta={'fileformat': 'VCFv4.2'})
198

199
# Customize field descriptions
200
descriptions = {
201
    'DP': 'Total read depth',
202
    'GQ': 'Genotype quality'
203
}
204
allel.write_vcf('output.vcf', variants, calldata, description=descriptions)
205
```
206
{ .api }
207

208
```python
209
def write_vcf(path, variants, calldata, samples=None, meta=None, rename=None,
210
              number=None, description=None, fill=None, write_header=True):
211
    """
212
    Write variant data to VCF format.
213
    
214
    Args:
215
        path (str): Output VCF file path
216
        variants (dict): Variant information arrays
217
        calldata (dict): Call data arrays (per-sample information)
218
        samples (list, optional): Sample names
219
        meta (dict, optional): Metadata for VCF header
220
        rename (dict, optional): Field name mappings
221
        number (dict, optional): Number specifications for INFO fields
222
        description (dict, optional): Field descriptions for header
223
        fill (dict, optional): Fill values for missing data
224
        write_header (bool): Whether to write VCF header
225
        
226
    Returns:
227
        None
228
    """
229
```
230
{ .api }
231

232
## GFF File Operations
233

234
### Reading GFF Files
235

236
Parse GFF3 (General Feature Format) files for genomic annotations.
237

238
```python
239
# Read GFF3 file into record array
240
features = allel.gff3_to_recarray('annotations.gff3')
241

242
# Read specific genomic region
243
features = allel.gff3_to_recarray('annotations.gff3', region='chr1:1000000-2000000')
244

245
# Convert to pandas DataFrame for easier manipulation
246
df = allel.gff3_to_dataframe('annotations.gff3')
247

248
# Iterate over GFF records for memory efficiency
249
for record in allel.iter_gff3('annotations.gff3'):
250
    print(record['type'], record['start'], record['end'])
251
```
252
{ .api }
253

254
```python
255
def gff3_to_recarray(path, attributes=None, region=None, score_fill=-1):
256
    """
257
    Read GFF3 file into structured record array.
258
    
259
    Args:
260
        path (str): Path to GFF3 file
261
        attributes (list, optional): Attribute names to parse from INFO field
262
        region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')
263
        score_fill (float, optional): Fill value for missing scores
264
        
265
    Returns:
266
        numpy.recarray: Structured array with GFF3 fields
267
    """
268

269
def gff3_to_dataframe(input, region=None):
270
    """
271
    Read GFF3 file into pandas DataFrame.
272
    
273
    Args:
274
        input (str): Path to GFF3 file
275
        region (str, optional): Genomic region to read
276
        
277
    Returns:
278
        pandas.DataFrame: GFF3 data as DataFrame
279
    """
280

281
def iter_gff3(path, attributes=None, region=None, score_fill=-1):
282
    """
283
    Iterate over GFF3 records from file.
284
    
285
    Args:
286
        path (str): Path to GFF3 file
287
        attributes (list, optional): Attribute names to parse
288
        region (str, optional): Genomic region filter
289
        score_fill (float, optional): Fill value for missing scores
290
        
291
    Yields:
292
        dict: GFF3 record as dictionary with standard fields
293
    """
294
```
295
{ .api }
296

297
## FASTA File Operations
298

299
### Reading FASTA Files
300

301
Read FASTA format sequences for reference genomes and sequence analysis.
302

303
```python
304
# Write sequences to FASTA file
305
sequences = {'chr1': 'ATCGATCG', 'chr2': 'GCTAGCTA'}
306
allel.write_fasta('output.fasta', sequences, ['chr1', 'chr2'])
307
```
308
{ .api }
309

310
```python
311
def write_fasta(path, sequences, names, mode='w', width=80):
312
    """
313
    Write sequences to FASTA format file.
314
    
315
    Args:
316
        path (str): Output FASTA file path
317
        sequences (dict or list): Sequences to write
318
        names (list): Sequence names/identifiers
319
        mode (str): File open mode ('w' for write, 'a' for append)
320
        width (int): Line width for sequence wrapping
321
        
322
    Returns:
323
        None
324
    """
325

326
327
```
328
{ .api }
329

330
## File Format Utilities
331

332
### Format Detection and Handling
333

334
Utilities for working with different file formats and sources.
335

336
```python
337
# Handle compressed files automatically
338
data = allel.read_vcf('data.vcf.gz')  # Automatically handles gzip
339
data = allel.read_vcf('data.vcf.bz2')  # Automatically handles bzip2
340

341
# Read from URLs
342
data = allel.read_vcf('https://example.com/data.vcf.gz')
343

344
# Custom buffer sizes for large files
345
data = allel.read_vcf('large_file.vcf', buffer_size=1048576)  # 1MB buffer
346
```
347
{ .api }
348

349
## Data Import Examples
350

351
### Complete Workflow Examples
352

353
```python
354
# Example 1: Basic VCF to analysis workflow
355
import allel
356
import numpy as np
357

358
# Read VCF file
359
variants, samples, genotypes = allel.read_vcf(
360
    'my_data.vcf',
361
    fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']
362
)
363

364
# Create genotype array
365
g = allel.GenotypeArray(genotypes['GT'])
366

367
# Basic quality control
368
ac = g.count_alleles()
369
segregating = ac.is_segregating()
370
biallelic = ac.allelism() == 2
371

372
# Filter to high-quality, biallelic, segregating variants
373
mask = segregating & biallelic
374
g_filtered = g[mask]
375
variants_filtered = {k: v[mask] for k, v in variants.items()}
376

377
# Example 2: Large file processing with chunking
378
def process_large_vcf(filename):
379
    results = []
380
    for variants, samples, calldata in allel.iter_vcf_chunks(filename, chunk_length=10000):
381
        g = allel.GenotypeArray(calldata['GT'])
382
        ac = g.count_alleles()
383
        diversity = allel.mean_pairwise_difference(ac)
384
        results.extend(diversity)
385
    return np.array(results)
386

387
# Example 3: Multi-format workflow
388
# Read VCF and annotations
389
variants, samples, genotypes = allel.read_vcf('variants.vcf')
390
annotations = allel.read_gff3('genes.gff3')
391
reference = allel.read_fasta('reference.fasta')
392

393
# Convert to efficient storage format
394
allel.vcf_to_zarr('variants.vcf', 'variants.zarr', compressor='blosc')
395
```
396
{ .api }
397

398
## Memory Management
399

400
### Efficient Data Loading
401

402
Handle large datasets efficiently with selective loading and chunking.
403

404
```python
405
# Read only essential fields to save memory
406
minimal_data = allel.read_vcf(
407
    'large_file.vcf',
408
    fields=['variants/CHROM', 'variants/POS', 'calldata/GT']
409
)
410

411
# Process in chunks for memory efficiency
412
def analyze_in_chunks(vcf_file, chunk_size=5000):
413
    all_stats = []
414
    for chunk in allel.iter_vcf_chunks(vcf_file, chunk_length=chunk_size):
415
        variants, samples, calldata = chunk
416
        g = allel.GenotypeArray(calldata['GT'])
417
        stats = g.count_alleles()
418
        all_stats.append(stats)
419
    return all_stats
420

421
# Use region-based loading for targeted analysis
422
region_data = allel.read_vcf('genome.vcf', region='chr22:20000000-25000000')
423
```
424
{ .api }
425

426
## Error Handling
427

428
### Common I/O Issues
429

430
Handle common file format and data issues gracefully.
431

432
```python
433
try:
434
    # Attempt to read VCF with error handling
435
    data = allel.read_vcf('data.vcf')
436
except Exception as e:
437
    print(f"Error reading VCF: {e}")
438
    # Try with different parameters
439
    data = allel.read_vcf('data.vcf', alt_number=1)
440

441
# Validate data after reading
442
def validate_genetic_data(variants, genotypes):
443
    """Validate genetic data integrity."""
444
    if len(variants['POS']) != genotypes['GT'].shape[0]:
445
        raise ValueError("Mismatch between variants and genotypes")
446
    
447
    # Check for reasonable chromosome names
448
    valid_chroms = set([str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'])
449
    invalid_chroms = set(variants['CHROM']) - valid_chroms
450
    if invalid_chroms:
451
        print(f"Warning: Unusual chromosome names found: {invalid_chroms}")
452
    
453
    return True
454
```
455
{ .api }

Version

Tile

Files

io.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

io.mddocs/