A Python package for exploring and analysing genetic variation data.
—
Scikit-allel provides comprehensive file I/O capabilities for working with standard genetic data formats including VCF, GFF, and FASTA files, with support for large datasets and various output formats.
Read VCF (Variant Call Format) files into structured arrays.
import allel
# Read entire VCF file
variants, samples, genotypes = allel.read_vcf('data.vcf')
# Read specific fields
variants, samples, genotypes = allel.read_vcf(
'data.vcf',
fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']
)
# Read with region filtering
data = allel.read_vcf('data.vcf', region='chr1:1000000-2000000')
# Read with sample selection
data = allel.read_vcf('data.vcf', samples=['sample1', 'sample2', 'sample3']){ .api }
Key Functions:
def read_vcf(input, fields=None, exclude_fields=None, rename_fields=None,
types=None, numbers=None, alt_number=DEFAULT_ALT_NUMBER, fills=None,
region=None, tabix='tabix', samples=None, transformers=None,
buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, log=None):
"""
Read a VCF file into structured arrays.
Args:
input (str): Path to VCF file or file-like object
fields (list, optional): Specific fields to read (e.g., ['variants/CHROM', 'calldata/GT'])
exclude_fields (list, optional): Fields to exclude from reading
rename_fields (dict, optional): Mapping to rename fields
types (dict, optional): Data types for specific fields
numbers (dict, optional): Number specifications for INFO fields
alt_number (int): Number of alternate alleles to read (default: DEFAULT_ALT_NUMBER)
fills (dict, optional): Fill values for missing data
region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')
tabix (str): Path to tabix executable for indexed files
samples (list, optional): Specific samples to include
transformers (dict, optional): Custom transformation functions
buffer_size (int): Buffer size for file reading
chunk_length (int): Number of variants to read per chunk
log (file, optional): Log file for messages
Returns:
tuple: (variants, samples, calldata) structured arrays
"""
def iter_vcf_chunks(input, fields=None, types=None, numbers=None, alt_number=None,
chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,
samples=None, transformers=None, log=None):
"""
Iterate over VCF file in chunks for memory-efficient processing.
Args:
input (str): Path to VCF file
fields (list, optional): Fields to read
chunk_length (int): Variants per chunk
buffer_size (int): File buffer size
region (str, optional): Genomic region
samples (list, optional): Sample subset
transformers (dict, optional): Field transformers
log (file, optional): Log file
Yields:
tuple: (variants, samples, calldata) for each chunk
"""
def read_vcf_headers(input):
"""
Read only the header information from a VCF file.
Args:
input (str): Path to VCF file or file-like object
Returns:
dict: Header information including metadata and column names
"""{ .api }
Convert VCF files to other formats for analysis or storage.
# Convert to NumPy compressed format
allel.vcf_to_npz('input.vcf', 'output.npz', compression='gzip')
# Convert to HDF5
allel.vcf_to_hdf5('input.vcf', 'output.h5', group='/variants',
compression='gzip', chunks=True)
# Convert to Zarr format
allel.vcf_to_zarr('input.vcf', 'output.zarr', compressor='blosc')
# Convert to pandas DataFrame
df = allel.vcf_to_dataframe('input.vcf', alt_number=1)
# Convert to CSV
allel.vcf_to_csv('input.vcf', 'output.csv', write_header=True){ .api }
def vcf_to_npz(input, output, fields=None, types=None, numbers=None, alt_number=None,
chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,
samples=None, transformers=None, log=None, compression='gzip'):
"""
Convert VCF file to NumPy compressed format.
Args:
input (str): Input VCF file path
output (str): Output NPZ file path
fields (list, optional): Fields to include
compression (str): Compression method ('gzip', 'bz2', 'lzma')
Returns:
None
"""
def vcf_to_hdf5(input, output, group='/', fields=None, types=None, numbers=None,
alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,
region=None, samples=None, transformers=None, log=None,
compression='gzip', compression_opts=1, chunks=True, fletcher32=False, shuffle=False):
"""
Convert VCF file to HDF5 format.
Args:
input (str): Input VCF file path
output (str): Output HDF5 file path
group (str): HDF5 group to write to
compression (str): Compression method
compression_opts (int): Compression level
chunks (bool or tuple): Chunking strategy
fletcher32 (bool): Enable Fletcher32 checksum
shuffle (bool): Enable shuffle filter
Returns:
None
"""
def vcf_to_zarr(input, output, group='', fields=None, types=None, numbers=None,
alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,
region=None, samples=None, transformers=None, log=None,
compressor=None, chunks=True):
"""
Convert VCF file to Zarr format.
Args:
input (str): Input VCF file path
output (str): Output Zarr path
group (str): Zarr group name
compressor (object, optional): Compression algorithm (e.g., blosc.Blosc())
chunks (bool or tuple): Chunking strategy
Returns:
None
"""
def vcf_to_dataframe(input, fields=None, alt_number=None, chunk_length=1000,
buffer_size=DEFAULT_BUFFER_SIZE, region=None, samples=None,
transformers=None, log=None):
"""
Convert VCF file to pandas DataFrame.
Args:
input (str): Input VCF file path
fields (list, optional): Fields to include
alt_number (int, optional): Number of alternate alleles
Returns:
pandas.DataFrame: VCF data as DataFrame
"""{ .api }
Write genetic data back to VCF format.
# Write arrays to VCF file
allel.write_vcf('output.vcf', variants, calldata, samples=sample_names,
meta={'fileformat': 'VCFv4.2'})
# Customize field descriptions
descriptions = {
'DP': 'Total read depth',
'GQ': 'Genotype quality'
}
allel.write_vcf('output.vcf', variants, calldata, description=descriptions){ .api }
def write_vcf(path, variants, calldata, samples=None, meta=None, rename=None,
number=None, description=None, fill=None, write_header=True):
"""
Write variant data to VCF format.
Args:
path (str): Output VCF file path
variants (dict): Variant information arrays
calldata (dict): Call data arrays (per-sample information)
samples (list, optional): Sample names
meta (dict, optional): Metadata for VCF header
rename (dict, optional): Field name mappings
number (dict, optional): Number specifications for INFO fields
description (dict, optional): Field descriptions for header
fill (dict, optional): Fill values for missing data
write_header (bool): Whether to write VCF header
Returns:
None
"""{ .api }
Parse GFF3 (General Feature Format) files for genomic annotations.
# Read GFF3 file into record array
features = allel.gff3_to_recarray('annotations.gff3')
# Read specific genomic region
features = allel.gff3_to_recarray('annotations.gff3', region='chr1:1000000-2000000')
# Convert to pandas DataFrame for easier manipulation
df = allel.gff3_to_dataframe('annotations.gff3')
# Iterate over GFF records for memory efficiency
for record in allel.iter_gff3('annotations.gff3'):
print(record['type'], record['start'], record['end']){ .api }
def gff3_to_recarray(path, attributes=None, region=None, score_fill=-1):
"""
Read GFF3 file into structured record array.
Args:
path (str): Path to GFF3 file
attributes (list, optional): Attribute names to parse from INFO field
region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')
score_fill (float, optional): Fill value for missing scores
Returns:
numpy.recarray: Structured array with GFF3 fields
"""
def gff3_to_dataframe(input, region=None):
"""
Read GFF3 file into pandas DataFrame.
Args:
input (str): Path to GFF3 file
region (str, optional): Genomic region to read
Returns:
pandas.DataFrame: GFF3 data as DataFrame
"""
def iter_gff3(path, attributes=None, region=None, score_fill=-1):
"""
Iterate over GFF3 records from file.
Args:
path (str): Path to GFF3 file
attributes (list, optional): Attribute names to parse
region (str, optional): Genomic region filter
score_fill (float, optional): Fill value for missing scores
Yields:
dict: GFF3 record as dictionary with standard fields
"""{ .api }
Read FASTA format sequences for reference genomes and sequence analysis.
# Write sequences to FASTA file
sequences = {'chr1': 'ATCGATCG', 'chr2': 'GCTAGCTA'}
allel.write_fasta('output.fasta', sequences, ['chr1', 'chr2']){ .api }
def write_fasta(path, sequences, names, mode='w', width=80):
"""
Write sequences to FASTA format file.
Args:
path (str): Output FASTA file path
sequences (dict or list): Sequences to write
names (list): Sequence names/identifiers
mode (str): File open mode ('w' for write, 'a' for append)
width (int): Line width for sequence wrapping
Returns:
None
"""{ .api }
Utilities for working with different file formats and sources.
# Handle compressed files automatically
data = allel.read_vcf('data.vcf.gz') # Automatically handles gzip
data = allel.read_vcf('data.vcf.bz2') # Automatically handles bzip2
# Read from URLs
data = allel.read_vcf('https://example.com/data.vcf.gz')
# Custom buffer sizes for large files
data = allel.read_vcf('large_file.vcf', buffer_size=1048576) # 1MB buffer{ .api }
# Example 1: Basic VCF to analysis workflow
import allel
import numpy as np
# Read VCF file
variants, samples, genotypes = allel.read_vcf(
'my_data.vcf',
fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']
)
# Create genotype array
g = allel.GenotypeArray(genotypes['GT'])
# Basic quality control
ac = g.count_alleles()
segregating = ac.is_segregating()
biallelic = ac.allelism() == 2
# Filter to high-quality, biallelic, segregating variants
mask = segregating & biallelic
g_filtered = g[mask]
variants_filtered = {k: v[mask] for k, v in variants.items()}
# Example 2: Large file processing with chunking
def process_large_vcf(filename):
results = []
for variants, samples, calldata in allel.iter_vcf_chunks(filename, chunk_length=10000):
g = allel.GenotypeArray(calldata['GT'])
ac = g.count_alleles()
diversity = allel.mean_pairwise_difference(ac)
results.extend(diversity)
return np.array(results)
# Example 3: Multi-format workflow
# Read VCF and annotations
variants, samples, genotypes = allel.read_vcf('variants.vcf')
annotations = allel.read_gff3('genes.gff3')
reference = allel.read_fasta('reference.fasta')
# Convert to efficient storage format
allel.vcf_to_zarr('variants.vcf', 'variants.zarr', compressor='blosc'){ .api }
Handle large datasets efficiently with selective loading and chunking.
# Read only essential fields to save memory
minimal_data = allel.read_vcf(
'large_file.vcf',
fields=['variants/CHROM', 'variants/POS', 'calldata/GT']
)
# Process in chunks for memory efficiency
def analyze_in_chunks(vcf_file, chunk_size=5000):
all_stats = []
for chunk in allel.iter_vcf_chunks(vcf_file, chunk_length=chunk_size):
variants, samples, calldata = chunk
g = allel.GenotypeArray(calldata['GT'])
stats = g.count_alleles()
all_stats.append(stats)
return all_stats
# Use region-based loading for targeted analysis
region_data = allel.read_vcf('genome.vcf', region='chr22:20000000-25000000'){ .api }
Handle common file format and data issues gracefully.
try:
# Attempt to read VCF with error handling
data = allel.read_vcf('data.vcf')
except Exception as e:
print(f"Error reading VCF: {e}")
# Try with different parameters
data = allel.read_vcf('data.vcf', alt_number=1)
# Validate data after reading
def validate_genetic_data(variants, genotypes):
"""Validate genetic data integrity."""
if len(variants['POS']) != genotypes['GT'].shape[0]:
raise ValueError("Mismatch between variants and genotypes")
# Check for reasonable chromosome names
valid_chroms = set([str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'])
invalid_chroms = set(variants['CHROM']) - valid_chroms
if invalid_chroms:
print(f"Warning: Unusual chromosome names found: {invalid_chroms}")
return True{ .api }
Install with Tessl CLI
npx tessl i tessl/pypi-scikit-allel