A Python package for exploring and analysing genetic variation data.
—
Scikit-allel provides essential utilities for data validation, array manipulation, and caching to support genetic analysis workflows.
Efficiently cache computational results using HDF5 storage.
import allel
import numpy as np
# Use HDF5 cache decorator for functions
@allel.hdf5_cache(filepath='analysis_cache.h5', group='/results')
def expensive_calculation(genotype_data):
g = allel.GenotypeArray(genotype_data)
ac = g.count_alleles()
return allel.sequence_diversity(positions, ac)
# Function results are automatically cached
result = expensive_calculation(my_genotype_data){ .api }
def hdf5_cache(filepath=None, parent=None, group=None, names=None, typed=False,
hashed_key=False, **h5dcreate_kwargs):
"""
HDF5 cache decorator for function results.
Args:
filepath (str, optional): Path to HDF5 file for caching
parent (str, optional): Parent group path within HDF5 file
group (str, optional): Specific group name for cache
names (list, optional): Names for cached results
typed (bool): Whether to consider argument types in cache key
hashed_key (bool): Whether to hash the cache key
**h5dcreate_kwargs: Additional HDF5 dataset creation arguments
Returns:
decorator: Function decorator that caches results
"""{ .api }
Utilities for validating and converting genetic data arrays.
import allel
import numpy as np
# Ensure array has correct dimensions
array_2d = allel.asarray_ndim(data, 2)
array_3d = allel.asarray_ndim(genotypes, 3)
# Validate array properties
allel.check_ndim(array, 2) # Raises error if wrong dimensions
allel.check_shape(array, (100, 4)) # Check specific shape
allel.check_dtype(array, int) # Validate data type
allel.check_integer_dtype(array) # Check if integer type
allel.check_ploidy(2, 'genotypes') # Validate ploidy parameter
# Check array alignment
allel.check_dim0_aligned(variants_array, genotypes_array)
allel.check_dim1_aligned(samples_array, genotypes_array){ .api }
Key Validation Functions:
def asarray_ndim(a, *ndims, **kwargs):
"""
Ensure array has one of the specified numbers of dimensions.
Args:
a (array_like): Input array
*ndims (int): Allowed numbers of dimensions
**kwargs: Additional arguments for array conversion
Returns:
numpy.ndarray: Array with validated dimensionality
Raises:
TypeError: If array has wrong number of dimensions
"""
def check_ndim(a, ndim):
"""
Check that array has expected number of dimensions.
Args:
a (array_like): Array to check
ndim (int): Expected number of dimensions
Raises:
TypeError: If array has wrong number of dimensions
"""
def check_shape(a, shape):
"""
Check that array has expected shape.
Args:
a (array_like): Array to check
shape (tuple): Expected shape
Raises:
TypeError: If array shape doesn't match
"""
def check_dtype(a, *dtypes):
"""
Check that array has one of the expected data types.
Args:
a (array_like): Array to check
*dtypes: Expected data types
Raises:
TypeError: If array has wrong data type
"""
def check_integer_dtype(a):
"""
Check that array has integer data type.
Args:
a (array_like): Array to check
Raises:
TypeError: If array is not integer type
"""
def check_ploidy(ploidy, name):
"""
Validate ploidy parameter.
Args:
ploidy (int): Ploidy value to validate
name (str): Name of the data for error messages
Raises:
ValueError: If ploidy is invalid
"""
def check_dim0_aligned(*arrays):
"""
Check that multiple arrays are aligned on first dimension.
Args:
*arrays: Variable number of arrays to check
Raises:
ValueError: If arrays have different sizes in first dimension
"""
def check_dim1_aligned(*arrays):
"""
Check that multiple arrays are aligned on second dimension.
Args:
*arrays: Variable number of arrays to check
Raises:
ValueError: If arrays have different sizes in second dimension
"""{ .api }
Utilities for handling numerical edge cases in genetic calculations.
# Handle invalid operations (e.g., division by zero)
with allel.ignore_invalid():
# Calculations that might produce NaN/inf values
frequencies = allele_counts / total_counts
diversity = frequencies * (1 - frequencies){ .api }
def ignore_invalid():
"""
Context manager to temporarily ignore numpy invalid value warnings.
Useful for genetic calculations that may involve division by zero
or other operations that produce NaN values as expected behavior.
Returns:
context manager: Context that ignores numpy invalid warnings
"""{ .api }
Scikit-allel arrays are built on NumPy and integrate seamlessly with the scientific Python ecosystem.
import numpy as np
import allel
# Convert genetic arrays to NumPy for custom operations
g = allel.GenotypeArray(genotype_data)
numpy_array = np.array(g)
# Use NumPy functions directly
missing_mask = (numpy_array == -1)
mean_genotype = np.nanmean(numpy_array, axis=1)
# Statistical operations
allele_variance = np.var(numpy_array, axis=1)
sample_correlations = np.corrcoef(numpy_array.T){ .api }
import pandas as pd
import matplotlib.pyplot as plt
# Create analysis workflows
def basic_qc_workflow(vcf_file):
"""Basic quality control workflow."""
# Read data
variants, samples, calldata = allel.read_vcf(vcf_file)
g = allel.GenotypeArray(calldata['GT'])
# Calculate statistics
ac = g.count_alleles()
missing_rate = np.mean(g.is_missing(), axis=1)
het_obs = allel.heterozygosity_observed(g)
# Create summary DataFrame
qc_stats = pd.DataFrame({
'CHROM': variants['CHROM'],
'POS': variants['POS'],
'missing_rate': missing_rate,
'het_obs': het_obs,
'n_alleles': ac.allelism()
})
return qc_stats
def plot_diversity_along_chromosome(positions, diversity_values):
"""Plot diversity metrics along chromosome."""
plt.figure(figsize=(12, 6))
plt.plot(positions, diversity_values, alpha=0.7)
plt.xlabel('Genomic Position')
plt.ylabel('Nucleotide Diversity (π)')
plt.title('Genetic Diversity Along Chromosome')
plt.grid(True, alpha=0.3)
return plt.gcf(){ .api }
# Standard genetic constants used throughout scikit-allel
DIPLOID_PLOIDY = 2 # Standard diploid ploidy level
# Usage in validation
def validate_diploid_genotypes(g):
"""Validate that genotypes are diploid."""
assert g.ploidy == allel.DIPLOID_PLOIDY
return True{ .api }
Install with Tessl CLI
npx tessl i tessl/pypi-scikit-allel