tessl/pypi-scikit-allel

A Python package for exploring and analysing genetic variation data.

—

Pending

Overview

Eval results

Files

Utilities

Name: tessl/pypi-scikit-allel
Author: tessl

Scikit-allel provides essential utilities for data validation, array manipulation, and caching to support genetic analysis workflows.

Caching System

HDF5 Caching

Efficiently cache computational results using HDF5 storage.

import allel
import numpy as np

# Use HDF5 cache decorator for functions
@allel.hdf5_cache(filepath='analysis_cache.h5', group='/results')
def expensive_calculation(genotype_data):
    g = allel.GenotypeArray(genotype_data)
    ac = g.count_alleles()
    return allel.sequence_diversity(positions, ac)

# Function results are automatically cached
result = expensive_calculation(my_genotype_data)

{ .api }

def hdf5_cache(filepath=None, parent=None, group=None, names=None, typed=False,
               hashed_key=False, **h5dcreate_kwargs):
    """
    HDF5 cache decorator for function results.
    
    Args:
        filepath (str, optional): Path to HDF5 file for caching
        parent (str, optional): Parent group path within HDF5 file
        group (str, optional): Specific group name for cache
        names (list, optional): Names for cached results
        typed (bool): Whether to consider argument types in cache key
        hashed_key (bool): Whether to hash the cache key
        **h5dcreate_kwargs: Additional HDF5 dataset creation arguments
        
    Returns:
        decorator: Function decorator that caches results
    """

{ .api }

Array Validation and Conversion

Core Validation Functions

Utilities for validating and converting genetic data arrays.

import allel
import numpy as np

# Ensure array has correct dimensions
array_2d = allel.asarray_ndim(data, 2)
array_3d = allel.asarray_ndim(genotypes, 3)

# Validate array properties
allel.check_ndim(array, 2)  # Raises error if wrong dimensions
allel.check_shape(array, (100, 4))  # Check specific shape
allel.check_dtype(array, int)  # Validate data type
allel.check_integer_dtype(array)  # Check if integer type
allel.check_ploidy(2, 'genotypes')  # Validate ploidy parameter

# Check array alignment
allel.check_dim0_aligned(variants_array, genotypes_array)
allel.check_dim1_aligned(samples_array, genotypes_array)

{ .api }

Key Validation Functions:

def asarray_ndim(a, *ndims, **kwargs):
    """
    Ensure array has one of the specified numbers of dimensions.
    
    Args:
        a (array_like): Input array
        *ndims (int): Allowed numbers of dimensions
        **kwargs: Additional arguments for array conversion
        
    Returns:
        numpy.ndarray: Array with validated dimensionality
        
    Raises:
        TypeError: If array has wrong number of dimensions
    """

def check_ndim(a, ndim):
    """
    Check that array has expected number of dimensions.
    
    Args:
        a (array_like): Array to check
        ndim (int): Expected number of dimensions
        
    Raises:
        TypeError: If array has wrong number of dimensions
    """

def check_shape(a, shape):
    """
    Check that array has expected shape.
    
    Args:
        a (array_like): Array to check
        shape (tuple): Expected shape
        
    Raises:
        TypeError: If array shape doesn't match
    """

def check_dtype(a, *dtypes):
    """
    Check that array has one of the expected data types.
    
    Args:
        a (array_like): Array to check
        *dtypes: Expected data types
        
    Raises:
        TypeError: If array has wrong data type
    """

def check_integer_dtype(a):
    """
    Check that array has integer data type.
    
    Args:
        a (array_like): Array to check
        
    Raises:
        TypeError: If array is not integer type
    """

def check_ploidy(ploidy, name):
    """
    Validate ploidy parameter.
    
    Args:
        ploidy (int): Ploidy value to validate
        name (str): Name of the data for error messages
        
    Raises:
        ValueError: If ploidy is invalid
    """

def check_dim0_aligned(*arrays):
    """
    Check that multiple arrays are aligned on first dimension.
    
    Args:
        *arrays: Variable number of arrays to check
        
    Raises:
        ValueError: If arrays have different sizes in first dimension
    """

def check_dim1_aligned(*arrays):
    """
    Check that multiple arrays are aligned on second dimension.
    
    Args:
        *arrays: Variable number of arrays to check
        
    Raises:
        ValueError: If arrays have different sizes in second dimension
    """

{ .api }

Error Handling Utilities

Numerical Computation Support

Utilities for handling numerical edge cases in genetic calculations.

# Handle invalid operations (e.g., division by zero)
with allel.ignore_invalid():
    # Calculations that might produce NaN/inf values
    frequencies = allele_counts / total_counts
    diversity = frequencies * (1 - frequencies)

{ .api }

def ignore_invalid():
    """
    Context manager to temporarily ignore numpy invalid value warnings.
    
    Useful for genetic calculations that may involve division by zero
    or other operations that produce NaN values as expected behavior.
    
    Returns:
        context manager: Context that ignores numpy invalid warnings
    """

{ .api }

Integration with Scientific Python

Working with NumPy Arrays

Scikit-allel arrays are built on NumPy and integrate seamlessly with the scientific Python ecosystem.

import numpy as np
import allel

# Convert genetic arrays to NumPy for custom operations
g = allel.GenotypeArray(genotype_data)
numpy_array = np.array(g)

# Use NumPy functions directly
missing_mask = (numpy_array == -1)
mean_genotype = np.nanmean(numpy_array, axis=1)

# Statistical operations
allele_variance = np.var(numpy_array, axis=1)
sample_correlations = np.corrcoef(numpy_array.T)

{ .api }

Integration Examples

import pandas as pd
import matplotlib.pyplot as plt

# Create analysis workflows
def basic_qc_workflow(vcf_file):
    """Basic quality control workflow."""
    # Read data
    variants, samples, calldata = allel.read_vcf(vcf_file)
    g = allel.GenotypeArray(calldata['GT'])
    
    # Calculate statistics
    ac = g.count_alleles()
    missing_rate = np.mean(g.is_missing(), axis=1)
    het_obs = allel.heterozygosity_observed(g)
    
    # Create summary DataFrame
    qc_stats = pd.DataFrame({
        'CHROM': variants['CHROM'],
        'POS': variants['POS'],
        'missing_rate': missing_rate,
        'het_obs': het_obs,
        'n_alleles': ac.allelism()
    })
    
    return qc_stats

def plot_diversity_along_chromosome(positions, diversity_values):
    """Plot diversity metrics along chromosome."""
    plt.figure(figsize=(12, 6))
    plt.plot(positions, diversity_values, alpha=0.7)
    plt.xlabel('Genomic Position')
    plt.ylabel('Nucleotide Diversity (π)')
    plt.title('Genetic Diversity Along Chromosome')
    plt.grid(True, alpha=0.3)
    return plt.gcf()

{ .api }

Constants

Genetic Analysis Constants

# Standard genetic constants used throughout scikit-allel
DIPLOID_PLOIDY = 2  # Standard diploid ploidy level

# Usage in validation
def validate_diploid_genotypes(g):
    """Validate that genotypes are diploid."""
    assert g.ploidy == allel.DIPLOID_PLOIDY
    return True

{ .api }

Install with Tessl CLI