A Python package for exploring and analysing genetic variation data.
—
Scikit-allel provides specialized array classes for representing genetic variation data with efficient operations and memory management. These data structures are the foundation for all genetic analyses in the library.
A specialized numpy array for storing genotype data with shape (n_variants, n_samples, ploidy).
import allel
import numpy as np
# Create from list/array data
genotypes = [[[0, 0], [0, 1]], [[1, 1], [1, 0]]] # 2 variants, 2 samples, diploid
g = allel.GenotypeArray(genotypes)
# Properties
print(g.n_variants) # Number of variants
print(g.n_samples) # Number of samples
print(g.ploidy) # Ploidy (e.g., 2 for diploid)
print(g.shape) # Array shape
print(g.dtype) # Data type{ .api }
Key Methods:
def count_alleles(self, max_allele=None):
"""
Count alleles per variant across all samples.
Args:
max_allele (int, optional): Maximum allele number to count
Returns:
AlleleCountsArray: Allele counts with shape (n_variants, n_alleles)
"""
def to_haplotypes(self):
"""
Convert genotypes to haplotype representation.
Returns:
HaplotypeArray: Haplotype array with shape (n_variants, n_haplotypes)
"""
def is_phased(self):
"""
Check if genotypes are phased.
Returns:
bool: True if all genotypes are phased
"""
def map_alleles(self, mapping):
"""
Remap allele values using provided mapping.
Args:
mapping (array_like): New allele values for each position
Returns:
GenotypeArray: Array with remapped alleles
"""
def subset(self, sel0=None, sel1=None):
"""
Select subset of variants and/or samples.
Args:
sel0 (array_like, optional): Variant selection
sel1 (array_like, optional): Sample selection
Returns:
GenotypeArray: Subset of original array
"""
def count_het(self, allele=None):
"""
Count heterozygous genotypes per variant.
Args:
allele (int, optional): Specific allele to check heterozygosity
Returns:
ndarray: Heterozygous counts per variant
"""
def count_hom_alt(self):
"""
Count homozygous alternate genotypes per variant.
Returns:
ndarray: Homozygous alternate counts per variant
"""
def is_het(self, allele=None):
"""
Determine which genotypes are heterozygous.
Args:
allele (int, optional): Specific allele to check
Returns:
ndarray: Boolean array indicating heterozygous genotypes
"""
def is_hom_alt(self):
"""
Determine which genotypes are homozygous alternate.
Returns:
ndarray: Boolean array indicating homozygous alternate genotypes
"""
def is_call(self):
"""
Determine which genotypes are called (not missing).
Returns:
ndarray: Boolean array indicating called genotypes
"""{ .api }
A specialized array for haplotype data with shape (n_variants, n_haplotypes).
# Create haplotype array
haplotypes = [[0, 1, 0, 1], [1, 0, 1, 0]] # 2 variants, 4 haplotypes
h = allel.HaplotypeArray(haplotypes)
# Convert from genotypes
g = allel.GenotypeArray([[[0, 1], [1, 0]]])
h = g.to_haplotypes(){ .api }
Key Methods:
def count_alleles(self, max_allele=None):
"""
Count alleles per variant across all haplotypes.
Args:
max_allele (int, optional): Maximum allele number to count
Returns:
AlleleCountsArray: Allele counts per variant
"""
def distinct_frequencies(self):
"""
Calculate frequencies of distinct haplotypes.
Returns:
tuple: (distinct_haplotypes, frequencies)
"""
def map_alleles(self, mapping):
"""
Remap allele values using provided mapping.
Args:
mapping (array_like): New allele values
Returns:
HaplotypeArray: Array with remapped alleles
"""{ .api }
Array for storing allele count data with shape (n_variants, n_alleles).
# Create from genotype array
g = allel.GenotypeArray([[[0, 0], [0, 1]], [[1, 1], [1, 0]]])
ac = g.count_alleles()
# Direct creation
counts = [[3, 1], [2, 2]] # 2 variants, counts for alleles 0,1
ac = allel.AlleleCountsArray(counts){ .api }
Key Methods:
def count_segregating(self):
"""
Count number of segregating (polymorphic) variants.
Returns:
int: Number of segregating variants
"""
def is_segregating(self):
"""
Determine which variants are segregating.
Returns:
ndarray: Boolean array indicating segregating variants
"""
def count_singletons(self):
"""
Count singleton variants (minor allele count = 1).
Returns:
int: Number of singleton variants
"""
def allelism(self):
"""
Determine number of alleles per variant.
Returns:
ndarray: Number of alleles observed per variant
"""
def max_allele(self):
"""
Find maximum allele number per variant.
Returns:
ndarray: Maximum allele number per variant
"""
def to_frequencies(self, fill=0):
"""
Convert counts to frequencies.
Args:
fill (float): Value for missing data
Returns:
ndarray: Allele frequencies
"""{ .api }
def GenotypeArray(data, dtype=None):
"""
Create a genotype array from input data.
Args:
data (array_like): Genotype data with shape (n_variants, n_samples, ploidy)
dtype (numpy.dtype, optional): Data type for array
Returns:
GenotypeArray: Genotype array instance
"""{ .api }
def HaplotypeArray(data, dtype=None):
"""
Create a haplotype array from input data.
Args:
data (array_like): Haplotype data with shape (n_variants, n_haplotypes)
dtype (numpy.dtype, optional): Data type for array
Returns:
HaplotypeArray: Haplotype array instance
"""{ .api }
def AlleleCountsArray(data, dtype=None):
"""
Create an allele counts array from input data.
Args:
data (array_like): Allele count data with shape (n_variants, n_alleles)
dtype (numpy.dtype, optional): Data type for array
Returns:
AlleleCountsArray: Allele counts array instance
"""{ .api }
Structured array for storing variant metadata.
# Create variant table with metadata
variants = allel.VariantTable({
'CHROM': ['1', '1', '2'],
'POS': [100, 200, 150],
'REF': ['A', 'G', 'T'],
'ALT': ['T', 'A', 'C']
})
# Access fields
positions = variants['POS']
chromosomes = variants['CHROM']{ .api }
Structured array for genomic features like genes and exons.
# Create feature table
features = allel.FeatureTable({
'seqid': ['1', '1', '2'],
'start': [1000, 2000, 1500],
'end': [2000, 3000, 2500],
'type': ['gene', 'exon', 'gene']
}){ .api }
For large datasets, scikit-allel provides chunked versions using Dask.
import dask.array as da
# Create chunked genotype array
chunks = (1000, 100, 2) # chunk sizes for (variants, samples, ploidy)
g_chunked = allel.GenotypeDaskArray(data, chunks=chunks)
# Compute results
ac = g_chunked.count_alleles().compute(){ .api }
# Create chunked haplotype array
h_chunked = allel.HaplotypeDaskArray(data, chunks=(1000, 200)){ .api }
# Create chunked allele counts array
ac_chunked = allel.AlleleCountsDaskArray(data, chunks=(1000, 4)){ .api }
For backwards compatibility, scikit-allel provides chunked array implementations using Zarr or HDF5 storage. Note: These are maintained for backwards compatibility; prefer Dask arrays for new code.
# Create chunked genotype array with Zarr storage
g_chunked = allel.GenotypeChunkedArray(data, chunks=(1000, 100, 2), storage='zarr')
# With HDF5 storage
g_chunked = allel.GenotypeChunkedArray(data, chunks=(1000, 100, 2), storage='hdf5'){ .api }
# Create chunked haplotype array
h_chunked = allel.HaplotypeChunkedArray(data, chunks=(1000, 200), storage='zarr'){ .api }
# Create chunked allele counts array
ac_chunked = allel.AlleleCountsChunkedArray(data, chunks=(1000, 4), storage='zarr'){ .api }
# Create chunked genotype allele counts array
gac_chunked = allel.GenotypeAlleleCountsChunkedArray(data, chunks=(1000, 100, 4), storage='zarr'){ .api }
Index for unique identifiers with efficient lookup.
# Create unique index
sample_ids = ['sample_1', 'sample_2', 'sample_3']
idx = allel.UniqueIndex(sample_ids)
# Lookup by value
pos = idx.locate_key('sample_2') # Returns position{ .api }
Index for sorted values with range queries.
# Create sorted index for positions
positions = [100, 200, 300, 400, 500]
idx = allel.SortedIndex(positions)
# Range queries
selection = idx.locate_range(150, 350) # Find positions in range{ .api }
Multi-level index for hierarchical data.
# Create multi-level index (chromosome, position)
chroms = ['1', '1', '1', '2', '2']
positions = [100, 200, 300, 150, 250]
idx = allel.SortedMultiIndex([chroms, positions])
# Query specific chromosome
chrom1_variants = idx.locate_key('1'){ .api }
Specialized index for chromosome-position data.
# Create chromosome-position index
chroms = ['1', '1', '2', '2']
positions = [100, 200, 150, 250]
idx = allel.ChromPosIndex(chroms, positions)
# Query by chromosome and position range
variants = idx.locate_range('1', 50, 150){ .api }
1D genotype vector for single variant.
# Create genotype vector for single variant
genotypes_1d = [0, 1, 1, 0] # Single variant across samples
gv = allel.GenotypeVector(genotypes_1d)
# Properties
print(gv.n_samples) # Number of samples
print(gv.shape) # Vector shape{ .api }
1D allele counts vector for single variant.
# Create allele counts vector
counts_1d = [3, 1, 0] # Counts for alleles 0, 1, 2
acv = allel.GenotypeAlleleCountsVector(counts_1d)
# Analysis methods
print(acv.is_segregating()) # Check if variant is polymorphic
print(acv.allelism()) # Number of alleles{ .api }
Install with Tessl CLI
npx tessl i tessl/pypi-scikit-allel