CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-scikit-allel

A Python package for exploring and analysing genetic variation data.

Pending
Overview
Eval results
Files

data-structures.mddocs/

Data Structures

Scikit-allel provides specialized array classes for representing genetic variation data with efficient operations and memory management. These data structures are the foundation for all genetic analyses in the library.

Core Array Classes

GenotypeArray

A specialized numpy array for storing genotype data with shape (n_variants, n_samples, ploidy).

import allel
import numpy as np

# Create from list/array data
genotypes = [[[0, 0], [0, 1]], [[1, 1], [1, 0]]]  # 2 variants, 2 samples, diploid
g = allel.GenotypeArray(genotypes)

# Properties
print(g.n_variants)  # Number of variants
print(g.n_samples)   # Number of samples  
print(g.ploidy)      # Ploidy (e.g., 2 for diploid)
print(g.shape)       # Array shape
print(g.dtype)       # Data type

{ .api }

Key Methods:

def count_alleles(self, max_allele=None):
    """
    Count alleles per variant across all samples.
    
    Args:
        max_allele (int, optional): Maximum allele number to count
        
    Returns:
        AlleleCountsArray: Allele counts with shape (n_variants, n_alleles)
    """

def to_haplotypes(self):
    """
    Convert genotypes to haplotype representation.
    
    Returns:
        HaplotypeArray: Haplotype array with shape (n_variants, n_haplotypes)
    """

def is_phased(self):
    """
    Check if genotypes are phased.
    
    Returns:
        bool: True if all genotypes are phased
    """

def map_alleles(self, mapping):
    """
    Remap allele values using provided mapping.
    
    Args:
        mapping (array_like): New allele values for each position
        
    Returns:
        GenotypeArray: Array with remapped alleles
    """

def subset(self, sel0=None, sel1=None):
    """
    Select subset of variants and/or samples.
    
    Args:
        sel0 (array_like, optional): Variant selection
        sel1 (array_like, optional): Sample selection
        
    Returns:
        GenotypeArray: Subset of original array
    """

def count_het(self, allele=None):
    """
    Count heterozygous genotypes per variant.
    
    Args:
        allele (int, optional): Specific allele to check heterozygosity
        
    Returns:
        ndarray: Heterozygous counts per variant
    """

def count_hom_alt(self):
    """
    Count homozygous alternate genotypes per variant.
    
    Returns:
        ndarray: Homozygous alternate counts per variant
    """

def is_het(self, allele=None):
    """
    Determine which genotypes are heterozygous.
    
    Args:
        allele (int, optional): Specific allele to check
        
    Returns:
        ndarray: Boolean array indicating heterozygous genotypes
    """

def is_hom_alt(self):
    """
    Determine which genotypes are homozygous alternate.
    
    Returns:
        ndarray: Boolean array indicating homozygous alternate genotypes
    """

def is_call(self):
    """
    Determine which genotypes are called (not missing).
    
    Returns:
        ndarray: Boolean array indicating called genotypes
    """

{ .api }

HaplotypeArray

A specialized array for haplotype data with shape (n_variants, n_haplotypes).

# Create haplotype array
haplotypes = [[0, 1, 0, 1], [1, 0, 1, 0]]  # 2 variants, 4 haplotypes
h = allel.HaplotypeArray(haplotypes)

# Convert from genotypes
g = allel.GenotypeArray([[[0, 1], [1, 0]]])
h = g.to_haplotypes()

{ .api }

Key Methods:

def count_alleles(self, max_allele=None):
    """
    Count alleles per variant across all haplotypes.
    
    Args:
        max_allele (int, optional): Maximum allele number to count
        
    Returns:
        AlleleCountsArray: Allele counts per variant
    """

def distinct_frequencies(self):
    """
    Calculate frequencies of distinct haplotypes.
    
    Returns:
        tuple: (distinct_haplotypes, frequencies)
    """

def map_alleles(self, mapping):
    """
    Remap allele values using provided mapping.
    
    Args:
        mapping (array_like): New allele values
        
    Returns:
        HaplotypeArray: Array with remapped alleles
    """

{ .api }

AlleleCountsArray

Array for storing allele count data with shape (n_variants, n_alleles).

# Create from genotype array
g = allel.GenotypeArray([[[0, 0], [0, 1]], [[1, 1], [1, 0]]])
ac = g.count_alleles()

# Direct creation
counts = [[3, 1], [2, 2]]  # 2 variants, counts for alleles 0,1
ac = allel.AlleleCountsArray(counts)

{ .api }

Key Methods:

def count_segregating(self):
    """
    Count number of segregating (polymorphic) variants.
    
    Returns:
        int: Number of segregating variants
    """

def is_segregating(self):
    """
    Determine which variants are segregating.
    
    Returns:
        ndarray: Boolean array indicating segregating variants
    """

def count_singletons(self):
    """
    Count singleton variants (minor allele count = 1).
    
    Returns:
        int: Number of singleton variants
    """

def allelism(self):
    """
    Determine number of alleles per variant.
    
    Returns:
        ndarray: Number of alleles observed per variant
    """

def max_allele(self):
    """
    Find maximum allele number per variant.
    
    Returns:
        ndarray: Maximum allele number per variant
    """

def to_frequencies(self, fill=0):
    """
    Convert counts to frequencies.
    
    Args:
        fill (float): Value for missing data
        
    Returns:
        ndarray: Allele frequencies
    """

{ .api }

Array Creation Functions

Genotype Array Creation

def GenotypeArray(data, dtype=None):
    """
    Create a genotype array from input data.
    
    Args:
        data (array_like): Genotype data with shape (n_variants, n_samples, ploidy)
        dtype (numpy.dtype, optional): Data type for array
        
    Returns:
        GenotypeArray: Genotype array instance
    """

{ .api }

Haplotype Array Creation

def HaplotypeArray(data, dtype=None):
    """
    Create a haplotype array from input data.
    
    Args:
        data (array_like): Haplotype data with shape (n_variants, n_haplotypes)
        dtype (numpy.dtype, optional): Data type for array
        
    Returns:
        HaplotypeArray: Haplotype array instance
    """

{ .api }

Allele Counts Array Creation

def AlleleCountsArray(data, dtype=None):
    """
    Create an allele counts array from input data.
    
    Args:
        data (array_like): Allele count data with shape (n_variants, n_alleles)
        dtype (numpy.dtype, optional): Data type for array
        
    Returns:
        AlleleCountsArray: Allele counts array instance
    """

{ .api }

Structured Arrays

VariantTable

Structured array for storing variant metadata.

# Create variant table with metadata
variants = allel.VariantTable({
    'CHROM': ['1', '1', '2'],
    'POS': [100, 200, 150],
    'REF': ['A', 'G', 'T'],
    'ALT': ['T', 'A', 'C']
})

# Access fields
positions = variants['POS']
chromosomes = variants['CHROM']

{ .api }

FeatureTable

Structured array for genomic features like genes and exons.

# Create feature table
features = allel.FeatureTable({
    'seqid': ['1', '1', '2'],
    'start': [1000, 2000, 1500],
    'end': [2000, 3000, 2500],
    'type': ['gene', 'exon', 'gene']
})

{ .api }

Chunked Arrays

For large datasets, scikit-allel provides chunked versions using Dask.

GenotypeDaskArray

import dask.array as da

# Create chunked genotype array
chunks = (1000, 100, 2)  # chunk sizes for (variants, samples, ploidy)
g_chunked = allel.GenotypeDaskArray(data, chunks=chunks)

# Compute results
ac = g_chunked.count_alleles().compute()

{ .api }

HaplotypeDaskArray

# Create chunked haplotype array
h_chunked = allel.HaplotypeDaskArray(data, chunks=(1000, 200))

{ .api }

AlleleCountsDaskArray

# Create chunked allele counts array
ac_chunked = allel.AlleleCountsDaskArray(data, chunks=(1000, 4))

{ .api }

Chunked Arrays (Legacy)

For backwards compatibility, scikit-allel provides chunked array implementations using Zarr or HDF5 storage. Note: These are maintained for backwards compatibility; prefer Dask arrays for new code.

GenotypeChunkedArray

# Create chunked genotype array with Zarr storage
g_chunked = allel.GenotypeChunkedArray(data, chunks=(1000, 100, 2), storage='zarr')

# With HDF5 storage
g_chunked = allel.GenotypeChunkedArray(data, chunks=(1000, 100, 2), storage='hdf5')

{ .api }

HaplotypeChunkedArray

# Create chunked haplotype array
h_chunked = allel.HaplotypeChunkedArray(data, chunks=(1000, 200), storage='zarr')

{ .api }

AlleleCountsChunkedArray

# Create chunked allele counts array
ac_chunked = allel.AlleleCountsChunkedArray(data, chunks=(1000, 4), storage='zarr')

{ .api }

GenotypeAlleleCountsChunkedArray

# Create chunked genotype allele counts array
gac_chunked = allel.GenotypeAlleleCountsChunkedArray(data, chunks=(1000, 100, 4), storage='zarr')

{ .api }

Index Classes

UniqueIndex

Index for unique identifiers with efficient lookup.

# Create unique index
sample_ids = ['sample_1', 'sample_2', 'sample_3']
idx = allel.UniqueIndex(sample_ids)

# Lookup by value
pos = idx.locate_key('sample_2')  # Returns position

{ .api }

SortedIndex

Index for sorted values with range queries.

# Create sorted index for positions
positions = [100, 200, 300, 400, 500]
idx = allel.SortedIndex(positions)

# Range queries
selection = idx.locate_range(150, 350)  # Find positions in range

{ .api }

SortedMultiIndex

Multi-level index for hierarchical data.

# Create multi-level index (chromosome, position)
chroms = ['1', '1', '1', '2', '2']
positions = [100, 200, 300, 150, 250]
idx = allel.SortedMultiIndex([chroms, positions])

# Query specific chromosome
chrom1_variants = idx.locate_key('1')

{ .api }

ChromPosIndex

Specialized index for chromosome-position data.

# Create chromosome-position index
chroms = ['1', '1', '2', '2']
positions = [100, 200, 150, 250]
idx = allel.ChromPosIndex(chroms, positions)

# Query by chromosome and position range
variants = idx.locate_range('1', 50, 150)

{ .api }

Vector Classes

GenotypeVector

1D genotype vector for single variant.

# Create genotype vector for single variant
genotypes_1d = [0, 1, 1, 0]  # Single variant across samples
gv = allel.GenotypeVector(genotypes_1d)

# Properties
print(gv.n_samples)  # Number of samples
print(gv.shape)      # Vector shape

{ .api }

GenotypeAlleleCountsVector

1D allele counts vector for single variant.

# Create allele counts vector
counts_1d = [3, 1, 0]  # Counts for alleles 0, 1, 2
acv = allel.GenotypeAlleleCountsVector(counts_1d)

# Analysis methods
print(acv.is_segregating())  # Check if variant is polymorphic
print(acv.allelism())        # Number of alleles

{ .api }

Memory-Efficient Storage

Install with Tessl CLI

npx tessl i tessl/pypi-scikit-allel

docs

data-structures.md

index.md

io.md

statistics.md

utilities.md

tile.json