A Python package for exploring and analysing genetic variation data.
—
Scikit-allel provides comprehensive statistical methods for population genetics analysis, including diversity metrics, population structure analysis, linkage disequilibrium, and tests for natural selection.
Calculate nucleotide diversity (π) and related metrics.
import allel
# Basic sequence diversity
diversity = allel.sequence_diversity(positions, allele_counts, start=None, stop=None)
# Mean pairwise differences
mpd = allel.mean_pairwise_difference(allele_counts)
# Between-population differences
div = allel.mean_pairwise_difference_between(ac1, ac2){ .api }
Key Functions:
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None):
"""
Calculate sequence diversity (π) over a genomic region.
Args:
pos (array_like): Variant positions
ac (AlleleCountsArray): Allele counts per variant
start (int, optional): Start position for calculation
stop (int, optional): Stop position for calculation
is_accessible (array_like, optional): Boolean array of accessible sites
Returns:
float: Sequence diversity value
"""
def mean_pairwise_difference(ac, an=None):
"""
Calculate mean pairwise nucleotide differences.
Args:
ac (AlleleCountsArray): Allele counts per variant
an (array_like, optional): Total allele numbers per variant
Returns:
ndarray: Mean pairwise differences per variant
"""
def windowed_diversity(pos, ac, windows, is_accessible=None):
"""
Calculate diversity in genomic windows.
Args:
pos (array_like): Variant positions
ac (AlleleCountsArray): Allele counts
windows (array_like): Window coordinates as (start, stop) pairs
is_accessible (array_like, optional): Accessible sites mask
Returns:
ndarray: Diversity values per window
"""{ .api }
Calculate divergence between populations or sequences.
# Between-population divergence
divergence = allel.sequence_divergence(positions, ac_pop1, ac_pop2, start=1000, stop=2000)
# Windowed divergence analysis
div_windows = allel.windowed_divergence(positions, ac_pop1, ac_pop2, windows)
# Windowed differentiation
df_windows = allel.windowed_df(positions, ac_pop1, ac_pop2, windows){ .api }
def sequence_divergence(pos, ac1, ac2, start=None, stop=None, is_accessible=None):
"""
Calculate sequence divergence between two populations.
Args:
pos (array_like): Variant positions
ac1 (AlleleCountsArray): Allele counts for population 1
ac2 (AlleleCountsArray): Allele counts for population 2
start (int, optional): Start position
stop (int, optional): Stop position
is_accessible (array_like, optional): Accessible sites mask
Returns:
float: Sequence divergence value
"""
def windowed_divergence(pos, ac1, ac2, windows, is_accessible=None):
"""
Calculate divergence between populations in genomic windows.
Args:
pos (array_like): Variant positions
ac1 (AlleleCountsArray): Allele counts for population 1
ac2 (AlleleCountsArray): Allele counts for population 2
windows (array_like): Window coordinates
is_accessible (array_like, optional): Accessible sites mask
Returns:
ndarray: Divergence values per window
"""
def windowed_df(pos, ac1, ac2, windows, is_accessible=None):
"""
Calculate windowed differentiation (Df) between populations.
Args:
pos (array_like): Variant positions
ac1 (AlleleCountsArray): Allele counts for population 1
ac2 (AlleleCountsArray): Allele counts for population 2
windows (array_like): Window coordinates
is_accessible (array_like, optional): Accessible sites mask
Returns:
ndarray: Df values per window
"""{ .api }
Estimate population mutation rate using Watterson's method.
# Single value
theta_w = allel.watterson_theta(allele_counts)
# Windowed analysis
theta_windows = allel.windowed_watterson_theta(positions, allele_counts, windows){ .api }
def watterson_theta(ac, an=None):
"""
Calculate Watterson's estimator of theta (population mutation rate).
Args:
ac (AlleleCountsArray): Allele counts per variant
an (array_like, optional): Total allele numbers per variant
Returns:
float: Watterson's theta estimate
"""
def windowed_watterson_theta(pos, ac, windows, is_accessible=None):
"""
Calculate Watterson's theta in genomic windows.
Args:
pos (array_like): Variant positions
ac (AlleleCountsArray): Allele counts
windows (array_like): Window coordinates
is_accessible (array_like, optional): Accessible sites mask
Returns:
ndarray: Theta values per window
"""{ .api }
Test for departure from neutral evolution.
# Single genomic region
tajima_d = allel.tajima_d(allele_counts, positions, start=1000, stop=2000)
# Windowed analysis
d_windows = allel.windowed_tajima_d(positions, allele_counts, windows)
# Moving window
d_moving = allel.moving_tajima_d(positions, allele_counts, size=1000, step=500){ .api }
def tajima_d(ac, pos=None, start=None, stop=None, is_accessible=None):
"""
Calculate Tajima's D test statistic.
Args:
ac (AlleleCountsArray): Allele counts per variant
pos (array_like, optional): Variant positions
start (int, optional): Start position
stop (int, optional): Stop position
is_accessible (array_like, optional): Accessible sites mask
Returns:
float: Tajima's D value
"""
def windowed_tajima_d(pos, ac, windows, is_accessible=None):
"""
Calculate Tajima's D in genomic windows.
Args:
pos (array_like): Variant positions
ac (AlleleCountsArray): Allele counts
windows (array_like): Window coordinates
is_accessible (array_like, optional): Accessible sites mask
Returns:
ndarray: Tajima's D values per window
"""
def moving_tajima_d(pos, ac, size, start=None, stop=None, step=None, is_accessible=None):
"""
Calculate Tajima's D in sliding windows.
Args:
pos (array_like): Variant positions
ac (AlleleCountsArray): Allele counts
size (int): Window size in base pairs
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size between windows
is_accessible (array_like, optional): Accessible sites mask
Returns:
tuple: (window_centers, tajima_d_values)
"""{ .api }
Calculate fixation indices between populations.
# Weir & Cockerham FST
fst_wc = allel.weir_cockerham_fst(genotypes, subpops)
# Hudson's FST
fst_hudson = allel.hudson_fst(allele_counts, subpops)
# Patterson's FST (for two populations)
fst_patterson = allel.patterson_fst(ac_pop1, ac_pop2){ .api }
Key Functions:
def weir_cockerham_fst(g, subpops, max_allele=None):
"""
Calculate Weir & Cockerham FST between subpopulations.
Args:
g (GenotypeArray): Genotype data
subpops (list): List of arrays containing sample indices for each subpopulation
max_allele (int, optional): Maximum allele number to consider
Returns:
tuple: (a, b, c) variance components for FST calculation
"""
def hudson_fst(ac, subpops):
"""
Calculate Hudson's FST estimator.
Args:
ac (AlleleCountsArray): Allele counts per variant
subpops (list): Subpopulation definitions
Returns:
tuple: (num, den) numerator and denominator for FST calculation
"""
def average_weir_cockerham_fst(g, subpops, max_allele=None):
"""
Calculate average FST across all variants.
Args:
g (GenotypeArray): Genotype data
subpops (list): Subpopulation sample indices
max_allele (int, optional): Maximum allele number
Returns:
float: Average FST value
"""
def windowed_weir_cockerham_fst(pos, g, subpops, windows, max_allele=None):
"""
Calculate FST in genomic windows.
Args:
pos (array_like): Variant positions
g (GenotypeArray): Genotype data
subpops (list): Subpopulation definitions
windows (array_like): Window coordinates
max_allele (int, optional): Maximum allele number
Returns:
ndarray: FST values per window
"""{ .api }
Advanced FST calculations for specific analysis needs.
# Blockwise FST calculations (for bootstrapping)
fst_blocks_wc = allel.blockwise_weir_cockerham_fst(genotypes, subpops, blocks)
fst_blocks_hudson = allel.blockwise_hudson_fst(ac, subpops, blocks)
fst_blocks_patterson = allel.blockwise_patterson_fst(ac1, ac2, blocks)
# Moving window FST
fst_moving_hudson = allel.moving_hudson_fst(positions, ac, subpops, size=10000)
fst_moving_patterson = allel.moving_patterson_fst(positions, ac1, ac2, size=10000)
fst_moving_wc = allel.moving_weir_cockerham_fst(positions, genotypes, subpops, size=10000)
# Average FST calculations
avg_hudson = allel.average_hudson_fst(ac, subpops)
avg_patterson = allel.average_patterson_fst(ac1, ac2){ .api }
def blockwise_weir_cockerham_fst(g, subpops, blocks, max_allele=None):
"""
Calculate Weir & Cockerham FST for genomic blocks.
Args:
g (GenotypeArray): Genotype data
subpops (list): Subpopulation sample indices
blocks (array_like): Block definitions
max_allele (int, optional): Maximum allele number
Returns:
ndarray: FST values per block
"""
def blockwise_hudson_fst(ac, subpops, blocks):
"""
Calculate Hudson's FST for genomic blocks.
Args:
ac (AlleleCountsArray): Allele counts
subpops (list): Subpopulation definitions
blocks (array_like): Block definitions
Returns:
ndarray: FST values per block
"""
def blockwise_patterson_fst(ac1, ac2, blocks):
"""
Calculate Patterson's FST for genomic blocks.
Args:
ac1 (AlleleCountsArray): Allele counts for population 1
ac2 (AlleleCountsArray): Allele counts for population 2
blocks (array_like): Block definitions
Returns:
ndarray: FST values per block
"""
def moving_hudson_fst(pos, ac, subpops, size, start=None, stop=None, step=None):
"""
Calculate Hudson's FST in sliding windows.
Args:
pos (array_like): Variant positions
ac (AlleleCountsArray): Allele counts
subpops (list): Subpopulation definitions
size (int): Window size in base pairs
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size between windows
Returns:
tuple: (window_centers, fst_values)
"""
def moving_patterson_fst(pos, ac1, ac2, size, start=None, stop=None, step=None):
"""
Calculate Patterson's FST in sliding windows.
Args:
pos (array_like): Variant positions
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
size (int): Window size in base pairs
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size between windows
Returns:
tuple: (window_centers, fst_values)
"""
def moving_weir_cockerham_fst(pos, g, subpops, size, start=None, stop=None, step=None, max_allele=None):
"""
Calculate Weir & Cockerham FST in sliding windows.
Args:
pos (array_like): Variant positions
g (GenotypeArray): Genotype data
subpops (list): Subpopulation sample indices
size (int): Window size in base pairs
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size between windows
max_allele (int, optional): Maximum allele number
Returns:
tuple: (window_centers, fst_values)
"""
def average_hudson_fst(ac, subpops):
"""
Calculate average Hudson's FST across all variants.
Args:
ac (AlleleCountsArray): Allele counts
subpops (list): Subpopulation definitions
Returns:
float: Average FST value
"""
def average_patterson_fst(ac1, ac2):
"""
Calculate average Patterson's FST across all variants.
Args:
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
Returns:
float: Average FST value
"""{ .api }
Measure linkage disequilibrium between variants.
# Rogers & Huff r correlation
r = allel.rogers_huff_r(genotype_matrix)
# Between two sets of variants
r_between = allel.rogers_huff_r_between(gn_a, gn_b)
# Windowed r-squared
r2_windows = allel.windowed_r_squared(positions, genotype_matrix, size=10000)
# Visualize LD patterns
allel.plot_pairwise_ld(gn_subset, positions_subset){ .api }
def rogers_huff_r(gn):
"""
Calculate Rogers & Huff r correlation coefficient matrix.
Args:
gn (array_like): Genotype matrix (n_variants, n_samples)
Returns:
ndarray: Correlation matrix between all pairs of variants
"""
def rogers_huff_r_between(gn1, gn2):
"""
Calculate Rogers & Huff r correlation between two sets of variants.
Args:
gn1 (array_like): First set of genotype data (n_variants1, n_samples)
gn2 (array_like): Second set of genotype data (n_variants2, n_samples)
Returns:
ndarray: Correlation matrix between variants in the two sets
"""
def windowed_r_squared(pos, gn, size, start=None, stop=None, step=None):
"""
Calculate r² in sliding windows.
Args:
pos (array_like): Variant positions
gn (array_like): Genotype matrix
size (int): Window size in base pairs
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size between windows
Returns:
tuple: (window_centers, mean_r2_values)
"""
def locate_unlinked(gn, size, step, threshold):
"""
Identify unlinked variants based on LD threshold.
Args:
gn (array_like): Genotype matrix
size (int): Window size for LD calculation
step (int): Step size between windows
threshold (float): LD threshold for considering variants unlinked
Returns:
ndarray: Indices of unlinked variants
"""
def plot_pairwise_ld(gn, pos, start=None, stop=None, ax=None):
"""
Plot pairwise linkage disequilibrium patterns.
Args:
gn (array_like): Genotype matrix
pos (array_like): Variant positions
start (int, optional): Start position for plotting
stop (int, optional): Stop position for plotting
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: LD heatmap plot
"""{ .api }
Dimensionality reduction for population structure analysis.
# Standard PCA
coords, model = allel.pca(genotype_matrix, n_components=10)
# Randomized PCA for large datasets
coords, model = allel.randomized_pca(genotype_matrix, n_components=10)
# Use different scalers
coords, model = allel.pca(genotype_matrix, scaler='patterson'){ .api }
def pca(gn, n_components=10, scaler=None, copy=True, random_state=None):
"""
Perform principal component analysis on genotype data.
Args:
gn (array_like): Genotype matrix (n_variants, n_samples)
n_components (int): Number of principal components to compute
scaler (str or object, optional): Scaling method ('standard', 'patterson', or scaler object)
copy (bool): Whether to copy input data
random_state (int, optional): Random seed for reproducibility
Returns:
tuple: (coordinates, fitted_model) where coordinates are sample projections
"""
def randomized_pca(gn, n_components=10, scaler=None, copy=True, random_state=None):
"""
Randomized PCA for large datasets.
Args:
gn (array_like): Genotype matrix
n_components (int): Number of components
scaler (str or object, optional): Scaling method
copy (bool): Whether to copy data
random_state (int, optional): Random seed
Returns:
tuple: (coordinates, fitted_model)
"""{ .api }
Standardize genotype data for analysis.
# Standard scaling (mean=0, std=1)
scaler = allel.StandardScaler()
gn_scaled = scaler.fit_transform(genotype_matrix)
# Center scaling (mean=0)
scaler = allel.CenterScaler()
gn_centered = scaler.fit_transform(genotype_matrix)
# Patterson scaling (for PCA)
scaler = allel.PattersonScaler()
gn_patterson = scaler.fit_transform(genotype_matrix)
# Get scaler by name
scaler = allel.get_scaler('patterson'){ .api }
class StandardScaler:
"""
Standardize genotype data to have zero mean and unit variance.
"""
def fit(self, X):
"""Compute mean and standard deviation for later scaling."""
def transform(self, X):
"""Scale data using computed statistics."""
def fit_transform(self, X):
"""Fit scaler and transform data in one step."""
class CenterScaler:
"""
Center genotype data to have zero mean.
"""
def fit(self, X):
"""Compute mean for later centering."""
def transform(self, X):
"""Center data using computed mean."""
def fit_transform(self, X):
"""Fit scaler and center data in one step."""
class PattersonScaler:
"""
Scale genotype data using Patterson method for PCA.
Scales variants by their expected variance under Hardy-Weinberg
equilibrium, which is appropriate for principal component analysis.
"""
def fit(self, X):
"""Compute scaling factors."""
def transform(self, X):
"""Apply Patterson scaling."""
def fit_transform(self, X):
"""Fit and transform in one step."""
def get_scaler(name):
"""
Get a scaler by name.
Args:
name (str): Scaler name ('standard', 'center', or 'patterson')
Returns:
scaler: Scaler object
"""{ .api }
Test for recent positive selection using haplotype structure.
# Integrated Haplotype Score (iHS)
ihs_scores = allel.ihs(haplotypes, map_positions, min_ehh=0.05)
# Cross-population Extended Haplotype Homozygosity (XP-EHH)
xpehh_scores = allel.xpehh(hap_pop1, hap_pop2, map_positions)
# Standardize scores
ihs_std = allel.standardize(ihs_scores){ .api }
def ihs(h, map_pos, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, discard_integration_warnings=False):
"""
Calculate integrated haplotype score (iHS) for each variant.
Args:
h (HaplotypeArray): Haplotype data
map_pos (array_like): Genetic map positions in centiMorgans
min_ehh (float): Minimum EHH value for integration cutoff
include_edges (bool): Whether to include variants at edges
gap_scale (float): Scale parameter for gap penalty
max_gap (float): Maximum gap size to allow
discard_integration_warnings (bool): Whether to suppress integration warnings
Returns:
ndarray: iHS scores per variant
"""
def xpehh(h1, h2, map_pos, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, discard_integration_warnings=False):
"""
Calculate cross-population EHH (XP-EHH) between two populations.
Args:
h1 (HaplotypeArray): Haplotypes from population 1
h2 (HaplotypeArray): Haplotypes from population 2
map_pos (array_like): Genetic map positions
min_ehh (float): Minimum EHH cutoff
include_edges (bool): Include edge variants
gap_scale (float): Gap penalty scale
max_gap (float): Maximum allowed gap
discard_integration_warnings (bool): Suppress warnings
Returns:
ndarray: XP-EHH scores per variant
"""
def standardize_by_allele_count(score, aac, flip=None):
"""
Standardize test statistic by allele count.
Args:
score (array_like): Raw test statistic values
aac (array_like): Alternate allele counts
flip (array_like, optional): Boolean array indicating which scores to flip
Returns:
ndarray: Standardized scores
"""{ .api }
Alternative selection test less sensitive to recombination rate variation.
# NSL scores
nsl_scores = allel.nsl(haplotypes, map_positions)
# Cross-population NSL
xpnsl_scores = allel.xpnsl(hap_pop1, hap_pop2, map_positions){ .api }
def nsl(h, map_pos=None, max_extent=None):
"""
Calculate number of segregating sites by length (NSL).
Args:
h (HaplotypeArray): Haplotype data
map_pos (array_like, optional): Map positions for physical distance
max_extent (float, optional): Maximum extent for calculation
Returns:
ndarray: NSL scores per variant
"""
def xpnsl(h1, h2, map_pos=None, max_extent=None):
"""
Calculate cross-population NSL.
Args:
h1 (HaplotypeArray): Population 1 haplotypes
h2 (HaplotypeArray): Population 2 haplotypes
map_pos (array_like, optional): Map positions
max_extent (float, optional): Maximum extent
Returns:
ndarray: XP-NSL scores per variant
"""{ .api }
More methods for detecting natural selection.
# EHH decay analysis
ehh_values = allel.ehh_decay(haplotypes, map_positions, focal_variant=100)
# Voight painting (visualization of haplotype decay)
painting = allel.voight_painting(haplotypes, map_positions, focal_variant=100)
allel.plot_voight_painting(painting)
# Standardization methods
scores_std = allel.standardize(scores)
scores_std_ac = allel.standardize_by_allele_count(scores, allele_counts)
# Moving Tajima's D difference
delta_td = allel.moving_delta_tajima_d(positions, ac_pop1, ac_pop2, size=10000)
# Population Branch Statistic (PBS)
pbs_scores = allel.pbs(fst_AB, fst_AC, fst_BC){ .api }
def ehh_decay(h, map_pos, focal=None, max_gap=200000, gap_scale=20000, truncate_left=None, truncate_right=None):
"""
Calculate Extended Haplotype Homozygosity (EHH) decay.
Args:
h (HaplotypeArray): Haplotype data
map_pos (array_like): Genetic map positions
focal (int, optional): Index of focal variant
max_gap (float): Maximum gap between variants
gap_scale (float): Gap scaling factor
truncate_left (float, optional): Left truncation point
truncate_right (float, optional): Right truncation point
Returns:
tuple: (distances, ehh_values)
"""
def voight_painting(h, map_pos, focal=None, max_gap=200000, gap_scale=20000):
"""
Generate Voight painting data for haplotype visualization.
Args:
h (HaplotypeArray): Haplotype data
map_pos (array_like): Genetic map positions
focal (int, optional): Index of focal variant
max_gap (float): Maximum gap between variants
gap_scale (float): Gap scaling factor
Returns:
dict: Painting data for visualization
"""
def plot_voight_painting(painting, ax=None):
"""
Plot Voight painting visualization.
Args:
painting (dict): Painting data from voight_painting()
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: Voight painting plot
"""
def fig_voight_painting(painting, height=None):
"""
Create figure for Voight painting.
Args:
painting (dict): Painting data
height (float, optional): Figure height
Returns:
matplotlib figure: Figure object
"""
def plot_haplotype_frequencies(h, start=None, stop=None, ax=None):
"""
Plot haplotype frequencies.
Args:
h (HaplotypeArray): Haplotype data
start (int, optional): Start position index
stop (int, optional): Stop position index
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: Haplotype frequency plot
"""
def plot_moving_haplotype_frequencies(h, size, start=None, stop=None, step=None, ax=None):
"""
Plot moving window haplotype frequencies.
Args:
h (HaplotypeArray): Haplotype data
size (int): Window size
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: Moving haplotype frequency plot
"""
def standardize(score):
"""
Standardize test statistic to z-scores.
Args:
score (array_like): Raw test statistic values
Returns:
ndarray: Standardized scores
"""
def moving_delta_tajima_d(pos, ac1, ac2, size, start=None, stop=None, step=None, is_accessible=None):
"""
Calculate difference in Tajima's D between populations in sliding windows.
Args:
pos (array_like): Variant positions
ac1 (AlleleCountsArray): Allele counts for population 1
ac2 (AlleleCountsArray): Allele counts for population 2
size (int): Window size in base pairs
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size
is_accessible (array_like, optional): Accessible sites mask
Returns:
tuple: (window_centers, delta_tajima_d_values)
"""
def pbs(fst_AB, fst_AC, fst_BC):
"""
Calculate Population Branch Statistic (PBS) for three populations.
Args:
fst_AB (array_like): FST between populations A and B
fst_AC (array_like): FST between populations A and C
fst_BC (array_like): FST between populations B and C
Returns:
ndarray: PBS values for population A
"""
def moving_haplotype_diversity(h, size, start=None, stop=None, step=None):
"""
Calculate haplotype diversity in sliding windows.
Args:
h (HaplotypeArray): Haplotype data
size (int): Window size (number of variants)
start (int, optional): Start position index
stop (int, optional): Stop position index
step (int, optional): Step size
Returns:
tuple: (window_centers, diversity_values)
"""
def moving_garud_h(h, size, start=None, stop=None, step=None):
"""
Calculate Garud's H statistics in sliding windows.
Args:
h (HaplotypeArray): Haplotype data
size (int): Window size (number of variants)
start (int, optional): Start position index
stop (int, optional): Stop position index
step (int, optional): Step size
Returns:
tuple: (window_centers, h1_values, h12_values, h123_values, h2_h1_values)
"""{ .api }
Measure diversity of haplotype patterns.
# Basic haplotype diversity
h_div = allel.haplotype_diversity(haplotypes, start=1000, stop=2000)
# Moving window analysis
h_div_moving = allel.moving_haplotype_diversity(haplotypes, size=1000, step=500)
# Garud's H statistics
h_stats = allel.garud_h(haplotypes, start=1000, stop=2000){ .api }
def haplotype_diversity(h, start=None, stop=None):
"""
Calculate haplotype diversity in a genomic region.
Args:
h (HaplotypeArray): Haplotype data
start (int, optional): Start position index
stop (int, optional): Stop position index
Returns:
float: Haplotype diversity value
"""
def garud_h(h, start=None, stop=None):
"""
Calculate Garud's H1, H12, H123, and H2/H1 statistics.
Args:
h (HaplotypeArray): Haplotype data
start (int, optional): Start position index
stop (int, optional): Stop position index
Returns:
tuple: (H1, H12, H123, H2_H1) statistics
"""{ .api }
Analyze allele frequency distributions.
# Basic SFS
sfs = allel.sfs(derived_allele_counts, n=sample_size)
# Folded SFS (when ancestral state unknown)
sfs_folded = allel.sfs_folded(derived_allele_counts, n=sample_size)
# Joint SFS between populations
joint_sfs = allel.joint_sfs(dac_pop1, dac_pop2, n1=n_pop1, n2=n_pop2){ .api }
def sfs(dac, n=None):
"""
Calculate site frequency spectrum.
Args:
dac (array_like): Derived allele counts per variant
n (int, optional): Sample size (number of chromosomes)
Returns:
ndarray: Site frequency spectrum
"""
def sfs_folded(dac, n=None):
"""
Calculate folded site frequency spectrum.
Args:
dac (array_like): Derived allele counts
n (int, optional): Sample size
Returns:
ndarray: Folded SFS
"""
def joint_sfs(dac1, dac2, n1=None, n2=None):
"""
Calculate joint site frequency spectrum between two populations.
Args:
dac1 (array_like): Derived allele counts for population 1
dac2 (array_like): Derived allele counts for population 2
n1 (int, optional): Sample size for population 1
n2 (int, optional): Sample size for population 2
Returns:
ndarray: Joint SFS as 2D array
"""{ .api }
Transform and scale site frequency spectra.
# Scale SFS by mutation rate
sfs_scaled = allel.sfs_scaled(derived_counts, n=sample_size, theta=0.01)
sfs_folded_scaled = allel.sfs_folded_scaled(derived_counts, n=sample_size, theta=0.01)
# Joint SFS scaling
joint_sfs_scaled = allel.joint_sfs_scaled(dac1, dac2, n1=n1, n2=n2, theta=0.01)
joint_sfs_folded_scaled = allel.joint_sfs_folded_scaled(dac1, dac2, n1=n1, n2=n2, theta=0.01)
# Fold existing SFS
folded_sfs = allel.fold_sfs(unfolded_sfs)
folded_joint_sfs = allel.fold_joint_sfs(unfolded_joint_sfs)
# Scale existing SFS
scaled_sfs = allel.scale_sfs(sfs, theta=0.01)
scaled_folded_sfs = allel.scale_sfs_folded(sfs, theta=0.01)
scaled_joint_sfs = allel.scale_joint_sfs(joint_sfs, theta=0.01)
scaled_joint_folded_sfs = allel.scale_joint_sfs_folded(joint_sfs, theta=0.01){ .api }
def sfs_scaled(dac, n=None, theta=1.0):
"""
Calculate scaled site frequency spectrum.
Args:
dac (array_like): Derived allele counts
n (int, optional): Sample size
theta (float): Scaling factor (mutation rate)
Returns:
ndarray: Scaled SFS
"""
def sfs_folded_scaled(dac, n=None, theta=1.0):
"""
Calculate scaled folded site frequency spectrum.
Args:
dac (array_like): Derived allele counts
n (int, optional): Sample size
theta (float): Scaling factor
Returns:
ndarray: Scaled folded SFS
"""
def joint_sfs_scaled(dac1, dac2, n1=None, n2=None, theta=1.0):
"""
Calculate scaled joint site frequency spectrum.
Args:
dac1 (array_like): Population 1 derived allele counts
dac2 (array_like): Population 2 derived allele counts
n1 (int, optional): Population 1 sample size
n2 (int, optional): Population 2 sample size
theta (float): Scaling factor
Returns:
ndarray: Scaled joint SFS
"""
def joint_sfs_folded_scaled(dac1, dac2, n1=None, n2=None, theta=1.0):
"""
Calculate scaled folded joint site frequency spectrum.
Args:
dac1 (array_like): Population 1 derived allele counts
dac2 (array_like): Population 2 derived allele counts
n1 (int, optional): Population 1 sample size
n2 (int, optional): Population 2 sample size
theta (float): Scaling factor
Returns:
ndarray: Scaled folded joint SFS
"""
def fold_sfs(sfs):
"""
Fold an existing site frequency spectrum.
Args:
sfs (array_like): Unfolded SFS
Returns:
ndarray: Folded SFS
"""
def fold_joint_sfs(joint_sfs):
"""
Fold an existing joint site frequency spectrum.
Args:
joint_sfs (array_like): Unfolded joint SFS
Returns:
ndarray: Folded joint SFS
"""
def scale_sfs(sfs, theta=1.0):
"""
Scale an existing site frequency spectrum.
Args:
sfs (array_like): SFS to scale
theta (float): Scaling factor
Returns:
ndarray: Scaled SFS
"""
def scale_sfs_folded(sfs, theta=1.0):
"""
Scale an existing folded site frequency spectrum.
Args:
sfs (array_like): Folded SFS to scale
theta (float): Scaling factor
Returns:
ndarray: Scaled folded SFS
"""
def scale_joint_sfs(joint_sfs, theta=1.0):
"""
Scale an existing joint site frequency spectrum.
Args:
joint_sfs (array_like): Joint SFS to scale
theta (float): Scaling factor
Returns:
ndarray: Scaled joint SFS
"""
def scale_joint_sfs_folded(joint_sfs, theta=1.0):
"""
Scale an existing folded joint site frequency spectrum.
Args:
joint_sfs (array_like): Folded joint SFS to scale
theta (float): Scaling factor
Returns:
ndarray: Scaled folded joint SFS
"""{ .api }
Plot site frequency spectra.
# Basic SFS plots
allel.plot_sfs(sfs)
allel.plot_sfs_folded(sfs_folded)
allel.plot_sfs_scaled(sfs_scaled)
allel.plot_sfs_folded_scaled(sfs_folded_scaled)
# Joint SFS plots
allel.plot_joint_sfs(joint_sfs)
allel.plot_joint_sfs_folded(joint_sfs_folded)
allel.plot_joint_sfs_scaled(joint_sfs_scaled)
allel.plot_joint_sfs_folded_scaled(joint_sfs_folded_scaled){ .api }
def plot_sfs(sfs, ax=None, n=None):
"""
Plot site frequency spectrum.
Args:
sfs (array_like): SFS values
ax (matplotlib axis, optional): Axis to plot on
n (int, optional): Sample size for x-axis labels
Returns:
matplotlib figure: SFS plot
"""
def plot_sfs_folded(sfs, ax=None, n=None):
"""
Plot folded site frequency spectrum.
Args:
sfs (array_like): Folded SFS values
ax (matplotlib axis, optional): Axis to plot on
n (int, optional): Sample size for x-axis labels
Returns:
matplotlib figure: Folded SFS plot
"""
def plot_sfs_scaled(sfs, ax=None, n=None):
"""
Plot scaled site frequency spectrum.
Args:
sfs (array_like): Scaled SFS values
ax (matplotlib axis, optional): Axis to plot on
n (int, optional): Sample size
Returns:
matplotlib figure: Scaled SFS plot
"""
def plot_sfs_folded_scaled(sfs, ax=None, n=None):
"""
Plot scaled folded site frequency spectrum.
Args:
sfs (array_like): Scaled folded SFS values
ax (matplotlib axis, optional): Axis to plot on
n (int, optional): Sample size
Returns:
matplotlib figure: Scaled folded SFS plot
"""
def plot_joint_sfs(joint_sfs, ax=None):
"""
Plot joint site frequency spectrum as heatmap.
Args:
joint_sfs (array_like): Joint SFS matrix
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: Joint SFS heatmap
"""
def plot_joint_sfs_folded(joint_sfs, ax=None):
"""
Plot folded joint site frequency spectrum.
Args:
joint_sfs (array_like): Folded joint SFS matrix
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: Folded joint SFS heatmap
"""
def plot_joint_sfs_scaled(joint_sfs, ax=None):
"""
Plot scaled joint site frequency spectrum.
Args:
joint_sfs (array_like): Scaled joint SFS matrix
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: Scaled joint SFS heatmap
"""
def plot_joint_sfs_folded_scaled(joint_sfs, ax=None):
"""
Plot scaled folded joint site frequency spectrum.
Args:
joint_sfs (array_like): Scaled folded joint SFS matrix
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: Scaled folded joint SFS heatmap
"""{ .api }
Test for deviations from Hardy-Weinberg equilibrium.
# Observed heterozygosity
het_obs = allel.heterozygosity_observed(genotypes)
# Expected heterozygosity
het_exp = allel.heterozygosity_expected(allele_frequencies)
# Inbreeding coefficient (FIS)
fis = allel.inbreeding_coefficient(genotypes){ .api }
def heterozygosity_observed(g, alleles=None):
"""
Calculate observed heterozygosity per variant.
Args:
g (GenotypeArray): Genotype data
alleles (array_like, optional): Specific alleles to consider
Returns:
ndarray: Observed heterozygosity per variant
"""
def heterozygosity_expected(af, ploidy=2):
"""
Calculate expected heterozygosity under Hardy-Weinberg equilibrium.
Args:
af (array_like): Allele frequencies
ploidy (int): Ploidy level
Returns:
ndarray: Expected heterozygosity per variant
"""
def inbreeding_coefficient(g, alleles=None):
"""
Calculate inbreeding coefficient (FIS) per variant.
Args:
g (GenotypeArray): Genotype data
alleles (array_like, optional): Specific alleles to consider
Returns:
ndarray: Inbreeding coefficient per variant
"""{ .api }
Test for population admixture and phylogenetic relationships.
# F2 statistic (genetic distance)
f2_values = allel.patterson_f2(ac_pop1, ac_pop2)
# F3 statistic (admixture test)
f3_values = allel.patterson_f3(ac_pop1, ac_pop2, ac_pop3)
# D statistic (4-population test)
d_values = allel.patterson_d(ac_pop1, ac_pop2, ac_pop3, ac_pop4)
# Blockwise calculations for bootstrapping
f3_blocks = allel.blockwise_patterson_f3(ac_pop1, ac_pop2, ac_pop3, blocks)
d_blocks = allel.blockwise_patterson_d(ac_pop1, ac_pop2, ac_pop3, ac_pop4, blocks)
# Average statistics
avg_f3 = allel.average_patterson_f3(ac_pop1, ac_pop2, ac_pop3)
avg_d = allel.average_patterson_d(ac_pop1, ac_pop2, ac_pop3, ac_pop4)
# Moving window analysis
f3_moving = allel.moving_patterson_f3(positions, ac_pop1, ac_pop2, ac_pop3, size=100000)
d_moving = allel.moving_patterson_d(positions, ac_pop1, ac_pop2, ac_pop3, ac_pop4, size=100000){ .api }
Key Functions:
def patterson_f2(ac1, ac2):
"""
Calculate Patterson's F2 statistic (genetic distance).
Args:
ac1 (AlleleCountsArray): Allele counts for population 1
ac2 (AlleleCountsArray): Allele counts for population 2
Returns:
ndarray: F2 values per variant
"""
def patterson_f3(ac1, ac2, ac3):
"""
Calculate Patterson's F3 statistic (test for admixture).
Args:
ac1 (AlleleCountsArray): Allele counts for population 1 (test population)
ac2 (AlleleCountsArray): Allele counts for population 2 (source 1)
ac3 (AlleleCountsArray): Allele counts for population 3 (source 2)
Returns:
ndarray: F3 values per variant
"""
def patterson_d(ac1, ac2, ac3, ac4):
"""
Calculate Patterson's D statistic (4-population test).
Args:
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
ac3 (AlleleCountsArray): Population 3 allele counts
ac4 (AlleleCountsArray): Population 4 allele counts (outgroup)
Returns:
ndarray: D values per variant
"""
def blockwise_patterson_f3(ac1, ac2, ac3, blocks):
"""
Calculate F3 statistic for genomic blocks.
Args:
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
ac3 (AlleleCountsArray): Population 3 allele counts
blocks (array_like): Block definitions
Returns:
ndarray: F3 values per block
"""
def blockwise_patterson_d(ac1, ac2, ac3, ac4, blocks):
"""
Calculate D statistic for genomic blocks.
Args:
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
ac3 (AlleleCountsArray): Population 3 allele counts
ac4 (AlleleCountsArray): Population 4 allele counts
blocks (array_like): Block definitions
Returns:
ndarray: D values per block
"""
def average_patterson_f3(ac1, ac2, ac3):
"""
Calculate average F3 statistic across all variants.
Args:
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
ac3 (AlleleCountsArray): Population 3 allele counts
Returns:
float: Average F3 value
"""
def average_patterson_d(ac1, ac2, ac3, ac4):
"""
Calculate average D statistic across all variants.
Args:
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
ac3 (AlleleCountsArray): Population 3 allele counts
ac4 (AlleleCountsArray): Population 4 allele counts
Returns:
float: Average D value
"""
def moving_patterson_f3(pos, ac1, ac2, ac3, size, start=None, stop=None, step=None):
"""
Calculate F3 statistic in sliding windows.
Args:
pos (array_like): Variant positions
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
ac3 (AlleleCountsArray): Population 3 allele counts
size (int): Window size in base pairs
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size
Returns:
tuple: (window_centers, f3_values)
"""
def moving_patterson_d(pos, ac1, ac2, ac3, ac4, size, start=None, stop=None, step=None):
"""
Calculate D statistic in sliding windows.
Args:
pos (array_like): Variant positions
ac1 (AlleleCountsArray): Population 1 allele counts
ac2 (AlleleCountsArray): Population 2 allele counts
ac3 (AlleleCountsArray): Population 3 allele counts
ac4 (AlleleCountsArray): Population 4 allele counts
size (int): Window size in base pairs
start (int, optional): Start position
stop (int, optional): Stop position
step (int, optional): Step size
Returns:
tuple: (window_centers, d_values)
"""{ .api }
Apply statistical methods over genomic windows.
# Create equally-sized windows
windows = allel.equally_accessible_windows(positions, size=10000)
# Apply any statistic to windows
window_stats = allel.windowed_statistic(positions, values, np.mean, windows)
# Moving window statistics
moving_stats = allel.moving_statistic(values, np.mean, size=100, step=50)
# Count variants in windows
counts = allel.windowed_count(positions, windows)
# Per-base analysis
per_base_values = allel.per_base(positions, values, start=1000, stop=2000)
# Moving statistics with different functions
moving_means = allel.moving_mean(values, size=100, step=50)
moving_stds = allel.moving_std(values, size=100, step=50)
moving_mids = allel.moving_midpoint(positions, size=100, step=50)
# Window coordinate helpers
index_wins = allel.index_windows(positions, window_size=100)
pos_wins = allel.position_windows(positions, window_size=10000, step=5000)
win_locs = allel.window_locations(windows){ .api }
More specialized windowing utilities.
def windowed_count(pos, windows, is_accessible=None):
"""
Count variants in genomic windows.
Args:
pos (array_like): Variant positions
windows (array_like): Window coordinates as (start, stop) pairs
is_accessible (array_like, optional): Accessible sites mask
Returns:
ndarray: Variant counts per window
"""
def per_base(pos, values, start=None, stop=None, is_accessible=None):
"""
Calculate per-base values over a genomic region.
Args:
pos (array_like): Variant positions
values (array_like): Values per variant
start (int, optional): Start position
stop (int, optional): Stop position
is_accessible (array_like, optional): Accessible sites mask
Returns:
float: Per-base value
"""
def moving_mean(values, size, step=1):
"""
Calculate moving mean over a sequence of values.
Args:
values (array_like): Input values
size (int): Window size
step (int): Step size between windows
Returns:
ndarray: Moving mean values
"""
def moving_std(values, size, step=1):
"""
Calculate moving standard deviation.
Args:
values (array_like): Input values
size (int): Window size
step (int): Step size between windows
Returns:
ndarray: Moving standard deviation values
"""
def moving_midpoint(positions, size, step=1):
"""
Calculate midpoints for moving windows.
Args:
positions (array_like): Position coordinates
size (int): Window size
step (int): Step size between windows
Returns:
ndarray: Window midpoint positions
"""
def index_windows(positions, window_size):
"""
Create windows based on variant indices.
Args:
positions (array_like): Variant positions
window_size (int): Number of variants per window
Returns:
ndarray: Window definitions as index ranges
"""
def position_windows(positions, window_size, step=None):
"""
Create windows based on genomic positions.
Args:
positions (array_like): Variant positions
window_size (int): Window size in base pairs
step (int, optional): Step size between windows
Returns:
ndarray: Window coordinates as (start, stop) pairs
"""
def window_locations(windows):
"""
Calculate locations (centers) of genomic windows.
Args:
windows (array_like): Window coordinates as (start, stop) pairs
Returns:
ndarray: Window center positions
"""{ .api }
Analyze transmission patterns in family data to identify Mendelian errors and phase genotypes.
# Identify Mendelian errors in trio data
errors = allel.mendel_errors(offspring_genotypes, parent_genotypes)
# Phase genotypes using transmission patterns
phased_offspring = allel.phase_progeny_by_transmission(offspring_genotypes, transmission)
phased_parents = allel.phase_parents_by_transmission(parent_genotypes, transmission)
# Complete phasing workflow
phased_offspring, phased_parents = allel.phase_by_transmission(offspring_genotypes, parent_genotypes){ .api }
Key Functions:
def mendel_errors(offspring_genotypes, parent_genotypes):
"""
Identify Mendelian inheritance errors in family data.
Args:
offspring_genotypes (GenotypeArray): Genotypes of offspring
parent_genotypes (GenotypeArray): Genotypes of parents
Returns:
ndarray: Array indicating Mendelian error status for each variant/offspring pair
"""
def phase_progeny_by_transmission(offspring_genotypes, transmission):
"""
Phase offspring genotypes using transmission patterns.
Args:
offspring_genotypes (GenotypeArray): Offspring genotype data
transmission (array_like): Transmission pattern information
Returns:
GenotypeArray: Phased offspring genotypes
"""
def phase_parents_by_transmission(parent_genotypes, transmission):
"""
Phase parent genotypes using transmission patterns.
Args:
parent_genotypes (GenotypeArray): Parent genotype data
transmission (array_like): Transmission pattern information
Returns:
GenotypeArray: Phased parent genotypes
"""
def phase_by_transmission(offspring_genotypes, parent_genotypes):
"""
Phase both offspring and parent genotypes using transmission patterns.
Args:
offspring_genotypes (GenotypeArray): Offspring genotypes
parent_genotypes (GenotypeArray): Parent genotypes
Returns:
tuple: (phased_offspring, phased_parents) GenotypeArray objects
"""{ .api }
Inheritance Constants:
# Mendelian inheritance pattern constants
INHERIT_MISSING = -1 # Missing inheritance pattern
INHERIT_NONPARENTAL = 0 # Non-parental allele
INHERIT_NONSEG_ALT = 1 # Non-segregating alternative
INHERIT_NONSEG_REF = 2 # Non-segregating reference
INHERIT_PARENT1 = 3 # Inherited from parent 1
INHERIT_PARENT2 = 4 # Inherited from parent 2
INHERIT_PARENT_MISSING = 5 # Parent has missing data
INHERIT_UNDETERMINED = 6 # Cannot determine inheritance{ .api }
Detect runs of homozygosity using Hidden Markov Models.
# Multi-state HMM for ROH detection
roh_results = allel.roh_mhmm(genotypes, positions, phet_roh=0.001, phet_nonroh=0.01, transition=1e-6)
# Poisson HMM approach
roh_poisson = allel.roh_poissonhmm(genotypes, positions, phet_roh=0.001, phet_nonroh=0.01,
transition_hethet=1e-5, transition_homhom=1e-8){ .api }
def roh_mhmm(g, pos, phet_roh, phet_nonroh, transition, map_pos=None):
"""
Detect runs of homozygosity using multi-state Hidden Markov Model.
Args:
g (GenotypeArray): Genotype data
pos (array_like): Variant positions
phet_roh (float): Probability of heterozygosity in ROH segments
phet_nonroh (float): Probability of heterozygosity in non-ROH segments
transition (float): Transition probability between states
map_pos (array_like, optional): Genetic map positions
Returns:
ndarray: ROH state predictions
"""
def roh_poissonhmm(g, pos, phet_roh, phet_nonroh, transition_hethet, transition_homhom, map_pos=None):
"""
Detect runs of homozygosity using Poisson Hidden Markov Model.
Args:
g (GenotypeArray): Genotype data
pos (array_like): Variant positions
phet_roh (float): Het probability in ROH
phet_nonroh (float): Het probability in non-ROH
transition_hethet (float): Transition probability between het states
transition_homhom (float): Transition probability between hom states
map_pos (array_like, optional): Genetic map positions
Returns:
ndarray: ROH state predictions
"""{ .api }
Apply statistical methods over genomic windows.
# Create equally-sized windows
windows = allel.equally_accessible_windows(positions, size=10000)
# Apply any statistic to windows
window_stats = allel.windowed_statistic(positions, values, np.mean, windows)
# Moving window statistics
moving_stats = allel.moving_statistic(values, np.mean, size=100, step=50){ .api }
def windowed_statistic(pos, values, statistic, windows, fill=None):
"""
Apply a statistic function to values within genomic windows.
Args:
pos (array_like): Positions corresponding to values
values (array_like): Values to calculate statistics on
statistic (callable): Function to apply (e.g., np.mean, np.sum)
windows (array_like): Window coordinates as (start, stop) pairs
fill (float, optional): Fill value for empty windows
Returns:
ndarray: Statistic values per window
"""
def equally_accessible_windows(pos, size, start=None, stop=None):
"""
Create equally-sized genomic windows.
Args:
pos (array_like): Variant positions to determine window bounds
size (int): Window size in base pairs
start (int, optional): Start coordinate
stop (int, optional): Stop coordinate
Returns:
ndarray: Window coordinates as (start, stop) pairs
"""
def moving_statistic(values, statistic, size, step=1):
"""
Apply statistic over a moving window.
Args:
values (array_like): Input values
statistic (callable): Function to apply
size (int): Window size
step (int): Step size between windows
Returns:
ndarray: Statistic values per window
"""{ .api }
Additional statistical utilities and visualization tools.
# Variant locator plot for exploring data
allel.plot_variant_locator(genotypes, positions, start=1000, stop=2000)
# Tabulate state transitions (for HMM analysis)
transitions = allel.tabulate_state_transitions(states)
# Tabulate state blocks
blocks = allel.tabulate_state_blocks(states){ .api }
def plot_variant_locator(gn, pos, start=None, stop=None, ax=None):
"""
Plot variant locator for exploring genotype data.
Args:
gn (GenotypeArray): Genotype data
pos (array_like): Variant positions
start (int, optional): Start position for plotting
stop (int, optional): Stop position for plotting
ax (matplotlib axis, optional): Axis to plot on
Returns:
matplotlib figure: Variant locator plot
"""
def tabulate_state_transitions(states, pos=None):
"""
Tabulate transitions between states in a sequence.
Args:
states (array_like): State sequence
pos (array_like, optional): Positions corresponding to states
Returns:
DataFrame: Transition table with counts and positions
"""
def tabulate_state_blocks(states, pos=None):
"""
Tabulate contiguous blocks of the same state.
Args:
states (array_like): State sequence
pos (array_like, optional): Positions corresponding to states
Returns:
DataFrame: Block table with start/stop positions and lengths
"""{ .api }
Install with Tessl CLI
npx tessl i tessl/pypi-scikit-allel