Comprehensive toolkit for analyzing single-cell gene expression data with scalable Python implementation supporting preprocessing, visualization, clustering, trajectory inference, and differential expression testing.
—
Scanpy's external module provides integration with popular external single-cell analysis tools and methods through a unified interface. This extends scanpy's capabilities with specialized algorithms for dimensionality reduction, trajectory inference, batch correction, imputation, and more.
Advanced analysis methods from specialized single-cell packages.
def phate(adata, n_components=2, knn=5, decay=40, n_landmark=2000, t='auto', gamma=1, n_pca=100, solver='exact', seed=None, n_jobs=1, random_state=None, copy=False, **kwargs):
"""
PHATE (Potential of Heat-diffusion for Affinity-based Embedding) dimensionality reduction.
Parameters:
- adata (AnnData): Annotated data object
- n_components (int): Number of dimensions for embedding
- knn (int): Number of nearest neighbors
- decay (int): Alpha decay parameter
- n_landmark (int): Number of landmark points
- t (str or int): Time parameter for diffusion
- gamma (float): Informational distance parameter
- n_pca (int): Number of PCA components for preprocessing
- solver (str): Solver for eigenvalue decomposition
- seed (int, optional): Random seed
- n_jobs (int): Number of parallel jobs
- random_state (int, optional): Random state
- copy (bool): Return copy
- **kwargs: Additional PHATE parameters
Returns:
AnnData or None: Object with PHATE embedding (if copy=True)
"""
def palantir(adata, start_cell=None, num_waypoints=1200, terminal_states=None, copy=False, **kwargs):
"""
Palantir trajectory inference algorithm.
Parameters:
- adata (AnnData): Annotated data object
- start_cell (str, optional): Starting cell for trajectory
- num_waypoints (int): Number of waypoints for trajectory
- terminal_states (list, optional): Terminal cell states
- copy (bool): Return copy
- **kwargs: Additional Palantir parameters
Returns:
AnnData or None: Object with trajectory results (if copy=True)
"""
def palantir_results(adata, early_cell=None, ms_data=None, copy=False):
"""
Process Palantir trajectory inference results.
Parameters:
- adata (AnnData): Annotated data object with Palantir results
- early_cell (str, optional): Early cell identifier
- ms_data (AnnData, optional): Mass spectrometry data
- copy (bool): Return copy
Returns:
AnnData or None: Object with processed results (if copy=True)
"""
def phenograph(adata, clustering_algo='leiden', k=30, directed=False, prune=False, min_cluster_size=10, jaccard=True, primary_metric='euclidean', n_jobs=-1, q_tol=1e-3, louvain_time_limit=2000, nn_method='kdtree', copy=False, **kwargs):
"""
PhenoGraph clustering algorithm.
Parameters:
- adata (AnnData): Annotated data object
- clustering_algo (str): Clustering algorithm ('leiden' or 'louvain')
- k (int): Number of nearest neighbors
- directed (bool): Use directed graph
- prune (bool): Prune graph
- min_cluster_size (int): Minimum cluster size
- jaccard (bool): Use Jaccard coefficient
- primary_metric (str): Distance metric
- n_jobs (int): Number of parallel jobs
- q_tol (float): Quality tolerance for clustering
- louvain_time_limit (int): Time limit for Louvain algorithm
- nn_method (str): Nearest neighbor method
- copy (bool): Return copy
- **kwargs: Additional parameters
Returns:
AnnData or None: Object with clustering results (if copy=True)
"""
def trimap(adata, n_inliers=10, n_outliers=5, n_random=5, lr=1000.0, n_iters=400, copy=False, **kwargs):
"""
TriMap dimensionality reduction.
Parameters:
- adata (AnnData): Annotated data object
- n_inliers (int): Number of inlier points
- n_outliers (int): Number of outlier points
- n_random (int): Number of random triplets
- lr (float): Learning rate
- n_iters (int): Number of iterations
- copy (bool): Return copy
- **kwargs: Additional TriMap parameters
Returns:
AnnData or None: Object with TriMap embedding (if copy=True)
"""
def wishbone(adata, start_cell=None, copy=False, **kwargs):
"""
Wishbone trajectory inference algorithm.
Parameters:
- adata (AnnData): Annotated data object
- start_cell (str, optional): Starting cell for trajectory
- copy (bool): Return copy
- **kwargs: Additional Wishbone parameters
Returns:
AnnData or None: Object with trajectory results (if copy=True)
"""
def sam(adata, max_iter=10, num_norm_avg=50, k=20, distance='correlation', copy=False, **kwargs):
"""
SAM (Self-Assembling Manifolds) for iterative clustering.
Parameters:
- adata (AnnData): Annotated data object
- max_iter (int): Maximum number of iterations
- num_norm_avg (int): Number of averages for normalization
- k (int): Number of nearest neighbors
- distance (str): Distance metric
- copy (bool): Return copy
- **kwargs: Additional SAM parameters
Returns:
AnnData or None: Object with SAM results (if copy=True)
"""
def harmony_timeseries(adata_list, tp=None, copy=False, **kwargs):
"""
Harmony integration for time series data.
Parameters:
- adata_list (list): List of AnnData objects from different time points
- tp (list, optional): Time point labels
- copy (bool): Return copy
- **kwargs: Additional Harmony parameters
Returns:
AnnData or None: Integrated dataset (if copy=True)
"""Specialized tools for cell cycle phase analysis.
def cyclone(adata, species='human', copy=False, **kwargs):
"""
Cyclone cell cycle phase assignment.
Parameters:
- adata (AnnData): Annotated data object
- species (str): Species for marker genes ('human' or 'mouse')
- copy (bool): Return copy
- **kwargs: Additional parameters
Returns:
AnnData or None: Object with cell cycle scores (if copy=True)
"""
def sandbag(adata, fraction=0.5, copy=False, **kwargs):
"""
Sandbag cell cycle gene identification.
Parameters:
- adata (AnnData): Annotated data object
- fraction (float): Fraction threshold for gene selection
- copy (bool): Return copy
- **kwargs: Additional parameters
Returns:
AnnData or None: Object with cell cycle gene markers (if copy=True)
"""Batch correction and integration methods from external packages.
def bbknn(adata, batch_key='batch', neighbors_within_batch=3, n_pcs=50, trim=None, copy=False, **kwargs):
"""
BBKNN (Batch Balanced k-Nearest Neighbors) batch correction.
Parameters:
- adata (AnnData): Annotated data object
- batch_key (str): Key in obs containing batch information
- neighbors_within_batch (int): Neighbors within each batch
- n_pcs (int): Number of principal components to use
- trim (int, optional): Trim neighbors per batch
- copy (bool): Return copy
- **kwargs: Additional BBKNN parameters
Returns:
AnnData or None: Object with corrected neighborhood graph (if copy=True)
"""
def dca(adata, mode='denoise', ae_type='nb-conddisp', normalize_per_cell=True, scale=True, log1p=True, hidden_size=(64, 32, 64), hidden_dropout=0.0, batchnorm=True, activation='relu', init='glorot_uniform', network_kwds={}, epochs=300, reduce_lr=10, early_stop=15, batch_size=32, optimizer='rmsprop', learning_rate=None, random_state=0, threads=None, verbose=False, training_kwds={}, return_model=False, return_info=False, copy=False):
"""
Deep Count Autoencoder (DCA) for denoising and batch correction.
Parameters:
- adata (AnnData): Annotated data object
- mode (str): Mode of operation ('denoise', 'latent')
- ae_type (str): Autoencoder type
- normalize_per_cell (bool): Normalize per cell
- scale (bool): Scale features
- log1p (bool): Log transform
- hidden_size (tuple): Hidden layer sizes
- hidden_dropout (float): Dropout rate
- batchnorm (bool): Use batch normalization
- activation (str): Activation function
- init (str): Weight initialization
- network_kwds (dict): Additional network parameters
- epochs (int): Number of training epochs
- reduce_lr (int): Learning rate reduction patience
- early_stop (int): Early stopping patience
- batch_size (int): Training batch size
- optimizer (str): Optimizer
- learning_rate (float, optional): Learning rate
- random_state (int): Random seed
- threads (int, optional): Number of threads
- verbose (bool): Verbose output
- training_kwds (dict): Additional training parameters
- return_model (bool): Return trained model
- return_info (bool): Return training information
- copy (bool): Return copy
Returns:
AnnData or tuple: Denoised data and optionally model/info
"""
def harmony_integrate(adata, key, basis='X_pca', adjusted_basis='X_pca_harmony', copy=False, **kwargs):
"""
Harmony batch integration.
Parameters:
- adata (AnnData): Annotated data object
- key (str): Key in obs for batch variable
- basis (str): Basis to integrate
- adjusted_basis (str): Key for integrated embedding
- copy (bool): Return copy
- **kwargs: Additional Harmony parameters
Returns:
AnnData or None: Object with integrated embedding (if copy=True)
"""
def hashsolo(adata, priors=[0.01, 0.8, 0.19], pre_existing_clusters=None, number_of_noise_barcodes=None, copy=False, **kwargs):
"""
HashSolo for demultiplexing cell hashing data and doublet detection.
Parameters:
- adata (AnnData): Annotated data object with hashtag data
- priors (list): Prior probabilities [doublet, negative, singlet]
- pre_existing_clusters (str, optional): Key for existing clusters
- number_of_noise_barcodes (int, optional): Number of noise barcodes
- copy (bool): Return copy
- **kwargs: Additional HashSolo parameters
Returns:
AnnData or None: Object with demultiplexing results (if copy=True)
"""
def magic(adata, name_list=None, knn=10, decay=1, knn_max=None, t=3, n_pca=20, solver='exact', knn_dist='euclidean', random_state=None, n_jobs=None, copy=False, **kwargs):
"""
MAGIC (Markov Affinity-based Graph Imputation of Cells) imputation.
Parameters:
- adata (AnnData): Annotated data object
- name_list (list, optional): Genes to impute (None for all)
- knn (int): Number of nearest neighbors
- decay (int): Alpha decay parameter
- knn_max (int, optional): Maximum number of neighbors
- t (int): Number of diffusion steps
- n_pca (int): Number of PCA components
- solver (str): Solver for eigenvalue decomposition
- knn_dist (str): Distance metric for KNN
- random_state (int, optional): Random seed
- n_jobs (int, optional): Number of parallel jobs
- copy (bool): Return copy
- **kwargs: Additional MAGIC parameters
Returns:
AnnData or None: Object with imputed data (if copy=True)
"""
def mnn_correct(adata_list, var_subset=None, batch_key='batch', index_unique='-', batch_categories=None, k=20, sigma=0.1, cos_norm_in=True, cos_norm_out=True, svd_dim=0, var_adj=True, compute_angle=False, mnn_order=None, svd_mode='rsvd', do_concatenate=True, save_raw=False, n_jobs=None, **kwargs):
"""
MNN (Mutual Nearest Neighbors) batch correction.
Parameters:
- adata_list (list): List of AnnData objects to correct
- var_subset (list, optional): Subset of variables for correction
- batch_key (str): Key for batch information
- index_unique (str): Separator for making indices unique
- batch_categories (list, optional): Batch category order
- k (int): Number of nearest neighbors
- sigma (float): Gaussian smoothing parameter
- cos_norm_in (bool): Cosine normalization before correction
- cos_norm_out (bool): Cosine normalization after correction
- svd_dim (int): Number of SVD dimensions (0 for no SVD)
- var_adj (bool): Adjust variance
- compute_angle (bool): Compute angle between batches
- mnn_order (list, optional): Order for MNN correction
- svd_mode (str): SVD computation mode
- do_concatenate (bool): Concatenate results
- save_raw (bool): Save uncorrected data
- n_jobs (int, optional): Number of parallel jobs
- **kwargs: Additional parameters
Returns:
AnnData or list: Corrected data
"""
def scanorama_integrate(adata_list, key=None, basis='X_pca', adjusted_basis='X_scanorama', copy=False, **kwargs):
"""
Scanorama integration for batch correction.
Parameters:
- adata_list (list): List of AnnData objects to integrate
- key (str, optional): Key for batch information
- basis (str): Basis for integration
- adjusted_basis (str): Key for integrated embedding
- copy (bool): Return copy
- **kwargs: Additional Scanorama parameters
Returns:
AnnData or list: Integrated datasets
"""Export scanpy results to other software platforms.
def cellbrowser(adata, outdir, name, **kwargs):
"""
Export to UCSC Cell Browser format.
Parameters:
- adata (AnnData): Annotated data object
- outdir (str): Output directory
- name (str): Dataset name
- **kwargs: Additional export parameters
Returns:
None: Creates Cell Browser files
"""
def spring_project(adata, project_dir, **kwargs):
"""
Export to SPRING visualization tool.
Parameters:
- adata (AnnData): Annotated data object
- project_dir (str): Project directory
- **kwargs: Additional export parameters
Returns:
None: Creates SPRING project files
"""import scanpy as sc
# PHATE embedding
sc.external.tl.phate(adata, n_components=2, knn=15, t=20)
sc.pl.embedding(adata, basis='X_phate', color='leiden')
# Compare with UMAP
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
sc.pl.umap(adata, color='leiden', ax=axes[0], show=False, frameon=False)
sc.pl.embedding(adata, basis='X_phate', color='leiden', ax=axes[1], show=False, frameon=False)
axes[0].set_title('UMAP')
axes[1].set_title('PHATE')
plt.show()# Set up for Palantir
sc.external.tl.palantir(adata, start_cell='ATGCCAGAACGACT-1')
# Plot pseudotime and branch probabilities
sc.pl.umap(adata, color=['palantir_pseudotime', 'palantir_entropy'])
# Plot differentiation potential
sc.pl.umap(adata, color='palantir_diff_potential')# Harmony integration
sc.external.pp.harmony_integrate(adata, 'batch')
# Compare before and after
sc.pl.umap(adata, color='batch', title='Before Harmony')
sc.pl.embedding(adata, basis='X_pca_harmony', color='batch', title='After Harmony')
# Recompute neighbors on integrated data
sc.pp.neighbors(adata, use_rep='X_pca_harmony')
sc.tl.umap(adata)# MAGIC imputation for specific genes
genes_to_impute = ['CD34', 'GATA1', 'GATA2']
sc.external.pp.magic(adata, name_list=genes_to_impute, t=3)
# Compare before and after imputation
sc.pl.violin(adata, genes_to_impute, groupby='leiden',
use_raw=True, title='Before MAGIC')
sc.pl.violin(adata, genes_to_impute, groupby='leiden',
layer='MAGIC_imputed', title='After MAGIC')# Cell cycle scoring with Cyclone
sc.external.tl.cyclone(adata, species='human')
# Plot cell cycle phases
sc.pl.umap(adata, color=['cyclone_G1', 'cyclone_S', 'cyclone_G2M'])
# Custom marker identification with Sandbag
sc.external.tl.sandbag(adata)# PhenoGraph clustering
sc.external.tl.phenograph(adata, k=30, clustering_algo='leiden')
# Compare with Leiden
sc.pl.umap(adata, color=['leiden', 'phenograph'], ncols=2)# BBKNN for batch-balanced neighbors
sc.external.pp.bbknn(adata, batch_key='batch', n_pcs=50)
# Recompute UMAP with corrected neighbors
sc.tl.umap(adata)
sc.pl.umap(adata, color='batch')# Export to UCSC Cell Browser
sc.external.exporting.cellbrowser(
adata,
outdir='cellbrowser_output',
name='my_dataset'
)
# Export to SPRING
sc.external.exporting.spring_project(
adata,
project_dir='spring_output'
)Many external tools require additional dependencies:
# For PHATE
pip install phate
# For Palantir
pip install palantir-sc
# For Harmony
pip install harmonypy
# For MAGIC
pip install magic-impute
# For BBKNN
pip install bbknn
# For DCA
pip install dcan_jobs parameterInstall with Tessl CLI
npx tessl i tessl/pypi-scanpy