Comprehensive toolkit for analyzing single-cell gene expression data with scalable Python implementation supporting preprocessing, visualization, clustering, trajectory inference, and differential expression testing.
—
Scanpy's tools module provides advanced analysis methods for single-cell data including dimensionality reduction, clustering, trajectory inference, differential expression testing, and specialized single-cell analysis algorithms.
Generate low-dimensional representations of high-dimensional single-cell data.
def tsne(adata, n_pcs=None, use_rep=None, perplexity=30, early_exaggeration=12, learning_rate=1000, random_state=0, use_fast_tsne=True, n_jobs=None, copy=False):
"""
t-distributed stochastic neighbor embedding (t-SNE).
Parameters:
- adata (AnnData): Annotated data object
- n_pcs (int, optional): Number of PCs to use
- use_rep (str, optional): Representation to use
- perplexity (float): t-SNE perplexity parameter
- early_exaggeration (float): Early exaggeration parameter
- learning_rate (float): Learning rate
- random_state (int): Random seed
- use_fast_tsne (bool): Use fast t-SNE implementation
- n_jobs (int, optional): Number of parallel jobs
- copy (bool): Return copy
Returns:
AnnData or None: Object with t-SNE coordinates (if copy=True)
"""
def umap(adata, min_dist=0.5, spread=1.0, n_components=2, maxiter=None, alpha=1.0, gamma=1.0, negative_sample_rate=5, init_pos='spectral', random_state=0, a=None, b=None, copy=False, method='umap', neighbors_key=None):
"""
Uniform Manifold Approximation and Projection (UMAP) embedding.
Parameters:
- adata (AnnData): Annotated data object
- min_dist (float): Minimum distance between embedded points
- spread (float): Effective scale of embedded points
- n_components (int): Number of dimensions for embedding
- maxiter (int, optional): Maximum number of iterations
- alpha (float): Learning rate
- gamma (float): Repulsive strength
- negative_sample_rate (int): Number of negative samples
- init_pos (str): Initialization method
- random_state (int): Random seed
- a (float, optional): Curve parameter
- b (float, optional): Curve parameter
- copy (bool): Return copy
- method (str): UMAP method to use
- neighbors_key (str, optional): Key for neighbors data
Returns:
AnnData or None: Object with UMAP coordinates (if copy=True)
"""
def diffmap(adata, n_comps=15, neighbors_key=None, random_state=0, copy=False):
"""
Diffusion map embedding.
Parameters:
- adata (AnnData): Annotated data object
- n_comps (int): Number of diffusion components
- neighbors_key (str, optional): Key for neighbors data
- random_state (int): Random seed
- copy (bool): Return copy
Returns:
AnnData or None: Object with diffusion map (if copy=True)
"""
def draw_graph(adata, layout='fa', random_state=0, root=None, neighbors_key=None, copy=False, **kwds):
"""
Force-directed graph drawing.
Parameters:
- adata (AnnData): Annotated data object
- layout (str): Layout algorithm ('fa', 'fr', etc.)
- random_state (int): Random seed
- root (int, optional): Root node for certain layouts
- neighbors_key (str, optional): Key for neighbors data
- copy (bool): Return copy
- **kwds: Additional layout parameters
Returns:
AnnData or None: Object with graph layout (if copy=True)
"""Identify groups of similar cells using various clustering algorithms.
def leiden(adata, resolution=1, restrict_to=None, random_state=0, key_added='leiden', adjacency=None, directed=True, use_weights=True, n_iterations=-1, partition_type=None, neighbors_key=None, obsp=None, copy=False):
"""
Leiden clustering algorithm.
Parameters:
- adata (AnnData): Annotated data object
- resolution (float): Resolution parameter for clustering
- restrict_to (tuple, optional): Restrict clustering to subset
- random_state (int): Random seed
- key_added (str): Key for storing cluster labels
- adjacency (array, optional): Adjacency matrix
- directed (bool): Use directed graph
- use_weights (bool): Use edge weights
- n_iterations (int): Number of iterations (-1 for convergence)
- partition_type (object, optional): Partition type
- neighbors_key (str, optional): Key for neighbors data
- obsp (str, optional): Key in obsp for adjacency
- copy (bool): Return copy
Returns:
AnnData or None: Object with cluster labels (if copy=True)
"""
def louvain(adata, resolution=1, random_state=0, restrict_to=None, key_added='louvain', adjacency=None, flavor='vtraag', directed=True, use_weights=False, partition_type=None, neighbors_key=None, obsp=None, copy=False):
"""
Louvain clustering algorithm.
Parameters:
- adata (AnnData): Annotated data object
- resolution (float): Resolution parameter for clustering
- random_state (int): Random seed
- restrict_to (tuple, optional): Restrict clustering to subset
- key_added (str): Key for storing cluster labels
- adjacency (array, optional): Adjacency matrix
- flavor (str): Implementation flavor
- directed (bool): Use directed graph
- use_weights (bool): Use edge weights
- partition_type (object, optional): Partition type
- neighbors_key (str, optional): Key for neighbors data
- obsp (str, optional): Key in obsp for adjacency
- copy (bool): Return copy
Returns:
AnnData or None: Object with cluster labels (if copy=True)
"""Analyze developmental trajectories and compute pseudotime.
def dpt(adata, n_dcs=10, n_branchings=0, min_group_size=0.01, allow_kendall_tau_shift=True, neighbors_key=None, copy=False):
"""
Diffusion pseudotime analysis.
Parameters:
- adata (AnnData): Annotated data object
- n_dcs (int): Number of diffusion components
- n_branchings (int): Number of branchings to detect
- min_group_size (float): Minimum group size for branching
- allow_kendall_tau_shift (bool): Allow Kendall tau shift
- neighbors_key (str, optional): Key for neighbors data
- copy (bool): Return copy
Returns:
AnnData or None: Object with pseudotime (if copy=True)
"""
def paga(adata, groups=None, use_rna_velocity=False, model='v1.2', neighbors_key=None, copy=False):
"""
Partition-based graph abstraction (PAGA).
Parameters:
- adata (AnnData): Annotated data object
- groups (str, optional): Key for grouping observations
- use_rna_velocity (bool): Use RNA velocity information
- model (str): PAGA model version
- neighbors_key (str, optional): Key for neighbors data
- copy (bool): Return copy
Returns:
AnnData or None: Object with PAGA results (if copy=True)
"""
def paga_degrees(adata, groups=None):
"""
Calculate node degrees in PAGA graph.
Parameters:
- adata (AnnData): Annotated data object
- groups (str, optional): Key for grouping observations
Returns:
array: Node degrees
"""
def paga_expression_entropies(adata, groups=None):
"""
Calculate expression entropies for PAGA nodes.
Parameters:
- adata (AnnData): Annotated data object
- groups (str, optional): Key for grouping observations
Returns:
array: Expression entropies
"""
def paga_compare_paths(adata1, adata2, adjacency_key='paga_adjacency', adjacency_key2=None, embeddings_key='X_umap', embedding_key2=None, annotation_key=None, annotation_key2=None):
"""
Compare PAGA paths between datasets.
Parameters:
- adata1 (AnnData): First dataset
- adata2 (AnnData): Second dataset
- adjacency_key (str): Key for adjacency matrix in first dataset
- adjacency_key2 (str, optional): Key for adjacency matrix in second dataset
- embeddings_key (str): Key for embeddings in first dataset
- embedding_key2 (str, optional): Key for embeddings in second dataset
- annotation_key (str, optional): Key for annotations in first dataset
- annotation_key2 (str, optional): Key for annotations in second dataset
Returns:
dict: Path comparison results
"""Identify genes that are differentially expressed between groups.
def rank_genes_groups(adata, groupby, use_raw=None, groups='all', reference='rest', n_genes=None, rankby_abs=False, pts=False, key_added=None, copy=False, method='wilcoxon', corr_method='benjamini-hochberg', tie_correct=False, layer=None, **kwds):
"""
Rank genes for characterizing groups.
Parameters:
- adata (AnnData): Annotated data object
- groupby (str): Key in obs for grouping cells
- use_raw (bool, optional): Use raw data
- groups (str or list): Groups to compare
- reference (str): Reference group for comparison
- n_genes (int, optional): Number of genes to return per group
- rankby_abs (bool): Rank by absolute values
- pts (bool): Calculate percentage of cells expressing gene
- key_added (str, optional): Key for storing results
- copy (bool): Return copy
- method (str): Statistical test method
- corr_method (str): Multiple testing correction method
- tie_correct (bool): Apply tie correction
- layer (str, optional): Layer to use
- **kwds: Additional method-specific parameters
Returns:
AnnData or None: Object with ranking results (if copy=True)
"""
def filter_rank_genes_groups(adata, key='rank_genes_groups', groupby=None, use_raw=None, log=True, key_added='rank_genes_groups_filtered', min_in_group_fraction=0.25, min_fold_change=2, max_out_group_fraction=0.5):
"""
Filter ranked genes based on fold change and expression criteria.
Parameters:
- adata (AnnData): Annotated data object
- key (str): Key for ranked genes results
- groupby (str, optional): Key for grouping
- use_raw (bool, optional): Use raw data
- log (bool): Data is log-transformed
- key_added (str): Key for filtered results
- min_in_group_fraction (float): Minimum fraction expressing in group
- min_fold_change (float): Minimum fold change
- max_out_group_fraction (float): Maximum fraction expressing out of group
Returns:
dict: Filtered gene rankings
"""Score cells based on gene set expression.
def score_genes(adata, gene_list, ctrl_size=50, gene_pool=None, n_bins=25, score_name='score', random_state=0, copy=False, use_raw=None):
"""
Score a set of genes.
Parameters:
- adata (AnnData): Annotated data object
- gene_list (list): List of gene names to score
- ctrl_size (int): Number of control genes per test gene
- gene_pool (list, optional): Pool of genes for controls
- n_bins (int): Number of expression bins
- score_name (str): Name for score in obs
- random_state (int): Random seed
- copy (bool): Return copy
- use_raw (bool, optional): Use raw data
Returns:
AnnData or None: Object with gene scores (if copy=True)
"""
def score_genes_cell_cycle(adata, s_genes, g2m_genes, copy=False, **kwargs):
"""
Score cell cycle phase based on marker genes.
Parameters:
- adata (AnnData): Annotated data object
- s_genes (list): S phase marker genes
- g2m_genes (list): G2/M phase marker genes
- copy (bool): Return copy
- **kwargs: Additional parameters for score_genes
Returns:
AnnData or None: Object with cell cycle scores (if copy=True)
"""Additional analysis tools for specific use cases.
def dendrogram(adata, groupby, n_pcs=None, use_rep=None, var_names=None, use_raw=None, cor_method='pearson', linkage_method='complete', optimal_ordering=False, key_added=None, inplace=True):
"""
Compute hierarchical clustering dendrogram.
Parameters:
- adata (AnnData): Annotated data object
- groupby (str): Key for grouping observations
- n_pcs (int, optional): Number of PCs to use
- use_rep (str, optional): Representation to use
- var_names (list, optional): Variable names to use
- use_raw (bool, optional): Use raw data
- cor_method (str): Correlation method
- linkage_method (str): Linkage method for clustering
- optimal_ordering (bool): Compute optimal leaf ordering
- key_added (str, optional): Key for storing results
- inplace (bool): Store results in adata
Returns:
dict or None: Dendrogram results (if not inplace)
"""
def embedding_density(adata, basis='umap', groupby=None, key_added=None):
"""
Calculate density of cells in embedding space.
Parameters:
- adata (AnnData): Annotated data object
- basis (str): Embedding basis to use
- groupby (str, optional): Key for grouping
- key_added (str, optional): Key for storing density
Returns:
None: Modifies adata in place
"""
def marker_gene_overlap(adata, reference_markers, key='rank_genes_groups', normalize='reference', top_n_markers=None, adj_pval_threshold=None, key_added='marker_gene_overlap'):
"""
Calculate overlap between marker genes and reference.
Parameters:
- adata (AnnData): Annotated data object
- reference_markers (dict): Reference marker genes
- key (str): Key for marker gene results
- normalize (str): Normalization method
- top_n_markers (int, optional): Number of top markers to consider
- adj_pval_threshold (float, optional): Adjusted p-value threshold
- key_added (str): Key for storing overlap results
Returns:
None: Modifies adata in place
"""
def ingest(adata, adata_ref, obs=None, embedding_method='umap', labeling_method='knn', neighbors_key=None, inplace=True, **kwargs):
"""
Map new data to reference using ingest method.
Parameters:
- adata (AnnData): Query data to map
- adata_ref (AnnData): Reference data
- obs (list, optional): Observations to map
- embedding_method (str): Method for embedding mapping
- labeling_method (str): Method for label transfer
- neighbors_key (str, optional): Key for neighbors data
- inplace (bool): Modify adata in place
- **kwargs: Additional parameters
Returns:
AnnData or None: Mapped data (if not inplace)
"""
def sim(adata, tmax=None, n_obs=None, copy=False, **kwargs):
"""
Simulate single-cell data.
Parameters:
- adata (AnnData): Template data object
- tmax (float, optional): Maximum time for simulation
- n_obs (int, optional): Number of observations to simulate
- copy (bool): Return copy
- **kwargs: Additional simulation parameters
Returns:
AnnData or None: Simulated data (if copy=True)
"""class Ingest:
"""Class for mapping query data to reference."""
def __init__(self, adata_ref, **kwargs):
"""
Initialize Ingest object.
Parameters:
- adata_ref (AnnData): Reference dataset
- **kwargs: Additional parameters
"""
def fit(self, **kwargs):
"""Fit the ingest model."""
def map_embedding(self, adata, **kwargs):
"""Map query data to reference embedding."""import scanpy as sc
# Assume data is preprocessed with neighbors computed
# Leiden clustering
sc.tl.leiden(adata, resolution=0.5)
# UMAP embedding
sc.tl.umap(adata)
# Find marker genes
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
# Plot results
sc.pl.umap(adata, color=['leiden'])
sc.pl.rank_genes_groups(adata, n_genes=5, sharey=False)# Diffusion pseudotime analysis
adata.uns['iroot'] = np.flatnonzero(adata.obs['leiden'] == '2')[0] # set root
sc.tl.dpt(adata)
# PAGA analysis
sc.tl.paga(adata, groups='leiden')
sc.pl.paga(adata, plot=False)
# Plot trajectory
sc.pl.umap(adata, color=['leiden', 'dpt_pseudotime'])# Compare specific groups
sc.tl.rank_genes_groups(adata, 'leiden', groups=['2'], reference='1', method='wilcoxon')
# Filter results
sc.tl.filter_rank_genes_groups(adata, min_fold_change=2)
# Get results as dataframe
result = sc.get.rank_genes_groups_df(adata, group='2')Install with Tessl CLI
npx tessl i tessl/pypi-scanpy