Comprehensive toolkit for analyzing single-cell gene expression data with scalable Python implementation supporting preprocessing, visualization, clustering, trajectory inference, and differential expression testing.
—
Scanpy provides various utility functions, configuration options, and helper tools for managing analysis workflows, extracting data, and configuring the analysis environment.
Configure scanpy's behavior and matplotlib plotting parameters.
# Global settings object
settings: ScanpyConfig
class ScanpyConfig:
"""Global scanpy configuration object."""
# Core settings
verbosity: int = 1 # Logging verbosity level (0-5)
n_jobs: int = 1 # Number of parallel jobs (-1 for all cores)
# Data settings
max_memory: str = '2G' # Maximum memory for operations
n_pcs: int = 50 # Default number of PCs
# Figure settings
figdir: str = './figures/' # Default figure output directory
file_format_figs: str = 'pdf' # Default figure format
dpi: int = 80 # Default DPI for figures
dpi_save: int = 150 # DPI for saved figures
transparent: bool = False # Transparent backgrounds
# Cache settings
cache_compression: str = 'lzf' # Compression for cached files
def set_figure_params(self, dpi=80, dpi_save=150, transparent=False, fontsize=14, color_map='viridis', format='pdf', facecolor='white', **kwargs):
"""
Set matplotlib figure parameters.
Parameters:
- dpi (int): Resolution for display
- dpi_save (int): Resolution for saved figures
- transparent (bool): Transparent background
- fontsize (int): Base font size
- color_map (str): Default colormap
- format (str): Default save format
- facecolor (str): Figure background color
- **kwargs: Additional matplotlib rcParams
"""Extract and manipulate data from AnnData objects.
def obs_df(adata, keys=None, obsm_keys=None, layer=None, gene_symbols=None, use_raw=False):
"""
Extract observation metadata as pandas DataFrame.
Parameters:
- adata (AnnData): Annotated data object
- keys (list, optional): Keys from obs to include
- obsm_keys (list, optional): Keys from obsm to include
- layer (str, optional): Layer to extract data from
- gene_symbols (str, optional): Gene symbols key
- use_raw (bool): Use raw data
Returns:
DataFrame: Observation data with requested keys
"""
def var_df(adata, keys=None, varm_keys=None, layer=None):
"""
Extract variable metadata as pandas DataFrame.
Parameters:
- adata (AnnData): Annotated data object
- keys (list, optional): Keys from var to include
- varm_keys (list, optional): Keys from varm to include
- layer (str, optional): Layer to extract data from
Returns:
DataFrame: Variable data with requested keys
"""
def rank_genes_groups_df(adata, group=None, key='rank_genes_groups', pval_cutoff=None, log2fc_min=None, log2fc_max=None, gene_symbols=None):
"""
Extract ranked genes results as pandas DataFrame.
Parameters:
- adata (AnnData): Annotated data object
- group (str, optional): Specific group to extract
- key (str): Key for ranked genes results
- pval_cutoff (float, optional): P-value cutoff
- log2fc_min (float, optional): Minimum log2 fold change
- log2fc_max (float, optional): Maximum log2 fold change
- gene_symbols (str, optional): Gene symbols key
Returns:
DataFrame: Ranked genes with statistics
"""
def aggregate(adata, by, func='mean', layer=None, obsm=None, varm=None):
"""
Aggregate observations by grouping variable.
Parameters:
- adata (AnnData): Annotated data object
- by (str): Key in obs for grouping
- func (str or callable): Aggregation function
- layer (str, optional): Layer to aggregate
- obsm (str, optional): Obsm key to aggregate
- varm (str, optional): Varm key to aggregate
Returns:
AnnData: Aggregated data object
"""Low-level utilities for accessing AnnData representations.
def _get_obs_rep(adata, use_rep=None, n_pcs=None, use_raw=False, layer=None, obsm=None, obsp=None):
"""
Get observation representation for analysis.
Parameters:
- adata (AnnData): Annotated data object
- use_rep (str, optional): Representation key in obsm
- n_pcs (int, optional): Number of PCs if using PCA
- use_raw (bool): Use raw data
- layer (str, optional): Layer to use
- obsm (str, optional): Obsm key
- obsp (str, optional): Obsp key
Returns:
array: Data representation
"""
def _set_obs_rep(adata, X_new, use_rep=None, n_pcs=None, layer=None, obsm=None):
"""
Set observation representation in AnnData.
Parameters:
- adata (AnnData): Annotated data object
- X_new (array): New data representation
- use_rep (str, optional): Representation key
- n_pcs (int, optional): Number of PCs
- layer (str, optional): Layer key
- obsm (str, optional): Obsm key
"""
def _check_mask(adata, mask_var, mask_obs=None):
"""
Validate and process mask for subsetting.
Parameters:
- adata (AnnData): Annotated data object
- mask_var (array or str): Variable mask
- mask_obs (array or str, optional): Observation mask
Returns:
tuple: Processed masks
"""Control logging output and verbosity levels.
def print_versions():
"""
Print version information for scanpy and dependencies.
Returns:
None: Prints version information to stdout
"""
# Logging levels
CRITICAL: int = 50
ERROR: int = 40
WARNING: int = 30
INFO: int = 20
DEBUG: int = 10
HINT: int = 15 # Custom level between INFO and DEBUG
# Verbosity levels
class Verbosity:
"""Verbosity level enumeration."""
error: int = 0
warn: int = 1
info: int = 2
hint: int = 3
debug: int = 4
trace: int = 5Tools for managing memory usage and performance.
def memory_usage():
"""
Get current memory usage.
Returns:
str: Memory usage information
"""
def check_versions():
"""
Check versions of key dependencies.
Returns:
None: Prints warnings for version issues
"""Utilities for working with files and paths.
def _check_datasetdir_exists():
"""Check if dataset directory exists."""
def _get_filename_from_key(key):
"""Generate filename from key."""
def _doc_params(**kwds):
"""Decorator for parameter documentation."""Configure matplotlib and plotting behavior.
def set_figure_params(scanpy=True, dpi=80, dpi_save=150, transparent=False, fontsize=14, color_map='viridis', format='pdf', facecolor='white', **kwargs):
"""
Set global figure parameters for matplotlib.
Parameters:
- scanpy (bool): Use scanpy-specific settings
- dpi (int): Display resolution
- dpi_save (int): Save resolution
- transparent (bool): Transparent background
- fontsize (int): Base font size
- color_map (str): Default colormap
- format (str): Default save format
- facecolor (str): Figure background color
- **kwargs: Additional rcParams
"""
def reset_rcParams():
"""Reset matplotlib rcParams to defaults."""Important constants used throughout scanpy.
# Default number of PCs
N_PCS: int = 50
# Default number of diffusion components
N_DCS: int = 15
# File format constants
FIGDIR_DEFAULT: str = './figures/'
FORMAT_DEFAULT: str = 'pdf'
# Cache settings
CACHE_DEFAULT: str = './cache/'import scanpy as sc
# Set verbosity level
sc.settings.verbosity = 3 # hint level
# Configure parallel processing
sc.settings.n_jobs = -1 # use all available cores
# Set figure parameters
sc.settings.set_figure_params(
dpi=100,
dpi_save=300,
fontsize=12,
color_map='plasma',
format='png',
transparent=True
)
# Set output directory
sc.settings.figdir = './my_figures/'
# Check current settings
print(f"Verbosity: {sc.settings.verbosity}")
print(f"N jobs: {sc.settings.n_jobs}")
print(f"Figure dir: {sc.settings.figdir}")# Extract observation data with specific columns
obs_data = sc.get.obs_df(adata, keys=['total_counts', 'n_genes', 'leiden'])
print(obs_data.head())
# Get ranked genes as DataFrame
marker_genes = sc.get.rank_genes_groups_df(adata, group='0')
top_genes = marker_genes.head(20)
# Extract variable information
var_data = sc.get.var_df(adata, keys=['highly_variable', 'dispersions'])
# Aggregate data by clusters
adata_agg = sc.get.aggregate(adata, by='leiden', func='mean')
print(f"Aggregated to {adata_agg.n_obs} pseudo-bulk samples")# Get PCA representation
X_pca = sc.get._get_obs_rep(adata, use_rep='X_pca', n_pcs=30)
print(f"PCA shape: {X_pca.shape}")
# Get UMAP representation
X_umap = sc.get._get_obs_rep(adata, use_rep='X_umap')
print(f"UMAP shape: {X_umap.shape}")
# Get raw data representation
X_raw = sc.get._get_obs_rep(adata, use_raw=True)
print(f"Raw data shape: {X_raw.shape}")# Print comprehensive version information
sc.logging.print_versions()
# Check for version compatibility issues
sc._utils.check_versions()
# Print memory usage
print(f"Current memory usage: {sc._utils.memory_usage()}")# Custom matplotlib configuration
sc.pl.set_rcParams_scanpy(fontsize=10, color_map='viridis')
# Reset to defaults
sc.pl.set_rcParams_defaults()
# Fine-grained matplotlib control
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3
# Apply custom color palette
import seaborn as sns
custom_palette = sns.color_palette("husl", 8)
sc.pl.palettes.default_20 = custom_palette# Configure for large datasets
sc.settings.max_memory = '16G' # Set memory limit
sc.settings.n_jobs = 8 # Limit parallel jobs
sc.settings.verbosity = 1 # Reduce logging overhead
# Enable caching for repeated operations
sc.settings.cachedir = '/tmp/scanpy_cache/'
# Use chunked operations for large matrices
sc.pp.scale(adata, chunked=True, chunk_size=1000)def run_standard_analysis(adata, resolution=0.5, n_pcs=50):
"""Custom analysis function using scanpy utilities."""
# Configure for this analysis
original_verbosity = sc.settings.verbosity
sc.settings.verbosity = 2
try:
# Preprocessing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# Analysis
sc.pp.highly_variable_genes(adata)
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata)
sc.pp.pca(adata, n_comps=n_pcs)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=resolution)
# Extract results
results = {
'clusters': sc.get.obs_df(adata, keys=['leiden']),
'embedding': sc.get._get_obs_rep(adata, use_rep='X_umap'),
'n_clusters': len(adata.obs['leiden'].unique())
}
return adata, results
finally:
# Restore original settings
sc.settings.verbosity = original_verbosity
# Run analysis
adata_processed, analysis_results = run_standard_analysis(adata)
print(f"Found {analysis_results['n_clusters']} clusters")# Enable debug logging
sc.settings.verbosity = 4 # debug level
# Check data integrity
def check_adata_integrity(adata):
"""Check AnnData object for common issues."""
print(f"Shape: {adata.shape}")
print(f"Data type: {adata.X.dtype}")
print(f"Sparse: {scipy.sparse.issparse(adata.X)}")
print(f"NaN values: {np.isnan(adata.X.data).sum() if scipy.sparse.issparse(adata.X) else np.isnan(adata.X).sum()}")
print(f"Negative values: {(adata.X.data < 0).sum() if scipy.sparse.issparse(adata.X) else (adata.X < 0).sum()}")
# Check for common issues
if adata.obs.index.duplicated().any():
print("WARNING: Duplicate observation names found")
if adata.var.index.duplicated().any():
print("WARNING: Duplicate variable names found")
check_adata_integrity(adata)
# Memory profiling for large operations
import time
start_time = time.time()
start_memory = sc._utils.memory_usage()
# Your analysis here
sc.pp.neighbors(adata, n_neighbors=15)
end_time = time.time()
end_memory = sc._utils.memory_usage()
print(f"Operation took {end_time - start_time:.2f} seconds")
print(f"Memory before: {start_memory}")
print(f"Memory after: {end_memory}")# Create configuration file (~/.scanpy/config.yaml)
import os
import yaml
config_dir = os.path.expanduser('~/.scanpy')
os.makedirs(config_dir, exist_ok=True)
config = {
'verbosity': 2,
'n_jobs': -1,
'figdir': './figures/',
'file_format_figs': 'pdf',
'dpi_save': 300,
'transparent': True
}
with open(os.path.join(config_dir, 'config.yaml'), 'w') as f:
yaml.dump(config, f)n_jobs and max_memory based on systemInstall with Tessl CLI
npx tessl i tessl/pypi-scanpy