tessl/pypi-scanpy

Comprehensive toolkit for analyzing single-cell gene expression data with scalable Python implementation supporting preprocessing, visualization, clustering, trajectory inference, and differential expression testing.

—

Pending

Overview

Eval results

Files

Preprocessing

Name: tessl/pypi-scanpy
Author: tessl

Scanpy's preprocessing module provides a comprehensive pipeline for preparing raw single-cell data for downstream analysis. This includes quality control, filtering, normalization, scaling, feature selection, and dimensionality reduction.

Capabilities

Quality Control and Filtering

Calculate quality control metrics and filter cells and genes based on various criteria.

def calculate_qc_metrics(adata, expr_type='counts', var_type='genes', qc_vars=None, percent_top=None, log1p=False, inplace=False):
    """
    Calculate quality control metrics for cells and genes.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - expr_type (str): Name for expression type in metrics
    - var_type (str): Name for variable type in metrics  
    - qc_vars (list, optional): List of gene sets to calculate percentages
    - percent_top (list, optional): Calculate percentage of top expressed genes
    - log1p (bool): Calculate metrics on log1p transformed data
    - inplace (bool): Modify adata in place
    
    Returns:
    None or dict: QC metrics (if not inplace)
    """

def filter_cells(adata, min_counts=None, min_genes=None, max_counts=None, max_genes=None, inplace=True, copy=False):
    """
    Filter cells based on counts and numbers of genes expressed.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - min_counts (int, optional): Minimum number of counts per cell
    - min_genes (int, optional): Minimum number of genes per cell
    - max_counts (int, optional): Maximum number of counts per cell
    - max_genes (int, optional): Maximum number of genes per cell
    - inplace (bool): Modify adata in place
    - copy (bool): Return copy instead of modifying
    
    Returns:
    AnnData or None: Filtered object (if copy=True)
    """

def filter_genes(adata, min_counts=None, min_cells=None, max_counts=None, max_cells=None, inplace=True, copy=False):
    """
    Filter genes based on counts and numbers of cells expressed.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - min_counts (int, optional): Minimum number of counts per gene
    - min_cells (int, optional): Minimum number of cells expressing gene
    - max_counts (int, optional): Maximum number of counts per gene
    - max_cells (int, optional): Maximum number of cells expressing gene
    - inplace (bool): Modify adata in place
    - copy (bool): Return copy instead of modifying
    
    Returns:
    AnnData or None: Filtered object (if copy=True)
    """

Normalization and Transformation

Normalize and transform count data to make it suitable for analysis.

def normalize_total(adata, target_sum=None, exclude_highly_expressed=False, max_fraction=0.05, key_added=None, layer=None, inplace=True):
    """
    Normalize counts per cell to a common library size.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - target_sum (float, optional): Target sum for normalization (default: median)
    - exclude_highly_expressed (bool): Exclude highly expressed genes from calculation
    - max_fraction (float): Maximum fraction of total counts for a gene
    - key_added (str, optional): Key to add normalization factors to obs
    - layer (str, optional): Layer to normalize
    - inplace (bool): Modify adata in place
    
    Returns:
    AnnData or None: Normalized object (if not inplace)
    """

def normalize_per_cell(adata, counts_per_cell_after=None, counts_per_cell=None, key_n_counts='n_counts', copy=False):
    """
    Normalize total counts per cell (deprecated - use normalize_total).
    
    Parameters:
    - adata (AnnData): Annotated data object
    - counts_per_cell_after (float, optional): Target counts per cell
    - counts_per_cell (array, optional): Current counts per cell
    - key_n_counts (str): Key for count information
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Normalized object (if copy=True)
    """

def log1p(adata, base=None, copy=False, chunked=False, chunk_size=None, layer=None, obsm=None):
    """
    Logarithmize the data matrix: X = log(X + 1).
    
    Parameters:
    - adata (AnnData): Annotated data object
    - base (float, optional): Base for logarithm (default: natural log)
    - copy (bool): Return copy
    - chunked (bool): Process in chunks for large datasets
    - chunk_size (int, optional): Size of chunks
    - layer (str, optional): Layer to transform
    - obsm (str, optional): Obsm key to transform
    
    Returns:
    AnnData or None: Transformed object (if copy=True)
    """

def sqrt(adata, copy=False):
    """
    Square root transform the data matrix.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Transformed object (if copy=True)
    """

Scaling and Centering

Scale and center data for downstream analysis.

def scale(adata, zero_center=True, max_value=None, copy=False, layer=None, obsm=None):
    """
    Scale data matrix to unit variance and optionally zero mean.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - zero_center (bool): Center data to zero mean
    - max_value (float, optional): Clip values to maximum
    - copy (bool): Return copy
    - layer (str, optional): Layer to scale
    - obsm (str, optional): Obsm key to scale
    
    Returns:
    AnnData or None: Scaled object (if copy=True)
    """

Feature Selection

Identify highly variable genes and other feature selection methods.

def highly_variable_genes(adata, layer=None, n_top_genes=None, min_disp=0.5, max_disp=np.inf, min_mean=0.0125, max_mean=3, span=0.3, n_bins=20, flavor='seurat_v3', subset=False, inplace=True, batch_key=None):
    """
    Identify highly variable genes across cells.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - layer (str, optional): Layer to use for calculation
    - n_top_genes (int, optional): Number of top genes to select
    - min_disp (float): Minimum dispersion
    - max_disp (float): Maximum dispersion
    - min_mean (float): Minimum mean expression
    - max_mean (float): Maximum mean expression
    - span (float): Span for LOWESS fit
    - n_bins (int): Number of bins for binning
    - flavor (str): Method for calculation ('seurat', 'cell_ranger', 'seurat_v3')
    - subset (bool): Subset to highly variable genes
    - inplace (bool): Modify adata in place
    - batch_key (str, optional): Key for batch correction
    
    Returns:
    AnnData or None: Modified object (if not inplace)
    """

def filter_genes_dispersion(adata, flavor='seurat', min_disp=None, max_disp=None, min_mean=None, max_mean=None, n_top_genes=None, log=True, subset=True, copy=False):
    """
    Filter genes by dispersion (deprecated - use highly_variable_genes).
    
    Parameters:
    - adata (AnnData): Annotated data object
    - flavor (str): Method for calculation
    - min_disp (float, optional): Minimum dispersion
    - max_disp (float, optional): Maximum dispersion
    - min_mean (float, optional): Minimum mean
    - max_mean (float, optional): Maximum mean
    - n_top_genes (int, optional): Number of top genes
    - log (bool): Log transform before calculation
    - subset (bool): Subset data
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Filtered object (if copy=True)
    """

Dimensionality Reduction

Perform principal component analysis for dimensionality reduction.

def pca(adata, n_comps=50, zero_center=True, svd_solver=None, random_state=0, return_info=False, use_highly_variable=None, dtype='float32', copy=False, chunked=False, chunk_size=None):
    """
    Principal component analysis.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - n_comps (int): Number of principal components to compute
    - zero_center (bool): Zero center the data
    - svd_solver (str, optional): SVD solver ('arpack', 'randomized', 'auto')
    - random_state (int): Random seed
    - return_info (bool): Return additional information
    - use_highly_variable (bool, optional): Use only highly variable genes
    - dtype (str): Data type for computation
    - copy (bool): Return copy
    - chunked (bool): Process in chunks
    - chunk_size (int, optional): Chunk size
    
    Returns:
    AnnData or None: Object with PCA results (if copy=True)
    """

Neighborhood Graph Construction

Compute neighborhood graphs for downstream analysis.

def neighbors(adata, n_neighbors=15, n_pcs=None, use_rep=None, knn=True, method='umap', transformer=None, metric='euclidean', metric_kwds={}, random_state=0, key_added=None, copy=False):
    """
    Compute the nearest neighbors distance matrix and neighborhood graph.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - n_neighbors (int): Number of nearest neighbors
    - n_pcs (int, optional): Number of PCs to use
    - use_rep (str, optional): Representation to use ('X_pca', etc.)
    - knn (bool): Use k-nearest neighbors
    - method (str): Method for connectivity ('umap', 'gauss')
    - transformer (object, optional): Custom transformer
    - metric (str): Distance metric
    - metric_kwds (dict): Additional metric parameters
    - random_state (int): Random seed
    - key_added (str, optional): Key for storing results
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Object with neighbors graph (if copy=True)
    """

Batch Effect Correction

Correct for batch effects and technical variation.

def combat(adata, key='batch', covariates=None, inplace=True):
    """
    ComBat batch effect correction.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - key (str): Key in obs containing batch information
    - covariates (list, optional): Additional covariates to preserve
    - inplace (bool): Modify adata in place
    
    Returns:
    AnnData or None: Batch-corrected object (if not inplace)
    """

Doublet Detection

Detect potential cell doublets using Scrublet.

def scrublet(adata, adata_sim=None, sim_doublet_ratio=2.0, n_neighbors=None, expected_doublet_rate=0.1, stdev_doublet_rate=0.02, synthetic_doublet_umi_subsampling=1.0, knn_dist_metric='euclidean', normalize_variance=True, log_transform=False, mean_center=True, n_prin_comps=30, use_approx_neighbors=True, get_doublet_neighbor_parents=False, random_state=0, copy=False):
    """
    Predict cell doublets using Scrublet.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - adata_sim (AnnData, optional): Simulated doublets
    - sim_doublet_ratio (float): Ratio of simulated doublets
    - n_neighbors (int, optional): Number of neighbors for KNN graph
    - expected_doublet_rate (float): Expected doublet rate
    - stdev_doublet_rate (float): Standard deviation of doublet rate
    - synthetic_doublet_umi_subsampling (float): UMI subsampling rate
    - knn_dist_metric (str): Distance metric for KNN
    - normalize_variance (bool): Normalize variance
    - log_transform (bool): Log transform data
    - mean_center (bool): Mean center data
    - n_prin_comps (int): Number of principal components
    - use_approx_neighbors (bool): Use approximate neighbors
    - get_doublet_neighbor_parents (bool): Get doublet neighbor parents
    - random_state (int): Random seed
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Object with doublet scores (if copy=True)
    """

def scrublet_simulate_doublets(adata, sim_doublet_ratio=2.0, synthetic_doublet_umi_subsampling=1.0, random_state=0):
    """
    Simulate doublets for Scrublet analysis.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - sim_doublet_ratio (float): Ratio of simulated doublets
    - synthetic_doublet_umi_subsampling (float): UMI subsampling rate
    - random_state (int): Random seed
    
    Returns:
    AnnData: Simulated doublets
    """

Utility Functions

Additional preprocessing utilities.

def downsample_counts(adata, counts_per_cell=None, total_counts=None, random_state=0, replace=False, copy=False):
    """
    Downsample counts per cell.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - counts_per_cell (int, optional): Target counts per cell
    - total_counts (int, optional): Total target counts
    - random_state (int): Random seed
    - replace (bool): Sample with replacement
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Downsampled object (if copy=True)
    """

def sample(adata, n_obs=None, fraction=None, copy=False, random_state=0):
    """
    Sample observations from the data.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - n_obs (int, optional): Number of observations to sample
    - fraction (float, optional): Fraction of observations to sample
    - copy (bool): Return copy
    - random_state (int): Random seed
    
    Returns:
    AnnData or None: Sampled object (if copy=True)
    """

def regress_out(adata, keys, n_jobs=None, copy=False):
    """
    Regress out unwanted sources of variation.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - keys (list): Keys in obs to regress out
    - n_jobs (int, optional): Number of parallel jobs
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Corrected object (if copy=True)
    """

Recipe Functions

Predefined preprocessing workflows based on published methods.

def recipe_seurat(adata, log=True, plot=True, copy=False):
    """
    Seurat-like preprocessing recipe.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - log (bool): Apply log transformation
    - plot (bool): Generate plots
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Preprocessed object (if copy=True)
    """

def recipe_weinreb17(adata, log=True, mean_threshold=0.01, cv_threshold=2, n_top_genes=1000, plot=True, copy=False):
    """
    Preprocessing recipe from Weinreb et al. 2017.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - log (bool): Apply log transformation
    - mean_threshold (float): Mean expression threshold
    - cv_threshold (float): Coefficient of variation threshold
    - n_top_genes (int): Number of top genes to select
    - plot (bool): Generate plots
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Preprocessed object (if copy=True)
    """

def recipe_zheng17(adata, n_top_genes=1000, log=True, plot=True, copy=False):
    """
    Preprocessing recipe from Zheng et al. 2017.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - n_top_genes (int): Number of top genes to select
    - log (bool): Apply log transformation  
    - plot (bool): Generate plots
    - copy (bool): Return copy
    
    Returns:
    AnnData or None: Preprocessed object (if copy=True)
    """

Usage Examples

Basic Preprocessing Pipeline

import scanpy as sc
import numpy as np

# Load data
adata = sc.read_10x_mtx('data/filtered_gene_bc_matrices/hg19/')

# Basic filtering
sc.pp.filter_cells(adata, min_genes=200)  # filter cells
sc.pp.filter_genes(adata, min_cells=3)   # filter genes

# Calculate QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)

# Filter based on QC metrics
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 20, :]

# Normalization and log transformation
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# Feature selection
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata.raw = adata  # save the full data
adata = adata[:, adata.var.highly_variable]

# Scaling
sc.pp.scale(adata, max_value=10)

# PCA
sc.pp.pca(adata, svd_solver='arpack')

# Neighborhood graph
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

Recipe-based Preprocessing

# Use Seurat-like preprocessing
adata = sc.read_10x_mtx('data/')
sc.pp.recipe_seurat(adata, log=True, plot=False)

Install with Tessl CLI

npx tessl i tessl/pypi-scanpy

docs