tessl/pypi-scanpy

Comprehensive toolkit for analyzing single-cell gene expression data with scalable Python implementation supporting preprocessing, visualization, clustering, trajectory inference, and differential expression testing.

—

Pending

Overview

Eval results

Files

Utilities and Settings

Name: tessl/pypi-scanpy
Author: tessl

Scanpy provides various utility functions, configuration options, and helper tools for managing analysis workflows, extracting data, and configuring the analysis environment.

Capabilities

Global Settings and Configuration

Configure scanpy's behavior and matplotlib plotting parameters.

# Global settings object
settings: ScanpyConfig

class ScanpyConfig:
    """Global scanpy configuration object."""
    
    # Core settings
    verbosity: int = 1  # Logging verbosity level (0-5)
    n_jobs: int = 1     # Number of parallel jobs (-1 for all cores)
    
    # Data settings  
    max_memory: str = '2G'  # Maximum memory for operations
    n_pcs: int = 50         # Default number of PCs
    
    # Figure settings
    figdir: str = './figures/'  # Default figure output directory
    file_format_figs: str = 'pdf'  # Default figure format
    dpi: int = 80              # Default DPI for figures
    dpi_save: int = 150        # DPI for saved figures
    transparent: bool = False   # Transparent backgrounds
    
    # Cache settings
    cache_compression: str = 'lzf'  # Compression for cached files
    
    def set_figure_params(self, dpi=80, dpi_save=150, transparent=False, fontsize=14, color_map='viridis', format='pdf', facecolor='white', **kwargs):
        """
        Set matplotlib figure parameters.
        
        Parameters:
        - dpi (int): Resolution for display
        - dpi_save (int): Resolution for saved figures
        - transparent (bool): Transparent background
        - fontsize (int): Base font size
        - color_map (str): Default colormap
        - format (str): Default save format
        - facecolor (str): Figure background color
        - **kwargs: Additional matplotlib rcParams
        """

Data Extraction Utilities

Extract and manipulate data from AnnData objects.

def obs_df(adata, keys=None, obsm_keys=None, layer=None, gene_symbols=None, use_raw=False):
    """
    Extract observation metadata as pandas DataFrame.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - keys (list, optional): Keys from obs to include
    - obsm_keys (list, optional): Keys from obsm to include  
    - layer (str, optional): Layer to extract data from
    - gene_symbols (str, optional): Gene symbols key
    - use_raw (bool): Use raw data
    
    Returns:
    DataFrame: Observation data with requested keys
    """

def var_df(adata, keys=None, varm_keys=None, layer=None):
    """
    Extract variable metadata as pandas DataFrame.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - keys (list, optional): Keys from var to include
    - varm_keys (list, optional): Keys from varm to include
    - layer (str, optional): Layer to extract data from
    
    Returns:
    DataFrame: Variable data with requested keys
    """

def rank_genes_groups_df(adata, group=None, key='rank_genes_groups', pval_cutoff=None, log2fc_min=None, log2fc_max=None, gene_symbols=None):
    """
    Extract ranked genes results as pandas DataFrame.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - group (str, optional): Specific group to extract
    - key (str): Key for ranked genes results
    - pval_cutoff (float, optional): P-value cutoff
    - log2fc_min (float, optional): Minimum log2 fold change
    - log2fc_max (float, optional): Maximum log2 fold change
    - gene_symbols (str, optional): Gene symbols key
    
    Returns:
    DataFrame: Ranked genes with statistics
    """

def aggregate(adata, by, func='mean', layer=None, obsm=None, varm=None):
    """
    Aggregate observations by grouping variable.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - by (str): Key in obs for grouping
    - func (str or callable): Aggregation function
    - layer (str, optional): Layer to aggregate
    - obsm (str, optional): Obsm key to aggregate
    - varm (str, optional): Varm key to aggregate
    
    Returns:
    AnnData: Aggregated data object
    """

Internal Data Access Utilities

Low-level utilities for accessing AnnData representations.

def _get_obs_rep(adata, use_rep=None, n_pcs=None, use_raw=False, layer=None, obsm=None, obsp=None):
    """
    Get observation representation for analysis.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - use_rep (str, optional): Representation key in obsm
    - n_pcs (int, optional): Number of PCs if using PCA
    - use_raw (bool): Use raw data
    - layer (str, optional): Layer to use
    - obsm (str, optional): Obsm key
    - obsp (str, optional): Obsp key
    
    Returns:
    array: Data representation
    """

def _set_obs_rep(adata, X_new, use_rep=None, n_pcs=None, layer=None, obsm=None):
    """
    Set observation representation in AnnData.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - X_new (array): New data representation
    - use_rep (str, optional): Representation key
    - n_pcs (int, optional): Number of PCs
    - layer (str, optional): Layer key
    - obsm (str, optional): Obsm key
    """

def _check_mask(adata, mask_var, mask_obs=None):
    """
    Validate and process mask for subsetting.
    
    Parameters:
    - adata (AnnData): Annotated data object
    - mask_var (array or str): Variable mask
    - mask_obs (array or str, optional): Observation mask
    
    Returns:
    tuple: Processed masks
    """

Logging and Verbosity

Control logging output and verbosity levels.

def print_versions():
    """
    Print version information for scanpy and dependencies.
    
    Returns:
    None: Prints version information to stdout
    """

# Logging levels
CRITICAL: int = 50
ERROR: int = 40  
WARNING: int = 30
INFO: int = 20
DEBUG: int = 10
HINT: int = 15  # Custom level between INFO and DEBUG

# Verbosity levels
class Verbosity:
    """Verbosity level enumeration."""
    error: int = 0
    warn: int = 1
    info: int = 2
    hint: int = 3
    debug: int = 4
    trace: int = 5

Memory and Performance Utilities

Tools for managing memory usage and performance.

def memory_usage():
    """
    Get current memory usage.
    
    Returns:
    str: Memory usage information
    """

def check_versions():
    """
    Check versions of key dependencies.
    
    Returns:
    None: Prints warnings for version issues
    """

File and Path Utilities

Utilities for working with files and paths.

def _check_datasetdir_exists():
    """Check if dataset directory exists."""
    
def _get_filename_from_key(key):
    """Generate filename from key."""
    
def _doc_params(**kwds):
    """Decorator for parameter documentation."""

Plotting Configuration

Configure matplotlib and plotting behavior.

def set_figure_params(scanpy=True, dpi=80, dpi_save=150, transparent=False, fontsize=14, color_map='viridis', format='pdf', facecolor='white', **kwargs):
    """
    Set global figure parameters for matplotlib.
    
    Parameters:
    - scanpy (bool): Use scanpy-specific settings
    - dpi (int): Display resolution
    - dpi_save (int): Save resolution  
    - transparent (bool): Transparent background
    - fontsize (int): Base font size
    - color_map (str): Default colormap
    - format (str): Default save format
    - facecolor (str): Figure background color
    - **kwargs: Additional rcParams
    """

def reset_rcParams():
    """Reset matplotlib rcParams to defaults."""

Constants and Enumerations

Important constants used throughout scanpy.

# Default number of PCs
N_PCS: int = 50

# Default number of diffusion components  
N_DCS: int = 15

# File format constants
FIGDIR_DEFAULT: str = './figures/'
FORMAT_DEFAULT: str = 'pdf'

# Cache settings
CACHE_DEFAULT: str = './cache/'

Usage Examples

Configuring Scanpy Settings

import scanpy as sc

# Set verbosity level
sc.settings.verbosity = 3  # hint level

# Configure parallel processing
sc.settings.n_jobs = -1  # use all available cores

# Set figure parameters
sc.settings.set_figure_params(
    dpi=100, 
    dpi_save=300,
    fontsize=12,
    color_map='plasma',
    format='png',
    transparent=True
)

# Set output directory
sc.settings.figdir = './my_figures/'

# Check current settings
print(f"Verbosity: {sc.settings.verbosity}")
print(f"N jobs: {sc.settings.n_jobs}")
print(f"Figure dir: {sc.settings.figdir}")

Data Extraction and Analysis

# Extract observation data with specific columns
obs_data = sc.get.obs_df(adata, keys=['total_counts', 'n_genes', 'leiden'])
print(obs_data.head())

# Get ranked genes as DataFrame
marker_genes = sc.get.rank_genes_groups_df(adata, group='0')
top_genes = marker_genes.head(20)

# Extract variable information
var_data = sc.get.var_df(adata, keys=['highly_variable', 'dispersions'])

# Aggregate data by clusters
adata_agg = sc.get.aggregate(adata, by='leiden', func='mean')
print(f"Aggregated to {adata_agg.n_obs} pseudo-bulk samples")

Working with Different Data Representations

# Get PCA representation
X_pca = sc.get._get_obs_rep(adata, use_rep='X_pca', n_pcs=30)
print(f"PCA shape: {X_pca.shape}")

# Get UMAP representation
X_umap = sc.get._get_obs_rep(adata, use_rep='X_umap')
print(f"UMAP shape: {X_umap.shape}")

# Get raw data representation
X_raw = sc.get._get_obs_rep(adata, use_raw=True)
print(f"Raw data shape: {X_raw.shape}")

Environment and Version Information

# Print comprehensive version information
sc.logging.print_versions()

# Check for version compatibility issues
sc._utils.check_versions()

# Print memory usage
print(f"Current memory usage: {sc._utils.memory_usage()}")

Advanced Configuration

# Custom matplotlib configuration
sc.pl.set_rcParams_scanpy(fontsize=10, color_map='viridis')

# Reset to defaults
sc.pl.set_rcParams_defaults()

# Fine-grained matplotlib control
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3

# Apply custom color palette
import seaborn as sns
custom_palette = sns.color_palette("husl", 8)
sc.pl.palettes.default_20 = custom_palette

Performance Optimization

# Configure for large datasets
sc.settings.max_memory = '16G'  # Set memory limit
sc.settings.n_jobs = 8         # Limit parallel jobs
sc.settings.verbosity = 1      # Reduce logging overhead

# Enable caching for repeated operations
sc.settings.cachedir = '/tmp/scanpy_cache/'

# Use chunked operations for large matrices
sc.pp.scale(adata, chunked=True, chunk_size=1000)

Custom Analysis Workflows

def run_standard_analysis(adata, resolution=0.5, n_pcs=50):
    """Custom analysis function using scanpy utilities."""
    
    # Configure for this analysis
    original_verbosity = sc.settings.verbosity
    sc.settings.verbosity = 2
    
    try:
        # Preprocessing
        sc.pp.filter_cells(adata, min_genes=200)
        sc.pp.filter_genes(adata, min_cells=3)
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        
        # Analysis
        sc.pp.highly_variable_genes(adata)
        adata.raw = adata
        adata = adata[:, adata.var.highly_variable]
        sc.pp.scale(adata)
        sc.pp.pca(adata, n_comps=n_pcs)
        sc.pp.neighbors(adata)
        sc.tl.umap(adata)
        sc.tl.leiden(adata, resolution=resolution)
        
        # Extract results
        results = {
            'clusters': sc.get.obs_df(adata, keys=['leiden']),
            'embedding': sc.get._get_obs_rep(adata, use_rep='X_umap'),
            'n_clusters': len(adata.obs['leiden'].unique())
        }
        
        return adata, results
        
    finally:
        # Restore original settings
        sc.settings.verbosity = original_verbosity

# Run analysis
adata_processed, analysis_results = run_standard_analysis(adata)
print(f"Found {analysis_results['n_clusters']} clusters")

Debugging and Troubleshooting

# Enable debug logging
sc.settings.verbosity = 4  # debug level

# Check data integrity
def check_adata_integrity(adata):
    """Check AnnData object for common issues."""
    print(f"Shape: {adata.shape}")
    print(f"Data type: {adata.X.dtype}")
    print(f"Sparse: {scipy.sparse.issparse(adata.X)}")
    print(f"NaN values: {np.isnan(adata.X.data).sum() if scipy.sparse.issparse(adata.X) else np.isnan(adata.X).sum()}")
    print(f"Negative values: {(adata.X.data < 0).sum() if scipy.sparse.issparse(adata.X) else (adata.X < 0).sum()}")
    
    # Check for common issues
    if adata.obs.index.duplicated().any():
        print("WARNING: Duplicate observation names found")
    if adata.var.index.duplicated().any():
        print("WARNING: Duplicate variable names found")

check_adata_integrity(adata)

# Memory profiling for large operations
import time
start_time = time.time()
start_memory = sc._utils.memory_usage()

# Your analysis here
sc.pp.neighbors(adata, n_neighbors=15)

end_time = time.time()
end_memory = sc._utils.memory_usage()

print(f"Operation took {end_time - start_time:.2f} seconds")
print(f"Memory before: {start_memory}")  
print(f"Memory after: {end_memory}")

Configuration Files

Setting up scanpy configuration

# Create configuration file (~/.scanpy/config.yaml)
import os
import yaml

config_dir = os.path.expanduser('~/.scanpy')
os.makedirs(config_dir, exist_ok=True)

config = {
    'verbosity': 2,
    'n_jobs': -1,
    'figdir': './figures/',
    'file_format_figs': 'pdf',
    'dpi_save': 300,
    'transparent': True
}

with open(os.path.join(config_dir, 'config.yaml'), 'w') as f:
    yaml.dump(config, f)