Comprehensive toolkit for analyzing single-cell gene expression data with scalable Python implementation supporting preprocessing, visualization, clustering, trajectory inference, and differential expression testing.
—
Scanpy's queries module provides tools for enriching single-cell analysis with external database information. This includes querying Biomart for gene annotations, performing gene enrichment analysis, and retrieving genomic coordinates and mitochondrial gene lists.
Query Ensembl Biomart database for comprehensive gene information.
def biomart_annotations(org, attrs, values=None, use_cache=True, **kwargs):
"""
Retrieve gene annotations from Ensembl Biomart.
Parameters:
- org (str): Organism name (e.g., 'hsapiens', 'mmusculus')
- attrs (list): List of attributes to retrieve from Biomart
- values (list, optional): List of gene identifiers to query
- use_cache (bool): Use cached results if available
- **kwargs: Additional Biomart query parameters
Returns:
DataFrame: Gene annotations with requested attributes
"""Perform functional enrichment analysis using g:Profiler.
def enrich(gene_list, organism='hsapiens', sources=None, background=None, domain_scope='annotated', significance_threshold_method='g_SCS', user_threshold=0.05, ordered_query=False, measure_underrepresentation=False, evcodes=False, combined=True, **kwargs):
"""
Gene enrichment analysis using g:Profiler API.
Parameters:
- gene_list (list): List of gene identifiers for enrichment
- organism (str): Organism code ('hsapiens', 'mmusculus', etc.)
- sources (list, optional): Databases to query (GO:BP, GO:MF, GO:CC, KEGG, etc.)
- background (list, optional): Background gene set
- domain_scope (str): Statistical domain scope
- significance_threshold_method (str): Multiple testing correction method
- user_threshold (float): Significance threshold
- ordered_query (bool): Whether gene list is ordered by importance
- measure_underrepresentation (bool): Test for underrepresentation
- evcodes (bool): Include GO evidence codes
- combined (bool): Use combined g:SCS threshold
- **kwargs: Additional g:Profiler parameters
Returns:
DataFrame: Enrichment results with p-values, terms, and descriptions
"""Retrieve genomic coordinates for genes from Ensembl.
def gene_coordinates(gene_list, org='hsapiens', gene_symbols=True, **kwargs):
"""
Get genomic coordinates for a list of genes.
Parameters:
- gene_list (list): List of gene identifiers
- org (str): Organism ('hsapiens', 'mmusculus', etc.)
- gene_symbols (bool): Whether input are gene symbols (vs Ensembl IDs)
- **kwargs: Additional query parameters
Returns:
DataFrame: Genomic coordinates with chromosome, start, end positions
"""Retrieve lists of mitochondrial genes for QC filtering.
def mitochondrial_genes(org='hsapiens', gene_symbols=True, **kwargs):
"""
Get list of mitochondrial genes for given organism.
Parameters:
- org (str): Organism ('hsapiens', 'mmusculus', etc.)
- gene_symbols (bool): Return gene symbols (vs Ensembl IDs)
- **kwargs: Additional query parameters
Returns:
list: List of mitochondrial gene identifiers
"""import scanpy as sc
import pandas as pd
# Get basic gene information
gene_list = ['CD3D', 'CD4', 'CD8A', 'IL2', 'IFNG']
annotations = sc.queries.biomart_annotations(
org='hsapiens',
attrs=['ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'],
values=gene_list
)
print(annotations.head())# Get marker genes from clustering
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
# Extract top marker genes for cluster 0
marker_genes = sc.get.rank_genes_groups_df(adata, group='0').head(100)['names'].tolist()
# Perform enrichment analysis
enrichment_results = sc.queries.enrich(
gene_list=marker_genes,
organism='hsapiens',
sources=['GO:BP', 'GO:MF', 'KEGG', 'REACTOME']
)
# Show top enriched terms
top_terms = enrichment_results.head(10)
print(top_terms[['native', 'name', 'p_value', 'term_size']])# Get coordinates for genes of interest
genes_of_interest = ['TP53', 'MYC', 'EGFR', 'VEGFA']
coordinates = sc.queries.gene_coordinates(
gene_list=genes_of_interest,
org='hsapiens'
)
print(coordinates)
# Use coordinates for downstream analysis
for _, gene_info in coordinates.iterrows():
print(f"{gene_info['external_gene_name']}: {gene_info['chromosome_name']}:{gene_info['start_position']}-{gene_info['end_position']}")# Get mitochondrial genes for human
mt_genes = sc.queries.mitochondrial_genes(org='hsapiens')
print(f"Found {len(mt_genes)} mitochondrial genes")
# Mark mitochondrial genes in AnnData object
adata.var['mt'] = adata.var_names.isin(mt_genes)
# Calculate mitochondrial gene percentage
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
# Filter cells with high mitochondrial content
adata = adata[adata.obs.pct_counts_mt < 20, :]# Human-mouse gene mapping
human_genes = ['CD3D', 'CD4', 'CD8A']
mouse_genes = ['Cd3d', 'Cd4', 'Cd8a']
# Get human annotations
human_annotations = sc.queries.biomart_annotations(
org='hsapiens',
attrs=['ensembl_gene_id', 'external_gene_name', 'mmusculus_homolog_ensembl_gene'],
values=human_genes
)
# Get mouse annotations
mouse_annotations = sc.queries.biomart_annotations(
org='mmusculus',
attrs=['ensembl_gene_id', 'external_gene_name', 'hsapiens_homolog_ensembl_gene'],
values=mouse_genes
)
print("Human-mouse homolog mapping:")
print(human_annotations[['external_gene_name', 'mmusculus_homolog_ensembl_gene']])def annotate_gene_list(gene_list, organism='hsapiens'):
"""Comprehensive gene annotation pipeline."""
# Get basic annotations
basic_info = sc.queries.biomart_annotations(
org=organism,
attrs=[
'ensembl_gene_id',
'external_gene_name',
'description',
'gene_biotype',
'chromosome_name',
'start_position',
'end_position'
],
values=gene_list
)
# Perform enrichment analysis
enrichment = sc.queries.enrich(
gene_list=gene_list,
organism=organism,
sources=['GO:BP', 'GO:MF', 'GO:CC', 'KEGG']
)
# Get mitochondrial gene status
mt_genes = sc.queries.mitochondrial_genes(org=organism)
basic_info['is_mitochondrial'] = basic_info['external_gene_name'].isin(mt_genes)
return {
'annotations': basic_info,
'enrichment': enrichment,
'mt_genes': mt_genes
}
# Use the pipeline
results = annotate_gene_list(['CD3D', 'CD4', 'CD8A', 'IL2', 'IFNG'])# Complete workflow: clustering -> marker genes -> annotation -> enrichment
def analyze_cluster_markers(adata, cluster_key='leiden', n_genes=50):
"""Analyze cluster markers with comprehensive annotation."""
# Find marker genes
sc.tl.rank_genes_groups(adata, cluster_key, method='wilcoxon')
results = {}
for cluster in adata.obs[cluster_key].unique():
# Get top marker genes
marker_df = sc.get.rank_genes_groups_df(adata, group=cluster)
top_markers = marker_df.head(n_genes)['names'].tolist()
# Annotate markers
annotations = sc.queries.biomart_annotations(
org='hsapiens',
attrs=['external_gene_name', 'description', 'gene_biotype'],
values=top_markers
)
# Enrichment analysis
enrichment = sc.queries.enrich(
gene_list=top_markers,
organism='hsapiens',
sources=['GO:BP', 'KEGG'],
user_threshold=0.01
)
results[cluster] = {
'markers': top_markers,
'annotations': annotations,
'enrichment': enrichment.head(10) if not enrichment.empty else None
}
return results
# Run comprehensive analysis
cluster_analysis = analyze_cluster_markers(adata, n_genes=30)
# Print results for each cluster
for cluster, data in cluster_analysis.items():
print(f"\nCluster {cluster}:")
print(f"Top markers: {', '.join(data['markers'][:10])}")
if data['enrichment'] is not None:
print("Top enriched pathways:")
for _, pathway in data['enrichment'].head(3).iterrows():
print(f" - {pathway['name']} (p={pathway['p_value']:.2e})")def robust_query(gene_list, max_retries=3, delay=1):
"""Query with retry logic and error handling."""
import time
for attempt in range(max_retries):
try:
return sc.queries.biomart_annotations(
org='hsapiens',
attrs=['external_gene_name', 'description'],
values=gene_list
)
except Exception as e:
if attempt < max_retries - 1:
print(f"Query failed (attempt {attempt + 1}): {e}")
time.sleep(delay * (2 ** attempt)) # Exponential backoff
else:
raise
# Use robust querying
annotations = robust_query(['CD3D', 'CD4', 'CD8A'])# Integrate query results with AnnData object
def add_gene_annotations(adata, organism='hsapiens'):
"""Add gene annotations to AnnData var."""
gene_list = adata.var_names.tolist()
# Get annotations
annotations = sc.queries.biomart_annotations(
org=organism,
attrs=['external_gene_name', 'description', 'gene_biotype', 'chromosome_name'],
values=gene_list
)
# Merge with existing var data
annotations.index = annotations['external_gene_name']
# Add to adata.var
for col in ['description', 'gene_biotype', 'chromosome_name']:
if col in annotations.columns:
adata.var[col] = annotations.loc[adata.var_names, col].fillna('Unknown')
# Mark mitochondrial genes
mt_genes = sc.queries.mitochondrial_genes(org=organism)
adata.var['mitochondrial'] = adata.var_names.isin(mt_genes)
return adata
# Add annotations to your data
adata = add_gene_annotations(adata)Common organism codes for queries:
'hsapiens' - Human'mmusculus' - Mouse'drerio' - Zebrafish'dmelanogaster' - Fruit fly'celegans' - C. elegans'rnorvegicus' - Rat'scerevisiae' - YeastThe queries module requires internet connectivity and may depend on:
biomart - For Ensembl queriesgprofiler-official - For enrichment analysisrequests - For API callspandas - For data handlingInstall additional dependencies if needed:
pip install biomart gprofiler-official requestsInstall with Tessl CLI
npx tessl i tessl/pypi-scanpy