Python API for efficient storage and retrieval of single-cell data using TileDB
Query builders and indexing utilities for efficient data retrieval from SOMA objects. These tools enable filtering, subsetting, and indexing operations on single-cell datasets at scale.
A powerful query builder for Experiments that provides methods to query observations, variables, and measurements with efficient filtering and retrieval.
class ExperimentAxisQuery:
def obs(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):
"""
Query observations (cells) from the experiment.
Parameters:
- column_names: list of str, specific observation columns to retrieve
- batch_size: int, number of observations per batch
- partitions: Partitions object for parallel reading
- platform_config: TileDB-specific configuration options
Returns:
Iterator of Arrow tables containing observation data
"""
def var(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):
"""
Query variables (genes/features) from the experiment.
Parameters:
- column_names: list of str, specific variable columns to retrieve
- batch_size: int, number of variables per batch
- partitions: Partitions object for parallel reading
- platform_config: TileDB-specific configuration options
Returns:
Iterator of Arrow tables containing variable data
"""
def X(self, layer_name, *, batch_size=None, partitions=None, platform_config=None):
"""
Query measurement matrices (expression data).
Parameters:
- layer_name: str, name of the X layer to query
- batch_size: int, number of elements per batch
- partitions: Partitions object for parallel reading
- platform_config: TileDB-specific configuration options
Returns:
Iterator of sparse matrix data
"""
def to_anndata(self, *, X_layer_name=None, column_names=None, obsm_layers=None, varm_layers=None, obsp_layers=None, varp_layers=None):
"""
Convert query results to an AnnData object.
Parameters:
- X_layer_name: str, X layer to use as main matrix (None uses first available)
- column_names: dict, column names to include for obs/var
- obsm_layers: list of str, obsm layers to include
- varm_layers: list of str, varm layers to include
- obsp_layers: list of str, obsp layers to include
- varp_layers: list of str, varp layers to include
Returns:
AnnData object with query results
"""Axis queries are created through the axis_query method on Experiments:
import tiledbsoma
with tiledbsoma.open("experiment.soma") as exp:
# Create basic query
query = exp.axis_query("RNA")
# Create query with observation filtering
query = exp.axis_query(
"RNA",
obs_query=tiledbsoma.AxisQuery(
value_filter="cell_type == 'T-cell' and n_genes > 1000"
)
)
# Create query with variable filtering
query = exp.axis_query(
"RNA",
var_query=tiledbsoma.AxisQuery(
value_filter="feature_type == 'Gene Expression'"
)
)
# Create query with coordinate selection
query = exp.axis_query(
"RNA",
obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2, 10, 50]),
var_query=tiledbsoma.AxisQuery(coords=slice(0, 100))
)import tiledbsoma
import pandas as pd
# Query specific cell types
with tiledbsoma.open("pbmc_experiment.soma") as exp:
# Create query for T cells
t_cell_query = exp.axis_query(
"RNA",
obs_query=tiledbsoma.AxisQuery(
value_filter="cell_type in ['CD4 T cells', 'CD8 T cells']"
)
)
# Get observation metadata
obs_data = t_cell_query.obs(
column_names=["cell_type", "donor_id", "n_genes", "total_counts"]
).concat()
print(f"T cells found: {len(obs_data)}")
# Get variable information
var_data = t_cell_query.var(
column_names=["feature_name", "feature_type"]
).concat()
print(f"Genes: {len(var_data)}")
# Get expression matrix
for batch in t_cell_query.X("data"):
coordinates = batch.coords().to_pandas() # cell_id, gene_id
values = batch.values().to_pandas() # expression values
print(f"Expression batch: {len(values)} non-zero values")
# Convert to AnnData for downstream analysis
t_cell_adata = t_cell_query.to_anndata(
X_layer_name="data",
obsm_layers=["X_pca", "X_umap"],
column_names={
"obs": ["cell_type", "donor_id"],
"var": ["feature_name", "highly_variable"]
}
)
print(f"AnnData shape: {t_cell_adata.shape}")
# Query with coordinate-based selection
with tiledbsoma.open("experiment.soma") as exp:
# Select first 1000 cells and top 2000 variable genes
subset_query = exp.axis_query(
"RNA",
obs_query=tiledbsoma.AxisQuery(coords=slice(0, 1000)),
var_query=tiledbsoma.AxisQuery(coords=slice(0, 2000))
)
# Process in batches
batch_size = 10000
for obs_batch in subset_query.obs(batch_size=batch_size):
obs_df = obs_batch.to_pandas()
print(f"Processing {len(obs_df)} observations")
# Process batch...
# Complex filtering query
with tiledbsoma.open("experiment.soma") as exp:
# Query high-quality cells with specific markers
quality_query = exp.axis_query(
"RNA",
obs_query=tiledbsoma.AxisQuery(
value_filter="""
n_genes >= 500 and n_genes <= 5000 and
total_counts >= 1000 and
pct_counts_mitochondrial <= 20 and
tissue == 'brain'
"""
),
var_query=tiledbsoma.AxisQuery(
value_filter="highly_variable == True and feature_type == 'Gene Expression'"
)
)
# Convert to AnnData with all available layers
brain_adata = quality_query.to_anndata(
X_layer_name="normalized",
obsm_layers=None, # Include all obsm layers
varm_layers=None # Include all varm layers
)The AxisQuery class provides flexible query specification for coordinates and filtering.
class AxisQuery:
def __init__(self, *, coords=None, value_filter=None):
"""
Create an axis query specification.
Parameters:
- coords: coordinate selection (slice, list, or array)
- value_filter: str, filter expression for attribute values
"""import tiledbsoma
# Various coordinate selection patterns
axis_queries = [
# Select specific indices
tiledbsoma.AxisQuery(coords=[0, 5, 10, 15, 20]),
# Select range with slice
tiledbsoma.AxisQuery(coords=slice(100, 500)),
# Select with step
tiledbsoma.AxisQuery(coords=slice(0, 1000, 10)), # Every 10th element
# Select all (equivalent to no coordinate filter)
tiledbsoma.AxisQuery(coords=slice(None)),
]
# Value filter examples
filter_queries = [
# Numeric comparisons
tiledbsoma.AxisQuery(value_filter="n_genes > 1000"),
# String matching
tiledbsoma.AxisQuery(value_filter="cell_type == 'B cells'"),
# Multiple conditions
tiledbsoma.AxisQuery(value_filter="n_genes > 500 and total_counts < 10000"),
# Set membership
tiledbsoma.AxisQuery(value_filter="donor_id in ['D1', 'D2', 'D3']"),
# Pattern matching
tiledbsoma.AxisQuery(value_filter="feature_name startswith 'MT-'"),
]A re-indexer for unique integer indices, compatible with Pandas Index.get_indexer functionality. Useful for mapping between different index spaces efficiently.
class IntIndexer:
def __init__(self, data, *, context=None):
"""
Initialize IntIndexer with integer keys.
Parameters:
- data: array-like of unique integers to index
- context: TileDB context for the operation
"""
def get_indexer(self, target):
"""
Compute underlying indices for target data.
Parameters:
- target: array-like of integers to find indices for
Returns:
numpy array of indices, with -1 for missing values
"""import tiledbsoma
import numpy as np
# Create indexer for soma_joinid values
original_ids = np.array([0, 5, 10, 15, 20, 25, 30])
indexer = tiledbsoma.IntIndexer(original_ids)
# Find positions of specific IDs
target_ids = np.array([5, 15, 99, 20]) # 99 doesn't exist
positions = indexer.get_indexer(target_ids)
print(positions) # [1, 3, -1, 4] (99 maps to -1)
# Use with SOMA data
with tiledbsoma.open("experiment.soma") as exp:
# Get all observation IDs
obs_ids = exp.obs.read(column_names=["soma_joinid"]).concat()["soma_joinid"].to_numpy()
# Create indexer
obs_indexer = tiledbsoma.IntIndexer(obs_ids)
# Map external IDs to SOMA positions
external_ids = np.array([100, 200, 300, 400])
soma_positions = obs_indexer.get_indexer(external_ids)
# Use positions for coordinate-based queries
valid_positions = soma_positions[soma_positions >= 0]
if len(valid_positions) > 0:
query = exp.axis_query(
"RNA",
obs_query=tiledbsoma.AxisQuery(coords=valid_positions)
)Utility function for building indices on integer arrays.
def tiledbsoma_build_index(data, *, context=None):
"""
Build index for integer array.
Parameters:
- data: array-like of integers to index
- context: TileDB context for the operation
Returns:
Built index structure for efficient lookups
"""import tiledbsoma
import numpy as np
# Build index for large ID array
large_id_array = np.random.randint(0, 1000000, size=100000)
index = tiledbsoma.tiledbsoma_build_index(large_id_array)
# Use index for efficient lookups
# (specific usage depends on implementation details)import tiledbsoma
# Efficient batch processing for large queries
with tiledbsoma.open("large_experiment.soma") as exp:
query = exp.axis_query("RNA")
# Process observations in batches
batch_size = 1000
total_processed = 0
for obs_batch in query.obs(batch_size=batch_size):
obs_df = obs_batch.to_pandas()
total_processed += len(obs_df)
# Process batch
print(f"Processed {total_processed} observations")
# Your analysis code here...import tiledbsoma
# Use partitions for parallel processing
with tiledbsoma.open("experiment.soma") as exp:
query = exp.axis_query("RNA")
# Create partitions for parallel execution
partitions = tiledbsoma.Partitions(n_partitions=4)
# Process partitions in parallel (conceptual - actual implementation may vary)
for partition_id in range(partitions.n_partitions):
obs_data = query.obs(partitions=partitions.get_partition(partition_id))
# Process partition...import tiledbsoma
# Memory-efficient processing of large datasets
with tiledbsoma.open("experiment.soma") as exp:
# Query only needed columns
query = exp.axis_query(
"RNA",
obs_query=tiledbsoma.AxisQuery(
value_filter="quality_score > 0.8"
)
)
# Stream data without loading everything into memory
for expr_batch in query.X("data", batch_size=5000):
# Process expression batch
coords = expr_batch.coords()
values = expr_batch.values()
# Compute statistics, etc. without storing full dataset
print(f"Batch non-zero values: {len(values)}")import tiledbsoma
import scanpy as sc
# Integrated analysis workflow
with tiledbsoma.open("experiment.soma") as exp:
# Query high-quality cells
hq_query = exp.axis_query(
"RNA",
obs_query=tiledbsoma.AxisQuery(
value_filter="n_genes > 200 and pct_counts_mitochondrial < 20"
),
var_query=tiledbsoma.AxisQuery(
value_filter="n_cells > 3" # Genes expressed in at least 3 cells
)
)
# Convert to AnnData for Scanpy analysis
adata = hq_query.to_anndata(X_layer_name="raw")
# Standard single-cell analysis
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
# Continue with downstream analysis...
print(f"Processed {adata.n_obs} cells and {adata.n_vars} genes")This query and indexing functionality provides the foundation for efficient, scalable analysis of single-cell datasets stored in SOMA format.
Install with Tessl CLI
npx tessl i tessl/pypi-tiledbsoma