Python API for efficient storage and retrieval of single-cell data using TileDB
Specialized data structures designed specifically for single-cell analysis workflows. These include Experiments for annotated measurement matrices and Measurements for grouping observations with variables, following established patterns from single-cell analysis tools like AnnData and Seurat.
A specialized Collection that represents an annotated 2-D matrix of measurements, typically for multimodal single-cell datasets. Experiments organize single-cell data into observations (cells), measurements (assays), and optional spatial information.
class Experiment(Collection):
obs: DataFrame # Primary observations annotations (cell metadata)
ms: Collection # Named measurements collection (assays)
spatial: Collection # Spatial scenes collection (experimental)
def axis_query(self, measurement_name, *, obs_query=None, var_query=None):
"""
Create an axis query for efficient data retrieval.
Parameters:
- measurement_name: str, name of measurement to query
- obs_query: AxisQuery, query specification for observations
- var_query: AxisQuery, query specification for variables
Returns:
ExperimentAxisQuery instance for data retrieval
"""The Experiment structure follows the pattern:
obs: DataFrame containing cell/observation metadata (cell types, treatments, etc.)ms: Collection of Measurement objects, each representing a different assay or data modalityspatial: Collection of Scene objects for spatial single-cell data (experimental feature)import tiledbsoma
import tiledbsoma.io as soma_io
import pyarrow as pa
# Create experiment from AnnData
experiment_uri = "single_cell_experiment.soma"
soma_io.from_anndata(
adata, # AnnData object
experiment_uri,
measurement_name="RNA"
)
# Open and explore experiment
with tiledbsoma.open(experiment_uri) as exp:
# Access cell metadata
print("Cell types:")
print(exp.obs.read(column_names=["cell_type"]).concat().to_pandas())
# Access RNA measurement
rna_measurement = exp.ms["RNA"]
# Query specific cells and genes
query = exp.axis_query("RNA")
# Get data as AnnData
adata_subset = query.to_anndata(
X_layer_name="data",
obs_column_names=["cell_type", "tissue"]
)A specialized Collection that represents a set of observations with measurements on a common set of annotated variables (features). Each Measurement corresponds to a single assay or data modality within an Experiment.
class Measurement(Collection):
var: DataFrame # Variable annotations (gene/feature metadata)
X: Collection[SparseNDArray] # Feature values matrices (count data, normalized data, etc.)
obsm: Collection[DenseNDArray] # Dense observation annotations (embeddings, etc.)
obsp: Collection[SparseNDArray] # Sparse pairwise observation annotations (distances, graphs)
varm: Collection[DenseNDArray] # Dense variable annotations (gene loadings, etc.)
varp: Collection[SparseNDArray] # Sparse pairwise variable annotations (gene networks)The Measurement structure mirrors AnnData organization:
var: Gene/feature annotations (gene symbols, biotypes, etc.)X: Collection of feature-by-observation matrices (raw counts, normalized, scaled)obsm: Dense matrices associated with observations (PCA, UMAP embeddings)obsp: Sparse matrices between observations (nearest neighbor graphs, distances)varm: Dense matrices associated with variables (principal components, gene loadings)varp: Sparse matrices between variables (gene regulatory networks, correlations)import tiledbsoma
import numpy as np
import pyarrow as pa
# Open an experiment and access RNA measurement
with tiledbsoma.open("experiment.soma") as exp:
rna = exp.ms["RNA"]
# Access gene annotations
print("Gene information:")
gene_info = rna.var.read(column_names=["feature_name", "feature_type"]).concat()
print(gene_info.to_pandas().head())
# Access raw count matrix
raw_counts = rna.X["data"]
print(f"Expression matrix shape: {raw_counts.shape}")
print(f"Non-zero values: {raw_counts.nnz}")
# Read expression data for specific genes
gene_slice = slice(0, 100) # First 100 genes
cell_slice = slice(0, 1000) # First 1000 cells
for batch in raw_counts.read(coords=(cell_slice, gene_slice)):
coords = batch.coords().to_pandas()
values = batch.values().to_pandas()
print(f"Expression batch: {len(values)} non-zero values")
# Access cell embeddings (if available)
if "X_pca" in rna.obsm:
pca_embeddings = rna.obsm["X_pca"]
pca_data = pca_embeddings.read().to_numpy()
print(f"PCA embeddings shape: {pca_data.shape}")import tiledbsoma
import pyarrow as pa
# Create experiment structure
with tiledbsoma.Experiment.create("my_experiment.soma") as exp:
# Create observations DataFrame (cell metadata)
obs_schema = pa.schema([
("soma_joinid", pa.int64()),
("cell_type", pa.string()),
("sample_id", pa.string()),
("n_genes", pa.int32()),
("total_counts", pa.int32())
])
exp.add_new_dataframe("obs", schema=obs_schema)
# Create measurements collection
exp.add_new_collection("ms")
# Create RNA measurement
with exp.ms.add_new_collection("RNA") as rna:
# Variable annotations (gene metadata)
var_schema = pa.schema([
("soma_joinid", pa.int64()),
("feature_name", pa.string()),
("feature_type", pa.string()),
("chromosome", pa.string())
])
rna.add_new_dataframe("var", schema=var_schema)
# Expression matrices collection
rna.add_new_collection("X")
rna.X.add_new_sparse_ndarray(
"data",
type=pa.int32(),
shape=(10000, 2000) # 10k cells, 2k genes
)
# Dense observation matrices (embeddings, etc.)
rna.add_new_collection("obsm")
rna.obsm.add_new_dense_ndarray(
"X_pca",
type=pa.float64(),
shape=(10000, 50) # PCA coordinates
)import tiledbsoma
import numpy as np
import pyarrow as pa
# Open experiment and add data
with tiledbsoma.open("my_experiment.soma", mode="w") as exp:
# Add cell metadata
cell_data = pa.table({
"soma_joinid": range(1000),
"cell_type": ["T-cell"] * 300 + ["B-cell"] * 200 + ["NK-cell"] * 500,
"sample_id": ["Sample1"] * 500 + ["Sample2"] * 500,
"n_genes": np.random.randint(500, 2000, 1000),
"total_counts": np.random.randint(1000, 10000, 1000)
})
exp.obs.write(cell_data)
# Add gene metadata
gene_data = pa.table({
"soma_joinid": range(2000),
"feature_name": [f"Gene_{i}" for i in range(2000)],
"feature_type": ["Gene"] * 2000,
"chromosome": [f"chr{i%22+1}" for i in range(2000)]
})
exp.ms["RNA"].var.write(gene_data)
# Add sparse expression data
n_nonzero = 50000
cell_ids = np.random.randint(0, 1000, n_nonzero)
gene_ids = np.random.randint(0, 2000, n_nonzero)
counts = np.random.poisson(3, n_nonzero) # Poisson-distributed counts
coords = pa.table({
"soma_dim_0": cell_ids,
"soma_dim_1": gene_ids
})
values = pa.table({
"soma_data": counts
})
exp.ms["RNA"].X["data"].write((coords, values))import tiledbsoma
# Create multi-modal experiment (RNA + ATAC)
with tiledbsoma.open("multimodal_experiment.soma", mode="w") as exp:
# Both measurements share the same observations (cells)
# But have different variables (genes vs peaks)
# RNA measurement
rna = exp.ms["RNA"]
print(f"RNA genes: {rna.var.count()}")
print(f"RNA expression shape: {rna.X['data'].shape}")
# ATAC measurement
atac = exp.ms["ATAC"]
print(f"ATAC peaks: {atac.var.count()}")
print(f"ATAC accessibility shape: {atac.X['data'].shape}")
# Query both modalities for the same cells
rna_query = exp.axis_query("RNA", obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2]))
atac_query = exp.axis_query("ATAC", obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2]))
# Get data for first 3 cells
rna_data = rna_query.to_anndata()
atac_data = atac_query.to_anndata()The most common workflow involves converting between SOMA and AnnData formats:
import tiledbsoma.io as soma_io
import scanpy as sc
# Load AnnData and convert to SOMA
adata = sc.datasets.pbmc3k()
soma_io.from_anndata(adata, "pbmc3k.soma", measurement_name="RNA")
# Work with SOMA format
with tiledbsoma.open("pbmc3k.soma") as exp:
# Perform queries, access subsets
query = exp.axis_query("RNA")
# Convert back to AnnData for analysis
adata_subset = query.to_anndata(X_layer_name="X")
# Save as H5AD
soma_io.to_h5ad(exp, "output.h5ad", measurement_name="RNA")This integration allows seamless use of SOMA's scalable storage with existing single-cell analysis workflows in Python.
Install with Tessl CLI
npx tessl i tessl/pypi-tiledbsoma