Python API for efficient storage and retrieval of single-cell data using TileDB
npx @tessl/cli install tessl/pypi-tiledbsoma@1.17.0A Python implementation of the SOMA (Stack of Matrices, Annotated) API using TileDB Embedded for efficient storage and retrieval of single-cell data. TileDB-SOMA provides scalable data structures for storing and querying larger-than-memory datasets in both cloud and local systems, with specialized support for single-cell biology workflows.
pip install tiledbsomaimport tiledbsomaCommon patterns for data structures:
from tiledbsoma import (
Collection, DataFrame, SparseNDArray, DenseNDArray,
Experiment, Measurement, open
)For I/O operations:
import tiledbsoma.io as soma_ioimport tiledbsoma
import numpy as np
import pyarrow as pa
# Create a DataFrame with single-cell observations
schema = pa.schema([
("soma_joinid", pa.int64()),
("cell_type", pa.string()),
("tissue", pa.string()),
("donor_id", pa.string())
])
# Create and write data
with tiledbsoma.DataFrame.create("obs.soma", schema=schema) as obs_df:
data = pa.table({
"soma_joinid": [0, 1, 2, 3],
"cell_type": ["T-cell", "B-cell", "Neuron", "Astrocyte"],
"tissue": ["blood", "blood", "brain", "brain"],
"donor_id": ["D1", "D1", "D2", "D2"]
})
obs_df.write(data)
# Read data back
with tiledbsoma.open("obs.soma") as obs_df:
data = obs_df.read().concat()
print(data.to_pandas())
# Create a sparse matrix for gene expression data
with tiledbsoma.SparseNDArray.create(
"X.soma",
type=pa.float32(),
shape=(1000, 2000) # 1000 cells, 2000 genes
) as X_array:
# Write sparse data (cell_id, gene_id, expression_value)
coordinates = pa.table({
"soma_dim_0": [0, 0, 1, 1, 2], # cell indices
"soma_dim_1": [5, 100, 5, 200, 300], # gene indices
})
values = pa.table({
"soma_data": [1.5, 2.3, 0.8, 3.1, 1.2] # expression values
})
X_array.write((coordinates, values))TileDB-SOMA follows a hierarchical object model designed for single-cell data analysis:
soma_joinid columnThe library uses Apache Arrow for in-memory data representation and TileDB for persistent storage, enabling efficient operations on larger-than-memory datasets with support for cloud storage backends.
Fundamental SOMA data types including Collections for hierarchical organization, DataFrames for tabular data, and sparse/dense N-dimensional arrays for numerical data storage.
class Collection:
@classmethod
def create(cls, uri, *, platform_config=None, context=None, tiledb_timestamp=None): ...
def add_new_collection(self, key, **kwargs): ...
def add_new_dataframe(self, key, **kwargs): ...
class DataFrame:
@classmethod
def create(cls, uri, *, schema, domain=None, platform_config=None, context=None, tiledb_timestamp=None): ...
def read(self, coords=(), value_filter=None, column_names=None, result_order=None, batch_size=None, partitions=None, platform_config=None): ...
def write(self, values, platform_config=None): ...
class SparseNDArray:
@classmethod
def create(cls, uri, *, type, shape, platform_config=None, context=None, tiledb_timestamp=None): ...
def read(self, coords=(), result_order=None, batch_size=None, partitions=None, platform_config=None): ...
def write(self, values, platform_config=None): ...
class DenseNDArray:
@classmethod
def create(cls, uri, *, type, shape, platform_config=None, context=None, tiledb_timestamp=None): ...
def read(self, coords=(), result_order=None, batch_size=None, partitions=None, platform_config=None): ...
def write(self, coords, values, platform_config=None): ...Specialized data structures for single-cell analysis including Experiments for annotated measurement matrices and Measurements for grouping observations with variables.
class Experiment(Collection):
obs: DataFrame # Primary observations annotations
ms: Collection # Named measurements collection
spatial: Collection # Spatial scenes collection
def axis_query(self, measurement_name, *, obs_query=None, var_query=None): ...
class Measurement(Collection):
var: DataFrame # Variable annotations
X: Collection[SparseNDArray] # Feature values matrices
obsm: Collection[DenseNDArray] # Dense observation annotations
obsp: Collection[SparseNDArray] # Sparse pairwise observation annotationsExperimental spatial data structures for storing and analyzing spatial single-cell data, including geometry dataframes, point clouds, multiscale images, and spatial scenes.
class GeometryDataFrame(DataFrame):
@classmethod
def create(cls, uri, *, schema, coordinate_space=("x", "y"), domain=None, platform_config=None, context=None, tiledb_timestamp=None): ...
class PointCloudDataFrame(DataFrame):
@classmethod
def create(cls, uri, *, schema, coordinate_space=("x", "y"), domain=None, platform_config=None, context=None, tiledb_timestamp=None): ...
class Scene(Collection):
img: Collection # Image collection
obsl: Collection # Observation location collection
varl: Collection # Variable location collectionComprehensive ingestion and outgestion functions for converting between SOMA format and popular single-cell data formats like AnnData and H5AD files.
def from_anndata(anndata, uri, *, measurement_name="RNA", obs_id_name="obs_id", var_id_name="var_id", X_layer_name=None, obsm_layers=None, varm_layers=None, obsp_layers=None, varp_layers=None, uns_keys=None, ingest_mode="write", registration_mapping=None, context=None, platform_config=None, additional_metadata=None): ...
def to_anndata(experiment, *, measurement_name="RNA", X_layer_name=None, obsm_layers=None, varm_layers=None, obsp_layers=None, varp_layers=None, obs_coords=None, var_coords=None, obs_value_filter=None, var_value_filter=None, obs_column_names=None, var_column_names=None, batch_size=None, context=None): ...
def from_h5ad(h5ad_file_path, output_path, *, measurement_name="RNA", ...): ...ID mapping utilities for multi-file append-mode ingestion, supporting soma_joinid remapping and string-to-integer label mapping across multiple input files.
class AxisAmbientLabelMapping:
def __init__(self, *, field_name: str, joinid_map: pd.DataFrame, enum_values: dict):
"""
Tracks mapping of input data ID-column names to SOMA join IDs.
Parameters:
- field_name: str, name of the ID column
- joinid_map: pd.DataFrame, mapping from ID to soma_joinid
- enum_values: dict, categorical type mappings
"""
class ExperimentAmbientLabelMapping:
obs: AxisAmbientLabelMapping # Observation ID mappings
var: dict[str, AxisAmbientLabelMapping] # Variable ID mappings per measurement
class AxisIDMapping:
def __init__(self, id_map: dict[int, int]):
"""
Offset-to-joinid mappings for individual input files.
Parameters:
- id_map: dict, mapping from input offsets to SOMA join IDs
"""
class ExperimentIDMapping:
obs: AxisIDMapping # Observation ID mapping
var: dict[str, AxisIDMapping] # Variable ID mappings per measurement
def get_dataframe_values(df: DataFrame, *, ids: npt.NDArray[np.int64], col_name: str):
"""Get values from DataFrame for specified IDs and column"""Query builders and indexing utilities for efficient data retrieval from SOMA objects, including experiment axis queries and integer indexing.
class ExperimentAxisQuery:
def obs(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None): ...
def var(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None): ...
def X(self, layer_name, *, batch_size=None, partitions=None, platform_config=None): ...
def to_anndata(self, *, X_layer_name=None, column_names=None, obsm_layers=None, varm_layers=None, obsp_layers=None, varp_layers=None): ...
class IntIndexer:
def __init__(self, data, *, context=None): ...
def get_indexer(self, target): ...Advanced query condition system for attribute filtering with support for complex Boolean expressions and membership operations.
class QueryCondition:
def __init__(self, expression: str):
"""
Create a query condition for filtering SOMA objects.
Parameters:
- expression: str, Boolean expression using TileDB query syntax
Supports:
- Comparison operators: <, >, <=, >=, ==, !=
- Boolean operators: and, or, &, |
- Membership operator: in
- Attribute casting: attr("column_name")
- Value casting: val(value)
"""
def init_query_condition(self, schema, query_attrs):
"""Initialize the query condition with schema and attributes"""Configuration classes for TileDB context management and platform-specific options for creating and writing SOMA objects.
class SOMATileDBContext:
def __init__(self, config=None): ...
class TileDBCreateOptions:
def __init__(self, **kwargs): ...
class TileDBWriteOptions:
def __init__(self, **kwargs): ...class CoordinateSpace:
"""Defines coordinate space for spatial data"""
class AffineTransform:
"""Affine coordinate transformation"""
class IdentityTransform:
"""Identity coordinate transformation"""
class ScaleTransform:
"""Scale coordinate transformation"""
class UniformScaleTransform:
"""Uniform scale coordinate transformation"""SOMA_JOINID: str = "soma_joinid" # Required DataFrame column nameclass SOMAError(Exception):
"""Base exception class for all SOMA-specific errors"""
class DoesNotExistError(SOMAError):
"""Raised when requested SOMA object does not exist"""
class AlreadyExistsError(SOMAError):
"""Raised when attempting to create object that already exists"""
class NotCreateableError(SOMAError):
"""Raised when object cannot be created"""def open(uri, mode="r", *, soma_type=None, context=None, tiledb_timestamp=None):
"""Opens any SOMA object at URI"""
def get_implementation() -> str:
"""Returns implementation name ('python-tiledb')"""
def get_implementation_version() -> str:
"""Returns package version"""
def show_package_versions() -> None:
"""Prints version information for all dependencies"""def tiledbsoma_stats_json() -> str:
"""Return TileDB-SOMA statistics as JSON string"""
def tiledbsoma_stats_as_py() -> list:
"""Return TileDB-SOMA statistics as Python objects"""
def tiledbsoma_stats_enable() -> None:
"""Enable TileDB statistics collection"""
def tiledbsoma_stats_disable() -> None:
"""Disable TileDB statistics collection"""
def tiledbsoma_stats_reset() -> None:
"""Reset TileDB statistics"""
def tiledbsoma_stats_dump() -> None:
"""Dump TileDB statistics to stdout"""import tiledbsoma.logging
def warning() -> None:
"""Set logging level to WARNING"""
def info() -> None:
"""Set logging level to INFO with progress indicators"""
def debug() -> None:
"""Set logging level to DEBUG with detailed progress"""
def log_io_same(message: str) -> None:
"""Log message to both INFO and DEBUG levels"""
def log_io(info_message: str | None, debug_message: str) -> None:
"""Log different messages at INFO and DEBUG levels"""