Python API for efficient storage and retrieval of single-cell data using TileDB
Experimental spatial data structures for storing and analyzing spatial single-cell data. These include geometry dataframes for complex shapes, point clouds for coordinate data, multiscale images for microscopy data, and spatial scenes for organizing spatial assets with shared coordinate systems.
Note: All spatial data types are marked as "Lifecycle: experimental" and may undergo significant changes.
A specialized DataFrame for storing complex geometries such as polygons, lines, and multipoints with spatial indexing capabilities. Designed for representing cell boundaries, tissue regions, and other complex spatial features.
class GeometryDataFrame(DataFrame):
@classmethod
def create(cls, uri, *, schema, coordinate_space=("x", "y"), domain=None, platform_config=None, context=None, tiledb_timestamp=None):
"""
Create a new GeometryDataFrame.
Parameters:
- uri: str, URI for the geometry dataframe
- schema: pyarrow.Schema, column schema including soma_joinid and geometry columns
- coordinate_space: tuple of str, names of coordinate dimensions (default: ("x", "y"))
- domain: list of tuples, domain bounds for each dimension
- platform_config: TileDB-specific configuration options
- context: TileDB context for the operation
- tiledb_timestamp: Timestamp for temporal queries
Returns:
GeometryDataFrame instance
"""The schema must include a geometry column containing spatial data in a format compatible with spatial operations.
import tiledbsoma
import pyarrow as pa
import numpy as np
# Define schema for cell boundaries
geometry_schema = pa.schema([
("soma_joinid", pa.int64()),
("cell_id", pa.string()),
("soma_geometry", pa.binary()), # Geometry data (e.g., WKB format)
("area", pa.float64()),
("perimeter", pa.float64()),
("tissue_region", pa.string())
])
# Create geometry dataframe for cell boundaries
with tiledbsoma.GeometryDataFrame.create(
"cell_boundaries.soma",
schema=geometry_schema,
coordinate_space=("x", "y")
) as geom_df:
# Example polygon data (simplified)
geometry_data = pa.table({
"soma_joinid": [0, 1, 2],
"cell_id": ["cell_001", "cell_002", "cell_003"],
"soma_geometry": [b"polygon_wkb_data_1", b"polygon_wkb_data_2", b"polygon_wkb_data_3"],
"area": [25.5, 32.1, 28.7],
"perimeter": [18.2, 20.8, 19.5],
"tissue_region": ["cortex", "cortex", "hippocampus"]
})
geom_df.write(geometry_data)
# Query geometries by region
with tiledbsoma.open("cell_boundaries.soma") as geom_df:
cortex_cells = geom_df.read(
value_filter="tissue_region == 'cortex'",
column_names=["soma_joinid", "cell_id", "area"]
).concat()
print(cortex_cells.to_pandas())A specialized DataFrame for storing point collections in multi-dimensional space with spatial indexing. Ideal for storing subcellular locations, molecular coordinates, and other point-based spatial data.
class PointCloudDataFrame(DataFrame):
@classmethod
def create(cls, uri, *, schema, coordinate_space=("x", "y"), domain=None, platform_config=None, context=None, tiledb_timestamp=None):
"""
Create a new PointCloudDataFrame.
Parameters:
- uri: str, URI for the point cloud dataframe
- schema: pyarrow.Schema, column schema including soma_joinid and coordinate columns
- coordinate_space: tuple of str, names of coordinate dimensions (default: ("x", "y"))
- domain: list of tuples, domain bounds for each dimension
- platform_config: TileDB-specific configuration options
- context: TileDB context for the operation
- tiledb_timestamp: Timestamp for temporal queries
Returns:
PointCloudDataFrame instance
"""The schema should include coordinate columns matching the coordinate_space specification.
import tiledbsoma
import pyarrow as pa
import numpy as np
# Define schema for molecule coordinates
point_schema = pa.schema([
("soma_joinid", pa.int64()),
("x", pa.float64()), # X coordinate
("y", pa.float64()), # Y coordinate
("z", pa.float64()), # Z coordinate (optional)
("gene", pa.string()),
("cell_id", pa.string()),
("intensity", pa.float32())
])
# Create point cloud for single-molecule FISH data
with tiledbsoma.PointCloudDataFrame.create(
"molecule_locations.soma",
schema=point_schema,
coordinate_space=("x", "y", "z")
) as point_df:
# Generate synthetic molecule locations
n_molecules = 10000
np.random.seed(42)
molecule_data = pa.table({
"soma_joinid": range(n_molecules),
"x": np.random.uniform(0, 1000, n_molecules),
"y": np.random.uniform(0, 1000, n_molecules),
"z": np.random.uniform(0, 10, n_molecules),
"gene": np.random.choice(["GAPDH", "ACTB", "CD3D", "CD79A"], n_molecules),
"cell_id": [f"cell_{i//50}" for i in range(n_molecules)],
"intensity": np.random.exponential(100, n_molecules)
})
point_df.write(molecule_data)
# Query molecules by gene and spatial region
with tiledbsoma.open("molecule_locations.soma") as point_df:
# Find GAPDH molecules in specific region
gapdh_molecules = point_df.read(
value_filter="gene == 'GAPDH' and x >= 100 and x <= 200 and y >= 100 and y <= 200",
column_names=["x", "y", "z", "intensity"]
).concat()
print(f"GAPDH molecules in region: {len(gapdh_molecules)}")A Collection of images at multiple resolution levels with consistent channels and axis order. Designed for storing and accessing microscopy data at different scales, enabling efficient visualization and analysis of large images.
class MultiscaleImage(Collection):
@classmethod
def create(cls, uri, *, type, reference_level_shape, axis_names=("c", "y", "x"), coordinate_space=None, platform_config=None, context=None, tiledb_timestamp=None):
"""
Create a new MultiscaleImage.
Parameters:
- uri: str, URI for the multiscale image
- type: pyarrow data type for image pixels
- reference_level_shape: tuple of int, shape of the highest resolution level
- axis_names: tuple of str, names for image axes (default: ("c", "y", "x"))
- coordinate_space: coordinate space specification (optional)
- platform_config: TileDB-specific configuration options
- context: TileDB context for the operation
- tiledb_timestamp: Timestamp for temporal queries
Returns:
MultiscaleImage instance
"""
def levels(self):
"""
Get available resolution levels.
Returns:
list of str: Level names (e.g., ["0", "1", "2"])
"""
def level_shape(self, level):
"""
Get shape of specific resolution level.
Parameters:
- level: str, level name
Returns:
tuple of int: Shape of the specified level
"""import tiledbsoma
import pyarrow as pa
import numpy as np
# Create multiscale image for microscopy data
with tiledbsoma.MultiscaleImage.create(
"tissue_image.soma",
type=pa.uint16(),
reference_level_shape=(3, 2048, 2048), # 3 channels, 2048x2048 pixels
axis_names=("c", "y", "x")
) as ms_image:
# Add multiple resolution levels
# Level 0: Full resolution
level_0 = ms_image.add_new_dense_ndarray(
"0",
type=pa.uint16(),
shape=(3, 2048, 2048)
)
# Level 1: Half resolution
level_1 = ms_image.add_new_dense_ndarray(
"1",
type=pa.uint16(),
shape=(3, 1024, 1024)
)
# Level 2: Quarter resolution
level_2 = ms_image.add_new_dense_ndarray(
"2",
type=pa.uint16(),
shape=(3, 512, 512)
)
# Access different resolution levels
with tiledbsoma.open("tissue_image.soma") as ms_image:
print(f"Available levels: {list(ms_image.keys())}")
# Read low-resolution version for overview
low_res = ms_image["2"].read().to_numpy()
print(f"Low resolution shape: {low_res.shape}")
# Read high-resolution region of interest
roi = ms_image["0"].read(coords=(slice(None), slice(500, 600), slice(500, 600)))
print(f"High-res ROI shape: {roi.to_numpy().shape}")A Collection that organizes spatial assets sharing a coordinate space. Scenes group related spatial data including images, observation locations, and variable locations, providing a unified coordinate system for spatial analysis.
class Scene(Collection):
img: Collection # Image collection (MultiscaleImage objects)
obsl: Collection # Observation location collection (PointCloudDataFrame, GeometryDataFrame)
varl: Collection # Variable location collection (spatial features)
@classmethod
def create(cls, uri, *, coordinate_space=None, platform_config=None, context=None, tiledb_timestamp=None):
"""
Create a new Scene.
Parameters:
- uri: str, URI for the scene
- coordinate_space: coordinate space specification defining spatial reference
- platform_config: TileDB-specific configuration options
- context: TileDB context for the operation
- tiledb_timestamp: Timestamp for temporal queries
Returns:
Scene instance
"""import tiledbsoma
import pyarrow as pa
# Create a spatial scene for tissue analysis
with tiledbsoma.Scene.create("tissue_scene.soma") as scene:
# Add image collection
scene.add_new_collection("img")
# Add observation locations (cell centers)
scene.add_new_collection("obsl")
# Add variable locations (gene expression locations)
scene.add_new_collection("varl")
# Add H&E staining image
he_image = scene.img.add_new_multiscale_image(
"HE_stain",
type=pa.uint8(),
reference_level_shape=(3, 4096, 4096),
axis_names=("c", "y", "x")
)
# Add cell center locations
cell_schema = pa.schema([
("soma_joinid", pa.int64()),
("x", pa.float64()),
("y", pa.float64()),
("cell_type", pa.string())
])
cell_locations = scene.obsl.add_new_point_cloud_dataframe(
"cell_centers",
schema=cell_schema,
coordinate_space=("x", "y")
)
# Access scene components
with tiledbsoma.open("tissue_scene.soma") as scene:
# Access H&E image
he_stain = scene.img["HE_stain"]
image_data = he_stain["0"].read(coords=(slice(None), slice(0, 500), slice(0, 500)))
# Access cell locations overlapping with image region
cell_centers = scene.obsl["cell_centers"]
cells_in_region = cell_centers.read(
value_filter="x >= 0 and x <= 500 and y >= 0 and y <= 500"
).concat()
print(f"Cells in image region: {len(cells_in_region)}")Spatial data types support coordinate system definitions and transformations for aligning data from different sources.
# Coordinate system types (imported from somacore)
class CoordinateSpace:
"""Defines coordinate space for spatial data"""
class AffineTransform:
"""Affine coordinate transformation matrix"""
class IdentityTransform:
"""Identity transformation (no change)"""
class ScaleTransform:
"""Scale transformation with per-axis scaling factors"""
class UniformScaleTransform:
"""Uniform scaling transformation"""import tiledbsoma
from tiledbsoma import CoordinateSpace, AffineTransform
# Define coordinate space with transformation
coord_space = CoordinateSpace([
("x", (0.0, 1000.0)), # X axis: 0-1000 microns
("y", (0.0, 1000.0)) # Y axis: 0-1000 microns
])
# Create geometry dataframe with coordinate space
with tiledbsoma.GeometryDataFrame.create(
"cells_with_coords.soma",
schema=cell_schema,
coordinate_space=("x", "y")
) as geom_df:
# Data is stored in the defined coordinate space
passThe spatial data types are designed to integrate with spatial analysis workflows:
import tiledbsoma
# Load spatial experiment
with tiledbsoma.open("spatial_experiment.soma") as exp:
# Access spatial scene
scene = exp.spatial["tissue_section_1"]
# Get cell locations and expression data
cell_locations = scene.obsl["cell_centers"]
rna_data = exp.ms["RNA"]
# Spatial analysis workflow:
# 1. Load cell coordinates
coords = cell_locations.read().concat().to_pandas()
# 2. Load expression data for same cells
query = exp.axis_query("RNA")
expression = query.to_anndata()
# 3. Combine for spatial analysis
# (e.g., spatial statistics, neighborhood analysis)This spatial data support enables TileDB-SOMA to handle complex spatial single-cell datasets including spatial transcriptomics, spatial proteomics, and multiplexed imaging data.
Install with Tessl CLI
npx tessl i tessl/pypi-tiledbsoma