0
# Single-Cell Biology Support
1
2
Specialized data structures designed specifically for single-cell analysis workflows. These include Experiments for annotated measurement matrices and Measurements for grouping observations with variables, following established patterns from single-cell analysis tools like AnnData and Seurat.
3
4
## Capabilities
5
6
### Experiment
7
8
A specialized Collection that represents an annotated 2-D matrix of measurements, typically for multimodal single-cell datasets. Experiments organize single-cell data into observations (cells), measurements (assays), and optional spatial information.
9
10
```python { .api }
11
class Experiment(Collection):
12
obs: DataFrame # Primary observations annotations (cell metadata)
13
ms: Collection # Named measurements collection (assays)
14
spatial: Collection # Spatial scenes collection (experimental)
15
16
def axis_query(self, measurement_name, *, obs_query=None, var_query=None):
17
"""
18
Create an axis query for efficient data retrieval.
19
20
Parameters:
21
- measurement_name: str, name of measurement to query
22
- obs_query: AxisQuery, query specification for observations
23
- var_query: AxisQuery, query specification for variables
24
25
Returns:
26
ExperimentAxisQuery instance for data retrieval
27
"""
28
```
29
30
The Experiment structure follows the pattern:
31
- `obs`: DataFrame containing cell/observation metadata (cell types, treatments, etc.)
32
- `ms`: Collection of Measurement objects, each representing a different assay or data modality
33
- `spatial`: Collection of Scene objects for spatial single-cell data (experimental feature)
34
35
#### Usage Example
36
37
```python
38
import tiledbsoma
39
import tiledbsoma.io as soma_io
40
import pyarrow as pa
41
42
# Create experiment from AnnData
43
experiment_uri = "single_cell_experiment.soma"
44
soma_io.from_anndata(
45
adata, # AnnData object
46
experiment_uri,
47
measurement_name="RNA"
48
)
49
50
# Open and explore experiment
51
with tiledbsoma.open(experiment_uri) as exp:
52
# Access cell metadata
53
print("Cell types:")
54
print(exp.obs.read(column_names=["cell_type"]).concat().to_pandas())
55
56
# Access RNA measurement
57
rna_measurement = exp.ms["RNA"]
58
59
# Query specific cells and genes
60
query = exp.axis_query("RNA")
61
62
# Get data as AnnData
63
adata_subset = query.to_anndata(
64
X_layer_name="data",
65
obs_column_names=["cell_type", "tissue"]
66
)
67
```
68
69
### Measurement
70
71
A specialized Collection that represents a set of observations with measurements on a common set of annotated variables (features). Each Measurement corresponds to a single assay or data modality within an Experiment.
72
73
```python { .api }
74
class Measurement(Collection):
75
var: DataFrame # Variable annotations (gene/feature metadata)
76
X: Collection[SparseNDArray] # Feature values matrices (count data, normalized data, etc.)
77
obsm: Collection[DenseNDArray] # Dense observation annotations (embeddings, etc.)
78
obsp: Collection[SparseNDArray] # Sparse pairwise observation annotations (distances, graphs)
79
varm: Collection[DenseNDArray] # Dense variable annotations (gene loadings, etc.)
80
varp: Collection[SparseNDArray] # Sparse pairwise variable annotations (gene networks)
81
```
82
83
The Measurement structure mirrors AnnData organization:
84
- `var`: Gene/feature annotations (gene symbols, biotypes, etc.)
85
- `X`: Collection of feature-by-observation matrices (raw counts, normalized, scaled)
86
- `obsm`: Dense matrices associated with observations (PCA, UMAP embeddings)
87
- `obsp`: Sparse matrices between observations (nearest neighbor graphs, distances)
88
- `varm`: Dense matrices associated with variables (principal components, gene loadings)
89
- `varp`: Sparse matrices between variables (gene regulatory networks, correlations)
90
91
#### Usage Example
92
93
```python
94
import tiledbsoma
95
import numpy as np
96
import pyarrow as pa
97
98
# Open an experiment and access RNA measurement
99
with tiledbsoma.open("experiment.soma") as exp:
100
rna = exp.ms["RNA"]
101
102
# Access gene annotations
103
print("Gene information:")
104
gene_info = rna.var.read(column_names=["feature_name", "feature_type"]).concat()
105
print(gene_info.to_pandas().head())
106
107
# Access raw count matrix
108
raw_counts = rna.X["data"]
109
print(f"Expression matrix shape: {raw_counts.shape}")
110
print(f"Non-zero values: {raw_counts.nnz}")
111
112
# Read expression data for specific genes
113
gene_slice = slice(0, 100) # First 100 genes
114
cell_slice = slice(0, 1000) # First 1000 cells
115
116
for batch in raw_counts.read(coords=(cell_slice, gene_slice)):
117
coords = batch.coords().to_pandas()
118
values = batch.values().to_pandas()
119
print(f"Expression batch: {len(values)} non-zero values")
120
121
# Access cell embeddings (if available)
122
if "X_pca" in rna.obsm:
123
pca_embeddings = rna.obsm["X_pca"]
124
pca_data = pca_embeddings.read().to_numpy()
125
print(f"PCA embeddings shape: {pca_data.shape}")
126
```
127
128
### Creating Single-Cell Data Structures
129
130
#### Creating Experiments Manually
131
132
```python
133
import tiledbsoma
134
import pyarrow as pa
135
136
# Create experiment structure
137
with tiledbsoma.Experiment.create("my_experiment.soma") as exp:
138
# Create observations DataFrame (cell metadata)
139
obs_schema = pa.schema([
140
("soma_joinid", pa.int64()),
141
("cell_type", pa.string()),
142
("sample_id", pa.string()),
143
("n_genes", pa.int32()),
144
("total_counts", pa.int32())
145
])
146
exp.add_new_dataframe("obs", schema=obs_schema)
147
148
# Create measurements collection
149
exp.add_new_collection("ms")
150
151
# Create RNA measurement
152
with exp.ms.add_new_collection("RNA") as rna:
153
# Variable annotations (gene metadata)
154
var_schema = pa.schema([
155
("soma_joinid", pa.int64()),
156
("feature_name", pa.string()),
157
("feature_type", pa.string()),
158
("chromosome", pa.string())
159
])
160
rna.add_new_dataframe("var", schema=var_schema)
161
162
# Expression matrices collection
163
rna.add_new_collection("X")
164
rna.X.add_new_sparse_ndarray(
165
"data",
166
type=pa.int32(),
167
shape=(10000, 2000) # 10k cells, 2k genes
168
)
169
170
# Dense observation matrices (embeddings, etc.)
171
rna.add_new_collection("obsm")
172
rna.obsm.add_new_dense_ndarray(
173
"X_pca",
174
type=pa.float64(),
175
shape=(10000, 50) # PCA coordinates
176
)
177
```
178
179
#### Adding Data to Measurements
180
181
```python
182
import tiledbsoma
183
import numpy as np
184
import pyarrow as pa
185
186
# Open experiment and add data
187
with tiledbsoma.open("my_experiment.soma", mode="w") as exp:
188
# Add cell metadata
189
cell_data = pa.table({
190
"soma_joinid": range(1000),
191
"cell_type": ["T-cell"] * 300 + ["B-cell"] * 200 + ["NK-cell"] * 500,
192
"sample_id": ["Sample1"] * 500 + ["Sample2"] * 500,
193
"n_genes": np.random.randint(500, 2000, 1000),
194
"total_counts": np.random.randint(1000, 10000, 1000)
195
})
196
exp.obs.write(cell_data)
197
198
# Add gene metadata
199
gene_data = pa.table({
200
"soma_joinid": range(2000),
201
"feature_name": [f"Gene_{i}" for i in range(2000)],
202
"feature_type": ["Gene"] * 2000,
203
"chromosome": [f"chr{i%22+1}" for i in range(2000)]
204
})
205
exp.ms["RNA"].var.write(gene_data)
206
207
# Add sparse expression data
208
n_nonzero = 50000
209
cell_ids = np.random.randint(0, 1000, n_nonzero)
210
gene_ids = np.random.randint(0, 2000, n_nonzero)
211
counts = np.random.poisson(3, n_nonzero) # Poisson-distributed counts
212
213
coords = pa.table({
214
"soma_dim_0": cell_ids,
215
"soma_dim_1": gene_ids
216
})
217
values = pa.table({
218
"soma_data": counts
219
})
220
221
exp.ms["RNA"].X["data"].write((coords, values))
222
```
223
224
### Working with Multiple Measurements
225
226
```python
227
import tiledbsoma
228
229
# Create multi-modal experiment (RNA + ATAC)
230
with tiledbsoma.open("multimodal_experiment.soma", mode="w") as exp:
231
232
# Both measurements share the same observations (cells)
233
# But have different variables (genes vs peaks)
234
235
# RNA measurement
236
rna = exp.ms["RNA"]
237
print(f"RNA genes: {rna.var.count()}")
238
print(f"RNA expression shape: {rna.X['data'].shape}")
239
240
# ATAC measurement
241
atac = exp.ms["ATAC"]
242
print(f"ATAC peaks: {atac.var.count()}")
243
print(f"ATAC accessibility shape: {atac.X['data'].shape}")
244
245
# Query both modalities for the same cells
246
rna_query = exp.axis_query("RNA", obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2]))
247
atac_query = exp.axis_query("ATAC", obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2]))
248
249
# Get data for first 3 cells
250
rna_data = rna_query.to_anndata()
251
atac_data = atac_query.to_anndata()
252
```
253
254
## Integration with Popular Tools
255
256
### AnnData Conversion
257
258
The most common workflow involves converting between SOMA and AnnData formats:
259
260
```python
261
import tiledbsoma.io as soma_io
262
import scanpy as sc
263
264
# Load AnnData and convert to SOMA
265
adata = sc.datasets.pbmc3k()
266
soma_io.from_anndata(adata, "pbmc3k.soma", measurement_name="RNA")
267
268
# Work with SOMA format
269
with tiledbsoma.open("pbmc3k.soma") as exp:
270
# Perform queries, access subsets
271
query = exp.axis_query("RNA")
272
273
# Convert back to AnnData for analysis
274
adata_subset = query.to_anndata(X_layer_name="X")
275
276
# Save as H5AD
277
soma_io.to_h5ad(exp, "output.h5ad", measurement_name="RNA")
278
```
279
280
This integration allows seamless use of SOMA's scalable storage with existing single-cell analysis workflows in Python.