0
# Query and Indexing
1
2
Query builders and indexing utilities for efficient data retrieval from SOMA objects. These tools enable filtering, subsetting, and indexing operations on single-cell datasets at scale.
3
4
## Capabilities
5
6
### ExperimentAxisQuery
7
8
A powerful query builder for Experiments that provides methods to query observations, variables, and measurements with efficient filtering and retrieval.
9
10
```python { .api }
11
class ExperimentAxisQuery:
12
def obs(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):
13
"""
14
Query observations (cells) from the experiment.
15
16
Parameters:
17
- column_names: list of str, specific observation columns to retrieve
18
- batch_size: int, number of observations per batch
19
- partitions: Partitions object for parallel reading
20
- platform_config: TileDB-specific configuration options
21
22
Returns:
23
Iterator of Arrow tables containing observation data
24
"""
25
26
def var(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):
27
"""
28
Query variables (genes/features) from the experiment.
29
30
Parameters:
31
- column_names: list of str, specific variable columns to retrieve
32
- batch_size: int, number of variables per batch
33
- partitions: Partitions object for parallel reading
34
- platform_config: TileDB-specific configuration options
35
36
Returns:
37
Iterator of Arrow tables containing variable data
38
"""
39
40
def X(self, layer_name, *, batch_size=None, partitions=None, platform_config=None):
41
"""
42
Query measurement matrices (expression data).
43
44
Parameters:
45
- layer_name: str, name of the X layer to query
46
- batch_size: int, number of elements per batch
47
- partitions: Partitions object for parallel reading
48
- platform_config: TileDB-specific configuration options
49
50
Returns:
51
Iterator of sparse matrix data
52
"""
53
54
def to_anndata(self, *, X_layer_name=None, column_names=None, obsm_layers=None, varm_layers=None, obsp_layers=None, varp_layers=None):
55
"""
56
Convert query results to an AnnData object.
57
58
Parameters:
59
- X_layer_name: str, X layer to use as main matrix (None uses first available)
60
- column_names: dict, column names to include for obs/var
61
- obsm_layers: list of str, obsm layers to include
62
- varm_layers: list of str, varm layers to include
63
- obsp_layers: list of str, obsp layers to include
64
- varp_layers: list of str, varp layers to include
65
66
Returns:
67
AnnData object with query results
68
"""
69
```
70
71
#### Creating Axis Queries
72
73
Axis queries are created through the `axis_query` method on Experiments:
74
75
```python
76
import tiledbsoma
77
78
with tiledbsoma.open("experiment.soma") as exp:
79
# Create basic query
80
query = exp.axis_query("RNA")
81
82
# Create query with observation filtering
83
query = exp.axis_query(
84
"RNA",
85
obs_query=tiledbsoma.AxisQuery(
86
value_filter="cell_type == 'T-cell' and n_genes > 1000"
87
)
88
)
89
90
# Create query with variable filtering
91
query = exp.axis_query(
92
"RNA",
93
var_query=tiledbsoma.AxisQuery(
94
value_filter="feature_type == 'Gene Expression'"
95
)
96
)
97
98
# Create query with coordinate selection
99
query = exp.axis_query(
100
"RNA",
101
obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2, 10, 50]),
102
var_query=tiledbsoma.AxisQuery(coords=slice(0, 100))
103
)
104
```
105
106
#### Usage Examples
107
108
```python
109
import tiledbsoma
110
import pandas as pd
111
112
# Query specific cell types
113
with tiledbsoma.open("pbmc_experiment.soma") as exp:
114
# Create query for T cells
115
t_cell_query = exp.axis_query(
116
"RNA",
117
obs_query=tiledbsoma.AxisQuery(
118
value_filter="cell_type in ['CD4 T cells', 'CD8 T cells']"
119
)
120
)
121
122
# Get observation metadata
123
obs_data = t_cell_query.obs(
124
column_names=["cell_type", "donor_id", "n_genes", "total_counts"]
125
).concat()
126
print(f"T cells found: {len(obs_data)}")
127
128
# Get variable information
129
var_data = t_cell_query.var(
130
column_names=["feature_name", "feature_type"]
131
).concat()
132
print(f"Genes: {len(var_data)}")
133
134
# Get expression matrix
135
for batch in t_cell_query.X("data"):
136
coordinates = batch.coords().to_pandas() # cell_id, gene_id
137
values = batch.values().to_pandas() # expression values
138
print(f"Expression batch: {len(values)} non-zero values")
139
140
# Convert to AnnData for downstream analysis
141
t_cell_adata = t_cell_query.to_anndata(
142
X_layer_name="data",
143
obsm_layers=["X_pca", "X_umap"],
144
column_names={
145
"obs": ["cell_type", "donor_id"],
146
"var": ["feature_name", "highly_variable"]
147
}
148
)
149
print(f"AnnData shape: {t_cell_adata.shape}")
150
151
# Query with coordinate-based selection
152
with tiledbsoma.open("experiment.soma") as exp:
153
# Select first 1000 cells and top 2000 variable genes
154
subset_query = exp.axis_query(
155
"RNA",
156
obs_query=tiledbsoma.AxisQuery(coords=slice(0, 1000)),
157
var_query=tiledbsoma.AxisQuery(coords=slice(0, 2000))
158
)
159
160
# Process in batches
161
batch_size = 10000
162
for obs_batch in subset_query.obs(batch_size=batch_size):
163
obs_df = obs_batch.to_pandas()
164
print(f"Processing {len(obs_df)} observations")
165
166
# Process batch...
167
168
# Complex filtering query
169
with tiledbsoma.open("experiment.soma") as exp:
170
# Query high-quality cells with specific markers
171
quality_query = exp.axis_query(
172
"RNA",
173
obs_query=tiledbsoma.AxisQuery(
174
value_filter="""
175
n_genes >= 500 and n_genes <= 5000 and
176
total_counts >= 1000 and
177
pct_counts_mitochondrial <= 20 and
178
tissue == 'brain'
179
"""
180
),
181
var_query=tiledbsoma.AxisQuery(
182
value_filter="highly_variable == True and feature_type == 'Gene Expression'"
183
)
184
)
185
186
# Convert to AnnData with all available layers
187
brain_adata = quality_query.to_anndata(
188
X_layer_name="normalized",
189
obsm_layers=None, # Include all obsm layers
190
varm_layers=None # Include all varm layers
191
)
192
```
193
194
### AxisQuery Specification
195
196
The AxisQuery class provides flexible query specification for coordinates and filtering.
197
198
```python { .api }
199
class AxisQuery:
200
def __init__(self, *, coords=None, value_filter=None):
201
"""
202
Create an axis query specification.
203
204
Parameters:
205
- coords: coordinate selection (slice, list, or array)
206
- value_filter: str, filter expression for attribute values
207
"""
208
```
209
210
#### Coordinate Selection Examples
211
212
```python
213
import tiledbsoma
214
215
# Various coordinate selection patterns
216
axis_queries = [
217
# Select specific indices
218
tiledbsoma.AxisQuery(coords=[0, 5, 10, 15, 20]),
219
220
# Select range with slice
221
tiledbsoma.AxisQuery(coords=slice(100, 500)),
222
223
# Select with step
224
tiledbsoma.AxisQuery(coords=slice(0, 1000, 10)), # Every 10th element
225
226
# Select all (equivalent to no coordinate filter)
227
tiledbsoma.AxisQuery(coords=slice(None)),
228
]
229
230
# Value filter examples
231
filter_queries = [
232
# Numeric comparisons
233
tiledbsoma.AxisQuery(value_filter="n_genes > 1000"),
234
235
# String matching
236
tiledbsoma.AxisQuery(value_filter="cell_type == 'B cells'"),
237
238
# Multiple conditions
239
tiledbsoma.AxisQuery(value_filter="n_genes > 500 and total_counts < 10000"),
240
241
# Set membership
242
tiledbsoma.AxisQuery(value_filter="donor_id in ['D1', 'D2', 'D3']"),
243
244
# Pattern matching
245
tiledbsoma.AxisQuery(value_filter="feature_name startswith 'MT-'"),
246
]
247
```
248
249
### IntIndexer
250
251
A re-indexer for unique integer indices, compatible with Pandas Index.get_indexer functionality. Useful for mapping between different index spaces efficiently.
252
253
```python { .api }
254
class IntIndexer:
255
def __init__(self, data, *, context=None):
256
"""
257
Initialize IntIndexer with integer keys.
258
259
Parameters:
260
- data: array-like of unique integers to index
261
- context: TileDB context for the operation
262
"""
263
264
def get_indexer(self, target):
265
"""
266
Compute underlying indices for target data.
267
268
Parameters:
269
- target: array-like of integers to find indices for
270
271
Returns:
272
numpy array of indices, with -1 for missing values
273
"""
274
```
275
276
#### Usage Example
277
278
```python
279
import tiledbsoma
280
import numpy as np
281
282
# Create indexer for soma_joinid values
283
original_ids = np.array([0, 5, 10, 15, 20, 25, 30])
284
indexer = tiledbsoma.IntIndexer(original_ids)
285
286
# Find positions of specific IDs
287
target_ids = np.array([5, 15, 99, 20]) # 99 doesn't exist
288
positions = indexer.get_indexer(target_ids)
289
print(positions) # [1, 3, -1, 4] (99 maps to -1)
290
291
# Use with SOMA data
292
with tiledbsoma.open("experiment.soma") as exp:
293
# Get all observation IDs
294
obs_ids = exp.obs.read(column_names=["soma_joinid"]).concat()["soma_joinid"].to_numpy()
295
296
# Create indexer
297
obs_indexer = tiledbsoma.IntIndexer(obs_ids)
298
299
# Map external IDs to SOMA positions
300
external_ids = np.array([100, 200, 300, 400])
301
soma_positions = obs_indexer.get_indexer(external_ids)
302
303
# Use positions for coordinate-based queries
304
valid_positions = soma_positions[soma_positions >= 0]
305
if len(valid_positions) > 0:
306
query = exp.axis_query(
307
"RNA",
308
obs_query=tiledbsoma.AxisQuery(coords=valid_positions)
309
)
310
```
311
312
### Index Building Function
313
314
Utility function for building indices on integer arrays.
315
316
```python { .api }
317
def tiledbsoma_build_index(data, *, context=None):
318
"""
319
Build index for integer array.
320
321
Parameters:
322
- data: array-like of integers to index
323
- context: TileDB context for the operation
324
325
Returns:
326
Built index structure for efficient lookups
327
"""
328
```
329
330
#### Usage Example
331
332
```python
333
import tiledbsoma
334
import numpy as np
335
336
# Build index for large ID array
337
large_id_array = np.random.randint(0, 1000000, size=100000)
338
index = tiledbsoma.tiledbsoma_build_index(large_id_array)
339
340
# Use index for efficient lookups
341
# (specific usage depends on implementation details)
342
```
343
344
### Query Performance Optimization
345
346
#### Batch Processing
347
348
```python
349
import tiledbsoma
350
351
# Efficient batch processing for large queries
352
with tiledbsoma.open("large_experiment.soma") as exp:
353
query = exp.axis_query("RNA")
354
355
# Process observations in batches
356
batch_size = 1000
357
total_processed = 0
358
359
for obs_batch in query.obs(batch_size=batch_size):
360
obs_df = obs_batch.to_pandas()
361
total_processed += len(obs_df)
362
363
# Process batch
364
print(f"Processed {total_processed} observations")
365
366
# Your analysis code here...
367
```
368
369
#### Parallel Processing
370
371
```python
372
import tiledbsoma
373
374
# Use partitions for parallel processing
375
with tiledbsoma.open("experiment.soma") as exp:
376
query = exp.axis_query("RNA")
377
378
# Create partitions for parallel execution
379
partitions = tiledbsoma.Partitions(n_partitions=4)
380
381
# Process partitions in parallel (conceptual - actual implementation may vary)
382
for partition_id in range(partitions.n_partitions):
383
obs_data = query.obs(partitions=partitions.get_partition(partition_id))
384
# Process partition...
385
```
386
387
#### Memory-Efficient Queries
388
389
```python
390
import tiledbsoma
391
392
# Memory-efficient processing of large datasets
393
with tiledbsoma.open("experiment.soma") as exp:
394
# Query only needed columns
395
query = exp.axis_query(
396
"RNA",
397
obs_query=tiledbsoma.AxisQuery(
398
value_filter="quality_score > 0.8"
399
)
400
)
401
402
# Stream data without loading everything into memory
403
for expr_batch in query.X("data", batch_size=5000):
404
# Process expression batch
405
coords = expr_batch.coords()
406
values = expr_batch.values()
407
408
# Compute statistics, etc. without storing full dataset
409
print(f"Batch non-zero values: {len(values)}")
410
```
411
412
### Integration with Analysis Workflows
413
414
```python
415
import tiledbsoma
416
import scanpy as sc
417
418
# Integrated analysis workflow
419
with tiledbsoma.open("experiment.soma") as exp:
420
# Query high-quality cells
421
hq_query = exp.axis_query(
422
"RNA",
423
obs_query=tiledbsoma.AxisQuery(
424
value_filter="n_genes > 200 and pct_counts_mitochondrial < 20"
425
),
426
var_query=tiledbsoma.AxisQuery(
427
value_filter="n_cells > 3" # Genes expressed in at least 3 cells
428
)
429
)
430
431
# Convert to AnnData for Scanpy analysis
432
adata = hq_query.to_anndata(X_layer_name="raw")
433
434
# Standard single-cell analysis
435
sc.pp.normalize_total(adata, target_sum=1e4)
436
sc.pp.log1p(adata)
437
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
438
439
# Continue with downstream analysis...
440
print(f"Processed {adata.n_obs} cells and {adata.n_vars} genes")
441
```
442
443
This query and indexing functionality provides the foundation for efficient, scalable analysis of single-cell datasets stored in SOMA format.