Tessl Tile for pypi/tiledbsoma@1.17.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

configuration.md core-data-structures.md data-io.md index.md query-indexing.md single-cell-biology.md spatial-data.md

query-indexing.mddocs/

0
# Query and Indexing
1

2
Query builders and indexing utilities for efficient data retrieval from SOMA objects. These tools enable filtering, subsetting, and indexing operations on single-cell datasets at scale.
3

4
## Capabilities
5

6
### ExperimentAxisQuery
7

8
A powerful query builder for Experiments that provides methods to query observations, variables, and measurements with efficient filtering and retrieval.
9

10
```python { .api }
11
class ExperimentAxisQuery:
12
    def obs(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):
13
        """
14
        Query observations (cells) from the experiment.
15
        
16
        Parameters:
17
        - column_names: list of str, specific observation columns to retrieve
18
        - batch_size: int, number of observations per batch
19
        - partitions: Partitions object for parallel reading
20
        - platform_config: TileDB-specific configuration options
21
        
22
        Returns:
23
        Iterator of Arrow tables containing observation data
24
        """
25
    
26
    def var(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):
27
        """
28
        Query variables (genes/features) from the experiment.
29
        
30
        Parameters:
31
        - column_names: list of str, specific variable columns to retrieve
32
        - batch_size: int, number of variables per batch
33
        - partitions: Partitions object for parallel reading
34
        - platform_config: TileDB-specific configuration options
35
        
36
        Returns:
37
        Iterator of Arrow tables containing variable data
38
        """
39
    
40
    def X(self, layer_name, *, batch_size=None, partitions=None, platform_config=None):
41
        """
42
        Query measurement matrices (expression data).
43
        
44
        Parameters:
45
        - layer_name: str, name of the X layer to query
46
        - batch_size: int, number of elements per batch
47
        - partitions: Partitions object for parallel reading
48
        - platform_config: TileDB-specific configuration options
49
        
50
        Returns:
51
        Iterator of sparse matrix data
52
        """
53
    
54
    def to_anndata(self, *, X_layer_name=None, column_names=None, obsm_layers=None, varm_layers=None, obsp_layers=None, varp_layers=None):
55
        """
56
        Convert query results to an AnnData object.
57
        
58
        Parameters:
59
        - X_layer_name: str, X layer to use as main matrix (None uses first available)
60
        - column_names: dict, column names to include for obs/var
61
        - obsm_layers: list of str, obsm layers to include
62
        - varm_layers: list of str, varm layers to include
63
        - obsp_layers: list of str, obsp layers to include
64
        - varp_layers: list of str, varp layers to include
65
        
66
        Returns:
67
        AnnData object with query results
68
        """
69
```
70

71
#### Creating Axis Queries
72

73
Axis queries are created through the `axis_query` method on Experiments:
74

75
```python
76
import tiledbsoma
77

78
with tiledbsoma.open("experiment.soma") as exp:
79
    # Create basic query
80
    query = exp.axis_query("RNA")
81
    
82
    # Create query with observation filtering
83
    query = exp.axis_query(
84
        "RNA",
85
        obs_query=tiledbsoma.AxisQuery(
86
            value_filter="cell_type == 'T-cell' and n_genes > 1000"
87
        )
88
    )
89
    
90
    # Create query with variable filtering
91
    query = exp.axis_query(
92
        "RNA", 
93
        var_query=tiledbsoma.AxisQuery(
94
            value_filter="feature_type == 'Gene Expression'"
95
        )
96
    )
97
    
98
    # Create query with coordinate selection
99
    query = exp.axis_query(
100
        "RNA",
101
        obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2, 10, 50]),
102
        var_query=tiledbsoma.AxisQuery(coords=slice(0, 100))
103
    )
104
```
105

106
#### Usage Examples
107

108
```python
109
import tiledbsoma
110
import pandas as pd
111

112
# Query specific cell types
113
with tiledbsoma.open("pbmc_experiment.soma") as exp:
114
    # Create query for T cells
115
    t_cell_query = exp.axis_query(
116
        "RNA",
117
        obs_query=tiledbsoma.AxisQuery(
118
            value_filter="cell_type in ['CD4 T cells', 'CD8 T cells']"
119
        )
120
    )
121
    
122
    # Get observation metadata
123
    obs_data = t_cell_query.obs(
124
        column_names=["cell_type", "donor_id", "n_genes", "total_counts"]
125
    ).concat()
126
    print(f"T cells found: {len(obs_data)}")
127
    
128
    # Get variable information
129
    var_data = t_cell_query.var(
130
        column_names=["feature_name", "feature_type"]
131
    ).concat()
132
    print(f"Genes: {len(var_data)}")
133
    
134
    # Get expression matrix
135
    for batch in t_cell_query.X("data"):
136
        coordinates = batch.coords().to_pandas()  # cell_id, gene_id
137
        values = batch.values().to_pandas()       # expression values
138
        print(f"Expression batch: {len(values)} non-zero values")
139
    
140
    # Convert to AnnData for downstream analysis
141
    t_cell_adata = t_cell_query.to_anndata(
142
        X_layer_name="data",
143
        obsm_layers=["X_pca", "X_umap"],
144
        column_names={
145
            "obs": ["cell_type", "donor_id"],
146
            "var": ["feature_name", "highly_variable"]
147
        }
148
    )
149
    print(f"AnnData shape: {t_cell_adata.shape}")
150

151
# Query with coordinate-based selection
152
with tiledbsoma.open("experiment.soma") as exp:
153
    # Select first 1000 cells and top 2000 variable genes
154
    subset_query = exp.axis_query(
155
        "RNA",
156
        obs_query=tiledbsoma.AxisQuery(coords=slice(0, 1000)),
157
        var_query=tiledbsoma.AxisQuery(coords=slice(0, 2000))
158
    )
159
    
160
    # Process in batches
161
    batch_size = 10000
162
    for obs_batch in subset_query.obs(batch_size=batch_size):
163
        obs_df = obs_batch.to_pandas()
164
        print(f"Processing {len(obs_df)} observations")
165
        
166
        # Process batch...
167

168
# Complex filtering query
169
with tiledbsoma.open("experiment.soma") as exp:
170
    # Query high-quality cells with specific markers
171
    quality_query = exp.axis_query(
172
        "RNA",
173
        obs_query=tiledbsoma.AxisQuery(
174
            value_filter="""
175
            n_genes >= 500 and n_genes <= 5000 and
176
            total_counts >= 1000 and
177
            pct_counts_mitochondrial <= 20 and
178
            tissue == 'brain'
179
            """
180
        ),
181
        var_query=tiledbsoma.AxisQuery(
182
            value_filter="highly_variable == True and feature_type == 'Gene Expression'"
183
        )
184
    )
185
    
186
    # Convert to AnnData with all available layers
187
    brain_adata = quality_query.to_anndata(
188
        X_layer_name="normalized",
189
        obsm_layers=None,  # Include all obsm layers
190
        varm_layers=None   # Include all varm layers
191
    )
192
```
193

194
### AxisQuery Specification
195

196
The AxisQuery class provides flexible query specification for coordinates and filtering.
197

198
```python { .api }
199
class AxisQuery:
200
    def __init__(self, *, coords=None, value_filter=None):
201
        """
202
        Create an axis query specification.
203
        
204
        Parameters:
205
        - coords: coordinate selection (slice, list, or array)
206
        - value_filter: str, filter expression for attribute values
207
        """
208
```
209

210
#### Coordinate Selection Examples
211

212
```python
213
import tiledbsoma
214

215
# Various coordinate selection patterns
216
axis_queries = [
217
    # Select specific indices
218
    tiledbsoma.AxisQuery(coords=[0, 5, 10, 15, 20]),
219
    
220
    # Select range with slice
221
    tiledbsoma.AxisQuery(coords=slice(100, 500)),
222
    
223
    # Select with step
224
    tiledbsoma.AxisQuery(coords=slice(0, 1000, 10)),  # Every 10th element
225
    
226
    # Select all (equivalent to no coordinate filter)
227
    tiledbsoma.AxisQuery(coords=slice(None)),
228
]
229

230
# Value filter examples
231
filter_queries = [
232
    # Numeric comparisons
233
    tiledbsoma.AxisQuery(value_filter="n_genes > 1000"),
234
    
235
    # String matching
236
    tiledbsoma.AxisQuery(value_filter="cell_type == 'B cells'"),
237
    
238
    # Multiple conditions
239
    tiledbsoma.AxisQuery(value_filter="n_genes > 500 and total_counts < 10000"),
240
    
241
    # Set membership
242
    tiledbsoma.AxisQuery(value_filter="donor_id in ['D1', 'D2', 'D3']"),
243
    
244
    # Pattern matching
245
    tiledbsoma.AxisQuery(value_filter="feature_name startswith 'MT-'"),
246
]
247
```
248

249
### IntIndexer
250

251
A re-indexer for unique integer indices, compatible with Pandas Index.get_indexer functionality. Useful for mapping between different index spaces efficiently.
252

253
```python { .api }
254
class IntIndexer:
255
    def __init__(self, data, *, context=None):
256
        """
257
        Initialize IntIndexer with integer keys.
258
        
259
        Parameters:
260
        - data: array-like of unique integers to index
261
        - context: TileDB context for the operation
262
        """
263
    
264
    def get_indexer(self, target):
265
        """
266
        Compute underlying indices for target data.
267
        
268
        Parameters:
269
        - target: array-like of integers to find indices for
270
        
271
        Returns:
272
        numpy array of indices, with -1 for missing values
273
        """
274
```
275

276
#### Usage Example
277

278
```python
279
import tiledbsoma
280
import numpy as np
281

282
# Create indexer for soma_joinid values
283
original_ids = np.array([0, 5, 10, 15, 20, 25, 30])
284
indexer = tiledbsoma.IntIndexer(original_ids)
285

286
# Find positions of specific IDs
287
target_ids = np.array([5, 15, 99, 20])  # 99 doesn't exist
288
positions = indexer.get_indexer(target_ids)
289
print(positions)  # [1, 3, -1, 4] (99 maps to -1)
290

291
# Use with SOMA data
292
with tiledbsoma.open("experiment.soma") as exp:
293
    # Get all observation IDs
294
    obs_ids = exp.obs.read(column_names=["soma_joinid"]).concat()["soma_joinid"].to_numpy()
295
    
296
    # Create indexer
297
    obs_indexer = tiledbsoma.IntIndexer(obs_ids)
298
    
299
    # Map external IDs to SOMA positions
300
    external_ids = np.array([100, 200, 300, 400])
301
    soma_positions = obs_indexer.get_indexer(external_ids)
302
    
303
    # Use positions for coordinate-based queries
304
    valid_positions = soma_positions[soma_positions >= 0]
305
    if len(valid_positions) > 0:
306
        query = exp.axis_query(
307
            "RNA",
308
            obs_query=tiledbsoma.AxisQuery(coords=valid_positions)
309
        )
310
```
311

312
### Index Building Function
313

314
Utility function for building indices on integer arrays.
315

316
```python { .api }
317
def tiledbsoma_build_index(data, *, context=None):
318
    """
319
    Build index for integer array.
320
    
321
    Parameters:
322
    - data: array-like of integers to index
323
    - context: TileDB context for the operation
324
    
325
    Returns:
326
    Built index structure for efficient lookups
327
    """
328
```
329

330
#### Usage Example
331

332
```python
333
import tiledbsoma
334
import numpy as np
335

336
# Build index for large ID array
337
large_id_array = np.random.randint(0, 1000000, size=100000)
338
index = tiledbsoma.tiledbsoma_build_index(large_id_array)
339

340
# Use index for efficient lookups
341
# (specific usage depends on implementation details)
342
```
343

344
### Query Performance Optimization
345

346
#### Batch Processing
347

348
```python
349
import tiledbsoma
350

351
# Efficient batch processing for large queries
352
with tiledbsoma.open("large_experiment.soma") as exp:
353
    query = exp.axis_query("RNA")
354
    
355
    # Process observations in batches
356
    batch_size = 1000
357
    total_processed = 0
358
    
359
    for obs_batch in query.obs(batch_size=batch_size):
360
        obs_df = obs_batch.to_pandas()
361
        total_processed += len(obs_df)
362
        
363
        # Process batch
364
        print(f"Processed {total_processed} observations")
365
        
366
        # Your analysis code here...
367
```
368

369
#### Parallel Processing
370

371
```python
372
import tiledbsoma
373

374
# Use partitions for parallel processing
375
with tiledbsoma.open("experiment.soma") as exp:
376
    query = exp.axis_query("RNA")
377
    
378
    # Create partitions for parallel execution
379
    partitions = tiledbsoma.Partitions(n_partitions=4)
380
    
381
    # Process partitions in parallel (conceptual - actual implementation may vary)
382
    for partition_id in range(partitions.n_partitions):
383
        obs_data = query.obs(partitions=partitions.get_partition(partition_id))
384
        # Process partition...
385
```
386

387
#### Memory-Efficient Queries
388

389
```python
390
import tiledbsoma
391

392
# Memory-efficient processing of large datasets
393
with tiledbsoma.open("experiment.soma") as exp:
394
    # Query only needed columns
395
    query = exp.axis_query(
396
        "RNA",
397
        obs_query=tiledbsoma.AxisQuery(
398
            value_filter="quality_score > 0.8"
399
        )
400
    )
401
    
402
    # Stream data without loading everything into memory
403
    for expr_batch in query.X("data", batch_size=5000):
404
        # Process expression batch
405
        coords = expr_batch.coords()
406
        values = expr_batch.values()
407
        
408
        # Compute statistics, etc. without storing full dataset
409
        print(f"Batch non-zero values: {len(values)}")
410
```
411

412
### Integration with Analysis Workflows
413

414
```python
415
import tiledbsoma
416
import scanpy as sc
417

418
# Integrated analysis workflow
419
with tiledbsoma.open("experiment.soma") as exp:
420
    # Query high-quality cells
421
    hq_query = exp.axis_query(
422
        "RNA",
423
        obs_query=tiledbsoma.AxisQuery(
424
            value_filter="n_genes > 200 and pct_counts_mitochondrial < 20"
425
        ),
426
        var_query=tiledbsoma.AxisQuery(
427
            value_filter="n_cells > 3"  # Genes expressed in at least 3 cells
428
        )
429
    )
430
    
431
    # Convert to AnnData for Scanpy analysis
432
    adata = hq_query.to_anndata(X_layer_name="raw")
433
    
434
    # Standard single-cell analysis
435
    sc.pp.normalize_total(adata, target_sum=1e4)
436
    sc.pp.log1p(adata)
437
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
438
    
439
    # Continue with downstream analysis...
440
    print(f"Processed {adata.n_obs} cells and {adata.n_vars} genes")
441
```
442

443
This query and indexing functionality provides the foundation for efficient, scalable analysis of single-cell datasets stored in SOMA format.

Version

Tile

Files

query-indexing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

query-indexing.mddocs/