or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcore-data-structures.mddata-io.mdindex.mdquery-indexing.mdsingle-cell-biology.mdspatial-data.md

query-indexing.mddocs/

0

# Query and Indexing

1

2

Query builders and indexing utilities for efficient data retrieval from SOMA objects. These tools enable filtering, subsetting, and indexing operations on single-cell datasets at scale.

3

4

## Capabilities

5

6

### ExperimentAxisQuery

7

8

A powerful query builder for Experiments that provides methods to query observations, variables, and measurements with efficient filtering and retrieval.

9

10

```python { .api }

11

class ExperimentAxisQuery:

12

def obs(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):

13

"""

14

Query observations (cells) from the experiment.

15

16

Parameters:

17

- column_names: list of str, specific observation columns to retrieve

18

- batch_size: int, number of observations per batch

19

- partitions: Partitions object for parallel reading

20

- platform_config: TileDB-specific configuration options

21

22

Returns:

23

Iterator of Arrow tables containing observation data

24

"""

25

26

def var(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):

27

"""

28

Query variables (genes/features) from the experiment.

29

30

Parameters:

31

- column_names: list of str, specific variable columns to retrieve

32

- batch_size: int, number of variables per batch

33

- partitions: Partitions object for parallel reading

34

- platform_config: TileDB-specific configuration options

35

36

Returns:

37

Iterator of Arrow tables containing variable data

38

"""

39

40

def X(self, layer_name, *, batch_size=None, partitions=None, platform_config=None):

41

"""

42

Query measurement matrices (expression data).

43

44

Parameters:

45

- layer_name: str, name of the X layer to query

46

- batch_size: int, number of elements per batch

47

- partitions: Partitions object for parallel reading

48

- platform_config: TileDB-specific configuration options

49

50

Returns:

51

Iterator of sparse matrix data

52

"""

53

54

def to_anndata(self, *, X_layer_name=None, column_names=None, obsm_layers=None, varm_layers=None, obsp_layers=None, varp_layers=None):

55

"""

56

Convert query results to an AnnData object.

57

58

Parameters:

59

- X_layer_name: str, X layer to use as main matrix (None uses first available)

60

- column_names: dict, column names to include for obs/var

61

- obsm_layers: list of str, obsm layers to include

62

- varm_layers: list of str, varm layers to include

63

- obsp_layers: list of str, obsp layers to include

64

- varp_layers: list of str, varp layers to include

65

66

Returns:

67

AnnData object with query results

68

"""

69

```

70

71

#### Creating Axis Queries

72

73

Axis queries are created through the `axis_query` method on Experiments:

74

75

```python

76

import tiledbsoma

77

78

with tiledbsoma.open("experiment.soma") as exp:

79

# Create basic query

80

query = exp.axis_query("RNA")

81

82

# Create query with observation filtering

83

query = exp.axis_query(

84

"RNA",

85

obs_query=tiledbsoma.AxisQuery(

86

value_filter="cell_type == 'T-cell' and n_genes > 1000"

87

)

88

)

89

90

# Create query with variable filtering

91

query = exp.axis_query(

92

"RNA",

93

var_query=tiledbsoma.AxisQuery(

94

value_filter="feature_type == 'Gene Expression'"

95

)

96

)

97

98

# Create query with coordinate selection

99

query = exp.axis_query(

100

"RNA",

101

obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2, 10, 50]),

102

var_query=tiledbsoma.AxisQuery(coords=slice(0, 100))

103

)

104

```

105

106

#### Usage Examples

107

108

```python

109

import tiledbsoma

110

import pandas as pd

111

112

# Query specific cell types

113

with tiledbsoma.open("pbmc_experiment.soma") as exp:

114

# Create query for T cells

115

t_cell_query = exp.axis_query(

116

"RNA",

117

obs_query=tiledbsoma.AxisQuery(

118

value_filter="cell_type in ['CD4 T cells', 'CD8 T cells']"

119

)

120

)

121

122

# Get observation metadata

123

obs_data = t_cell_query.obs(

124

column_names=["cell_type", "donor_id", "n_genes", "total_counts"]

125

).concat()

126

print(f"T cells found: {len(obs_data)}")

127

128

# Get variable information

129

var_data = t_cell_query.var(

130

column_names=["feature_name", "feature_type"]

131

).concat()

132

print(f"Genes: {len(var_data)}")

133

134

# Get expression matrix

135

for batch in t_cell_query.X("data"):

136

coordinates = batch.coords().to_pandas() # cell_id, gene_id

137

values = batch.values().to_pandas() # expression values

138

print(f"Expression batch: {len(values)} non-zero values")

139

140

# Convert to AnnData for downstream analysis

141

t_cell_adata = t_cell_query.to_anndata(

142

X_layer_name="data",

143

obsm_layers=["X_pca", "X_umap"],

144

column_names={

145

"obs": ["cell_type", "donor_id"],

146

"var": ["feature_name", "highly_variable"]

147

}

148

)

149

print(f"AnnData shape: {t_cell_adata.shape}")

150

151

# Query with coordinate-based selection

152

with tiledbsoma.open("experiment.soma") as exp:

153

# Select first 1000 cells and top 2000 variable genes

154

subset_query = exp.axis_query(

155

"RNA",

156

obs_query=tiledbsoma.AxisQuery(coords=slice(0, 1000)),

157

var_query=tiledbsoma.AxisQuery(coords=slice(0, 2000))

158

)

159

160

# Process in batches

161

batch_size = 10000

162

for obs_batch in subset_query.obs(batch_size=batch_size):

163

obs_df = obs_batch.to_pandas()

164

print(f"Processing {len(obs_df)} observations")

165

166

# Process batch...

167

168

# Complex filtering query

169

with tiledbsoma.open("experiment.soma") as exp:

170

# Query high-quality cells with specific markers

171

quality_query = exp.axis_query(

172

"RNA",

173

obs_query=tiledbsoma.AxisQuery(

174

value_filter="""

175

n_genes >= 500 and n_genes <= 5000 and

176

total_counts >= 1000 and

177

pct_counts_mitochondrial <= 20 and

178

tissue == 'brain'

179

"""

180

),

181

var_query=tiledbsoma.AxisQuery(

182

value_filter="highly_variable == True and feature_type == 'Gene Expression'"

183

)

184

)

185

186

# Convert to AnnData with all available layers

187

brain_adata = quality_query.to_anndata(

188

X_layer_name="normalized",

189

obsm_layers=None, # Include all obsm layers

190

varm_layers=None # Include all varm layers

191

)

192

```

193

194

### AxisQuery Specification

195

196

The AxisQuery class provides flexible query specification for coordinates and filtering.

197

198

```python { .api }

199

class AxisQuery:

200

def __init__(self, *, coords=None, value_filter=None):

201

"""

202

Create an axis query specification.

203

204

Parameters:

205

- coords: coordinate selection (slice, list, or array)

206

- value_filter: str, filter expression for attribute values

207

"""

208

```

209

210

#### Coordinate Selection Examples

211

212

```python

213

import tiledbsoma

214

215

# Various coordinate selection patterns

216

axis_queries = [

217

# Select specific indices

218

tiledbsoma.AxisQuery(coords=[0, 5, 10, 15, 20]),

219

220

# Select range with slice

221

tiledbsoma.AxisQuery(coords=slice(100, 500)),

222

223

# Select with step

224

tiledbsoma.AxisQuery(coords=slice(0, 1000, 10)), # Every 10th element

225

226

# Select all (equivalent to no coordinate filter)

227

tiledbsoma.AxisQuery(coords=slice(None)),

228

]

229

230

# Value filter examples

231

filter_queries = [

232

# Numeric comparisons

233

tiledbsoma.AxisQuery(value_filter="n_genes > 1000"),

234

235

# String matching

236

tiledbsoma.AxisQuery(value_filter="cell_type == 'B cells'"),

237

238

# Multiple conditions

239

tiledbsoma.AxisQuery(value_filter="n_genes > 500 and total_counts < 10000"),

240

241

# Set membership

242

tiledbsoma.AxisQuery(value_filter="donor_id in ['D1', 'D2', 'D3']"),

243

244

# Pattern matching

245

tiledbsoma.AxisQuery(value_filter="feature_name startswith 'MT-'"),

246

]

247

```

248

249

### IntIndexer

250

251

A re-indexer for unique integer indices, compatible with Pandas Index.get_indexer functionality. Useful for mapping between different index spaces efficiently.

252

253

```python { .api }

254

class IntIndexer:

255

def __init__(self, data, *, context=None):

256

"""

257

Initialize IntIndexer with integer keys.

258

259

Parameters:

260

- data: array-like of unique integers to index

261

- context: TileDB context for the operation

262

"""

263

264

def get_indexer(self, target):

265

"""

266

Compute underlying indices for target data.

267

268

Parameters:

269

- target: array-like of integers to find indices for

270

271

Returns:

272

numpy array of indices, with -1 for missing values

273

"""

274

```

275

276

#### Usage Example

277

278

```python

279

import tiledbsoma

280

import numpy as np

281

282

# Create indexer for soma_joinid values

283

original_ids = np.array([0, 5, 10, 15, 20, 25, 30])

284

indexer = tiledbsoma.IntIndexer(original_ids)

285

286

# Find positions of specific IDs

287

target_ids = np.array([5, 15, 99, 20]) # 99 doesn't exist

288

positions = indexer.get_indexer(target_ids)

289

print(positions) # [1, 3, -1, 4] (99 maps to -1)

290

291

# Use with SOMA data

292

with tiledbsoma.open("experiment.soma") as exp:

293

# Get all observation IDs

294

obs_ids = exp.obs.read(column_names=["soma_joinid"]).concat()["soma_joinid"].to_numpy()

295

296

# Create indexer

297

obs_indexer = tiledbsoma.IntIndexer(obs_ids)

298

299

# Map external IDs to SOMA positions

300

external_ids = np.array([100, 200, 300, 400])

301

soma_positions = obs_indexer.get_indexer(external_ids)

302

303

# Use positions for coordinate-based queries

304

valid_positions = soma_positions[soma_positions >= 0]

305

if len(valid_positions) > 0:

306

query = exp.axis_query(

307

"RNA",

308

obs_query=tiledbsoma.AxisQuery(coords=valid_positions)

309

)

310

```

311

312

### Index Building Function

313

314

Utility function for building indices on integer arrays.

315

316

```python { .api }

317

def tiledbsoma_build_index(data, *, context=None):

318

"""

319

Build index for integer array.

320

321

Parameters:

322

- data: array-like of integers to index

323

- context: TileDB context for the operation

324

325

Returns:

326

Built index structure for efficient lookups

327

"""

328

```

329

330

#### Usage Example

331

332

```python

333

import tiledbsoma

334

import numpy as np

335

336

# Build index for large ID array

337

large_id_array = np.random.randint(0, 1000000, size=100000)

338

index = tiledbsoma.tiledbsoma_build_index(large_id_array)

339

340

# Use index for efficient lookups

341

# (specific usage depends on implementation details)

342

```

343

344

### Query Performance Optimization

345

346

#### Batch Processing

347

348

```python

349

import tiledbsoma

350

351

# Efficient batch processing for large queries

352

with tiledbsoma.open("large_experiment.soma") as exp:

353

query = exp.axis_query("RNA")

354

355

# Process observations in batches

356

batch_size = 1000

357

total_processed = 0

358

359

for obs_batch in query.obs(batch_size=batch_size):

360

obs_df = obs_batch.to_pandas()

361

total_processed += len(obs_df)

362

363

# Process batch

364

print(f"Processed {total_processed} observations")

365

366

# Your analysis code here...

367

```

368

369

#### Parallel Processing

370

371

```python

372

import tiledbsoma

373

374

# Use partitions for parallel processing

375

with tiledbsoma.open("experiment.soma") as exp:

376

query = exp.axis_query("RNA")

377

378

# Create partitions for parallel execution

379

partitions = tiledbsoma.Partitions(n_partitions=4)

380

381

# Process partitions in parallel (conceptual - actual implementation may vary)

382

for partition_id in range(partitions.n_partitions):

383

obs_data = query.obs(partitions=partitions.get_partition(partition_id))

384

# Process partition...

385

```

386

387

#### Memory-Efficient Queries

388

389

```python

390

import tiledbsoma

391

392

# Memory-efficient processing of large datasets

393

with tiledbsoma.open("experiment.soma") as exp:

394

# Query only needed columns

395

query = exp.axis_query(

396

"RNA",

397

obs_query=tiledbsoma.AxisQuery(

398

value_filter="quality_score > 0.8"

399

)

400

)

401

402

# Stream data without loading everything into memory

403

for expr_batch in query.X("data", batch_size=5000):

404

# Process expression batch

405

coords = expr_batch.coords()

406

values = expr_batch.values()

407

408

# Compute statistics, etc. without storing full dataset

409

print(f"Batch non-zero values: {len(values)}")

410

```

411

412

### Integration with Analysis Workflows

413

414

```python

415

import tiledbsoma

416

import scanpy as sc

417

418

# Integrated analysis workflow

419

with tiledbsoma.open("experiment.soma") as exp:

420

# Query high-quality cells

421

hq_query = exp.axis_query(

422

"RNA",

423

obs_query=tiledbsoma.AxisQuery(

424

value_filter="n_genes > 200 and pct_counts_mitochondrial < 20"

425

),

426

var_query=tiledbsoma.AxisQuery(

427

value_filter="n_cells > 3" # Genes expressed in at least 3 cells

428

)

429

)

430

431

# Convert to AnnData for Scanpy analysis

432

adata = hq_query.to_anndata(X_layer_name="raw")

433

434

# Standard single-cell analysis

435

sc.pp.normalize_total(adata, target_sum=1e4)

436

sc.pp.log1p(adata)

437

sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

438

439

# Continue with downstream analysis...

440

print(f"Processed {adata.n_obs} cells and {adata.n_vars} genes")

441

```

442

443

This query and indexing functionality provides the foundation for efficient, scalable analysis of single-cell datasets stored in SOMA format.