or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcore-data-structures.mddata-io.mdindex.mdquery-indexing.mdsingle-cell-biology.mdspatial-data.md

core-data-structures.mddocs/

0

# Core Data Structures

1

2

The fundamental SOMA data types that provide the building blocks for storing and organizing scientific data. These include Collections for hierarchical organization, DataFrames for tabular data, and sparse/dense N-dimensional arrays for numerical data storage.

3

4

## Capabilities

5

6

### Collection

7

8

A string-keyed container that can hold any SOMA object type, enabling hierarchical organization of data. Collections provide the foundation for complex data structures and can contain other collections, dataframes, or arrays.

9

10

```python { .api }

11

class Collection:

12

@classmethod

13

def create(cls, uri, *, platform_config=None, context=None, tiledb_timestamp=None):

14

"""

15

Create a new Collection.

16

17

Parameters:

18

- uri: str, URI for the collection

19

- platform_config: TileDB-specific configuration options

20

- context: TileDB context for the operation

21

- tiledb_timestamp: Timestamp for temporal queries

22

23

Returns:

24

Collection instance

25

"""

26

27

def add_new_collection(self, key, **kwargs):

28

"""

29

Add a new sub-collection.

30

31

Parameters:

32

- key: str, name for the new collection

33

- **kwargs: Additional arguments passed to Collection.create()

34

35

Returns:

36

Collection instance

37

"""

38

39

def add_new_dataframe(self, key, **kwargs):

40

"""

41

Add a new DataFrame to the collection.

42

43

Parameters:

44

- key: str, name for the new dataframe

45

- **kwargs: Additional arguments passed to DataFrame.create()

46

47

Returns:

48

DataFrame instance

49

"""

50

51

def add_new_dense_ndarray(self, key, **kwargs):

52

"""

53

Add a new DenseNDArray to the collection.

54

55

Parameters:

56

- key: str, name for the new array

57

- **kwargs: Additional arguments passed to DenseNDArray.create()

58

59

Returns:

60

DenseNDArray instance

61

"""

62

63

def add_new_sparse_ndarray(self, key, **kwargs):

64

"""

65

Add a new SparseNDArray to the collection.

66

67

Parameters:

68

- key: str, name for the new array

69

- **kwargs: Additional arguments passed to SparseNDArray.create()

70

71

Returns:

72

SparseNDArray instance

73

"""

74

75

def members(self):

76

"""

77

Get collection member names and types.

78

79

Returns:

80

dict: Mapping of member names to their SOMA types

81

"""

82

83

def __getitem__(self, key):

84

"""

85

Access collection members by key.

86

87

Parameters:

88

- key: str, member name

89

90

Returns:

91

SOMA object at the specified key

92

"""

93

94

def keys(self):

95

"""

96

Get collection member names.

97

98

Returns:

99

Iterator of member names

100

"""

101

```

102

103

#### Usage Example

104

105

```python

106

import tiledbsoma

107

108

# Create a root collection

109

with tiledbsoma.Collection.create("my_experiment.soma") as collection:

110

# Add sub-collections for organization

111

collection.add_new_collection("raw_data")

112

collection.add_new_collection("processed_data")

113

114

# Add data structures

115

collection.add_new_dataframe("observations", schema=obs_schema)

116

collection.add_new_sparse_ndarray("expression_matrix", type=pa.float32(), shape=(1000, 2000))

117

118

# Access collection members

119

with tiledbsoma.open("my_experiment.soma") as collection:

120

obs_df = collection["observations"]

121

expr_matrix = collection["expression_matrix"]

122

```

123

124

### DataFrame

125

126

A multi-column table with a user-defined Arrow schema. All DataFrames must contain a `soma_joinid` column of type int64, which serves as the primary index for joining with other data structures.

127

128

```python { .api }

129

class DataFrame:

130

@classmethod

131

def create(cls, uri, *, schema, domain=None, platform_config=None, context=None, tiledb_timestamp=None):

132

"""

133

Create a new DataFrame.

134

135

Parameters:

136

- uri: str, URI for the dataframe

137

- schema: pyarrow.Schema, column schema including soma_joinid

138

- domain: list of tuples, domain bounds for each dimension (optional)

139

- platform_config: TileDB-specific configuration options

140

- context: TileDB context for the operation

141

- tiledb_timestamp: Timestamp for temporal queries

142

143

Returns:

144

DataFrame instance

145

"""

146

147

def read(self, coords=(), value_filter=None, column_names=None, result_order=None, batch_size=None, partitions=None, platform_config=None):

148

"""

149

Read data from the DataFrame.

150

151

Parameters:

152

- coords: tuple, coordinate selection for soma_joinid

153

- value_filter: str, filter expression for attribute values

154

- column_names: list of str, specific columns to read

155

- result_order: ResultOrder, result ordering preference

156

- batch_size: int, number of rows per batch

157

- partitions: Partitions object for parallel reading

158

- platform_config: TileDB-specific configuration options

159

160

Returns:

161

Iterator of Arrow tables

162

"""

163

164

def write(self, values, platform_config=None):

165

"""

166

Write data to the DataFrame.

167

168

Parameters:

169

- values: pyarrow.Table, data to write

170

- platform_config: TileDB-specific configuration options

171

"""

172

173

def keys(self):

174

"""

175

Get column names.

176

177

Returns:

178

list of str: Column names

179

"""

180

181

def count(self):

182

"""

183

Get the number of rows in the DataFrame.

184

185

Returns:

186

int: Number of rows

187

"""

188

189

def domain(self):

190

"""

191

Get the domain bounds for each dimension.

192

193

Returns:

194

tuple: Domain bounds (min, max) for soma_joinid

195

"""

196

197

def tiledbsoma_upgrade_domain(self, newdomain, check_only=False):

198

"""

199

Upgrade the domain bounds.

200

201

Parameters:

202

- newdomain: tuple, new domain bounds

203

- check_only: bool, if True, only check if upgrade is possible

204

205

Returns:

206

bool: True if upgrade was successful or is possible

207

"""

208

209

def tiledbsoma_resize_soma_joinid_shape(self, newshape, check_only=False):

210

"""

211

Resize the soma_joinid dimension shape.

212

213

Parameters:

214

- newshape: int, new maximum soma_joinid value

215

- check_only: bool, if True, only check if resize is possible

216

217

Returns:

218

bool: True if resize was successful or is possible

219

"""

220

221

@property

222

def schema(self):

223

"""

224

Get the Arrow schema.

225

226

Returns:

227

pyarrow.Schema: The dataframe schema

228

"""

229

230

def maxdomain(self):

231

"""

232

Get the maximum domain bounds.

233

234

Returns:

235

tuple: Maximum domain bounds for each dimension

236

"""

237

238

def index_column_names(self):

239

"""

240

Get the names of index columns.

241

242

Returns:

243

tuple of str: Index column names

244

"""

245

246

def get_enumeration_values(self, enum_name):

247

"""

248

Get enumeration values for a categorical column.

249

250

Parameters:

251

- enum_name: str, name of the enumeration

252

253

Returns:

254

list: Enumeration values

255

"""

256

257

def extend_enumeration_values(self, enum_name, new_values):

258

"""

259

Extend enumeration with new values.

260

261

Parameters:

262

- enum_name: str, name of the enumeration

263

- new_values: list, new values to add

264

"""

265

266

def tiledbsoma_has_upgraded_domain(self):

267

"""

268

Check if domain has been upgraded.

269

270

Returns:

271

bool: True if domain has been upgraded

272

"""

273

274

def tiledbsoma_upgrade_soma_joinid_shape(self, newshape, check_only=False):

275

"""

276

Upgrade soma_joinid dimension shape.

277

278

Parameters:

279

- newshape: int, new shape for soma_joinid dimension

280

- check_only: bool, if True, only check if upgrade is possible

281

282

Returns:

283

bool or None: Result of upgrade operation

284

"""

285

286

def change_domain(self, newdomain, check_only=False):

287

"""

288

Change the domain configuration.

289

290

Parameters:

291

- newdomain: tuple, new domain bounds

292

- check_only: bool, if True, only check if change is possible

293

"""

294

```

295

296

#### Usage Example

297

298

```python

299

import tiledbsoma

300

import pyarrow as pa

301

302

# Define schema with required soma_joinid column

303

schema = pa.schema([

304

("soma_joinid", pa.int64()),

305

("cell_type", pa.string()),

306

("tissue", pa.string()),

307

("donor_id", pa.string()),

308

("total_counts", pa.int32())

309

])

310

311

# Create and write data

312

with tiledbsoma.DataFrame.create("cell_metadata.soma", schema=schema) as df:

313

data = pa.table({

314

"soma_joinid": [0, 1, 2, 3, 4],

315

"cell_type": ["T-cell", "B-cell", "Neuron", "Astrocyte", "Hepatocyte"],

316

"tissue": ["blood", "blood", "brain", "brain", "liver"],

317

"donor_id": ["D1", "D1", "D2", "D2", "D3"],

318

"total_counts": [1500, 2000, 800, 1200, 1800]

319

})

320

df.write(data)

321

322

# Read with filtering

323

with tiledbsoma.open("cell_metadata.soma") as df:

324

# Filter for brain tissue cells

325

brain_cells = df.read(

326

value_filter="tissue == 'brain'",

327

column_names=["soma_joinid", "cell_type", "total_counts"]

328

).concat()

329

print(brain_cells.to_pandas())

330

```

331

332

### SparseNDArray

333

334

A sparse N-dimensional array with offset (0-based) integer indexing. Dimensions are named `soma_dim_0`, `soma_dim_1`, etc., and stored values are named `soma_data`. Sparse arrays only store non-zero values, making them memory-efficient for data with many zeros.

335

336

```python { .api }

337

class SparseNDArray:

338

@classmethod

339

def create(cls, uri, *, type, shape, platform_config=None, context=None, tiledb_timestamp=None):

340

"""

341

Create a new SparseNDArray.

342

343

Parameters:

344

- uri: str, URI for the array

345

- type: pyarrow data type for stored values

346

- shape: tuple of int, array dimensions

347

- platform_config: TileDB-specific configuration options

348

- context: TileDB context for the operation

349

- tiledb_timestamp: Timestamp for temporal queries

350

351

Returns:

352

SparseNDArray instance

353

"""

354

355

def read(self, coords=(), result_order=None, batch_size=None, partitions=None, platform_config=None):

356

"""

357

Read data from the sparse array.

358

359

Parameters:

360

- coords: tuple of slices/arrays, coordinate selection for each dimension

361

- result_order: ResultOrder, result ordering preference

362

- batch_size: int, number of elements per batch

363

- partitions: Partitions object for parallel reading

364

- platform_config: TileDB-specific configuration options

365

366

Returns:

367

SparseNDArrayRead iterator

368

"""

369

370

def write(self, values, platform_config=None):

371

"""

372

Write sparse data to the array.

373

374

Parameters:

375

- values: tuple of (coordinates_table, values_table)

376

- coordinates_table: pyarrow.Table with soma_dim_* columns

377

- values_table: pyarrow.Table with soma_data column

378

- platform_config: TileDB-specific configuration options

379

"""

380

381

@property

382

def shape(self):

383

"""

384

Get array dimensions.

385

386

Returns:

387

tuple of int: Array shape

388

"""

389

390

@property

391

def nnz(self):

392

"""

393

Get number of non-zero elements.

394

395

Returns:

396

int: Number of stored (non-zero) elements

397

"""

398

399

@property

400

def schema(self):

401

"""

402

Get the Arrow schema for coordinates and values.

403

404

Returns:

405

pyarrow.Schema: Schema for the array data

406

"""

407

```

408

409

#### Usage Example

410

411

```python

412

import tiledbsoma

413

import pyarrow as pa

414

import numpy as np

415

416

# Create a sparse 2D array for gene expression (cells x genes)

417

with tiledbsoma.SparseNDArray.create(

418

"expression_matrix.soma",

419

type=pa.float32(),

420

shape=(1000, 2000) # 1000 cells, 2000 genes

421

) as sparse_array:

422

423

# Generate sparse data (only non-zero expression values)

424

np.random.seed(42)

425

n_nonzero = 5000

426

cell_ids = np.random.randint(0, 1000, n_nonzero)

427

gene_ids = np.random.randint(0, 2000, n_nonzero)

428

expression_values = np.random.exponential(2.0, n_nonzero)

429

430

# Prepare coordinate and value tables

431

coordinates = pa.table({

432

"soma_dim_0": cell_ids, # cell dimension

433

"soma_dim_1": gene_ids # gene dimension

434

})

435

values = pa.table({

436

"soma_data": expression_values

437

})

438

439

# Write sparse data

440

sparse_array.write((coordinates, values))

441

442

# Read sparse data back

443

with tiledbsoma.open("expression_matrix.soma") as sparse_array:

444

print(f"Array shape: {sparse_array.shape}")

445

print(f"Non-zero elements: {sparse_array.nnz}")

446

447

# Read subset of data (first 100 cells, all genes)

448

reader = sparse_array.read(coords=(slice(0, 100), slice(None)))

449

for batch in reader:

450

coords_df = batch.coords().to_pandas()

451

values_df = batch.values().to_pandas()

452

print(f"Batch: {len(coords_df)} non-zero values")

453

```

454

455

### DenseNDArray

456

457

A dense N-dimensional array with offset (0-based) integer indexing. Like sparse arrays, dimensions are named `soma_dim_0`, `soma_dim_1`, etc., and values are named `soma_data`. Dense arrays store values for all coordinate positions, making them suitable for data without sparsity.

458

459

```python { .api }

460

class DenseNDArray:

461

@classmethod

462

def create(cls, uri, *, type, shape, platform_config=None, context=None, tiledb_timestamp=None):

463

"""

464

Create a new DenseNDArray.

465

466

Parameters:

467

- uri: str, URI for the array

468

- type: pyarrow data type for stored values

469

- shape: tuple of int, array dimensions

470

- platform_config: TileDB-specific configuration options

471

- context: TileDB context for the operation

472

- tiledb_timestamp: Timestamp for temporal queries

473

474

Returns:

475

DenseNDArray instance

476

"""

477

478

def read(self, coords=(), result_order=None, batch_size=None, partitions=None, platform_config=None):

479

"""

480

Read data from the dense array.

481

482

Parameters:

483

- coords: tuple of slices/arrays, coordinate selection for each dimension

484

- result_order: ResultOrder, result ordering preference

485

- batch_size: int, number of elements per batch

486

- partitions: Partitions object for parallel reading

487

- platform_config: TileDB-specific configuration options

488

489

Returns:

490

pyarrow.Tensor with requested data

491

"""

492

493

def write(self, coords, values, platform_config=None):

494

"""

495

Write dense data to the array.

496

497

Parameters:

498

- coords: tuple of slices, coordinate region to write

499

- values: numpy array or Arrow tensor with data to write

500

- platform_config: TileDB-specific configuration options

501

"""

502

503

@property

504

def shape(self):

505

"""

506

Get array dimensions.

507

508

Returns:

509

tuple of int: Array shape

510

"""

511

512

@property

513

def schema(self):

514

"""

515

Get the Arrow schema for the array.

516

517

Returns:

518

pyarrow.Schema: Schema for the array data

519

"""

520

```

521

522

#### Usage Example

523

524

```python

525

import tiledbsoma

526

import pyarrow as pa

527

import numpy as np

528

529

# Create a dense 2D array for embedding coordinates

530

with tiledbsoma.DenseNDArray.create(

531

"cell_embeddings.soma",

532

type=pa.float64(),

533

shape=(1000, 50) # 1000 cells, 50 embedding dimensions

534

) as dense_array:

535

536

# Generate embedding data (PCA coordinates)

537

np.random.seed(42)

538

embeddings = np.random.normal(0, 1, (1000, 50))

539

540

# Write all data at once

541

dense_array.write(

542

coords=(slice(None), slice(None)), # Write entire array

543

values=embeddings

544

)

545

546

# Read dense data back

547

with tiledbsoma.open("cell_embeddings.soma") as dense_array:

548

print(f"Array shape: {dense_array.shape}")

549

550

# Read subset (first 10 cells, first 5 dimensions)

551

subset = dense_array.read(coords=(slice(0, 10), slice(0, 5)))

552

print("First 10 cells, first 5 PCA dimensions:")

553

print(subset.to_numpy())

554

555

# Read specific cells by index

556

cell_indices = [0, 50, 100, 200, 500]

557

selected_cells = dense_array.read(coords=(cell_indices, slice(None)))

558

print(f"Selected cells embedding shape: {selected_cells.to_numpy().shape}")

559

```

560

561

## Factory Function

562

563

```python { .api }

564

def open(uri, mode="r", *, soma_type=None, context=None, tiledb_timestamp=None):

565

"""

566

Open any SOMA object at the specified URI.

567

568

Parameters:

569

- uri: str, URI of the SOMA object to open

570

- mode: str, access mode ("r" for read, "w" for write)

571

- soma_type: str, expected SOMA type (optional, auto-detected if not provided)

572

- context: TileDB context for the operation

573

- tiledb_timestamp: Timestamp for temporal queries

574

575

Returns:

576

SOMA object of the appropriate type (Collection, DataFrame, etc.)

577

"""

578

```