0
# Configuration and Options
1
2
Configuration classes for TileDB context management and platform-specific options for creating and writing SOMA objects. These classes provide fine-grained control over TileDB storage engine behavior and performance characteristics.
3
4
## Package Import
5
6
```python
7
from tiledbsoma.options import (
8
SOMATileDBContext, TileDBCreateOptions, TileDBWriteOptions, ConfigDict
9
)
10
```
11
12
## Capabilities
13
14
### SOMATileDBContext
15
16
TileDB context configuration for SOMA operations. The context manages TileDB-specific settings including storage backends, memory limits, threading, and other platform configurations.
17
18
```python { .api }
19
class SOMATileDBContext:
20
def __init__(self, config=None):
21
"""
22
Initialize TileDB context for SOMA operations.
23
24
Parameters:
25
- config: dict or ConfigDict, TileDB configuration parameters
26
"""
27
28
@property
29
def config(self):
30
"""
31
Get current TileDB configuration.
32
33
Returns:
34
ConfigDict: Current configuration settings
35
"""
36
37
def __enter__(self):
38
"""Context manager entry."""
39
return self
40
41
def __exit__(self, exc_type, exc_val, exc_tb):
42
"""Context manager exit."""
43
pass
44
```
45
46
#### Usage Example
47
48
```python
49
import tiledbsoma
50
from tiledbsoma.options import SOMATileDBContext
51
52
# Basic context usage
53
with SOMATileDBContext() as ctx:
54
with tiledbsoma.open("experiment.soma", context=ctx) as exp:
55
# Operations use the specified context
56
data = exp.obs.read().concat()
57
58
# Context with custom configuration
59
config = {
60
"sm.memory_budget": 2**30, # 1GB memory budget
61
"sm.tile_cache_size": 2**28, # 256MB tile cache
62
"vfs.s3.max_parallel_ops": 16, # S3 parallel operations
63
"vfs.s3.multipart_part_size": 50*1024*1024 # 50MB parts
64
}
65
66
with SOMATileDBContext(config=config) as ctx:
67
# Create objects with custom settings
68
with tiledbsoma.DataFrame.create(
69
"high_perf_dataframe.soma",
70
schema=schema,
71
context=ctx
72
) as df:
73
df.write(data)
74
```
75
76
### ConfigDict
77
78
Configuration dictionary for TileDB context with structured parameter management.
79
80
```python { .api }
81
class ConfigDict:
82
def __init__(self, config_dict=None):
83
"""
84
Initialize configuration dictionary.
85
86
Parameters:
87
- config_dict: dict, initial configuration parameters
88
"""
89
90
def __getitem__(self, key):
91
"""Get configuration parameter."""
92
93
def __setitem__(self, key, value):
94
"""Set configuration parameter."""
95
96
def update(self, other):
97
"""Update configuration with another dict or ConfigDict."""
98
99
def items(self):
100
"""Iterate over configuration items."""
101
```
102
103
#### Usage Example
104
105
```python
106
from tiledbsoma.options import ConfigDict, SOMATileDBContext
107
108
# Create configuration
109
config = ConfigDict({
110
"sm.memory_budget": 1024**3, # 1GB
111
"sm.tile_cache_size": 512*1024**2, # 512MB
112
})
113
114
# Update configuration
115
config.update({
116
"vfs.num_threads": 8,
117
"sm.compute_concurrency_level": 4
118
})
119
120
# Use with context
121
with SOMATileDBContext(config=config) as ctx:
122
# Context uses the configuration
123
pass
124
```
125
126
### TileDBCreateOptions
127
128
Platform-specific options for creating TileDB objects. These options control storage layout, compression, encryption, and other creation-time parameters.
129
130
```python { .api }
131
class TileDBCreateOptions:
132
def __init__(self, **kwargs):
133
"""
134
Initialize TileDB creation options.
135
136
Parameters:
137
- **kwargs: TileDB-specific creation parameters
138
139
Common Parameters:
140
- tile_order: str, tile order ("row-major" or "col-major")
141
- cell_order: str, cell order ("row-major" or "col-major")
142
- capacity: int, tile capacity
143
- sparse: bool, whether array is sparse
144
- allows_duplicates: bool, whether duplicates are allowed
145
- offsets_compression: str, compression for offsets
146
- offsets_compression_level: int, compression level for offsets
147
- validity_compression: str, compression for validity data
148
- validity_compression_level: int, compression level for validity
149
"""
150
151
def __getitem__(self, key):
152
"""Get creation option."""
153
154
def __setitem__(self, key, value):
155
"""Set creation option."""
156
157
def get(self, key, default=None):
158
"""Get creation option with default."""
159
```
160
161
#### Usage Example
162
163
```python
164
import tiledbsoma
165
from tiledbsoma.options import TileDBCreateOptions
166
import pyarrow as pa
167
168
# Create options for high-performance sparse array
169
create_opts = TileDBCreateOptions(
170
capacity=100000, # Large tile capacity
171
tile_order="row-major",
172
cell_order="row-major",
173
allows_duplicates=False,
174
# Compression settings
175
offsets_compression="lz4",
176
offsets_compression_level=1,
177
validity_compression="rle" # Run-length encoding
178
)
179
180
# Use options when creating arrays
181
with tiledbsoma.SparseNDArray.create(
182
"optimized_matrix.soma",
183
type=pa.float32(),
184
shape=(10000, 5000),
185
platform_config=create_opts
186
) as array:
187
# Array created with optimized settings
188
pass
189
190
# Create options for dataframes
191
df_create_opts = TileDBCreateOptions(
192
capacity=10000,
193
tile_order="row-major",
194
offsets_compression="zstd",
195
offsets_compression_level=3
196
)
197
198
with tiledbsoma.DataFrame.create(
199
"optimized_dataframe.soma",
200
schema=schema,
201
platform_config=df_create_opts
202
) as df:
203
df.write(data)
204
```
205
206
### TileDBWriteOptions
207
208
Platform-specific options for writing to TileDB objects. These options control write behavior, memory usage, and performance characteristics during data ingestion.
209
210
```python { .api }
211
class TileDBWriteOptions:
212
def __init__(self, **kwargs):
213
"""
214
Initialize TileDB write options.
215
216
Parameters:
217
- **kwargs: TileDB-specific write parameters
218
219
Common Parameters:
220
- batch_size: int, number of elements per write batch
221
- memory_budget: int, memory budget for writes in bytes
222
- check_coord_dups: bool, check for coordinate duplicates
223
- check_coord_oob: bool, check for out-of-bounds coordinates
224
- dedup_coords: bool, deduplicate coordinates
225
"""
226
227
def __getitem__(self, key):
228
"""Get write option."""
229
230
def __setitem__(self, key, value):
231
"""Set write option."""
232
233
def get(self, key, default=None):
234
"""Get write option with default."""
235
```
236
237
#### Usage Example
238
239
```python
240
import tiledbsoma
241
from tiledbsoma.options import TileDBWriteOptions
242
import numpy as np
243
import pyarrow as pa
244
245
# Write options for large bulk loads
246
write_opts = TileDBWriteOptions(
247
batch_size=50000, # Large batches
248
memory_budget=2**30, # 1GB memory budget
249
check_coord_dups=False, # Skip duplicate check for performance
250
check_coord_oob=False, # Skip bounds check for performance
251
dedup_coords=False # Skip deduplication
252
)
253
254
# Use write options for bulk data loading
255
with tiledbsoma.open("large_matrix.soma", mode="w") as array:
256
# Generate large dataset
257
n_nonzero = 1000000
258
cell_ids = np.random.randint(0, 50000, n_nonzero)
259
gene_ids = np.random.randint(0, 20000, n_nonzero)
260
values = np.random.exponential(2.0, n_nonzero)
261
262
coords = pa.table({
263
"soma_dim_0": cell_ids,
264
"soma_dim_1": gene_ids
265
})
266
data = pa.table({
267
"soma_data": values
268
})
269
270
# Write with optimized settings
271
array.write((coords, data), platform_config=write_opts)
272
273
# Write options for safety-first approach
274
safe_write_opts = TileDBWriteOptions(
275
check_coord_dups=True,
276
check_coord_oob=True,
277
dedup_coords=True,
278
memory_budget=512*1024**2 # 512MB
279
)
280
281
with tiledbsoma.open("safe_dataframe.soma", mode="w") as df:
282
df.write(data, platform_config=safe_write_opts)
283
```
284
285
### Common Configuration Patterns
286
287
#### Cloud Storage Configuration
288
289
```python
290
from tiledbsoma.options import SOMATileDBContext, ConfigDict
291
292
# S3 configuration
293
s3_config = ConfigDict({
294
# S3 settings
295
"vfs.s3.aws_access_key_id": "your_access_key",
296
"vfs.s3.aws_secret_access_key": "your_secret_key",
297
"vfs.s3.region": "us-west-2",
298
"vfs.s3.max_parallel_ops": 16,
299
"vfs.s3.multipart_part_size": 100*1024*1024, # 100MB
300
"vfs.s3.use_virtual_addressing": "true",
301
302
# Performance settings
303
"sm.memory_budget": 4*1024**3, # 4GB
304
"sm.tile_cache_size": 1024**3, # 1GB
305
"vfs.num_threads": 16
306
})
307
308
with SOMATileDBContext(config=s3_config) as ctx:
309
# Work with S3-stored data
310
with tiledbsoma.open("s3://my-bucket/experiment.soma", context=ctx) as exp:
311
data = exp.obs.read().concat()
312
313
# Azure Blob Storage configuration
314
azure_config = ConfigDict({
315
"vfs.azure.storage_account_name": "myaccount",
316
"vfs.azure.storage_account_key": "mykey",
317
"vfs.azure.max_parallel_ops": 16,
318
"sm.memory_budget": 2*1024**3,
319
"vfs.num_threads": 8
320
})
321
```
322
323
#### High-Performance Local Storage
324
325
```python
326
from tiledbsoma.options import SOMATileDBContext, TileDBCreateOptions, TileDBWriteOptions
327
328
# High-performance local configuration
329
local_config = ConfigDict({
330
"sm.memory_budget": 8*1024**3, # 8GB memory
331
"sm.tile_cache_size": 2*1024**3, # 2GB cache
332
"sm.compute_concurrency_level": 8,
333
"vfs.num_threads": 16,
334
"sm.io_concurrency_level": 4
335
})
336
337
# Optimized creation options
338
create_opts = TileDBCreateOptions(
339
capacity=100000,
340
tile_order="row-major",
341
offsets_compression="lz4",
342
offsets_compression_level=1
343
)
344
345
# Optimized write options
346
write_opts = TileDBWriteOptions(
347
batch_size=100000,
348
memory_budget=4*1024**3,
349
check_coord_dups=False
350
)
351
352
# Combined usage
353
with SOMATileDBContext(config=local_config) as ctx:
354
with tiledbsoma.SparseNDArray.create(
355
"fast_array.soma",
356
type=pa.float32(),
357
shape=(100000, 50000),
358
context=ctx,
359
platform_config=create_opts
360
) as array:
361
# Fast bulk loading
362
array.write(data, platform_config=write_opts)
363
```
364
365
#### Memory-Constrained Configuration
366
367
```python
368
# Configuration for memory-limited environments
369
low_memory_config = ConfigDict({
370
"sm.memory_budget": 256*1024**2, # 256MB
371
"sm.tile_cache_size": 64*1024**2, # 64MB
372
"vfs.num_threads": 2,
373
"sm.compute_concurrency_level": 1
374
})
375
376
conservative_write_opts = TileDBWriteOptions(
377
batch_size=1000,
378
memory_budget=128*1024**2, # 128MB
379
check_coord_dups=True,
380
check_coord_oob=True
381
)
382
383
with SOMATileDBContext(config=low_memory_config) as ctx:
384
# Memory-efficient operations
385
with tiledbsoma.open("data.soma", context=ctx) as obj:
386
# Process in small batches
387
for batch in obj.read(batch_size=1000):
388
# Process batch
389
pass
390
```
391
392
### Context Management Best Practices
393
394
```python
395
import tiledbsoma
396
from tiledbsoma.options import SOMATileDBContext
397
398
# Context sharing across operations
399
config = {"sm.memory_budget": 2*1024**3}
400
401
with SOMATileDBContext(config=config) as ctx:
402
# Create experiment with shared context
403
with tiledbsoma.Experiment.create("exp.soma", context=ctx) as exp:
404
# All operations share the same context
405
exp.add_new_dataframe("obs", schema=obs_schema)
406
407
with exp.obs as obs_df:
408
obs_df.write(obs_data)
409
410
# Read operations with same context
411
with tiledbsoma.open("exp.soma", context=ctx) as exp:
412
data = exp.obs.read().concat()
413
```
414
415
This configuration system provides comprehensive control over TileDB-SOMA performance and behavior, enabling optimization for different use cases from high-throughput cloud deployments to memory-constrained local analysis.