Tessl Tile for pypi/tiledbsoma@1.17.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

configuration.md core-data-structures.md data-io.md index.md query-indexing.md single-cell-biology.md spatial-data.md

configuration.mddocs/

0
# Configuration and Options
1

2
Configuration classes for TileDB context management and platform-specific options for creating and writing SOMA objects. These classes provide fine-grained control over TileDB storage engine behavior and performance characteristics.
3

4
## Package Import
5

6
```python
7
from tiledbsoma.options import (
8
    SOMATileDBContext, TileDBCreateOptions, TileDBWriteOptions, ConfigDict
9
)
10
```
11

12
## Capabilities
13

14
### SOMATileDBContext
15

16
TileDB context configuration for SOMA operations. The context manages TileDB-specific settings including storage backends, memory limits, threading, and other platform configurations.
17

18
```python { .api }
19
class SOMATileDBContext:
20
    def __init__(self, config=None):
21
        """
22
        Initialize TileDB context for SOMA operations.
23
        
24
        Parameters:
25
        - config: dict or ConfigDict, TileDB configuration parameters
26
        """
27
    
28
    @property
29
    def config(self):
30
        """
31
        Get current TileDB configuration.
32
        
33
        Returns:
34
        ConfigDict: Current configuration settings
35
        """
36
    
37
    def __enter__(self):
38
        """Context manager entry."""
39
        return self
40
    
41
    def __exit__(self, exc_type, exc_val, exc_tb):
42
        """Context manager exit."""
43
        pass
44
```
45

46
#### Usage Example
47

48
```python
49
import tiledbsoma
50
from tiledbsoma.options import SOMATileDBContext
51

52
# Basic context usage
53
with SOMATileDBContext() as ctx:
54
    with tiledbsoma.open("experiment.soma", context=ctx) as exp:
55
        # Operations use the specified context
56
        data = exp.obs.read().concat()
57

58
# Context with custom configuration
59
config = {
60
    "sm.memory_budget": 2**30,  # 1GB memory budget
61
    "sm.tile_cache_size": 2**28,  # 256MB tile cache
62
    "vfs.s3.max_parallel_ops": 16,  # S3 parallel operations
63
    "vfs.s3.multipart_part_size": 50*1024*1024  # 50MB parts
64
}
65

66
with SOMATileDBContext(config=config) as ctx:
67
    # Create objects with custom settings
68
    with tiledbsoma.DataFrame.create(
69
        "high_perf_dataframe.soma",
70
        schema=schema,
71
        context=ctx
72
    ) as df:
73
        df.write(data)
74
```
75

76
### ConfigDict
77

78
Configuration dictionary for TileDB context with structured parameter management.
79

80
```python { .api }
81
class ConfigDict:
82
    def __init__(self, config_dict=None):
83
        """
84
        Initialize configuration dictionary.
85
        
86
        Parameters:
87
        - config_dict: dict, initial configuration parameters
88
        """
89
    
90
    def __getitem__(self, key):
91
        """Get configuration parameter."""
92
        
93
    def __setitem__(self, key, value):
94
        """Set configuration parameter."""
95
        
96
    def update(self, other):
97
        """Update configuration with another dict or ConfigDict."""
98
        
99
    def items(self):
100
        """Iterate over configuration items."""
101
```
102

103
#### Usage Example
104

105
```python
106
from tiledbsoma.options import ConfigDict, SOMATileDBContext
107

108
# Create configuration
109
config = ConfigDict({
110
    "sm.memory_budget": 1024**3,  # 1GB
111
    "sm.tile_cache_size": 512*1024**2,  # 512MB
112
})
113

114
# Update configuration
115
config.update({
116
    "vfs.num_threads": 8,
117
    "sm.compute_concurrency_level": 4
118
})
119

120
# Use with context
121
with SOMATileDBContext(config=config) as ctx:
122
    # Context uses the configuration
123
    pass
124
```
125

126
### TileDBCreateOptions
127

128
Platform-specific options for creating TileDB objects. These options control storage layout, compression, encryption, and other creation-time parameters.
129

130
```python { .api }
131
class TileDBCreateOptions:
132
    def __init__(self, **kwargs):
133
        """
134
        Initialize TileDB creation options.
135
        
136
        Parameters:
137
        - **kwargs: TileDB-specific creation parameters
138
        
139
        Common Parameters:
140
        - tile_order: str, tile order ("row-major" or "col-major")
141
        - cell_order: str, cell order ("row-major" or "col-major")
142
        - capacity: int, tile capacity
143
        - sparse: bool, whether array is sparse
144
        - allows_duplicates: bool, whether duplicates are allowed
145
        - offsets_compression: str, compression for offsets
146
        - offsets_compression_level: int, compression level for offsets
147
        - validity_compression: str, compression for validity data
148
        - validity_compression_level: int, compression level for validity
149
        """
150
    
151
    def __getitem__(self, key):
152
        """Get creation option."""
153
        
154
    def __setitem__(self, key, value):
155
        """Set creation option."""
156
        
157
    def get(self, key, default=None):
158
        """Get creation option with default."""
159
```
160

161
#### Usage Example
162

163
```python
164
import tiledbsoma
165
from tiledbsoma.options import TileDBCreateOptions
166
import pyarrow as pa
167

168
# Create options for high-performance sparse array
169
create_opts = TileDBCreateOptions(
170
    capacity=100000,  # Large tile capacity
171
    tile_order="row-major",
172
    cell_order="row-major",
173
    allows_duplicates=False,
174
    # Compression settings
175
    offsets_compression="lz4",
176
    offsets_compression_level=1,
177
    validity_compression="rle"  # Run-length encoding
178
)
179

180
# Use options when creating arrays
181
with tiledbsoma.SparseNDArray.create(
182
    "optimized_matrix.soma",
183
    type=pa.float32(),
184
    shape=(10000, 5000),
185
    platform_config=create_opts
186
) as array:
187
    # Array created with optimized settings
188
    pass
189

190
# Create options for dataframes
191
df_create_opts = TileDBCreateOptions(
192
    capacity=10000,
193
    tile_order="row-major",
194
    offsets_compression="zstd",
195
    offsets_compression_level=3
196
)
197

198
with tiledbsoma.DataFrame.create(
199
    "optimized_dataframe.soma",
200
    schema=schema,
201
    platform_config=df_create_opts
202
) as df:
203
    df.write(data)
204
```
205

206
### TileDBWriteOptions
207

208
Platform-specific options for writing to TileDB objects. These options control write behavior, memory usage, and performance characteristics during data ingestion.
209

210
```python { .api }
211
class TileDBWriteOptions:
212
    def __init__(self, **kwargs):
213
        """
214
        Initialize TileDB write options.
215
        
216
        Parameters:
217
        - **kwargs: TileDB-specific write parameters
218
        
219
        Common Parameters:
220
        - batch_size: int, number of elements per write batch
221
        - memory_budget: int, memory budget for writes in bytes
222
        - check_coord_dups: bool, check for coordinate duplicates
223
        - check_coord_oob: bool, check for out-of-bounds coordinates
224
        - dedup_coords: bool, deduplicate coordinates
225
        """
226
    
227
    def __getitem__(self, key):
228
        """Get write option."""
229
        
230
    def __setitem__(self, key, value):
231
        """Set write option."""
232
        
233
    def get(self, key, default=None):
234
        """Get write option with default."""
235
```
236

237
#### Usage Example
238

239
```python
240
import tiledbsoma
241
from tiledbsoma.options import TileDBWriteOptions
242
import numpy as np
243
import pyarrow as pa
244

245
# Write options for large bulk loads
246
write_opts = TileDBWriteOptions(
247
    batch_size=50000,  # Large batches
248
    memory_budget=2**30,  # 1GB memory budget
249
    check_coord_dups=False,  # Skip duplicate check for performance
250
    check_coord_oob=False,   # Skip bounds check for performance
251
    dedup_coords=False       # Skip deduplication
252
)
253

254
# Use write options for bulk data loading
255
with tiledbsoma.open("large_matrix.soma", mode="w") as array:
256
    # Generate large dataset
257
    n_nonzero = 1000000
258
    cell_ids = np.random.randint(0, 50000, n_nonzero)
259
    gene_ids = np.random.randint(0, 20000, n_nonzero)
260
    values = np.random.exponential(2.0, n_nonzero)
261
    
262
    coords = pa.table({
263
        "soma_dim_0": cell_ids,
264
        "soma_dim_1": gene_ids
265
    })
266
    data = pa.table({
267
        "soma_data": values
268
    })
269
    
270
    # Write with optimized settings
271
    array.write((coords, data), platform_config=write_opts)
272

273
# Write options for safety-first approach
274
safe_write_opts = TileDBWriteOptions(
275
    check_coord_dups=True,
276
    check_coord_oob=True,
277
    dedup_coords=True,
278
    memory_budget=512*1024**2  # 512MB
279
)
280

281
with tiledbsoma.open("safe_dataframe.soma", mode="w") as df:
282
    df.write(data, platform_config=safe_write_opts)
283
```
284

285
### Common Configuration Patterns
286

287
#### Cloud Storage Configuration
288

289
```python
290
from tiledbsoma.options import SOMATileDBContext, ConfigDict
291

292
# S3 configuration
293
s3_config = ConfigDict({
294
    # S3 settings
295
    "vfs.s3.aws_access_key_id": "your_access_key",
296
    "vfs.s3.aws_secret_access_key": "your_secret_key",
297
    "vfs.s3.region": "us-west-2",
298
    "vfs.s3.max_parallel_ops": 16,
299
    "vfs.s3.multipart_part_size": 100*1024*1024,  # 100MB
300
    "vfs.s3.use_virtual_addressing": "true",
301
    
302
    # Performance settings
303
    "sm.memory_budget": 4*1024**3,  # 4GB
304
    "sm.tile_cache_size": 1024**3,  # 1GB
305
    "vfs.num_threads": 16
306
})
307

308
with SOMATileDBContext(config=s3_config) as ctx:
309
    # Work with S3-stored data
310
    with tiledbsoma.open("s3://my-bucket/experiment.soma", context=ctx) as exp:
311
        data = exp.obs.read().concat()
312

313
# Azure Blob Storage configuration
314
azure_config = ConfigDict({
315
    "vfs.azure.storage_account_name": "myaccount",
316
    "vfs.azure.storage_account_key": "mykey",
317
    "vfs.azure.max_parallel_ops": 16,
318
    "sm.memory_budget": 2*1024**3,
319
    "vfs.num_threads": 8
320
})
321
```
322

323
#### High-Performance Local Storage
324

325
```python
326
from tiledbsoma.options import SOMATileDBContext, TileDBCreateOptions, TileDBWriteOptions
327

328
# High-performance local configuration
329
local_config = ConfigDict({
330
    "sm.memory_budget": 8*1024**3,  # 8GB memory
331
    "sm.tile_cache_size": 2*1024**3,  # 2GB cache
332
    "sm.compute_concurrency_level": 8,
333
    "vfs.num_threads": 16,
334
    "sm.io_concurrency_level": 4
335
})
336

337
# Optimized creation options
338
create_opts = TileDBCreateOptions(
339
    capacity=100000,
340
    tile_order="row-major",
341
    offsets_compression="lz4",
342
    offsets_compression_level=1
343
)
344

345
# Optimized write options
346
write_opts = TileDBWriteOptions(
347
    batch_size=100000,
348
    memory_budget=4*1024**3,
349
    check_coord_dups=False
350
)
351

352
# Combined usage
353
with SOMATileDBContext(config=local_config) as ctx:
354
    with tiledbsoma.SparseNDArray.create(
355
        "fast_array.soma",
356
        type=pa.float32(),
357
        shape=(100000, 50000),
358
        context=ctx,
359
        platform_config=create_opts
360
    ) as array:
361
        # Fast bulk loading
362
        array.write(data, platform_config=write_opts)
363
```
364

365
#### Memory-Constrained Configuration
366

367
```python
368
# Configuration for memory-limited environments
369
low_memory_config = ConfigDict({
370
    "sm.memory_budget": 256*1024**2,  # 256MB
371
    "sm.tile_cache_size": 64*1024**2,  # 64MB
372
    "vfs.num_threads": 2,
373
    "sm.compute_concurrency_level": 1
374
})
375

376
conservative_write_opts = TileDBWriteOptions(
377
    batch_size=1000,
378
    memory_budget=128*1024**2,  # 128MB
379
    check_coord_dups=True,
380
    check_coord_oob=True
381
)
382

383
with SOMATileDBContext(config=low_memory_config) as ctx:
384
    # Memory-efficient operations
385
    with tiledbsoma.open("data.soma", context=ctx) as obj:
386
        # Process in small batches
387
        for batch in obj.read(batch_size=1000):
388
            # Process batch
389
            pass
390
```
391

392
### Context Management Best Practices
393

394
```python
395
import tiledbsoma
396
from tiledbsoma.options import SOMATileDBContext
397

398
# Context sharing across operations
399
config = {"sm.memory_budget": 2*1024**3}
400

401
with SOMATileDBContext(config=config) as ctx:
402
    # Create experiment with shared context
403
    with tiledbsoma.Experiment.create("exp.soma", context=ctx) as exp:
404
        # All operations share the same context
405
        exp.add_new_dataframe("obs", schema=obs_schema)
406
        
407
        with exp.obs as obs_df:
408
            obs_df.write(obs_data)
409
    
410
    # Read operations with same context
411
    with tiledbsoma.open("exp.soma", context=ctx) as exp:
412
        data = exp.obs.read().concat()
413
```
414

415
This configuration system provides comprehensive control over TileDB-SOMA performance and behavior, enabling optimization for different use cases from high-throughput cloud deployments to memory-constrained local analysis.

Version

Tile

Files

configuration.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

configuration.mddocs/