Python API for efficient storage and retrieval of single-cell data using TileDB
Configuration classes for TileDB context management and platform-specific options for creating and writing SOMA objects. These classes provide fine-grained control over TileDB storage engine behavior and performance characteristics.
from tiledbsoma.options import (
SOMATileDBContext, TileDBCreateOptions, TileDBWriteOptions, ConfigDict
)TileDB context configuration for SOMA operations. The context manages TileDB-specific settings including storage backends, memory limits, threading, and other platform configurations.
class SOMATileDBContext:
def __init__(self, config=None):
"""
Initialize TileDB context for SOMA operations.
Parameters:
- config: dict or ConfigDict, TileDB configuration parameters
"""
@property
def config(self):
"""
Get current TileDB configuration.
Returns:
ConfigDict: Current configuration settings
"""
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
passimport tiledbsoma
from tiledbsoma.options import SOMATileDBContext
# Basic context usage
with SOMATileDBContext() as ctx:
with tiledbsoma.open("experiment.soma", context=ctx) as exp:
# Operations use the specified context
data = exp.obs.read().concat()
# Context with custom configuration
config = {
"sm.memory_budget": 2**30, # 1GB memory budget
"sm.tile_cache_size": 2**28, # 256MB tile cache
"vfs.s3.max_parallel_ops": 16, # S3 parallel operations
"vfs.s3.multipart_part_size": 50*1024*1024 # 50MB parts
}
with SOMATileDBContext(config=config) as ctx:
# Create objects with custom settings
with tiledbsoma.DataFrame.create(
"high_perf_dataframe.soma",
schema=schema,
context=ctx
) as df:
df.write(data)Configuration dictionary for TileDB context with structured parameter management.
class ConfigDict:
def __init__(self, config_dict=None):
"""
Initialize configuration dictionary.
Parameters:
- config_dict: dict, initial configuration parameters
"""
def __getitem__(self, key):
"""Get configuration parameter."""
def __setitem__(self, key, value):
"""Set configuration parameter."""
def update(self, other):
"""Update configuration with another dict or ConfigDict."""
def items(self):
"""Iterate over configuration items."""from tiledbsoma.options import ConfigDict, SOMATileDBContext
# Create configuration
config = ConfigDict({
"sm.memory_budget": 1024**3, # 1GB
"sm.tile_cache_size": 512*1024**2, # 512MB
})
# Update configuration
config.update({
"vfs.num_threads": 8,
"sm.compute_concurrency_level": 4
})
# Use with context
with SOMATileDBContext(config=config) as ctx:
# Context uses the configuration
passPlatform-specific options for creating TileDB objects. These options control storage layout, compression, encryption, and other creation-time parameters.
class TileDBCreateOptions:
def __init__(self, **kwargs):
"""
Initialize TileDB creation options.
Parameters:
- **kwargs: TileDB-specific creation parameters
Common Parameters:
- tile_order: str, tile order ("row-major" or "col-major")
- cell_order: str, cell order ("row-major" or "col-major")
- capacity: int, tile capacity
- sparse: bool, whether array is sparse
- allows_duplicates: bool, whether duplicates are allowed
- offsets_compression: str, compression for offsets
- offsets_compression_level: int, compression level for offsets
- validity_compression: str, compression for validity data
- validity_compression_level: int, compression level for validity
"""
def __getitem__(self, key):
"""Get creation option."""
def __setitem__(self, key, value):
"""Set creation option."""
def get(self, key, default=None):
"""Get creation option with default."""import tiledbsoma
from tiledbsoma.options import TileDBCreateOptions
import pyarrow as pa
# Create options for high-performance sparse array
create_opts = TileDBCreateOptions(
capacity=100000, # Large tile capacity
tile_order="row-major",
cell_order="row-major",
allows_duplicates=False,
# Compression settings
offsets_compression="lz4",
offsets_compression_level=1,
validity_compression="rle" # Run-length encoding
)
# Use options when creating arrays
with tiledbsoma.SparseNDArray.create(
"optimized_matrix.soma",
type=pa.float32(),
shape=(10000, 5000),
platform_config=create_opts
) as array:
# Array created with optimized settings
pass
# Create options for dataframes
df_create_opts = TileDBCreateOptions(
capacity=10000,
tile_order="row-major",
offsets_compression="zstd",
offsets_compression_level=3
)
with tiledbsoma.DataFrame.create(
"optimized_dataframe.soma",
schema=schema,
platform_config=df_create_opts
) as df:
df.write(data)Platform-specific options for writing to TileDB objects. These options control write behavior, memory usage, and performance characteristics during data ingestion.
class TileDBWriteOptions:
def __init__(self, **kwargs):
"""
Initialize TileDB write options.
Parameters:
- **kwargs: TileDB-specific write parameters
Common Parameters:
- batch_size: int, number of elements per write batch
- memory_budget: int, memory budget for writes in bytes
- check_coord_dups: bool, check for coordinate duplicates
- check_coord_oob: bool, check for out-of-bounds coordinates
- dedup_coords: bool, deduplicate coordinates
"""
def __getitem__(self, key):
"""Get write option."""
def __setitem__(self, key, value):
"""Set write option."""
def get(self, key, default=None):
"""Get write option with default."""import tiledbsoma
from tiledbsoma.options import TileDBWriteOptions
import numpy as np
import pyarrow as pa
# Write options for large bulk loads
write_opts = TileDBWriteOptions(
batch_size=50000, # Large batches
memory_budget=2**30, # 1GB memory budget
check_coord_dups=False, # Skip duplicate check for performance
check_coord_oob=False, # Skip bounds check for performance
dedup_coords=False # Skip deduplication
)
# Use write options for bulk data loading
with tiledbsoma.open("large_matrix.soma", mode="w") as array:
# Generate large dataset
n_nonzero = 1000000
cell_ids = np.random.randint(0, 50000, n_nonzero)
gene_ids = np.random.randint(0, 20000, n_nonzero)
values = np.random.exponential(2.0, n_nonzero)
coords = pa.table({
"soma_dim_0": cell_ids,
"soma_dim_1": gene_ids
})
data = pa.table({
"soma_data": values
})
# Write with optimized settings
array.write((coords, data), platform_config=write_opts)
# Write options for safety-first approach
safe_write_opts = TileDBWriteOptions(
check_coord_dups=True,
check_coord_oob=True,
dedup_coords=True,
memory_budget=512*1024**2 # 512MB
)
with tiledbsoma.open("safe_dataframe.soma", mode="w") as df:
df.write(data, platform_config=safe_write_opts)from tiledbsoma.options import SOMATileDBContext, ConfigDict
# S3 configuration
s3_config = ConfigDict({
# S3 settings
"vfs.s3.aws_access_key_id": "your_access_key",
"vfs.s3.aws_secret_access_key": "your_secret_key",
"vfs.s3.region": "us-west-2",
"vfs.s3.max_parallel_ops": 16,
"vfs.s3.multipart_part_size": 100*1024*1024, # 100MB
"vfs.s3.use_virtual_addressing": "true",
# Performance settings
"sm.memory_budget": 4*1024**3, # 4GB
"sm.tile_cache_size": 1024**3, # 1GB
"vfs.num_threads": 16
})
with SOMATileDBContext(config=s3_config) as ctx:
# Work with S3-stored data
with tiledbsoma.open("s3://my-bucket/experiment.soma", context=ctx) as exp:
data = exp.obs.read().concat()
# Azure Blob Storage configuration
azure_config = ConfigDict({
"vfs.azure.storage_account_name": "myaccount",
"vfs.azure.storage_account_key": "mykey",
"vfs.azure.max_parallel_ops": 16,
"sm.memory_budget": 2*1024**3,
"vfs.num_threads": 8
})from tiledbsoma.options import SOMATileDBContext, TileDBCreateOptions, TileDBWriteOptions
# High-performance local configuration
local_config = ConfigDict({
"sm.memory_budget": 8*1024**3, # 8GB memory
"sm.tile_cache_size": 2*1024**3, # 2GB cache
"sm.compute_concurrency_level": 8,
"vfs.num_threads": 16,
"sm.io_concurrency_level": 4
})
# Optimized creation options
create_opts = TileDBCreateOptions(
capacity=100000,
tile_order="row-major",
offsets_compression="lz4",
offsets_compression_level=1
)
# Optimized write options
write_opts = TileDBWriteOptions(
batch_size=100000,
memory_budget=4*1024**3,
check_coord_dups=False
)
# Combined usage
with SOMATileDBContext(config=local_config) as ctx:
with tiledbsoma.SparseNDArray.create(
"fast_array.soma",
type=pa.float32(),
shape=(100000, 50000),
context=ctx,
platform_config=create_opts
) as array:
# Fast bulk loading
array.write(data, platform_config=write_opts)# Configuration for memory-limited environments
low_memory_config = ConfigDict({
"sm.memory_budget": 256*1024**2, # 256MB
"sm.tile_cache_size": 64*1024**2, # 64MB
"vfs.num_threads": 2,
"sm.compute_concurrency_level": 1
})
conservative_write_opts = TileDBWriteOptions(
batch_size=1000,
memory_budget=128*1024**2, # 128MB
check_coord_dups=True,
check_coord_oob=True
)
with SOMATileDBContext(config=low_memory_config) as ctx:
# Memory-efficient operations
with tiledbsoma.open("data.soma", context=ctx) as obj:
# Process in small batches
for batch in obj.read(batch_size=1000):
# Process batch
passimport tiledbsoma
from tiledbsoma.options import SOMATileDBContext
# Context sharing across operations
config = {"sm.memory_budget": 2*1024**3}
with SOMATileDBContext(config=config) as ctx:
# Create experiment with shared context
with tiledbsoma.Experiment.create("exp.soma", context=ctx) as exp:
# All operations share the same context
exp.add_new_dataframe("obs", schema=obs_schema)
with exp.obs as obs_df:
obs_df.write(obs_data)
# Read operations with same context
with tiledbsoma.open("exp.soma", context=ctx) as exp:
data = exp.obs.read().concat()This configuration system provides comprehensive control over TileDB-SOMA performance and behavior, enabling optimization for different use cases from high-throughput cloud deployments to memory-constrained local analysis.
Install with Tessl CLI
npx tessl i tessl/pypi-tiledbsoma