tessl/pypi-tiledbsoma

Python API for efficient storage and retrieval of single-cell data using TileDB

Overview

Eval results

Files

Core Data Structures

Name: tessl/pypi-tiledbsoma
Author: tessl

The fundamental SOMA data types that provide the building blocks for storing and organizing scientific data. These include Collections for hierarchical organization, DataFrames for tabular data, and sparse/dense N-dimensional arrays for numerical data storage.

Capabilities

Collection

A string-keyed container that can hold any SOMA object type, enabling hierarchical organization of data. Collections provide the foundation for complex data structures and can contain other collections, dataframes, or arrays.

class Collection:
    @classmethod
    def create(cls, uri, *, platform_config=None, context=None, tiledb_timestamp=None):
        """
        Create a new Collection.
        
        Parameters:
        - uri: str, URI for the collection
        - platform_config: TileDB-specific configuration options
        - context: TileDB context for the operation
        - tiledb_timestamp: Timestamp for temporal queries
        
        Returns:
        Collection instance
        """
    
    def add_new_collection(self, key, **kwargs):
        """
        Add a new sub-collection.
        
        Parameters:
        - key: str, name for the new collection
        - **kwargs: Additional arguments passed to Collection.create()
        
        Returns:
        Collection instance
        """
    
    def add_new_dataframe(self, key, **kwargs):
        """
        Add a new DataFrame to the collection.
        
        Parameters:
        - key: str, name for the new dataframe
        - **kwargs: Additional arguments passed to DataFrame.create()
        
        Returns:
        DataFrame instance
        """
    
    def add_new_dense_ndarray(self, key, **kwargs):
        """
        Add a new DenseNDArray to the collection.
        
        Parameters:
        - key: str, name for the new array
        - **kwargs: Additional arguments passed to DenseNDArray.create()
        
        Returns:
        DenseNDArray instance
        """
    
    def add_new_sparse_ndarray(self, key, **kwargs):
        """
        Add a new SparseNDArray to the collection.
        
        Parameters:
        - key: str, name for the new array
        - **kwargs: Additional arguments passed to SparseNDArray.create()
        
        Returns:
        SparseNDArray instance
        """
    
    def members(self):
        """
        Get collection member names and types.
        
        Returns:
        dict: Mapping of member names to their SOMA types
        """
    
    def __getitem__(self, key):
        """
        Access collection members by key.
        
        Parameters:
        - key: str, member name
        
        Returns:
        SOMA object at the specified key
        """
    
    def keys(self):
        """
        Get collection member names.
        
        Returns:
        Iterator of member names
        """

Usage Example

import tiledbsoma

# Create a root collection
with tiledbsoma.Collection.create("my_experiment.soma") as collection:
    # Add sub-collections for organization
    collection.add_new_collection("raw_data")
    collection.add_new_collection("processed_data")
    
    # Add data structures
    collection.add_new_dataframe("observations", schema=obs_schema)
    collection.add_new_sparse_ndarray("expression_matrix", type=pa.float32(), shape=(1000, 2000))

# Access collection members
with tiledbsoma.open("my_experiment.soma") as collection:
    obs_df = collection["observations"]
    expr_matrix = collection["expression_matrix"]

DataFrame

A multi-column table with a user-defined Arrow schema. All DataFrames must contain a soma_joinid column of type int64, which serves as the primary index for joining with other data structures.

class DataFrame:
    @classmethod
    def create(cls, uri, *, schema, domain=None, platform_config=None, context=None, tiledb_timestamp=None):
        """
        Create a new DataFrame.
        
        Parameters:
        - uri: str, URI for the dataframe
        - schema: pyarrow.Schema, column schema including soma_joinid
        - domain: list of tuples, domain bounds for each dimension (optional)
        - platform_config: TileDB-specific configuration options
        - context: TileDB context for the operation
        - tiledb_timestamp: Timestamp for temporal queries
        
        Returns:
        DataFrame instance
        """
    
    def read(self, coords=(), value_filter=None, column_names=None, result_order=None, batch_size=None, partitions=None, platform_config=None):
        """
        Read data from the DataFrame.
        
        Parameters:
        - coords: tuple, coordinate selection for soma_joinid
        - value_filter: str, filter expression for attribute values
        - column_names: list of str, specific columns to read
        - result_order: ResultOrder, result ordering preference
        - batch_size: int, number of rows per batch
        - partitions: Partitions object for parallel reading
        - platform_config: TileDB-specific configuration options
        
        Returns:
        Iterator of Arrow tables
        """
    
    def write(self, values, platform_config=None):
        """
        Write data to the DataFrame.
        
        Parameters:
        - values: pyarrow.Table, data to write
        - platform_config: TileDB-specific configuration options
        """
    
    def keys(self):
        """
        Get column names.
        
        Returns:
        list of str: Column names
        """
    
    def count(self):
        """
        Get the number of rows in the DataFrame.
        
        Returns:
        int: Number of rows
        """
    
    def domain(self):
        """
        Get the domain bounds for each dimension.
        
        Returns:
        tuple: Domain bounds (min, max) for soma_joinid
        """
    
    def tiledbsoma_upgrade_domain(self, newdomain, check_only=False):
        """
        Upgrade the domain bounds.
        
        Parameters:
        - newdomain: tuple, new domain bounds
        - check_only: bool, if True, only check if upgrade is possible
        
        Returns:
        bool: True if upgrade was successful or is possible
        """
    
    def tiledbsoma_resize_soma_joinid_shape(self, newshape, check_only=False):
        """
        Resize the soma_joinid dimension shape.
        
        Parameters:
        - newshape: int, new maximum soma_joinid value
        - check_only: bool, if True, only check if resize is possible
        
        Returns:
        bool: True if resize was successful or is possible
        """
    
    @property
    def schema(self):
        """
        Get the Arrow schema.
        
        Returns:
        pyarrow.Schema: The dataframe schema
        """
    
    def maxdomain(self):
        """
        Get the maximum domain bounds.
        
        Returns:
        tuple: Maximum domain bounds for each dimension
        """
    
    def index_column_names(self):
        """
        Get the names of index columns.
        
        Returns:
        tuple of str: Index column names
        """
    
    def get_enumeration_values(self, enum_name):
        """
        Get enumeration values for a categorical column.
        
        Parameters:
        - enum_name: str, name of the enumeration
        
        Returns:
        list: Enumeration values
        """
    
    def extend_enumeration_values(self, enum_name, new_values):
        """
        Extend enumeration with new values.
        
        Parameters:
        - enum_name: str, name of the enumeration
        - new_values: list, new values to add
        """
    
    def tiledbsoma_has_upgraded_domain(self):
        """
        Check if domain has been upgraded.
        
        Returns:
        bool: True if domain has been upgraded
        """
    
    def tiledbsoma_upgrade_soma_joinid_shape(self, newshape, check_only=False):
        """
        Upgrade soma_joinid dimension shape.
        
        Parameters:
        - newshape: int, new shape for soma_joinid dimension
        - check_only: bool, if True, only check if upgrade is possible
        
        Returns:
        bool or None: Result of upgrade operation
        """
    
    def change_domain(self, newdomain, check_only=False):
        """
        Change the domain configuration.
        
        Parameters:
        - newdomain: tuple, new domain bounds
        - check_only: bool, if True, only check if change is possible
        """

Usage Example

import tiledbsoma
import pyarrow as pa

# Define schema with required soma_joinid column
schema = pa.schema([
    ("soma_joinid", pa.int64()),
    ("cell_type", pa.string()),
    ("tissue", pa.string()),
    ("donor_id", pa.string()),
    ("total_counts", pa.int32())
])

# Create and write data
with tiledbsoma.DataFrame.create("cell_metadata.soma", schema=schema) as df:
    data = pa.table({
        "soma_joinid": [0, 1, 2, 3, 4],
        "cell_type": ["T-cell", "B-cell", "Neuron", "Astrocyte", "Hepatocyte"],
        "tissue": ["blood", "blood", "brain", "brain", "liver"],
        "donor_id": ["D1", "D1", "D2", "D2", "D3"],
        "total_counts": [1500, 2000, 800, 1200, 1800]
    })
    df.write(data)

# Read with filtering
with tiledbsoma.open("cell_metadata.soma") as df:
    # Filter for brain tissue cells
    brain_cells = df.read(
        value_filter="tissue == 'brain'",
        column_names=["soma_joinid", "cell_type", "total_counts"]
    ).concat()
    print(brain_cells.to_pandas())

SparseNDArray

A sparse N-dimensional array with offset (0-based) integer indexing. Dimensions are named soma_dim_0, soma_dim_1, etc., and stored values are named soma_data. Sparse arrays only store non-zero values, making them memory-efficient for data with many zeros.

class SparseNDArray:
    @classmethod
    def create(cls, uri, *, type, shape, platform_config=None, context=None, tiledb_timestamp=None):
        """
        Create a new SparseNDArray.
        
        Parameters:
        - uri: str, URI for the array
        - type: pyarrow data type for stored values
        - shape: tuple of int, array dimensions
        - platform_config: TileDB-specific configuration options
        - context: TileDB context for the operation
        - tiledb_timestamp: Timestamp for temporal queries
        
        Returns:
        SparseNDArray instance
        """
    
    def read(self, coords=(), result_order=None, batch_size=None, partitions=None, platform_config=None):
        """
        Read data from the sparse array.
        
        Parameters:
        - coords: tuple of slices/arrays, coordinate selection for each dimension
        - result_order: ResultOrder, result ordering preference
        - batch_size: int, number of elements per batch
        - partitions: Partitions object for parallel reading
        - platform_config: TileDB-specific configuration options
        
        Returns:
        SparseNDArrayRead iterator
        """
    
    def write(self, values, platform_config=None):
        """
        Write sparse data to the array.
        
        Parameters:
        - values: tuple of (coordinates_table, values_table)
          - coordinates_table: pyarrow.Table with soma_dim_* columns
          - values_table: pyarrow.Table with soma_data column
        - platform_config: TileDB-specific configuration options
        """
    
    @property
    def shape(self):
        """
        Get array dimensions.
        
        Returns:
        tuple of int: Array shape
        """
    
    @property
    def nnz(self):
        """
        Get number of non-zero elements.
        
        Returns:
        int: Number of stored (non-zero) elements
        """
    
    @property
    def schema(self):
        """
        Get the Arrow schema for coordinates and values.
        
        Returns:
        pyarrow.Schema: Schema for the array data
        """

Usage Example

import tiledbsoma
import pyarrow as pa
import numpy as np

# Create a sparse 2D array for gene expression (cells x genes)
with tiledbsoma.SparseNDArray.create(
    "expression_matrix.soma",
    type=pa.float32(), 
    shape=(1000, 2000)  # 1000 cells, 2000 genes
) as sparse_array:
    
    # Generate sparse data (only non-zero expression values)
    np.random.seed(42)
    n_nonzero = 5000
    cell_ids = np.random.randint(0, 1000, n_nonzero)
    gene_ids = np.random.randint(0, 2000, n_nonzero)
    expression_values = np.random.exponential(2.0, n_nonzero)
    
    # Prepare coordinate and value tables
    coordinates = pa.table({
        "soma_dim_0": cell_ids,  # cell dimension
        "soma_dim_1": gene_ids   # gene dimension
    })
    values = pa.table({
        "soma_data": expression_values
    })
    
    # Write sparse data
    sparse_array.write((coordinates, values))

# Read sparse data back
with tiledbsoma.open("expression_matrix.soma") as sparse_array:
    print(f"Array shape: {sparse_array.shape}")
    print(f"Non-zero elements: {sparse_array.nnz}")
    
    # Read subset of data (first 100 cells, all genes)
    reader = sparse_array.read(coords=(slice(0, 100), slice(None)))
    for batch in reader:
        coords_df = batch.coords().to_pandas()
        values_df = batch.values().to_pandas()
        print(f"Batch: {len(coords_df)} non-zero values")

DenseNDArray

A dense N-dimensional array with offset (0-based) integer indexing. Like sparse arrays, dimensions are named soma_dim_0, soma_dim_1, etc., and values are named soma_data. Dense arrays store values for all coordinate positions, making them suitable for data without sparsity.

class DenseNDArray:
    @classmethod
    def create(cls, uri, *, type, shape, platform_config=None, context=None, tiledb_timestamp=None):
        """
        Create a new DenseNDArray.
        
        Parameters:
        - uri: str, URI for the array
        - type: pyarrow data type for stored values
        - shape: tuple of int, array dimensions
        - platform_config: TileDB-specific configuration options
        - context: TileDB context for the operation
        - tiledb_timestamp: Timestamp for temporal queries
        
        Returns:
        DenseNDArray instance
        """
    
    def read(self, coords=(), result_order=None, batch_size=None, partitions=None, platform_config=None):
        """
        Read data from the dense array.
        
        Parameters:
        - coords: tuple of slices/arrays, coordinate selection for each dimension
        - result_order: ResultOrder, result ordering preference
        - batch_size: int, number of elements per batch
        - partitions: Partitions object for parallel reading
        - platform_config: TileDB-specific configuration options
        
        Returns:
        pyarrow.Tensor with requested data
        """
    
    def write(self, coords, values, platform_config=None):
        """
        Write dense data to the array.
        
        Parameters:
        - coords: tuple of slices, coordinate region to write
        - values: numpy array or Arrow tensor with data to write
        - platform_config: TileDB-specific configuration options
        """
    
    @property
    def shape(self):
        """
        Get array dimensions.
        
        Returns:
        tuple of int: Array shape
        """
    
    @property
    def schema(self):
        """
        Get the Arrow schema for the array.
        
        Returns:
        pyarrow.Schema: Schema for the array data
        """

Usage Example

import tiledbsoma
import pyarrow as pa
import numpy as np

# Create a dense 2D array for embedding coordinates
with tiledbsoma.DenseNDArray.create(
    "cell_embeddings.soma",
    type=pa.float64(),
    shape=(1000, 50)  # 1000 cells, 50 embedding dimensions
) as dense_array:
    
    # Generate embedding data (PCA coordinates)
    np.random.seed(42)
    embeddings = np.random.normal(0, 1, (1000, 50))
    
    # Write all data at once
    dense_array.write(
        coords=(slice(None), slice(None)),  # Write entire array
        values=embeddings
    )

# Read dense data back
with tiledbsoma.open("cell_embeddings.soma") as dense_array:
    print(f"Array shape: {dense_array.shape}")
    
    # Read subset (first 10 cells, first 5 dimensions)
    subset = dense_array.read(coords=(slice(0, 10), slice(0, 5)))
    print("First 10 cells, first 5 PCA dimensions:")
    print(subset.to_numpy())
    
    # Read specific cells by index
    cell_indices = [0, 50, 100, 200, 500]
    selected_cells = dense_array.read(coords=(cell_indices, slice(None)))
    print(f"Selected cells embedding shape: {selected_cells.to_numpy().shape}")

Factory Function

def open(uri, mode="r", *, soma_type=None, context=None, tiledb_timestamp=None):
    """
    Open any SOMA object at the specified URI.
    
    Parameters:
    - uri: str, URI of the SOMA object to open
    - mode: str, access mode ("r" for read, "w" for write)
    - soma_type: str, expected SOMA type (optional, auto-detected if not provided)
    - context: TileDB context for the operation
    - tiledb_timestamp: Timestamp for temporal queries
    
    Returns:
    SOMA object of the appropriate type (Collection, DataFrame, etc.)
    """

Install with Tessl CLI