tessl/pypi-tiledbsoma

Python API for efficient storage and retrieval of single-cell data using TileDB

Overview

Eval results

Files

Query and Indexing

Name: tessl/pypi-tiledbsoma
Author: tessl

Query builders and indexing utilities for efficient data retrieval from SOMA objects. These tools enable filtering, subsetting, and indexing operations on single-cell datasets at scale.

Capabilities

ExperimentAxisQuery

A powerful query builder for Experiments that provides methods to query observations, variables, and measurements with efficient filtering and retrieval.

class ExperimentAxisQuery:
    def obs(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):
        """
        Query observations (cells) from the experiment.
        
        Parameters:
        - column_names: list of str, specific observation columns to retrieve
        - batch_size: int, number of observations per batch
        - partitions: Partitions object for parallel reading
        - platform_config: TileDB-specific configuration options
        
        Returns:
        Iterator of Arrow tables containing observation data
        """
    
    def var(self, *, column_names=None, batch_size=None, partitions=None, platform_config=None):
        """
        Query variables (genes/features) from the experiment.
        
        Parameters:
        - column_names: list of str, specific variable columns to retrieve
        - batch_size: int, number of variables per batch
        - partitions: Partitions object for parallel reading
        - platform_config: TileDB-specific configuration options
        
        Returns:
        Iterator of Arrow tables containing variable data
        """
    
    def X(self, layer_name, *, batch_size=None, partitions=None, platform_config=None):
        """
        Query measurement matrices (expression data).
        
        Parameters:
        - layer_name: str, name of the X layer to query
        - batch_size: int, number of elements per batch
        - partitions: Partitions object for parallel reading
        - platform_config: TileDB-specific configuration options
        
        Returns:
        Iterator of sparse matrix data
        """
    
    def to_anndata(self, *, X_layer_name=None, column_names=None, obsm_layers=None, varm_layers=None, obsp_layers=None, varp_layers=None):
        """
        Convert query results to an AnnData object.
        
        Parameters:
        - X_layer_name: str, X layer to use as main matrix (None uses first available)
        - column_names: dict, column names to include for obs/var
        - obsm_layers: list of str, obsm layers to include
        - varm_layers: list of str, varm layers to include
        - obsp_layers: list of str, obsp layers to include
        - varp_layers: list of str, varp layers to include
        
        Returns:
        AnnData object with query results
        """

Creating Axis Queries

Axis queries are created through the axis_query method on Experiments:

import tiledbsoma

with tiledbsoma.open("experiment.soma") as exp:
    # Create basic query
    query = exp.axis_query("RNA")
    
    # Create query with observation filtering
    query = exp.axis_query(
        "RNA",
        obs_query=tiledbsoma.AxisQuery(
            value_filter="cell_type == 'T-cell' and n_genes > 1000"
        )
    )
    
    # Create query with variable filtering
    query = exp.axis_query(
        "RNA", 
        var_query=tiledbsoma.AxisQuery(
            value_filter="feature_type == 'Gene Expression'"
        )
    )
    
    # Create query with coordinate selection
    query = exp.axis_query(
        "RNA",
        obs_query=tiledbsoma.AxisQuery(coords=[0, 1, 2, 10, 50]),
        var_query=tiledbsoma.AxisQuery(coords=slice(0, 100))
    )

Usage Examples

import tiledbsoma
import pandas as pd

# Query specific cell types
with tiledbsoma.open("pbmc_experiment.soma") as exp:
    # Create query for T cells
    t_cell_query = exp.axis_query(
        "RNA",
        obs_query=tiledbsoma.AxisQuery(
            value_filter="cell_type in ['CD4 T cells', 'CD8 T cells']"
        )
    )
    
    # Get observation metadata
    obs_data = t_cell_query.obs(
        column_names=["cell_type", "donor_id", "n_genes", "total_counts"]
    ).concat()
    print(f"T cells found: {len(obs_data)}")
    
    # Get variable information
    var_data = t_cell_query.var(
        column_names=["feature_name", "feature_type"]
    ).concat()
    print(f"Genes: {len(var_data)}")
    
    # Get expression matrix
    for batch in t_cell_query.X("data"):
        coordinates = batch.coords().to_pandas()  # cell_id, gene_id
        values = batch.values().to_pandas()       # expression values
        print(f"Expression batch: {len(values)} non-zero values")
    
    # Convert to AnnData for downstream analysis
    t_cell_adata = t_cell_query.to_anndata(
        X_layer_name="data",
        obsm_layers=["X_pca", "X_umap"],
        column_names={
            "obs": ["cell_type", "donor_id"],
            "var": ["feature_name", "highly_variable"]
        }
    )
    print(f"AnnData shape: {t_cell_adata.shape}")

# Query with coordinate-based selection
with tiledbsoma.open("experiment.soma") as exp:
    # Select first 1000 cells and top 2000 variable genes
    subset_query = exp.axis_query(
        "RNA",
        obs_query=tiledbsoma.AxisQuery(coords=slice(0, 1000)),
        var_query=tiledbsoma.AxisQuery(coords=slice(0, 2000))
    )
    
    # Process in batches
    batch_size = 10000
    for obs_batch in subset_query.obs(batch_size=batch_size):
        obs_df = obs_batch.to_pandas()
        print(f"Processing {len(obs_df)} observations")
        
        # Process batch...

# Complex filtering query
with tiledbsoma.open("experiment.soma") as exp:
    # Query high-quality cells with specific markers
    quality_query = exp.axis_query(
        "RNA",
        obs_query=tiledbsoma.AxisQuery(
            value_filter="""
            n_genes >= 500 and n_genes <= 5000 and
            total_counts >= 1000 and
            pct_counts_mitochondrial <= 20 and
            tissue == 'brain'
            """
        ),
        var_query=tiledbsoma.AxisQuery(
            value_filter="highly_variable == True and feature_type == 'Gene Expression'"
        )
    )
    
    # Convert to AnnData with all available layers
    brain_adata = quality_query.to_anndata(
        X_layer_name="normalized",
        obsm_layers=None,  # Include all obsm layers
        varm_layers=None   # Include all varm layers
    )

AxisQuery Specification

The AxisQuery class provides flexible query specification for coordinates and filtering.

class AxisQuery:
    def __init__(self, *, coords=None, value_filter=None):
        """
        Create an axis query specification.
        
        Parameters:
        - coords: coordinate selection (slice, list, or array)
        - value_filter: str, filter expression for attribute values
        """

Coordinate Selection Examples

import tiledbsoma

# Various coordinate selection patterns
axis_queries = [
    # Select specific indices
    tiledbsoma.AxisQuery(coords=[0, 5, 10, 15, 20]),
    
    # Select range with slice
    tiledbsoma.AxisQuery(coords=slice(100, 500)),
    
    # Select with step
    tiledbsoma.AxisQuery(coords=slice(0, 1000, 10)),  # Every 10th element
    
    # Select all (equivalent to no coordinate filter)
    tiledbsoma.AxisQuery(coords=slice(None)),
]

# Value filter examples
filter_queries = [
    # Numeric comparisons
    tiledbsoma.AxisQuery(value_filter="n_genes > 1000"),
    
    # String matching
    tiledbsoma.AxisQuery(value_filter="cell_type == 'B cells'"),
    
    # Multiple conditions
    tiledbsoma.AxisQuery(value_filter="n_genes > 500 and total_counts < 10000"),
    
    # Set membership
    tiledbsoma.AxisQuery(value_filter="donor_id in ['D1', 'D2', 'D3']"),
    
    # Pattern matching
    tiledbsoma.AxisQuery(value_filter="feature_name startswith 'MT-'"),
]

IntIndexer

A re-indexer for unique integer indices, compatible with Pandas Index.get_indexer functionality. Useful for mapping between different index spaces efficiently.

class IntIndexer:
    def __init__(self, data, *, context=None):
        """
        Initialize IntIndexer with integer keys.
        
        Parameters:
        - data: array-like of unique integers to index
        - context: TileDB context for the operation
        """
    
    def get_indexer(self, target):
        """
        Compute underlying indices for target data.
        
        Parameters:
        - target: array-like of integers to find indices for
        
        Returns:
        numpy array of indices, with -1 for missing values
        """

Usage Example

import tiledbsoma
import numpy as np

# Create indexer for soma_joinid values
original_ids = np.array([0, 5, 10, 15, 20, 25, 30])
indexer = tiledbsoma.IntIndexer(original_ids)

# Find positions of specific IDs
target_ids = np.array([5, 15, 99, 20])  # 99 doesn't exist
positions = indexer.get_indexer(target_ids)
print(positions)  # [1, 3, -1, 4] (99 maps to -1)

# Use with SOMA data
with tiledbsoma.open("experiment.soma") as exp:
    # Get all observation IDs
    obs_ids = exp.obs.read(column_names=["soma_joinid"]).concat()["soma_joinid"].to_numpy()
    
    # Create indexer
    obs_indexer = tiledbsoma.IntIndexer(obs_ids)
    
    # Map external IDs to SOMA positions
    external_ids = np.array([100, 200, 300, 400])
    soma_positions = obs_indexer.get_indexer(external_ids)
    
    # Use positions for coordinate-based queries
    valid_positions = soma_positions[soma_positions >= 0]
    if len(valid_positions) > 0:
        query = exp.axis_query(
            "RNA",
            obs_query=tiledbsoma.AxisQuery(coords=valid_positions)
        )

Index Building Function

Utility function for building indices on integer arrays.

def tiledbsoma_build_index(data, *, context=None):
    """
    Build index for integer array.
    
    Parameters:
    - data: array-like of integers to index
    - context: TileDB context for the operation
    
    Returns:
    Built index structure for efficient lookups
    """

Usage Example

import tiledbsoma
import numpy as np

# Build index for large ID array
large_id_array = np.random.randint(0, 1000000, size=100000)
index = tiledbsoma.tiledbsoma_build_index(large_id_array)

# Use index for efficient lookups
# (specific usage depends on implementation details)

Query Performance Optimization

Batch Processing

import tiledbsoma

# Efficient batch processing for large queries
with tiledbsoma.open("large_experiment.soma") as exp:
    query = exp.axis_query("RNA")
    
    # Process observations in batches
    batch_size = 1000
    total_processed = 0
    
    for obs_batch in query.obs(batch_size=batch_size):
        obs_df = obs_batch.to_pandas()
        total_processed += len(obs_df)
        
        # Process batch
        print(f"Processed {total_processed} observations")
        
        # Your analysis code here...

Parallel Processing

import tiledbsoma

# Use partitions for parallel processing
with tiledbsoma.open("experiment.soma") as exp:
    query = exp.axis_query("RNA")
    
    # Create partitions for parallel execution
    partitions = tiledbsoma.Partitions(n_partitions=4)
    
    # Process partitions in parallel (conceptual - actual implementation may vary)
    for partition_id in range(partitions.n_partitions):
        obs_data = query.obs(partitions=partitions.get_partition(partition_id))
        # Process partition...

Memory-Efficient Queries

import tiledbsoma

# Memory-efficient processing of large datasets
with tiledbsoma.open("experiment.soma") as exp:
    # Query only needed columns
    query = exp.axis_query(
        "RNA",
        obs_query=tiledbsoma.AxisQuery(
            value_filter="quality_score > 0.8"
        )
    )
    
    # Stream data without loading everything into memory
    for expr_batch in query.X("data", batch_size=5000):
        # Process expression batch
        coords = expr_batch.coords()
        values = expr_batch.values()
        
        # Compute statistics, etc. without storing full dataset
        print(f"Batch non-zero values: {len(values)}")

Integration with Analysis Workflows

import tiledbsoma
import scanpy as sc

# Integrated analysis workflow
with tiledbsoma.open("experiment.soma") as exp:
    # Query high-quality cells
    hq_query = exp.axis_query(
        "RNA",
        obs_query=tiledbsoma.AxisQuery(
            value_filter="n_genes > 200 and pct_counts_mitochondrial < 20"
        ),
        var_query=tiledbsoma.AxisQuery(
            value_filter="n_cells > 3"  # Genes expressed in at least 3 cells
        )
    )
    
    # Convert to AnnData for Scanpy analysis
    adata = hq_query.to_anndata(X_layer_name="raw")
    
    # Standard single-cell analysis
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    
    # Continue with downstream analysis...
    print(f"Processed {adata.n_obs} cells and {adata.n_vars} genes")

This query and indexing functionality provides the foundation for efficient, scalable analysis of single-cell datasets stored in SOMA format.

Install with Tessl CLI