tessl/pypi-deeplake

Database for AI powered by a storage format optimized for deep-learning applications.

1.59x

Evaluation — 75%

↑ 1.59x

Agent success when using this tile

Overview

Eval results

Files

Query System

Name: tessl/pypi-deeplake
Rating: 0.75 (1 reviews)
Author: tessl

TQL (Tensor Query Language) provides SQL-like syntax optimized for tensor operations, enabling complex data filtering, aggregation, and transformation across datasets. The query system supports both immediate execution and prepared statements for parameterized queries.

Capabilities

Query Execution

Execute TQL queries with immediate results or asynchronous processing for large datasets.

def query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> DatasetView:
    """
    Execute TQL query and return results.
    
    Parameters:
    - query: TQL query string
    - token: Activeloop authentication token
    - creds: Storage credentials dictionary
    
    Returns:
    DatasetView: Query result view
    """

def query_async(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> Future[DatasetView]:
    """
    Execute TQL query asynchronously.
    
    Parameters:
    - query: TQL query string
    - token: Activeloop authentication token
    - creds: Storage credentials dictionary
    
    Returns:
    Future[DatasetView]: Future resolving to query result view
    """

Prepared Queries

Create prepared statements for efficient execution of parameterized queries with variable substitution.

def prepare_query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> Executor:
    """
    Prepare parameterized query for efficient repeated execution.
    
    Parameters:
    - query: TQL query string with parameter placeholders
    - token: Activeloop authentication token
    - creds: Storage credentials dictionary
    
    Returns:
    Executor: Prepared query executor
    """

class Executor:
    """Prepared query executor for parameterized queries."""
    
    def get_query_string(self) -> str:
        """
        Get the prepared query string.
        
        Returns:
        str: Original query string with parameter placeholders
        """
    
    def run_single(self, parameters: Dict[str, Any]) -> DatasetView:
        """
        Execute prepared query with single parameter set.
        
        Parameters:
        - parameters: Dictionary mapping parameter names to values
        
        Returns:
        DatasetView: Query result view
        """
    
    def run_single_async(self, parameters: Dict[str, Any]) -> Future[DatasetView]:
        """
        Execute prepared query asynchronously with single parameter set.
        
        Parameters:
        - parameters: Dictionary mapping parameter names to values
        
        Returns:
        Future[DatasetView]: Future resolving to query result view
        """
    
    def run_batch(self, parameters: List[Dict[str, Any]]) -> List[DatasetView]:
        """
        Execute prepared query with multiple parameter sets.
        
        Parameters:
        - parameters: List of parameter dictionaries
        
        Returns:
        List[DatasetView]: List of query result views
        """
    
    def run_batch_async(self, parameters: List[Dict[str, Any]]) -> Future[List[DatasetView]]:
        """
        Execute prepared query asynchronously with multiple parameter sets.
        
        Parameters:
        - parameters: List of parameter dictionaries
        
        Returns:
        Future[List[DatasetView]]: Future resolving to list of query result views
        """

Query Analysis

Analyze and explain query execution plans for optimization and debugging.

def explain_query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> ExplainQueryResult:
    """
    Explain query execution plan.
    
    Parameters:
    - query: TQL query string to analyze
    - token: Activeloop authentication token
    - creds: Storage credentials dictionary
    
    Returns:
    ExplainQueryResult: Query execution plan and statistics
    """

class ExplainQueryResult:
    """Query execution plan and analysis."""
    
    def __str__(self) -> str:
        """
        Get human-readable explanation of query plan.
        
        Returns:
        str: Formatted query execution plan
        """
    
    def to_dict(self) -> Dict[str, Any]:
        """
        Get query plan as structured data.
        
        Returns:
        Dict[str, Any]: Dictionary containing execution plan details
        """

Dataset View Operations

DatasetView objects provide additional query and analysis capabilities on query results.

class DatasetView:
    """Query result view with additional query capabilities."""
    
    schema: SchemaView
    
    def query(self, query: str) -> DatasetView:
        """
        Execute nested query on this view.
        
        Parameters:
        - query: TQL query string
        
        Returns:
        DatasetView: Nested query result view
        """
    
    def prepare_query(self, query: str) -> Executor:
        """
        Prepare parameterized query on this view.
        
        Parameters:
        - query: TQL query string with parameter placeholders
        
        Returns:
        Executor: Prepared query executor
        """
    
    def explain_query(self, query: str) -> ExplainQueryResult:
        """
        Explain query execution plan on this view.
        
        Parameters:
        - query: TQL query string to analyze
        
        Returns:
        ExplainQueryResult: Query execution plan and statistics
        """
    
    def summary(self) -> str:
        """
        Get summary statistics of the dataset view.
        
        Returns:
        str: Summary statistics including row count, column info, etc.
        """
    
    def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]:
        """
        Iterate over view data in batches.
        
        Parameters:
        - batch_size: Number of rows per batch
        
        Returns:
        Iterator[Dict[str, Any]]: Iterator yielding batches as dictionaries
        """

TQL Function Registration

def register_function(function: Callable) -> None:
    """
    Register Python function for use in TQL queries.
    
    Parameters:
    - function: Python function to register
    """

def get_max_num_parallel_queries() -> int:
    """
    Get maximum number of parallel queries allowed.
    
    Returns:
    int: Maximum parallel query limit
    """

def set_max_num_parallel_queries(num: int) -> None:
    """
    Set maximum number of parallel queries allowed.
    
    Parameters:
    - num: Maximum parallel query limit
    """

Usage Examples

Basic Queries

import deeplake

# Simple SELECT query
results = deeplake.query('SELECT * FROM "s3://my-bucket/dataset" WHERE label == "cat"')

# Access query results
print(f"Found {len(results)} cat images")
for row in results:
    print(f"Image: {row['image_path']}, Label: {row['label']}")

# Query with aggregation
stats = deeplake.query('SELECT label, COUNT(*) as count FROM "s3://my-bucket/dataset" GROUP BY label')
for row in stats:
    print(f"Label: {row['label']}, Count: {row['count']}")

# Query with filtering and ordering
high_confidence = deeplake.query('''
    SELECT image_path, confidence 
    FROM "s3://my-bucket/dataset" 
    WHERE confidence > 0.9 
    ORDER BY confidence DESC 
    LIMIT 10
''')

Parameterized Queries

# Prepare parameterized query
executor = deeplake.prepare_query('''
    SELECT * FROM "s3://my-bucket/dataset" 
    WHERE label == $label AND confidence > $min_confidence
''')

# Execute with different parameters
cats = executor.run_single({"label": "cat", "min_confidence": 0.8})
dogs = executor.run_single({"label": "dog", "min_confidence": 0.8})

# Batch execution
params_list = [
    {"label": "cat", "min_confidence": 0.9},
    {"label": "dog", "min_confidence": 0.9},
    {"label": "bird", "min_confidence": 0.9}
]
results_list = executor.run_batch(params_list)

for i, results in enumerate(results_list):
    label = params_list[i]["label"]
    print(f"High confidence {label} images: {len(results)}")

Advanced TQL Features

# Complex filtering with multiple conditions
complex_query = deeplake.query('''
    SELECT image_path, embeddings, metadata 
    FROM "s3://my-bucket/dataset" 
    WHERE label IN ("cat", "dog") 
    AND confidence > 0.85 
    AND width > 224 
    AND height > 224
''')

# Similarity search using embedding vectors
similar_images = deeplake.query('''
    SELECT image_path, 
           COSINE_SIMILARITY(embeddings, $target_embedding) as similarity
    FROM "s3://my-bucket/dataset" 
    WHERE COSINE_SIMILARITY(embeddings, $target_embedding) > 0.8
    ORDER BY similarity DESC
''', parameters={"target_embedding": target_vector})

# Text search in descriptions
text_results = deeplake.query('''
    SELECT * FROM "s3://my-bucket/dataset" 
    WHERE CONTAINS(description, "outdoor scene")
''')

# Geospatial queries
location_results = deeplake.query('''
    SELECT * FROM "s3://my-bucket/dataset" 
    WHERE latitude BETWEEN 40.0 AND 41.0 
    AND longitude BETWEEN -74.0 AND -73.0
''')

Query Analysis and Optimization

# Analyze query performance
query_str = 'SELECT * FROM "s3://my-bucket/dataset" WHERE confidence > 0.9'
explanation = deeplake.explain_query(query_str)

print("Query Plan:")
print(explanation)

# Get structured execution plan
plan_dict = explanation.to_dict()
print(f"Estimated rows: {plan_dict.get('estimated_rows', 'unknown')}")
print(f"Index usage: {plan_dict.get('uses_index', 'unknown')}")

# Query optimization suggestions
if not plan_dict.get('uses_index', False):
    print("Consider creating an index on 'confidence' column for better performance")

Nested Queries and Views

# Create initial view
base_view = deeplake.query('SELECT * FROM "s3://my-bucket/dataset" WHERE split == "train"')

# Query on the view
filtered_view = base_view.query('SELECT * WHERE confidence > 0.9')

# Further nested query
final_results = filtered_view.query('SELECT image_path, label ORDER BY confidence DESC LIMIT 100')

print(f"Top 100 high-confidence training images: {len(final_results)}")

Custom Function Registration

import numpy as np

# Register custom function for TQL
def normalize_scores(scores):
    """Normalize confidence scores to 0-1 range."""
    scores_array = np.array(scores)
    return (scores_array - scores_array.min()) / (scores_array.max() - scores_array.min())

deeplake.tql.register_function(normalize_scores)

# Use custom function in query
normalized_results = deeplake.query('''
    SELECT image_path, 
           normalize_scores(confidence) as normalized_confidence
    FROM "s3://my-bucket/dataset"
    ORDER BY normalized_confidence DESC
''')

Async Query Execution

import asyncio

async def process_multiple_queries():
    queries = [
        'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "cat"',
        'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "dog"',
        'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "bird"'
    ]
    
    # Execute queries concurrently
    tasks = [deeplake.query_async(query) for query in queries]
    results = await asyncio.gather(*tasks)
    
    for i, result in enumerate(results):
        query_type = queries[i].split('"')[3]  # Extract label
        print(f"Query {i+1} returned {len(result)} results")
    
    return results

# Run async queries
results = asyncio.run(process_multiple_queries())

Performance Tuning

# Set maximum parallel queries for performance tuning
current_max = deeplake.tql.get_max_num_parallel_queries()
print(f"Current max parallel queries: {current_max}")

# Increase for high-performance systems
deeplake.tql.set_max_num_parallel_queries(8)

# Query with performance monitoring
import time

start_time = time.time()
large_results = deeplake.query('''
    SELECT * FROM "s3://my-bucket/large_dataset" 
    WHERE embedding_magnitude > 0.5
''')
end_time = time.time()

print(f"Query executed in {end_time - start_time:.2f} seconds")
print(f"Returned {len(large_results)} results")

Install with Tessl CLI

npx tessl i tessl/pypi-deeplake