Tessl Tile for pypi/deeplake@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

data-access.md data-import-export.md dataset-management.md error-handling.md framework-integration.md index.md query-system.md schema-templates.md storage-system.md type-system.md version-control.md

query-system.mddocs/

0
# Query System
1

2
TQL (Tensor Query Language) provides SQL-like syntax optimized for tensor operations, enabling complex data filtering, aggregation, and transformation across datasets. The query system supports both immediate execution and prepared statements for parameterized queries.
3

4
## Capabilities
5

6
### Query Execution
7

8
Execute TQL queries with immediate results or asynchronous processing for large datasets.
9

10
```python { .api }
11
def query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> DatasetView:
12
    """
13
    Execute TQL query and return results.
14
    
15
    Parameters:
16
    - query: TQL query string
17
    - token: Activeloop authentication token
18
    - creds: Storage credentials dictionary
19
    
20
    Returns:
21
    DatasetView: Query result view
22
    """
23

24
def query_async(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> Future[DatasetView]:
25
    """
26
    Execute TQL query asynchronously.
27
    
28
    Parameters:
29
    - query: TQL query string
30
    - token: Activeloop authentication token
31
    - creds: Storage credentials dictionary
32
    
33
    Returns:
34
    Future[DatasetView]: Future resolving to query result view
35
    """
36
```
37

38
### Prepared Queries
39

40
Create prepared statements for efficient execution of parameterized queries with variable substitution.
41

42
```python { .api }
43
def prepare_query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> Executor:
44
    """
45
    Prepare parameterized query for efficient repeated execution.
46
    
47
    Parameters:
48
    - query: TQL query string with parameter placeholders
49
    - token: Activeloop authentication token
50
    - creds: Storage credentials dictionary
51
    
52
    Returns:
53
    Executor: Prepared query executor
54
    """
55

56
class Executor:
57
    """Prepared query executor for parameterized queries."""
58
    
59
    def get_query_string(self) -> str:
60
        """
61
        Get the prepared query string.
62
        
63
        Returns:
64
        str: Original query string with parameter placeholders
65
        """
66
    
67
    def run_single(self, parameters: Dict[str, Any]) -> DatasetView:
68
        """
69
        Execute prepared query with single parameter set.
70
        
71
        Parameters:
72
        - parameters: Dictionary mapping parameter names to values
73
        
74
        Returns:
75
        DatasetView: Query result view
76
        """
77
    
78
    def run_single_async(self, parameters: Dict[str, Any]) -> Future[DatasetView]:
79
        """
80
        Execute prepared query asynchronously with single parameter set.
81
        
82
        Parameters:
83
        - parameters: Dictionary mapping parameter names to values
84
        
85
        Returns:
86
        Future[DatasetView]: Future resolving to query result view
87
        """
88
    
89
    def run_batch(self, parameters: List[Dict[str, Any]]) -> List[DatasetView]:
90
        """
91
        Execute prepared query with multiple parameter sets.
92
        
93
        Parameters:
94
        - parameters: List of parameter dictionaries
95
        
96
        Returns:
97
        List[DatasetView]: List of query result views
98
        """
99
    
100
    def run_batch_async(self, parameters: List[Dict[str, Any]]) -> Future[List[DatasetView]]:
101
        """
102
        Execute prepared query asynchronously with multiple parameter sets.
103
        
104
        Parameters:
105
        - parameters: List of parameter dictionaries
106
        
107
        Returns:
108
        Future[List[DatasetView]]: Future resolving to list of query result views
109
        """
110
```
111

112
### Query Analysis
113

114
Analyze and explain query execution plans for optimization and debugging.
115

116
```python { .api }
117
def explain_query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> ExplainQueryResult:
118
    """
119
    Explain query execution plan.
120
    
121
    Parameters:
122
    - query: TQL query string to analyze
123
    - token: Activeloop authentication token
124
    - creds: Storage credentials dictionary
125
    
126
    Returns:
127
    ExplainQueryResult: Query execution plan and statistics
128
    """
129

130
class ExplainQueryResult:
131
    """Query execution plan and analysis."""
132
    
133
    def __str__(self) -> str:
134
        """
135
        Get human-readable explanation of query plan.
136
        
137
        Returns:
138
        str: Formatted query execution plan
139
        """
140
    
141
    def to_dict(self) -> Dict[str, Any]:
142
        """
143
        Get query plan as structured data.
144
        
145
        Returns:
146
        Dict[str, Any]: Dictionary containing execution plan details
147
        """
148
```
149

150
### Dataset View Operations
151

152
DatasetView objects provide additional query and analysis capabilities on query results.
153

154
```python { .api }
155
class DatasetView:
156
    """Query result view with additional query capabilities."""
157
    
158
    schema: SchemaView
159
    
160
    def query(self, query: str) -> DatasetView:
161
        """
162
        Execute nested query on this view.
163
        
164
        Parameters:
165
        - query: TQL query string
166
        
167
        Returns:
168
        DatasetView: Nested query result view
169
        """
170
    
171
    def prepare_query(self, query: str) -> Executor:
172
        """
173
        Prepare parameterized query on this view.
174
        
175
        Parameters:
176
        - query: TQL query string with parameter placeholders
177
        
178
        Returns:
179
        Executor: Prepared query executor
180
        """
181
    
182
    def explain_query(self, query: str) -> ExplainQueryResult:
183
        """
184
        Explain query execution plan on this view.
185
        
186
        Parameters:
187
        - query: TQL query string to analyze
188
        
189
        Returns:
190
        ExplainQueryResult: Query execution plan and statistics
191
        """
192
    
193
    def summary(self) -> str:
194
        """
195
        Get summary statistics of the dataset view.
196
        
197
        Returns:
198
        str: Summary statistics including row count, column info, etc.
199
        """
200
    
201
    def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]:
202
        """
203
        Iterate over view data in batches.
204
        
205
        Parameters:
206
        - batch_size: Number of rows per batch
207
        
208
        Returns:
209
        Iterator[Dict[str, Any]]: Iterator yielding batches as dictionaries
210
        """
211
```
212

213
### TQL Function Registration
214

215
Register custom Python functions for use in TQL queries with automatic type inference.
216

217
```python { .api }
218
def register_function(function: Callable) -> None:
219
    """
220
    Register Python function for use in TQL queries.
221
    
222
    Parameters:
223
    - function: Python function to register
224
    """
225

226
def get_max_num_parallel_queries() -> int:
227
    """
228
    Get maximum number of parallel queries allowed.
229
    
230
    Returns:
231
    int: Maximum parallel query limit
232
    """
233

234
def set_max_num_parallel_queries(num: int) -> None:
235
    """
236
    Set maximum number of parallel queries allowed.
237
    
238
    Parameters:
239
    - num: Maximum parallel query limit
240
    """
241
```
242

243
## Usage Examples
244

245
### Basic Queries
246

247
```python
248
import deeplake
249

250
# Simple SELECT query
251
results = deeplake.query('SELECT * FROM "s3://my-bucket/dataset" WHERE label == "cat"')
252

253
# Access query results
254
print(f"Found {len(results)} cat images")
255
for row in results:
256
    print(f"Image: {row['image_path']}, Label: {row['label']}")
257

258
# Query with aggregation
259
stats = deeplake.query('SELECT label, COUNT(*) as count FROM "s3://my-bucket/dataset" GROUP BY label')
260
for row in stats:
261
    print(f"Label: {row['label']}, Count: {row['count']}")
262

263
# Query with filtering and ordering
264
high_confidence = deeplake.query('''
265
    SELECT image_path, confidence 
266
    FROM "s3://my-bucket/dataset" 
267
    WHERE confidence > 0.9 
268
    ORDER BY confidence DESC 
269
    LIMIT 10
270
''')
271
```
272

273
### Parameterized Queries
274

275
```python
276
# Prepare parameterized query
277
executor = deeplake.prepare_query('''
278
    SELECT * FROM "s3://my-bucket/dataset" 
279
    WHERE label == $label AND confidence > $min_confidence
280
''')
281

282
# Execute with different parameters
283
cats = executor.run_single({"label": "cat", "min_confidence": 0.8})
284
dogs = executor.run_single({"label": "dog", "min_confidence": 0.8})
285

286
# Batch execution
287
params_list = [
288
    {"label": "cat", "min_confidence": 0.9},
289
    {"label": "dog", "min_confidence": 0.9},
290
    {"label": "bird", "min_confidence": 0.9}
291
]
292
results_list = executor.run_batch(params_list)
293

294
for i, results in enumerate(results_list):
295
    label = params_list[i]["label"]
296
    print(f"High confidence {label} images: {len(results)}")
297
```
298

299
### Advanced TQL Features
300

301
```python
302
# Complex filtering with multiple conditions
303
complex_query = deeplake.query('''
304
    SELECT image_path, embeddings, metadata 
305
    FROM "s3://my-bucket/dataset" 
306
    WHERE label IN ("cat", "dog") 
307
    AND confidence > 0.85 
308
    AND width > 224 
309
    AND height > 224
310
''')
311

312
# Similarity search using embedding vectors
313
similar_images = deeplake.query('''
314
    SELECT image_path, 
315
           COSINE_SIMILARITY(embeddings, $target_embedding) as similarity
316
    FROM "s3://my-bucket/dataset" 
317
    WHERE COSINE_SIMILARITY(embeddings, $target_embedding) > 0.8
318
    ORDER BY similarity DESC
319
''', parameters={"target_embedding": target_vector})
320

321
# Text search in descriptions
322
text_results = deeplake.query('''
323
    SELECT * FROM "s3://my-bucket/dataset" 
324
    WHERE CONTAINS(description, "outdoor scene")
325
''')
326

327
# Geospatial queries
328
location_results = deeplake.query('''
329
    SELECT * FROM "s3://my-bucket/dataset" 
330
    WHERE latitude BETWEEN 40.0 AND 41.0 
331
    AND longitude BETWEEN -74.0 AND -73.0
332
''')
333
```
334

335
### Query Analysis and Optimization
336

337
```python
338
# Analyze query performance
339
query_str = 'SELECT * FROM "s3://my-bucket/dataset" WHERE confidence > 0.9'
340
explanation = deeplake.explain_query(query_str)
341

342
print("Query Plan:")
343
print(explanation)
344

345
# Get structured execution plan
346
plan_dict = explanation.to_dict()
347
print(f"Estimated rows: {plan_dict.get('estimated_rows', 'unknown')}")
348
print(f"Index usage: {plan_dict.get('uses_index', 'unknown')}")
349

350
# Query optimization suggestions
351
if not plan_dict.get('uses_index', False):
352
    print("Consider creating an index on 'confidence' column for better performance")
353
```
354

355
### Nested Queries and Views
356

357
```python
358
# Create initial view
359
base_view = deeplake.query('SELECT * FROM "s3://my-bucket/dataset" WHERE split == "train"')
360

361
# Query on the view
362
filtered_view = base_view.query('SELECT * WHERE confidence > 0.9')
363

364
# Further nested query
365
final_results = filtered_view.query('SELECT image_path, label ORDER BY confidence DESC LIMIT 100')
366

367
print(f"Top 100 high-confidence training images: {len(final_results)}")
368
```
369

370
### Custom Function Registration
371

372
```python
373
import numpy as np
374

375
# Register custom function for TQL
376
def normalize_scores(scores):
377
    """Normalize confidence scores to 0-1 range."""
378
    scores_array = np.array(scores)
379
    return (scores_array - scores_array.min()) / (scores_array.max() - scores_array.min())
380

381
deeplake.tql.register_function(normalize_scores)
382

383
# Use custom function in query
384
normalized_results = deeplake.query('''
385
    SELECT image_path, 
386
           normalize_scores(confidence) as normalized_confidence
387
    FROM "s3://my-bucket/dataset"
388
    ORDER BY normalized_confidence DESC
389
''')
390
```
391

392
### Async Query Execution
393

394
```python
395
import asyncio
396

397
async def process_multiple_queries():
398
    queries = [
399
        'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "cat"',
400
        'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "dog"',
401
        'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "bird"'
402
    ]
403
    
404
    # Execute queries concurrently
405
    tasks = [deeplake.query_async(query) for query in queries]
406
    results = await asyncio.gather(*tasks)
407
    
408
    for i, result in enumerate(results):
409
        query_type = queries[i].split('"')[3]  # Extract label
410
        print(f"Query {i+1} returned {len(result)} results")
411
    
412
    return results
413

414
# Run async queries
415
results = asyncio.run(process_multiple_queries())
416
```
417

418
### Performance Tuning
419

420
```python
421
# Set maximum parallel queries for performance tuning
422
current_max = deeplake.tql.get_max_num_parallel_queries()
423
print(f"Current max parallel queries: {current_max}")
424

425
# Increase for high-performance systems
426
deeplake.tql.set_max_num_parallel_queries(8)
427

428
# Query with performance monitoring
429
import time
430

431
start_time = time.time()
432
large_results = deeplake.query('''
433
    SELECT * FROM "s3://my-bucket/large_dataset" 
434
    WHERE embedding_magnitude > 0.5
435
''')
436
end_time = time.time()
437

438
print(f"Query executed in {end_time - start_time:.2f} seconds")
439
print(f"Returned {len(large_results)} results")
440
```

Version

Tile

Files

query-system.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

query-system.mddocs/