0
# Query System
1
2
TQL (Tensor Query Language) provides SQL-like syntax optimized for tensor operations, enabling complex data filtering, aggregation, and transformation across datasets. The query system supports both immediate execution and prepared statements for parameterized queries.
3
4
## Capabilities
5
6
### Query Execution
7
8
Execute TQL queries with immediate results or asynchronous processing for large datasets.
9
10
```python { .api }
11
def query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> DatasetView:
12
"""
13
Execute TQL query and return results.
14
15
Parameters:
16
- query: TQL query string
17
- token: Activeloop authentication token
18
- creds: Storage credentials dictionary
19
20
Returns:
21
DatasetView: Query result view
22
"""
23
24
def query_async(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> Future[DatasetView]:
25
"""
26
Execute TQL query asynchronously.
27
28
Parameters:
29
- query: TQL query string
30
- token: Activeloop authentication token
31
- creds: Storage credentials dictionary
32
33
Returns:
34
Future[DatasetView]: Future resolving to query result view
35
"""
36
```
37
38
### Prepared Queries
39
40
Create prepared statements for efficient execution of parameterized queries with variable substitution.
41
42
```python { .api }
43
def prepare_query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> Executor:
44
"""
45
Prepare parameterized query for efficient repeated execution.
46
47
Parameters:
48
- query: TQL query string with parameter placeholders
49
- token: Activeloop authentication token
50
- creds: Storage credentials dictionary
51
52
Returns:
53
Executor: Prepared query executor
54
"""
55
56
class Executor:
57
"""Prepared query executor for parameterized queries."""
58
59
def get_query_string(self) -> str:
60
"""
61
Get the prepared query string.
62
63
Returns:
64
str: Original query string with parameter placeholders
65
"""
66
67
def run_single(self, parameters: Dict[str, Any]) -> DatasetView:
68
"""
69
Execute prepared query with single parameter set.
70
71
Parameters:
72
- parameters: Dictionary mapping parameter names to values
73
74
Returns:
75
DatasetView: Query result view
76
"""
77
78
def run_single_async(self, parameters: Dict[str, Any]) -> Future[DatasetView]:
79
"""
80
Execute prepared query asynchronously with single parameter set.
81
82
Parameters:
83
- parameters: Dictionary mapping parameter names to values
84
85
Returns:
86
Future[DatasetView]: Future resolving to query result view
87
"""
88
89
def run_batch(self, parameters: List[Dict[str, Any]]) -> List[DatasetView]:
90
"""
91
Execute prepared query with multiple parameter sets.
92
93
Parameters:
94
- parameters: List of parameter dictionaries
95
96
Returns:
97
List[DatasetView]: List of query result views
98
"""
99
100
def run_batch_async(self, parameters: List[Dict[str, Any]]) -> Future[List[DatasetView]]:
101
"""
102
Execute prepared query asynchronously with multiple parameter sets.
103
104
Parameters:
105
- parameters: List of parameter dictionaries
106
107
Returns:
108
Future[List[DatasetView]]: Future resolving to list of query result views
109
"""
110
```
111
112
### Query Analysis
113
114
Analyze and explain query execution plans for optimization and debugging.
115
116
```python { .api }
117
def explain_query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> ExplainQueryResult:
118
"""
119
Explain query execution plan.
120
121
Parameters:
122
- query: TQL query string to analyze
123
- token: Activeloop authentication token
124
- creds: Storage credentials dictionary
125
126
Returns:
127
ExplainQueryResult: Query execution plan and statistics
128
"""
129
130
class ExplainQueryResult:
131
"""Query execution plan and analysis."""
132
133
def __str__(self) -> str:
134
"""
135
Get human-readable explanation of query plan.
136
137
Returns:
138
str: Formatted query execution plan
139
"""
140
141
def to_dict(self) -> Dict[str, Any]:
142
"""
143
Get query plan as structured data.
144
145
Returns:
146
Dict[str, Any]: Dictionary containing execution plan details
147
"""
148
```
149
150
### Dataset View Operations
151
152
DatasetView objects provide additional query and analysis capabilities on query results.
153
154
```python { .api }
155
class DatasetView:
156
"""Query result view with additional query capabilities."""
157
158
schema: SchemaView
159
160
def query(self, query: str) -> DatasetView:
161
"""
162
Execute nested query on this view.
163
164
Parameters:
165
- query: TQL query string
166
167
Returns:
168
DatasetView: Nested query result view
169
"""
170
171
def prepare_query(self, query: str) -> Executor:
172
"""
173
Prepare parameterized query on this view.
174
175
Parameters:
176
- query: TQL query string with parameter placeholders
177
178
Returns:
179
Executor: Prepared query executor
180
"""
181
182
def explain_query(self, query: str) -> ExplainQueryResult:
183
"""
184
Explain query execution plan on this view.
185
186
Parameters:
187
- query: TQL query string to analyze
188
189
Returns:
190
ExplainQueryResult: Query execution plan and statistics
191
"""
192
193
def summary(self) -> str:
194
"""
195
Get summary statistics of the dataset view.
196
197
Returns:
198
str: Summary statistics including row count, column info, etc.
199
"""
200
201
def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]:
202
"""
203
Iterate over view data in batches.
204
205
Parameters:
206
- batch_size: Number of rows per batch
207
208
Returns:
209
Iterator[Dict[str, Any]]: Iterator yielding batches as dictionaries
210
"""
211
```
212
213
### TQL Function Registration
214
215
Register custom Python functions for use in TQL queries with automatic type inference.
216
217
```python { .api }
218
def register_function(function: Callable) -> None:
219
"""
220
Register Python function for use in TQL queries.
221
222
Parameters:
223
- function: Python function to register
224
"""
225
226
def get_max_num_parallel_queries() -> int:
227
"""
228
Get maximum number of parallel queries allowed.
229
230
Returns:
231
int: Maximum parallel query limit
232
"""
233
234
def set_max_num_parallel_queries(num: int) -> None:
235
"""
236
Set maximum number of parallel queries allowed.
237
238
Parameters:
239
- num: Maximum parallel query limit
240
"""
241
```
242
243
## Usage Examples
244
245
### Basic Queries
246
247
```python
248
import deeplake
249
250
# Simple SELECT query
251
results = deeplake.query('SELECT * FROM "s3://my-bucket/dataset" WHERE label == "cat"')
252
253
# Access query results
254
print(f"Found {len(results)} cat images")
255
for row in results:
256
print(f"Image: {row['image_path']}, Label: {row['label']}")
257
258
# Query with aggregation
259
stats = deeplake.query('SELECT label, COUNT(*) as count FROM "s3://my-bucket/dataset" GROUP BY label')
260
for row in stats:
261
print(f"Label: {row['label']}, Count: {row['count']}")
262
263
# Query with filtering and ordering
264
high_confidence = deeplake.query('''
265
SELECT image_path, confidence
266
FROM "s3://my-bucket/dataset"
267
WHERE confidence > 0.9
268
ORDER BY confidence DESC
269
LIMIT 10
270
''')
271
```
272
273
### Parameterized Queries
274
275
```python
276
# Prepare parameterized query
277
executor = deeplake.prepare_query('''
278
SELECT * FROM "s3://my-bucket/dataset"
279
WHERE label == $label AND confidence > $min_confidence
280
''')
281
282
# Execute with different parameters
283
cats = executor.run_single({"label": "cat", "min_confidence": 0.8})
284
dogs = executor.run_single({"label": "dog", "min_confidence": 0.8})
285
286
# Batch execution
287
params_list = [
288
{"label": "cat", "min_confidence": 0.9},
289
{"label": "dog", "min_confidence": 0.9},
290
{"label": "bird", "min_confidence": 0.9}
291
]
292
results_list = executor.run_batch(params_list)
293
294
for i, results in enumerate(results_list):
295
label = params_list[i]["label"]
296
print(f"High confidence {label} images: {len(results)}")
297
```
298
299
### Advanced TQL Features
300
301
```python
302
# Complex filtering with multiple conditions
303
complex_query = deeplake.query('''
304
SELECT image_path, embeddings, metadata
305
FROM "s3://my-bucket/dataset"
306
WHERE label IN ("cat", "dog")
307
AND confidence > 0.85
308
AND width > 224
309
AND height > 224
310
''')
311
312
# Similarity search using embedding vectors
313
similar_images = deeplake.query('''
314
SELECT image_path,
315
COSINE_SIMILARITY(embeddings, $target_embedding) as similarity
316
FROM "s3://my-bucket/dataset"
317
WHERE COSINE_SIMILARITY(embeddings, $target_embedding) > 0.8
318
ORDER BY similarity DESC
319
''', parameters={"target_embedding": target_vector})
320
321
# Text search in descriptions
322
text_results = deeplake.query('''
323
SELECT * FROM "s3://my-bucket/dataset"
324
WHERE CONTAINS(description, "outdoor scene")
325
''')
326
327
# Geospatial queries
328
location_results = deeplake.query('''
329
SELECT * FROM "s3://my-bucket/dataset"
330
WHERE latitude BETWEEN 40.0 AND 41.0
331
AND longitude BETWEEN -74.0 AND -73.0
332
''')
333
```
334
335
### Query Analysis and Optimization
336
337
```python
338
# Analyze query performance
339
query_str = 'SELECT * FROM "s3://my-bucket/dataset" WHERE confidence > 0.9'
340
explanation = deeplake.explain_query(query_str)
341
342
print("Query Plan:")
343
print(explanation)
344
345
# Get structured execution plan
346
plan_dict = explanation.to_dict()
347
print(f"Estimated rows: {plan_dict.get('estimated_rows', 'unknown')}")
348
print(f"Index usage: {plan_dict.get('uses_index', 'unknown')}")
349
350
# Query optimization suggestions
351
if not plan_dict.get('uses_index', False):
352
print("Consider creating an index on 'confidence' column for better performance")
353
```
354
355
### Nested Queries and Views
356
357
```python
358
# Create initial view
359
base_view = deeplake.query('SELECT * FROM "s3://my-bucket/dataset" WHERE split == "train"')
360
361
# Query on the view
362
filtered_view = base_view.query('SELECT * WHERE confidence > 0.9')
363
364
# Further nested query
365
final_results = filtered_view.query('SELECT image_path, label ORDER BY confidence DESC LIMIT 100')
366
367
print(f"Top 100 high-confidence training images: {len(final_results)}")
368
```
369
370
### Custom Function Registration
371
372
```python
373
import numpy as np
374
375
# Register custom function for TQL
376
def normalize_scores(scores):
377
"""Normalize confidence scores to 0-1 range."""
378
scores_array = np.array(scores)
379
return (scores_array - scores_array.min()) / (scores_array.max() - scores_array.min())
380
381
deeplake.tql.register_function(normalize_scores)
382
383
# Use custom function in query
384
normalized_results = deeplake.query('''
385
SELECT image_path,
386
normalize_scores(confidence) as normalized_confidence
387
FROM "s3://my-bucket/dataset"
388
ORDER BY normalized_confidence DESC
389
''')
390
```
391
392
### Async Query Execution
393
394
```python
395
import asyncio
396
397
async def process_multiple_queries():
398
queries = [
399
'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "cat"',
400
'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "dog"',
401
'SELECT * FROM "s3://my-bucket/dataset" WHERE label == "bird"'
402
]
403
404
# Execute queries concurrently
405
tasks = [deeplake.query_async(query) for query in queries]
406
results = await asyncio.gather(*tasks)
407
408
for i, result in enumerate(results):
409
query_type = queries[i].split('"')[3] # Extract label
410
print(f"Query {i+1} returned {len(result)} results")
411
412
return results
413
414
# Run async queries
415
results = asyncio.run(process_multiple_queries())
416
```
417
418
### Performance Tuning
419
420
```python
421
# Set maximum parallel queries for performance tuning
422
current_max = deeplake.tql.get_max_num_parallel_queries()
423
print(f"Current max parallel queries: {current_max}")
424
425
# Increase for high-performance systems
426
deeplake.tql.set_max_num_parallel_queries(8)
427
428
# Query with performance monitoring
429
import time
430
431
start_time = time.time()
432
large_results = deeplake.query('''
433
SELECT * FROM "s3://my-bucket/large_dataset"
434
WHERE embedding_magnitude > 0.5
435
''')
436
end_time = time.time()
437
438
print(f"Query executed in {end_time - start_time:.2f} seconds")
439
print(f"Returned {len(large_results)} results")
440
```