0
# Data Access and Manipulation
1
2
Comprehensive row and column-based data access patterns with support for indexing, slicing, batch operations, and efficient data manipulation. Deep Lake provides both mutable and read-only access patterns optimized for ML workflows.
3
4
## Capabilities
5
6
### Dataset Access Patterns
7
8
Dataset objects provide dictionary-like and array-like access to data with automatic type handling and optimization.
9
10
```python { .api }
11
class Dataset:
12
"""Primary mutable dataset class."""
13
14
def __getitem__(self, key: Union[int, slice, str]) -> Union[Row, RowRange, Column]:
15
"""
16
Access dataset elements by index or name.
17
18
Parameters:
19
- key: Row index (int), row range (slice), or column name (str)
20
21
Returns:
22
- Row: Single row access (when key is int)
23
- RowRange: Multiple row access (when key is slice)
24
- Column: Column access (when key is str)
25
"""
26
27
def __len__(self) -> int:
28
"""Get number of rows in dataset."""
29
30
def append(self, data: Dict[str, Any]) -> None:
31
"""
32
Append new row to dataset.
33
34
Parameters:
35
- data: Dictionary mapping column names to values
36
"""
37
38
def extend(self, data: List[Dict[str, Any]]) -> None:
39
"""
40
Append multiple rows to dataset.
41
42
Parameters:
43
- data: List of dictionaries mapping column names to values
44
"""
45
46
def add_column(self, name: str, dtype: Type) -> None:
47
"""
48
Add new column to dataset.
49
50
Parameters:
51
- name: Column name
52
- dtype: Column data type
53
"""
54
55
def remove_column(self, name: str) -> None:
56
"""
57
Remove column from dataset.
58
59
Parameters:
60
- name: Column name to remove
61
"""
62
63
def rename_column(self, old_name: str, new_name: str) -> None:
64
"""
65
Rename existing column.
66
67
Parameters:
68
- old_name: Current column name
69
- new_name: New column name
70
"""
71
72
class ReadOnlyDataset:
73
"""Read-only dataset access."""
74
75
def __getitem__(self, key: Union[int, slice, str]) -> Union[RowView, RowRangeView, ColumnView]:
76
"""Access dataset elements (read-only)."""
77
78
def __len__(self) -> int:
79
"""Get number of rows in dataset."""
80
81
class DatasetView:
82
"""Query result view of dataset."""
83
84
def __getitem__(self, key: Union[int, slice, str]) -> Union[RowView, RowRangeView, ColumnView]:
85
"""Access query result elements."""
86
87
def __len__(self) -> int:
88
"""Get number of rows in view."""
89
90
def summary(self) -> str:
91
"""Get summary statistics of the dataset view."""
92
```
93
94
### Column Access and Manipulation
95
96
Column objects provide typed access to homogeneous data with support for indexing, slicing, and batch operations.
97
98
```python { .api }
99
class Column:
100
"""Mutable column access."""
101
102
name: str
103
metadata: Metadata
104
indexes: List[str]
105
106
def __getitem__(self, key: Union[int, slice, List[int]]) -> Any:
107
"""
108
Get column values by index.
109
110
Parameters:
111
- key: Row index (int), slice, or list of indices
112
113
Returns:
114
- Any: Single value or list of values
115
"""
116
117
def __setitem__(self, key: Union[int, slice, List[int]], value: Any) -> None:
118
"""
119
Set column values by index.
120
121
Parameters:
122
- key: Row index (int), slice, or list of indices
123
- value: Value(s) to set
124
"""
125
126
def __len__(self) -> int:
127
"""Get number of elements in column."""
128
129
def create_index(self, type: IndexType) -> None:
130
"""
131
Create index on column for query optimization.
132
133
Parameters:
134
- type: Index type specification
135
"""
136
137
def drop_index(self, name: str) -> None:
138
"""
139
Drop existing index.
140
141
Parameters:
142
- name: Index name to drop
143
"""
144
145
def get_async(self, index: int) -> Future[Any]:
146
"""
147
Get column value asynchronously.
148
149
Parameters:
150
- index: Row index
151
152
Returns:
153
Future[Any]: Future resolving to column value
154
"""
155
156
def set_async(self, index: int, value: Any) -> FutureVoid:
157
"""
158
Set column value asynchronously.
159
160
Parameters:
161
- index: Row index
162
- value: Value to set
163
164
Returns:
165
FutureVoid: Future completing when set operation is done
166
"""
167
168
def get_bytes(self, index: int) -> bytes:
169
"""
170
Get raw bytes representation of column value.
171
172
Parameters:
173
- index: Row index
174
175
Returns:
176
bytes: Raw bytes data
177
"""
178
179
def get_bytes_async(self, index: int) -> Future[bytes]:
180
"""
181
Get raw bytes representation asynchronously.
182
183
Parameters:
184
- index: Row index
185
186
Returns:
187
Future[bytes]: Future resolving to raw bytes data
188
"""
189
190
class ColumnView:
191
"""Read-only column access."""
192
193
name: str
194
metadata: ReadOnlyMetadata
195
indexes: List[str]
196
197
def __getitem__(self, key: Union[int, slice, List[int]]) -> Any:
198
"""Get column values by index (read-only)."""
199
200
def __len__(self) -> int:
201
"""Get number of elements in column."""
202
203
def get_async(self, index: int) -> Future[Any]:
204
"""Get column value asynchronously."""
205
206
def get_bytes(self, index: int) -> bytes:
207
"""Get raw bytes representation of column value."""
208
209
def get_bytes_async(self, index: int) -> Future[bytes]:
210
"""Get raw bytes representation asynchronously."""
211
212
def _links_info(self) -> Dict[str, Any]:
213
"""Get link information for linked columns."""
214
215
class ColumnDefinition:
216
"""Mutable column definition."""
217
218
name: str
219
dtype: Type
220
221
def drop(self) -> None:
222
"""Drop this column from dataset."""
223
224
def rename(self, new_name: str) -> None:
225
"""Rename this column."""
226
227
class ColumnDefinitionView:
228
"""Read-only column definition."""
229
230
name: str
231
dtype: Type
232
```
233
234
### Row Access and Manipulation
235
236
Row objects provide dictionary-like access to individual records with type-aware value handling.
237
238
```python { .api }
239
class Row:
240
"""Mutable single row access."""
241
242
row_id: int
243
244
def __getitem__(self, column_name: str) -> Any:
245
"""
246
Get value from specific column.
247
248
Parameters:
249
- column_name: Column name
250
251
Returns:
252
Any: Column value for this row
253
"""
254
255
def __setitem__(self, column_name: str, value: Any) -> None:
256
"""
257
Set value in specific column.
258
259
Parameters:
260
- column_name: Column name
261
- value: Value to set
262
"""
263
264
def to_dict(self) -> Dict[str, Any]:
265
"""
266
Convert row to dictionary.
267
268
Returns:
269
Dict[str, Any]: Dictionary mapping column names to values
270
"""
271
272
def get_async(self, column_name: str) -> Future[Any]:
273
"""Get column value asynchronously."""
274
275
def set_async(self, column_name: str, value: Any) -> FutureVoid:
276
"""Set column value asynchronously."""
277
278
def get_bytes(self, column_name: str) -> bytes:
279
"""Get raw bytes representation of column value."""
280
281
def get_bytes_async(self, column_name: str) -> Future[bytes]:
282
"""Get raw bytes representation asynchronously."""
283
284
class RowView:
285
"""Read-only single row access."""
286
287
row_id: int
288
289
def __getitem__(self, column_name: str) -> Any:
290
"""Get value from specific column (read-only)."""
291
292
def to_dict(self) -> Dict[str, Any]:
293
"""Convert row to dictionary."""
294
295
def get_async(self, column_name: str) -> Future[Any]:
296
"""Get column value asynchronously."""
297
298
def get_bytes(self, column_name: str) -> bytes:
299
"""Get raw bytes representation of column value."""
300
301
def get_bytes_async(self, column_name: str) -> Future[bytes]:
302
"""Get raw bytes representation asynchronously."""
303
304
class RowRange:
305
"""Mutable multiple row access."""
306
307
def __getitem__(self, column_name: str) -> List[Any]:
308
"""Get values from specific column across all rows in range."""
309
310
def __setitem__(self, column_name: str, values: List[Any]) -> None:
311
"""Set values in specific column across all rows in range."""
312
313
def __len__(self) -> int:
314
"""Get number of rows in range."""
315
316
def __iter__(self) -> Iterator[Row]:
317
"""Iterate over rows in range."""
318
319
def summary(self) -> str:
320
"""Get summary statistics of the row range."""
321
322
class RowRangeView:
323
"""Read-only multiple row access."""
324
325
def __getitem__(self, column_name: str) -> List[Any]:
326
"""Get values from specific column across all rows in range."""
327
328
def __len__(self) -> int:
329
"""Get number of rows in range."""
330
331
def __iter__(self) -> Iterator[RowView]:
332
"""Iterate over rows in range."""
333
334
def summary(self) -> str:
335
"""Get summary statistics of the row range."""
336
```
337
338
### Metadata Management
339
340
Metadata objects provide key-value storage for dataset and column metadata with type preservation.
341
342
```python { .api }
343
class Metadata:
344
"""Mutable metadata storage."""
345
346
def __getitem__(self, key: str) -> Any:
347
"""Get metadata value by key."""
348
349
def __setitem__(self, key: str, value: Any) -> None:
350
"""Set metadata value by key."""
351
352
def __contains__(self, key: str) -> bool:
353
"""Check if metadata key exists."""
354
355
def keys(self) -> List[str]:
356
"""Get all metadata keys."""
357
358
class ReadOnlyMetadata:
359
"""Read-only metadata storage."""
360
361
def __getitem__(self, key: str) -> Any:
362
"""Get metadata value by key."""
363
364
def __contains__(self, key: str) -> bool:
365
"""Check if metadata key exists."""
366
367
def keys(self) -> List[str]:
368
"""Get all metadata keys."""
369
```
370
371
## Usage Examples
372
373
### Basic Data Access
374
375
```python
376
import deeplake
377
378
# Open dataset
379
dataset = deeplake.open("./my_dataset")
380
381
# Row access
382
row = dataset[0] # First row
383
print(row["image_path"]) # Access column value
384
print(row.to_dict()) # Convert to dictionary
385
386
# Row range access
387
rows = dataset[0:10] # First 10 rows
388
for row in rows:
389
print(row["label"])
390
391
# Column access
392
images_column = dataset["images"]
393
print(len(images_column)) # Number of images
394
first_image = images_column[0] # First image
395
396
# Column slicing
397
batch_images = images_column[0:32] # First 32 images
398
```
399
400
### Data Manipulation
401
402
```python
403
# Add new column
404
dataset.add_column("scores", deeplake.types.Float32())
405
406
# Append single row
407
dataset.append({
408
"images": "new_image.jpg",
409
"labels": "dog",
410
"scores": 0.95
411
})
412
413
# Append multiple rows
414
batch_data = [
415
{"images": f"image_{i}.jpg", "labels": f"label_{i}", "scores": 0.8 + i * 0.01}
416
for i in range(100)
417
]
418
dataset.extend(batch_data)
419
420
# Update specific values
421
dataset[0]["scores"] = 0.99 # Update single value
422
dataset["scores"][0:10] = [0.9] * 10 # Update range
423
424
# Column operations
425
scores = dataset["scores"]
426
scores[100] = 0.85 # Set specific score
427
high_scores = scores[scores > 0.9] # Filter high scores
428
```
429
430
### Batch Operations
431
432
```python
433
# Access data in batches
434
batch_size = 32
435
for i in range(0, len(dataset), batch_size):
436
batch = dataset[i:i+batch_size]
437
438
# Get batch data as lists
439
images = batch["images"]
440
labels = batch["labels"]
441
442
# Process batch
443
process_batch(images, labels)
444
445
# Column-wise batch operations
446
images_column = dataset["images"]
447
for i in range(0, len(images_column), batch_size):
448
image_batch = images_column[i:i+batch_size]
449
processed_batch = preprocess_images(image_batch)
450
# Save processed results...
451
```
452
453
### Async Operations
454
455
```python
456
import asyncio
457
458
async def process_data_async(dataset):
459
# Get multiple values concurrently
460
tasks = [
461
dataset["images"].get_async(i)
462
for i in range(10)
463
]
464
465
images = await asyncio.gather(*tasks)
466
return images
467
468
# Set values asynchronously
469
async def update_scores_async(dataset, new_scores):
470
tasks = [
471
dataset["scores"].set_async(i, score)
472
for i, score in enumerate(new_scores)
473
]
474
475
await asyncio.gather(*tasks)
476
```
477
478
### Metadata Usage
479
480
```python
481
# Dataset metadata
482
dataset.metadata["version"] = "1.0"
483
dataset.metadata["description"] = "Training dataset for image classification"
484
print(dataset.metadata.keys())
485
486
# Column metadata
487
images_column = dataset["images"]
488
images_column.metadata["preprocessing"] = "normalized"
489
images_column.metadata["source"] = "camera_feed"
490
491
# Access metadata
492
if "version" in dataset.metadata:
493
print(f"Dataset version: {dataset.metadata['version']}")
494
```
495
496
### Indexing for Performance
497
498
```python
499
# Create index on text column for fast queries
500
text_column = dataset["descriptions"]
501
text_column.create_index(deeplake.types.TextIndex(deeplake.types.Inverted))
502
503
# Create embedding index for similarity search
504
embedding_column = dataset["embeddings"]
505
embedding_column.create_index(
506
deeplake.types.EmbeddingIndex(deeplake.types.Clustered)
507
)
508
509
# List all indexes on column
510
print(text_column.indexes)
511
512
# Drop index when no longer needed
513
text_column.drop_index("inverted_index")
514
```