Tessl Tile for pypi/deeplake@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

data-access.md data-import-export.md dataset-management.md error-handling.md framework-integration.md index.md query-system.md schema-templates.md storage-system.md type-system.md version-control.md

data-access.mddocs/

0
# Data Access and Manipulation
1

2
Comprehensive row and column-based data access patterns with support for indexing, slicing, batch operations, and efficient data manipulation. Deep Lake provides both mutable and read-only access patterns optimized for ML workflows.
3

4
## Capabilities
5

6
### Dataset Access Patterns
7

8
Dataset objects provide dictionary-like and array-like access to data with automatic type handling and optimization.
9

10
```python { .api }
11
class Dataset:
12
    """Primary mutable dataset class."""
13
    
14
    def __getitem__(self, key: Union[int, slice, str]) -> Union[Row, RowRange, Column]:
15
        """
16
        Access dataset elements by index or name.
17
        
18
        Parameters:
19
        - key: Row index (int), row range (slice), or column name (str)
20
        
21
        Returns:
22
        - Row: Single row access (when key is int)
23
        - RowRange: Multiple row access (when key is slice) 
24
        - Column: Column access (when key is str)
25
        """
26
    
27
    def __len__(self) -> int:
28
        """Get number of rows in dataset."""
29
    
30
    def append(self, data: Dict[str, Any]) -> None:
31
        """
32
        Append new row to dataset.
33
        
34
        Parameters:
35
        - data: Dictionary mapping column names to values
36
        """
37
    
38
    def extend(self, data: List[Dict[str, Any]]) -> None:
39
        """
40
        Append multiple rows to dataset.
41
        
42
        Parameters:
43
        - data: List of dictionaries mapping column names to values
44
        """
45
    
46
    def add_column(self, name: str, dtype: Type) -> None:
47
        """
48
        Add new column to dataset.
49
        
50
        Parameters:
51
        - name: Column name
52
        - dtype: Column data type
53
        """
54
    
55
    def remove_column(self, name: str) -> None:
56
        """
57
        Remove column from dataset.
58
        
59
        Parameters:
60
        - name: Column name to remove
61
        """
62
    
63
    def rename_column(self, old_name: str, new_name: str) -> None:
64
        """
65
        Rename existing column.
66
        
67
        Parameters:
68
        - old_name: Current column name
69
        - new_name: New column name
70
        """
71

72
class ReadOnlyDataset:
73
    """Read-only dataset access."""
74
    
75
    def __getitem__(self, key: Union[int, slice, str]) -> Union[RowView, RowRangeView, ColumnView]:
76
        """Access dataset elements (read-only)."""
77
    
78
    def __len__(self) -> int:
79
        """Get number of rows in dataset."""
80

81
class DatasetView:
82
    """Query result view of dataset."""
83
    
84
    def __getitem__(self, key: Union[int, slice, str]) -> Union[RowView, RowRangeView, ColumnView]:
85
        """Access query result elements."""
86
    
87
    def __len__(self) -> int:
88
        """Get number of rows in view."""
89
    
90
    def summary(self) -> str:
91
        """Get summary statistics of the dataset view."""
92
```
93

94
### Column Access and Manipulation
95

96
Column objects provide typed access to homogeneous data with support for indexing, slicing, and batch operations.
97

98
```python { .api }
99
class Column:
100
    """Mutable column access."""
101
    
102
    name: str
103
    metadata: Metadata
104
    indexes: List[str]
105
    
106
    def __getitem__(self, key: Union[int, slice, List[int]]) -> Any:
107
        """
108
        Get column values by index.
109
        
110
        Parameters:
111
        - key: Row index (int), slice, or list of indices
112
        
113
        Returns:
114
        - Any: Single value or list of values
115
        """
116
    
117
    def __setitem__(self, key: Union[int, slice, List[int]], value: Any) -> None:
118
        """
119
        Set column values by index.
120
        
121
        Parameters:
122
        - key: Row index (int), slice, or list of indices
123
        - value: Value(s) to set
124
        """
125
    
126
    def __len__(self) -> int:
127
        """Get number of elements in column."""
128
    
129
    def create_index(self, type: IndexType) -> None:
130
        """
131
        Create index on column for query optimization.
132
        
133
        Parameters:
134
        - type: Index type specification
135
        """
136
    
137
    def drop_index(self, name: str) -> None:
138
        """
139
        Drop existing index.
140
        
141
        Parameters:
142
        - name: Index name to drop
143
        """
144
    
145
    def get_async(self, index: int) -> Future[Any]:
146
        """
147
        Get column value asynchronously.
148
        
149
        Parameters:
150
        - index: Row index
151
        
152
        Returns:
153
        Future[Any]: Future resolving to column value
154
        """
155
    
156
    def set_async(self, index: int, value: Any) -> FutureVoid:
157
        """
158
        Set column value asynchronously.
159
        
160
        Parameters:
161
        - index: Row index
162
        - value: Value to set
163
        
164
        Returns:
165
        FutureVoid: Future completing when set operation is done
166
        """
167
    
168
    def get_bytes(self, index: int) -> bytes:
169
        """
170
        Get raw bytes representation of column value.
171
        
172
        Parameters:
173
        - index: Row index
174
        
175
        Returns:
176
        bytes: Raw bytes data
177
        """
178
    
179
    def get_bytes_async(self, index: int) -> Future[bytes]:
180
        """
181
        Get raw bytes representation asynchronously.
182
        
183
        Parameters:
184
        - index: Row index
185
        
186
        Returns:
187
        Future[bytes]: Future resolving to raw bytes data
188
        """
189

190
class ColumnView:
191
    """Read-only column access."""
192
    
193
    name: str
194
    metadata: ReadOnlyMetadata
195
    indexes: List[str]
196
    
197
    def __getitem__(self, key: Union[int, slice, List[int]]) -> Any:
198
        """Get column values by index (read-only)."""
199
    
200
    def __len__(self) -> int:
201
        """Get number of elements in column."""
202
    
203
    def get_async(self, index: int) -> Future[Any]:
204
        """Get column value asynchronously."""
205
    
206
    def get_bytes(self, index: int) -> bytes:
207
        """Get raw bytes representation of column value."""
208
    
209
    def get_bytes_async(self, index: int) -> Future[bytes]:
210
        """Get raw bytes representation asynchronously."""
211
    
212
    def _links_info(self) -> Dict[str, Any]:
213
        """Get link information for linked columns."""
214

215
class ColumnDefinition:
216
    """Mutable column definition."""
217
    
218
    name: str
219
    dtype: Type
220
    
221
    def drop(self) -> None:
222
        """Drop this column from dataset."""
223
    
224
    def rename(self, new_name: str) -> None:
225
        """Rename this column."""
226

227
class ColumnDefinitionView:
228
    """Read-only column definition."""
229
    
230
    name: str
231
    dtype: Type
232
```
233

234
### Row Access and Manipulation
235

236
Row objects provide dictionary-like access to individual records with type-aware value handling.
237

238
```python { .api }
239
class Row:
240
    """Mutable single row access."""
241
    
242
    row_id: int
243
    
244
    def __getitem__(self, column_name: str) -> Any:
245
        """
246
        Get value from specific column.
247
        
248
        Parameters:
249
        - column_name: Column name
250
        
251
        Returns:
252
        Any: Column value for this row
253
        """
254
    
255
    def __setitem__(self, column_name: str, value: Any) -> None:
256
        """
257
        Set value in specific column.
258
        
259
        Parameters:
260
        - column_name: Column name
261
        - value: Value to set
262
        """
263
    
264
    def to_dict(self) -> Dict[str, Any]:
265
        """
266
        Convert row to dictionary.
267
        
268
        Returns:
269
        Dict[str, Any]: Dictionary mapping column names to values
270
        """
271
    
272
    def get_async(self, column_name: str) -> Future[Any]:
273
        """Get column value asynchronously."""
274
    
275
    def set_async(self, column_name: str, value: Any) -> FutureVoid:
276
        """Set column value asynchronously."""
277
    
278
    def get_bytes(self, column_name: str) -> bytes:
279
        """Get raw bytes representation of column value."""
280
    
281
    def get_bytes_async(self, column_name: str) -> Future[bytes]:
282
        """Get raw bytes representation asynchronously."""
283

284
class RowView:
285
    """Read-only single row access."""
286
    
287
    row_id: int
288
    
289
    def __getitem__(self, column_name: str) -> Any:
290
        """Get value from specific column (read-only)."""
291
    
292
    def to_dict(self) -> Dict[str, Any]:
293
        """Convert row to dictionary."""
294
    
295
    def get_async(self, column_name: str) -> Future[Any]:
296
        """Get column value asynchronously."""
297
    
298
    def get_bytes(self, column_name: str) -> bytes:
299
        """Get raw bytes representation of column value."""
300
    
301
    def get_bytes_async(self, column_name: str) -> Future[bytes]:
302
        """Get raw bytes representation asynchronously."""
303

304
class RowRange:
305
    """Mutable multiple row access."""
306
    
307
    def __getitem__(self, column_name: str) -> List[Any]:
308
        """Get values from specific column across all rows in range."""
309
    
310
    def __setitem__(self, column_name: str, values: List[Any]) -> None:
311
        """Set values in specific column across all rows in range."""
312
    
313
    def __len__(self) -> int:
314
        """Get number of rows in range."""
315
    
316
    def __iter__(self) -> Iterator[Row]:
317
        """Iterate over rows in range."""
318
    
319
    def summary(self) -> str:
320
        """Get summary statistics of the row range."""
321

322
class RowRangeView:
323
    """Read-only multiple row access."""
324
    
325
    def __getitem__(self, column_name: str) -> List[Any]:
326
        """Get values from specific column across all rows in range."""
327
    
328
    def __len__(self) -> int:
329
        """Get number of rows in range."""
330
    
331
    def __iter__(self) -> Iterator[RowView]:
332
        """Iterate over rows in range."""
333
    
334
    def summary(self) -> str:
335
        """Get summary statistics of the row range."""
336
```
337

338
### Metadata Management
339

340
Metadata objects provide key-value storage for dataset and column metadata with type preservation.
341

342
```python { .api }
343
class Metadata:
344
    """Mutable metadata storage."""
345
    
346
    def __getitem__(self, key: str) -> Any:
347
        """Get metadata value by key."""
348
    
349
    def __setitem__(self, key: str, value: Any) -> None:
350
        """Set metadata value by key."""
351
    
352
    def __contains__(self, key: str) -> bool:
353
        """Check if metadata key exists."""
354
    
355
    def keys(self) -> List[str]:
356
        """Get all metadata keys."""
357

358
class ReadOnlyMetadata:
359
    """Read-only metadata storage."""
360
    
361
    def __getitem__(self, key: str) -> Any:
362
        """Get metadata value by key."""
363
    
364
    def __contains__(self, key: str) -> bool:
365
        """Check if metadata key exists."""
366
    
367
    def keys(self) -> List[str]:
368
        """Get all metadata keys."""
369
```
370

371
## Usage Examples
372

373
### Basic Data Access
374

375
```python
376
import deeplake
377

378
# Open dataset
379
dataset = deeplake.open("./my_dataset")
380

381
# Row access
382
row = dataset[0]  # First row
383
print(row["image_path"])  # Access column value
384
print(row.to_dict())  # Convert to dictionary
385

386
# Row range access
387
rows = dataset[0:10]  # First 10 rows
388
for row in rows:
389
    print(row["label"])
390

391
# Column access
392
images_column = dataset["images"]
393
print(len(images_column))  # Number of images
394
first_image = images_column[0]  # First image
395

396
# Column slicing
397
batch_images = images_column[0:32]  # First 32 images
398
```
399

400
### Data Manipulation
401

402
```python
403
# Add new column
404
dataset.add_column("scores", deeplake.types.Float32())
405

406
# Append single row
407
dataset.append({
408
    "images": "new_image.jpg",
409
    "labels": "dog",
410
    "scores": 0.95
411
})
412

413
# Append multiple rows
414
batch_data = [
415
    {"images": f"image_{i}.jpg", "labels": f"label_{i}", "scores": 0.8 + i * 0.01}
416
    for i in range(100)
417
]
418
dataset.extend(batch_data)
419

420
# Update specific values
421
dataset[0]["scores"] = 0.99  # Update single value
422
dataset["scores"][0:10] = [0.9] * 10  # Update range
423

424
# Column operations
425
scores = dataset["scores"]
426
scores[100] = 0.85  # Set specific score
427
high_scores = scores[scores > 0.9]  # Filter high scores
428
```
429

430
### Batch Operations
431

432
```python
433
# Access data in batches
434
batch_size = 32
435
for i in range(0, len(dataset), batch_size):
436
    batch = dataset[i:i+batch_size]
437
    
438
    # Get batch data as lists
439
    images = batch["images"]
440
    labels = batch["labels"]
441
    
442
    # Process batch
443
    process_batch(images, labels)
444

445
# Column-wise batch operations
446
images_column = dataset["images"]
447
for i in range(0, len(images_column), batch_size):
448
    image_batch = images_column[i:i+batch_size]
449
    processed_batch = preprocess_images(image_batch)
450
    # Save processed results...
451
```
452

453
### Async Operations
454

455
```python
456
import asyncio
457

458
async def process_data_async(dataset):
459
    # Get multiple values concurrently
460
    tasks = [
461
        dataset["images"].get_async(i)
462
        for i in range(10)
463
    ]
464
    
465
    images = await asyncio.gather(*tasks)
466
    return images
467

468
# Set values asynchronously
469
async def update_scores_async(dataset, new_scores):
470
    tasks = [
471
        dataset["scores"].set_async(i, score)
472
        for i, score in enumerate(new_scores)
473
    ]
474
    
475
    await asyncio.gather(*tasks)
476
```
477

478
### Metadata Usage
479

480
```python
481
# Dataset metadata
482
dataset.metadata["version"] = "1.0"
483
dataset.metadata["description"] = "Training dataset for image classification"
484
print(dataset.metadata.keys())
485

486
# Column metadata
487
images_column = dataset["images"]
488
images_column.metadata["preprocessing"] = "normalized"
489
images_column.metadata["source"] = "camera_feed"
490

491
# Access metadata
492
if "version" in dataset.metadata:
493
    print(f"Dataset version: {dataset.metadata['version']}")
494
```
495

496
### Indexing for Performance
497

498
```python
499
# Create index on text column for fast queries
500
text_column = dataset["descriptions"]
501
text_column.create_index(deeplake.types.TextIndex(deeplake.types.Inverted))
502

503
# Create embedding index for similarity search
504
embedding_column = dataset["embeddings"]
505
embedding_column.create_index(
506
    deeplake.types.EmbeddingIndex(deeplake.types.Clustered)
507
)
508

509
# List all indexes on column
510
print(text_column.indexes)
511

512
# Drop index when no longer needed
513
text_column.drop_index("inverted_index")
514
```

Version

Tile

Files

data-access.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-access.mddocs/