Tessl Tile for pypi/deeplake@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

data-access.md data-import-export.md dataset-management.md error-handling.md framework-integration.md index.md query-system.md schema-templates.md storage-system.md type-system.md version-control.md

framework-integration.mddocs/

0
# Framework Integration
1

2
Seamless integration with PyTorch and TensorFlow for training and inference workflows with optimized data loading, transformation pipelines, and batch processing. Deep Lake provides native framework adapters for efficient ML model training.
3

4
## Capabilities
5

6
### PyTorch Integration
7

8
Native PyTorch Dataset integration with support for custom transforms, data loading, and distributed training.
9

10
```python { .api }
11
class DatasetView:
12
    """PyTorch integration for dataset views."""
13
    
14
    def pytorch(self, transform: Optional[Callable[[Any], Any]] = None) -> Any:
15
        """
16
        Create PyTorch Dataset from Deep Lake dataset.
17
        
18
        Parameters:
19
        - transform: Optional transform function to apply to samples
20
        
21
        Returns:
22
        TorchDataset: PyTorch-compatible dataset object
23
        """
24
```
25

26
### TensorFlow Integration
27

28
Native TensorFlow Dataset integration with optimized data pipelines and GPU acceleration support.
29

30
```python { .api }
31
class DatasetView:
32
    """TensorFlow integration for dataset views."""
33
    
34
    def tensorflow(self) -> Any:
35
        """
36
        Create TensorFlow Dataset from Deep Lake dataset.
37
        
38
        Returns:
39
        tf.data.Dataset: TensorFlow-compatible dataset object
40
        """
41
```
42

43
### Batch Processing
44

45
Efficient batch iteration with customizable batch sizes and data loading strategies.
46

47
```python { .api }
48
class DatasetView:
49
    """Batch processing capabilities."""
50
    
51
    def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]:
52
        """
53
        Iterate over dataset in batches.
54
        
55
        Parameters:
56
        - batch_size: Number of samples per batch
57
        
58
        Returns:
59
        Iterator[Dict[str, Any]]: Iterator yielding batches as dictionaries
60
        """
61
```
62

63
## Usage Examples
64

65
### Basic PyTorch Integration
66

67
```python
68
import deeplake
69
import torch
70
from torch.utils.data import DataLoader
71

72
# Open dataset
73
dataset = deeplake.open("./ml_dataset")
74

75
# Create PyTorch dataset
76
torch_dataset = dataset.pytorch()
77

78
# Use with PyTorch DataLoader
79
train_loader = DataLoader(
80
    torch_dataset,
81
    batch_size=32,
82
    shuffle=True,
83
    num_workers=4
84
)
85

86
# Training loop
87
for batch_idx, batch in enumerate(train_loader):
88
    images = batch["images"]
89
    labels = batch["labels"]
90
    
91
    # Convert to tensors if needed
92
    if isinstance(images, list):
93
        images = torch.stack([torch.tensor(img) for img in images])
94
    if isinstance(labels, list):
95
        labels = torch.tensor(labels)
96
    
97
    print(f"Batch {batch_idx}: images shape {images.shape}, labels shape {labels.shape}")
98
    
99
    if batch_idx >= 2:  # Just show first few batches
100
        break
101
```
102

103
### PyTorch with Custom Transforms
104

105
```python
106
import torchvision.transforms as transforms
107
from PIL import Image
108
import numpy as np
109

110
# Define custom transform pipeline
111
def custom_transform(sample):
112
    """Custom transform for image-text pairs."""
113
    
114
    # Load and transform image
115
    if isinstance(sample["images"], str):
116
        # Load image from path
117
        image = Image.open(sample["images"])
118
    else:
119
        # Convert numpy array to PIL
120
        image = Image.fromarray(sample["images"])
121
    
122
    # Apply torchvision transforms
123
    transform = transforms.Compose([
124
        transforms.Resize((224, 224)),
125
        transforms.ToTensor(),
126
        transforms.Normalize(mean=[0.485, 0.456, 0.406], 
127
                           std=[0.229, 0.224, 0.225])
128
    ])
129
    
130
    transformed_image = transform(image)
131
    
132
    # Process text label
133
    label = sample["labels"]
134
    if isinstance(label, str):
135
        # Convert string label to integer (example mapping)
136
        label_map = {"cat": 0, "dog": 1, "bird": 2}
137
        label = label_map.get(label, -1)
138
    
139
    return {
140
        "image": transformed_image,
141
        "label": torch.tensor(label, dtype=torch.long)
142
    }
143

144
# Apply custom transform
145
torch_dataset = dataset.pytorch(transform=custom_transform)
146

147
# Use in training
148
train_loader = DataLoader(torch_dataset, batch_size=16, shuffle=True)
149

150
for batch in train_loader:
151
    images = batch["image"]  # Already tensor from transform
152
    labels = batch["label"]  # Already tensor from transform
153
    
154
    print(f"Transformed batch - Images: {images.shape}, Labels: {labels.shape}")
155
    break
156
```
157

158
### Advanced PyTorch Usage
159

160
```python
161
# Multi-modal dataset with complex transforms
162
def multimodal_transform(sample):
163
    """Transform for vision-language model training."""
164
    
165
    # Process image
166
    image = Image.open(sample["image_path"])
167
    image_transform = transforms.Compose([
168
        transforms.Resize((224, 224)),
169
        transforms.ToTensor(),
170
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
171
    ])
172
    image_tensor = image_transform(image)
173
    
174
    # Process text
175
    text = sample["description"]
176
    # Tokenize text (example with simple word-level tokenization)
177
    tokens = text.lower().split()[:50]  # Limit to 50 tokens
178
    
179
    # Convert to embedding indices (simplified)
180
    vocab = {"<pad>": 0, "<unk>": 1}  # Simplified vocab
181
    vocab.update({word: i+2 for i, word in enumerate(set(tokens))})
182
    
183
    token_ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]
184
    # Pad to fixed length
185
    token_ids += [vocab["<pad>"]] * (50 - len(token_ids))
186
    token_ids = token_ids[:50]
187
    
188
    # Process embeddings if available
189
    embeddings = sample.get("embeddings", np.zeros(768))
190
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
191
    
192
    return {
193
        "image": image_tensor,
194
        "text_tokens": torch.tensor(token_ids, dtype=torch.long),
195
        "embeddings": embeddings_tensor,
196
        "metadata": sample.get("metadata", {})
197
    }
198

199
# Create multimodal dataset
200
multimodal_dataset = dataset.pytorch(transform=multimodal_transform)
201

202
# Custom collate function for variable-length data
203
def collate_multimodal(batch):
204
    """Custom collate function for multimodal data."""
205
    
206
    images = torch.stack([item["image"] for item in batch])
207
    text_tokens = torch.stack([item["text_tokens"] for item in batch])
208
    embeddings = torch.stack([item["embeddings"] for item in batch])
209
    
210
    # Collect metadata
211
    metadata = [item["metadata"] for item in batch]
212
    
213
    return {
214
        "images": images,
215
        "text_tokens": text_tokens,
216
        "embeddings": embeddings,
217
        "metadata": metadata
218
    }
219

220
# Use with custom collate
221
multimodal_loader = DataLoader(
222
    multimodal_dataset,
223
    batch_size=8,
224
    shuffle=True,
225
    collate_fn=collate_multimodal,
226
    num_workers=2
227
)
228

229
for batch in multimodal_loader:
230
    print(f"Multimodal batch:")
231
    print(f"  Images: {batch['images'].shape}")
232
    print(f"  Text tokens: {batch['text_tokens'].shape}")
233
    print(f"  Embeddings: {batch['embeddings'].shape}")
234
    print(f"  Metadata items: {len(batch['metadata'])}")
235
    break
236
```
237

238
### TensorFlow Integration
239

240
```python
241
import tensorflow as tf
242
import deeplake
243

244
# Create TensorFlow dataset
245
dataset = deeplake.open("./ml_dataset")
246
tf_dataset = dataset.tensorflow()
247

248
# Basic usage with TensorFlow
249
for batch in tf_dataset.take(3):
250
    print(f"TensorFlow batch keys: {list(batch.keys())}")
251
    for key, value in batch.items():
252
        print(f"  {key}: {value.shape if hasattr(value, 'shape') else type(value)}")
253

254
# Apply TensorFlow transformations
255
def preprocess_tf(sample):
256
    """TensorFlow preprocessing function."""
257
    
258
    # Process image data
259
    if "images" in sample:
260
        image = sample["images"]
261
        # Resize and normalize
262
        image = tf.image.resize(image, [224, 224])
263
        image = tf.cast(image, tf.float32) / 255.0
264
        sample["images"] = image
265
    
266
    # Process labels
267
    if "labels" in sample:
268
        # Convert string labels to integers
269
        label_map = tf.lookup.StaticHashTable(
270
            tf.lookup.KeyValueTensorInitializer(
271
                keys=["cat", "dog", "bird"],
272
                values=[0, 1, 2]
273
            ),
274
            default_value=-1
275
        )
276
        sample["labels"] = label_map.lookup(sample["labels"])
277
    
278
    return sample
279

280
# Apply preprocessing
281
processed_tf_dataset = tf_dataset.map(
282
    preprocess_tf,
283
    num_parallel_calls=tf.data.AUTOTUNE
284
)
285

286
# Batch and prefetch for training
287
train_tf_dataset = processed_tf_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
288

289
# Use in TensorFlow training
290
for batch in train_tf_dataset.take(2):
291
    images = batch["images"]
292
    labels = batch["labels"]
293
    print(f"TF Training batch - Images: {images.shape}, Labels: {labels.shape}")
294
```
295

296
### Advanced TensorFlow Usage
297

298
```python
299
# Complex TensorFlow pipeline with multiple data types
300
def create_tf_pipeline(dataset_path, batch_size=32):
301
    """Create optimized TensorFlow data pipeline."""
302
    
303
    dataset = deeplake.open(dataset_path)
304
    tf_dataset = dataset.tensorflow()
305
    
306
    def process_sample(sample):
307
        """Process individual sample."""
308
        processed = {}
309
        
310
        # Handle images
311
        if "images" in sample:
312
            image = sample["images"]
313
            # Decode if string path
314
            if tf.dtypes.string == image.dtype:
315
                image = tf.io.read_file(image)
316
                image = tf.image.decode_image(image, channels=3)
317
            
318
            # Resize and normalize
319
            image = tf.image.resize(image, [224, 224])
320
            image = tf.cast(image, tf.float32) / 255.0
321
            
322
            # Data augmentation
323
            image = tf.image.random_flip_left_right(image)
324
            image = tf.image.random_brightness(image, 0.1)
325
            
326
            processed["image"] = image
327
        
328
        # Handle embeddings
329
        if "embeddings" in sample:
330
            embeddings = tf.cast(sample["embeddings"], tf.float32)
331
            # L2 normalize embeddings
332
            embeddings = tf.nn.l2_normalize(embeddings, axis=-1)
333
            processed["embeddings"] = embeddings
334
        
335
        # Handle text
336
        if "text" in sample:
337
            # Simple text processing (in practice, use proper tokenizer)
338
            text = sample["text"]
339
            processed["text"] = text
340
        
341
        # Handle labels
342
        if "labels" in sample:
343
            processed["label"] = tf.cast(sample["labels"], tf.int32)
344
        
345
        return processed
346
    
347
    # Apply transformations
348
    processed_dataset = (tf_dataset
349
                        .map(process_sample, num_parallel_calls=tf.data.AUTOTUNE)
350
                        .batch(batch_size)
351
                        .prefetch(tf.data.AUTOTUNE))
352
    
353
    return processed_dataset
354

355
# Create optimized pipeline
356
tf_pipeline = create_tf_pipeline("./complex_dataset", batch_size=16)
357

358
# Use in model training
359
for epoch in range(2):
360
    print(f"Epoch {epoch + 1}")
361
    for step, batch in enumerate(tf_pipeline.take(5)):
362
        print(f"  Step {step + 1}: {list(batch.keys())}")
363
        # Here you would pass batch to your model
364
        # loss = model.train_step(batch)
365
```
366

367
### Batch Processing Without Framework
368

369
```python
370
# Direct batch processing using Deep Lake's batches method
371
dataset = deeplake.open("./large_dataset")
372

373
# Process data in batches without ML framework
374
for batch_data in dataset.batches(batch_size=64):
375
    # batch_data is a dictionary with column names as keys
376
    # and lists of values as values
377
    
378
    images = batch_data["images"]
379
    labels = batch_data["labels"]
380
    
381
    print(f"Processing batch with {len(images)} samples")
382
    
383
    # Custom processing logic
384
    for i, (image_path, label) in enumerate(zip(images, labels)):
385
        # Process individual sample
386
        # This could be feature extraction, validation, etc.
387
        pass
388
    
389
    # Break after first batch for demo
390
    break
391

392
# Batch processing with filtering
393
high_confidence_data = deeplake.query(
394
    "SELECT * FROM dataset WHERE confidence > 0.9"
395
)
396

397
for batch in high_confidence_data.batches(batch_size=32):
398
    confidence_scores = batch["confidence"]
399
    avg_confidence = sum(confidence_scores) / len(confidence_scores)
400
    print(f"Batch average confidence: {avg_confidence:.3f}")
401
```
402

403
### Distributed Training Setup
404

405
```python
406
# PyTorch distributed training setup
407
import torch.distributed as dist
408
from torch.nn.parallel import DistributedDataParallel as DDP
409
from torch.utils.data.distributed import DistributedSampler
410

411
def setup_distributed_training(dataset_path, world_size, rank):
412
    """Setup distributed training with Deep Lake."""
413
    
414
    # Initialize distributed training
415
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
416
    
417
    # Open dataset
418
    dataset = deeplake.open(dataset_path)
419
    torch_dataset = dataset.pytorch(transform=custom_transform)
420
    
421
    # Create distributed sampler
422
    sampler = DistributedSampler(
423
        torch_dataset,
424
        num_replicas=world_size,
425
        rank=rank,
426
        shuffle=True
427
    )
428
    
429
    # Create distributed data loader
430
    train_loader = DataLoader(
431
        torch_dataset,
432
        batch_size=32,
433
        sampler=sampler,
434
        num_workers=4,
435
        pin_memory=True
436
    )
437
    
438
    return train_loader, sampler
439

440
# Usage in distributed training script
441
def train_distributed():
442
    """Distributed training example."""
443
    
444
    world_size = torch.cuda.device_count()
445
    rank = int(os.environ.get("LOCAL_RANK", 0))
446
    
447
    train_loader, sampler = setup_distributed_training(
448
        "./distributed_dataset", world_size, rank
449
    )
450
    
451
    # Training loop with distributed sampler
452
    for epoch in range(10):
453
        sampler.set_epoch(epoch)  # Important for proper shuffling
454
        
455
        for batch_idx, batch in enumerate(train_loader):
456
            # Distributed training step
457
            # model_output = model(batch)
458
            # loss = criterion(model_output, batch["labels"])
459
            # loss.backward()
460
            # optimizer.step()
461
            
462
            if batch_idx % 100 == 0 and rank == 0:
463
                print(f"Epoch {epoch}, Batch {batch_idx}")
464
```
465

466
### Performance Optimization
467

468
```python
469
# Optimized data loading for high-performance training
470
class OptimizedDataLoader:
471
    """Optimized data loader for Deep Lake datasets."""
472
    
473
    def __init__(self, dataset_path, batch_size=32, num_workers=4):
474
        self.dataset = deeplake.open(dataset_path)
475
        self.batch_size = batch_size
476
        self.num_workers = num_workers
477
        
478
        # Pre-compute dataset length
479
        self.length = len(self.dataset)
480
        
481
        # Create optimized transforms
482
        self.transform = self._create_optimized_transform()
483
    
484
    def _create_optimized_transform(self):
485
        """Create optimized transform pipeline."""
486
        def fast_transform(sample):
487
            # Minimize data copying and conversion
488
            result = {}
489
            
490
            # Efficient image processing
491
            if "images" in sample:
492
                image = sample["images"]
493
                # Use optimized image loading
494
                if isinstance(image, str):
495
                    # Load image efficiently
496
                    image = Image.open(image)
497
                
498
                # Fast tensor conversion
499
                image_array = np.array(image)
500
                result["image"] = torch.from_numpy(image_array).permute(2, 0, 1).float() / 255.0
501
            
502
            # Efficient label processing
503
            if "labels" in sample:
504
                result["label"] = torch.tensor(sample["labels"], dtype=torch.long)
505
            
506
            return result
507
        
508
        return fast_transform
509
    
510
    def get_dataloader(self):
511
        """Get optimized PyTorch DataLoader."""
512
        torch_dataset = self.dataset.pytorch(transform=self.transform)
513
        
514
        return DataLoader(
515
            torch_dataset,
516
            batch_size=self.batch_size,
517
            shuffle=True,
518
            num_workers=self.num_workers,
519
            pin_memory=True,
520
            prefetch_factor=2,
521
            persistent_workers=True
522
        )
523

524
# Usage
525
optimized_loader = OptimizedDataLoader("./high_perf_dataset", batch_size=64, num_workers=8)
526
train_loader = optimized_loader.get_dataloader()
527

528
# Measure performance
529
import time
530

531
start_time = time.time()
532
total_samples = 0
533

534
for batch_idx, batch in enumerate(train_loader):
535
    total_samples += batch["image"].shape[0]
536
    
537
    if batch_idx >= 100:  # Test first 100 batches
538
        break
539

540
end_time = time.time()
541
throughput = total_samples / (end_time - start_time)
542
print(f"Data loading throughput: {throughput:.1f} samples/second")
543
```
544

545
### Integration with Popular Libraries
546

547
```python
548
# Integration with Hugging Face datasets
549
from datasets import Dataset as HFDataset
550

551
def convert_to_huggingface(deeplake_dataset):
552
    """Convert Deep Lake dataset to Hugging Face format."""
553
    
554
    # Export data
555
    data_dict = {"text": [], "labels": []}
556
    
557
    for sample in deeplake_dataset:
558
        data_dict["text"].append(sample["text"])
559
        data_dict["labels"].append(sample["labels"])
560
    
561
    # Create Hugging Face dataset
562
    hf_dataset = HFDataset.from_dict(data_dict)
563
    return hf_dataset
564

565
# Integration with Lightning
566
import pytorch_lightning as pl
567

568
class DeepLakeDataModule(pl.LightningDataModule):
569
    """PyTorch Lightning DataModule for Deep Lake."""
570
    
571
    def __init__(self, train_path, val_path, batch_size=32):
572
        super().__init__()
573
        self.train_path = train_path
574
        self.val_path = val_path
575
        self.batch_size = batch_size
576
    
577
    def setup(self, stage=None):
578
        """Setup datasets."""
579
        if stage == "fit" or stage is None:
580
            self.train_dataset = deeplake.open(self.train_path).pytorch(
581
                transform=custom_transform
582
            )
583
            self.val_dataset = deeplake.open(self.val_path).pytorch(
584
                transform=custom_transform
585
            )
586
    
587
    def train_dataloader(self):
588
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
589
    
590
    def val_dataloader(self):
591
        return DataLoader(self.val_dataset, batch_size=self.batch_size)
592

593
# Usage with Lightning
594
datamodule = DeepLakeDataModule("./train_dataset", "./val_dataset", batch_size=32)
595

596
# Use with Lightning Trainer
597
# trainer = pl.Trainer()
598
# trainer.fit(model, datamodule)
599
```

Version

Tile

Files

framework-integration.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

framework-integration.mddocs/