0
# Framework Integration
1
2
Seamless integration with PyTorch and TensorFlow for training and inference workflows with optimized data loading, transformation pipelines, and batch processing. Deep Lake provides native framework adapters for efficient ML model training.
3
4
## Capabilities
5
6
### PyTorch Integration
7
8
Native PyTorch Dataset integration with support for custom transforms, data loading, and distributed training.
9
10
```python { .api }
11
class DatasetView:
12
"""PyTorch integration for dataset views."""
13
14
def pytorch(self, transform: Optional[Callable[[Any], Any]] = None) -> Any:
15
"""
16
Create PyTorch Dataset from Deep Lake dataset.
17
18
Parameters:
19
- transform: Optional transform function to apply to samples
20
21
Returns:
22
TorchDataset: PyTorch-compatible dataset object
23
"""
24
```
25
26
### TensorFlow Integration
27
28
Native TensorFlow Dataset integration with optimized data pipelines and GPU acceleration support.
29
30
```python { .api }
31
class DatasetView:
32
"""TensorFlow integration for dataset views."""
33
34
def tensorflow(self) -> Any:
35
"""
36
Create TensorFlow Dataset from Deep Lake dataset.
37
38
Returns:
39
tf.data.Dataset: TensorFlow-compatible dataset object
40
"""
41
```
42
43
### Batch Processing
44
45
Efficient batch iteration with customizable batch sizes and data loading strategies.
46
47
```python { .api }
48
class DatasetView:
49
"""Batch processing capabilities."""
50
51
def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]:
52
"""
53
Iterate over dataset in batches.
54
55
Parameters:
56
- batch_size: Number of samples per batch
57
58
Returns:
59
Iterator[Dict[str, Any]]: Iterator yielding batches as dictionaries
60
"""
61
```
62
63
## Usage Examples
64
65
### Basic PyTorch Integration
66
67
```python
68
import deeplake
69
import torch
70
from torch.utils.data import DataLoader
71
72
# Open dataset
73
dataset = deeplake.open("./ml_dataset")
74
75
# Create PyTorch dataset
76
torch_dataset = dataset.pytorch()
77
78
# Use with PyTorch DataLoader
79
train_loader = DataLoader(
80
torch_dataset,
81
batch_size=32,
82
shuffle=True,
83
num_workers=4
84
)
85
86
# Training loop
87
for batch_idx, batch in enumerate(train_loader):
88
images = batch["images"]
89
labels = batch["labels"]
90
91
# Convert to tensors if needed
92
if isinstance(images, list):
93
images = torch.stack([torch.tensor(img) for img in images])
94
if isinstance(labels, list):
95
labels = torch.tensor(labels)
96
97
print(f"Batch {batch_idx}: images shape {images.shape}, labels shape {labels.shape}")
98
99
if batch_idx >= 2: # Just show first few batches
100
break
101
```
102
103
### PyTorch with Custom Transforms
104
105
```python
106
import torchvision.transforms as transforms
107
from PIL import Image
108
import numpy as np
109
110
# Define custom transform pipeline
111
def custom_transform(sample):
112
"""Custom transform for image-text pairs."""
113
114
# Load and transform image
115
if isinstance(sample["images"], str):
116
# Load image from path
117
image = Image.open(sample["images"])
118
else:
119
# Convert numpy array to PIL
120
image = Image.fromarray(sample["images"])
121
122
# Apply torchvision transforms
123
transform = transforms.Compose([
124
transforms.Resize((224, 224)),
125
transforms.ToTensor(),
126
transforms.Normalize(mean=[0.485, 0.456, 0.406],
127
std=[0.229, 0.224, 0.225])
128
])
129
130
transformed_image = transform(image)
131
132
# Process text label
133
label = sample["labels"]
134
if isinstance(label, str):
135
# Convert string label to integer (example mapping)
136
label_map = {"cat": 0, "dog": 1, "bird": 2}
137
label = label_map.get(label, -1)
138
139
return {
140
"image": transformed_image,
141
"label": torch.tensor(label, dtype=torch.long)
142
}
143
144
# Apply custom transform
145
torch_dataset = dataset.pytorch(transform=custom_transform)
146
147
# Use in training
148
train_loader = DataLoader(torch_dataset, batch_size=16, shuffle=True)
149
150
for batch in train_loader:
151
images = batch["image"] # Already tensor from transform
152
labels = batch["label"] # Already tensor from transform
153
154
print(f"Transformed batch - Images: {images.shape}, Labels: {labels.shape}")
155
break
156
```
157
158
### Advanced PyTorch Usage
159
160
```python
161
# Multi-modal dataset with complex transforms
162
def multimodal_transform(sample):
163
"""Transform for vision-language model training."""
164
165
# Process image
166
image = Image.open(sample["image_path"])
167
image_transform = transforms.Compose([
168
transforms.Resize((224, 224)),
169
transforms.ToTensor(),
170
transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
171
])
172
image_tensor = image_transform(image)
173
174
# Process text
175
text = sample["description"]
176
# Tokenize text (example with simple word-level tokenization)
177
tokens = text.lower().split()[:50] # Limit to 50 tokens
178
179
# Convert to embedding indices (simplified)
180
vocab = {"<pad>": 0, "<unk>": 1} # Simplified vocab
181
vocab.update({word: i+2 for i, word in enumerate(set(tokens))})
182
183
token_ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]
184
# Pad to fixed length
185
token_ids += [vocab["<pad>"]] * (50 - len(token_ids))
186
token_ids = token_ids[:50]
187
188
# Process embeddings if available
189
embeddings = sample.get("embeddings", np.zeros(768))
190
embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
191
192
return {
193
"image": image_tensor,
194
"text_tokens": torch.tensor(token_ids, dtype=torch.long),
195
"embeddings": embeddings_tensor,
196
"metadata": sample.get("metadata", {})
197
}
198
199
# Create multimodal dataset
200
multimodal_dataset = dataset.pytorch(transform=multimodal_transform)
201
202
# Custom collate function for variable-length data
203
def collate_multimodal(batch):
204
"""Custom collate function for multimodal data."""
205
206
images = torch.stack([item["image"] for item in batch])
207
text_tokens = torch.stack([item["text_tokens"] for item in batch])
208
embeddings = torch.stack([item["embeddings"] for item in batch])
209
210
# Collect metadata
211
metadata = [item["metadata"] for item in batch]
212
213
return {
214
"images": images,
215
"text_tokens": text_tokens,
216
"embeddings": embeddings,
217
"metadata": metadata
218
}
219
220
# Use with custom collate
221
multimodal_loader = DataLoader(
222
multimodal_dataset,
223
batch_size=8,
224
shuffle=True,
225
collate_fn=collate_multimodal,
226
num_workers=2
227
)
228
229
for batch in multimodal_loader:
230
print(f"Multimodal batch:")
231
print(f" Images: {batch['images'].shape}")
232
print(f" Text tokens: {batch['text_tokens'].shape}")
233
print(f" Embeddings: {batch['embeddings'].shape}")
234
print(f" Metadata items: {len(batch['metadata'])}")
235
break
236
```
237
238
### TensorFlow Integration
239
240
```python
241
import tensorflow as tf
242
import deeplake
243
244
# Create TensorFlow dataset
245
dataset = deeplake.open("./ml_dataset")
246
tf_dataset = dataset.tensorflow()
247
248
# Basic usage with TensorFlow
249
for batch in tf_dataset.take(3):
250
print(f"TensorFlow batch keys: {list(batch.keys())}")
251
for key, value in batch.items():
252
print(f" {key}: {value.shape if hasattr(value, 'shape') else type(value)}")
253
254
# Apply TensorFlow transformations
255
def preprocess_tf(sample):
256
"""TensorFlow preprocessing function."""
257
258
# Process image data
259
if "images" in sample:
260
image = sample["images"]
261
# Resize and normalize
262
image = tf.image.resize(image, [224, 224])
263
image = tf.cast(image, tf.float32) / 255.0
264
sample["images"] = image
265
266
# Process labels
267
if "labels" in sample:
268
# Convert string labels to integers
269
label_map = tf.lookup.StaticHashTable(
270
tf.lookup.KeyValueTensorInitializer(
271
keys=["cat", "dog", "bird"],
272
values=[0, 1, 2]
273
),
274
default_value=-1
275
)
276
sample["labels"] = label_map.lookup(sample["labels"])
277
278
return sample
279
280
# Apply preprocessing
281
processed_tf_dataset = tf_dataset.map(
282
preprocess_tf,
283
num_parallel_calls=tf.data.AUTOTUNE
284
)
285
286
# Batch and prefetch for training
287
train_tf_dataset = processed_tf_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
288
289
# Use in TensorFlow training
290
for batch in train_tf_dataset.take(2):
291
images = batch["images"]
292
labels = batch["labels"]
293
print(f"TF Training batch - Images: {images.shape}, Labels: {labels.shape}")
294
```
295
296
### Advanced TensorFlow Usage
297
298
```python
299
# Complex TensorFlow pipeline with multiple data types
300
def create_tf_pipeline(dataset_path, batch_size=32):
301
"""Create optimized TensorFlow data pipeline."""
302
303
dataset = deeplake.open(dataset_path)
304
tf_dataset = dataset.tensorflow()
305
306
def process_sample(sample):
307
"""Process individual sample."""
308
processed = {}
309
310
# Handle images
311
if "images" in sample:
312
image = sample["images"]
313
# Decode if string path
314
if tf.dtypes.string == image.dtype:
315
image = tf.io.read_file(image)
316
image = tf.image.decode_image(image, channels=3)
317
318
# Resize and normalize
319
image = tf.image.resize(image, [224, 224])
320
image = tf.cast(image, tf.float32) / 255.0
321
322
# Data augmentation
323
image = tf.image.random_flip_left_right(image)
324
image = tf.image.random_brightness(image, 0.1)
325
326
processed["image"] = image
327
328
# Handle embeddings
329
if "embeddings" in sample:
330
embeddings = tf.cast(sample["embeddings"], tf.float32)
331
# L2 normalize embeddings
332
embeddings = tf.nn.l2_normalize(embeddings, axis=-1)
333
processed["embeddings"] = embeddings
334
335
# Handle text
336
if "text" in sample:
337
# Simple text processing (in practice, use proper tokenizer)
338
text = sample["text"]
339
processed["text"] = text
340
341
# Handle labels
342
if "labels" in sample:
343
processed["label"] = tf.cast(sample["labels"], tf.int32)
344
345
return processed
346
347
# Apply transformations
348
processed_dataset = (tf_dataset
349
.map(process_sample, num_parallel_calls=tf.data.AUTOTUNE)
350
.batch(batch_size)
351
.prefetch(tf.data.AUTOTUNE))
352
353
return processed_dataset
354
355
# Create optimized pipeline
356
tf_pipeline = create_tf_pipeline("./complex_dataset", batch_size=16)
357
358
# Use in model training
359
for epoch in range(2):
360
print(f"Epoch {epoch + 1}")
361
for step, batch in enumerate(tf_pipeline.take(5)):
362
print(f" Step {step + 1}: {list(batch.keys())}")
363
# Here you would pass batch to your model
364
# loss = model.train_step(batch)
365
```
366
367
### Batch Processing Without Framework
368
369
```python
370
# Direct batch processing using Deep Lake's batches method
371
dataset = deeplake.open("./large_dataset")
372
373
# Process data in batches without ML framework
374
for batch_data in dataset.batches(batch_size=64):
375
# batch_data is a dictionary with column names as keys
376
# and lists of values as values
377
378
images = batch_data["images"]
379
labels = batch_data["labels"]
380
381
print(f"Processing batch with {len(images)} samples")
382
383
# Custom processing logic
384
for i, (image_path, label) in enumerate(zip(images, labels)):
385
# Process individual sample
386
# This could be feature extraction, validation, etc.
387
pass
388
389
# Break after first batch for demo
390
break
391
392
# Batch processing with filtering
393
high_confidence_data = deeplake.query(
394
"SELECT * FROM dataset WHERE confidence > 0.9"
395
)
396
397
for batch in high_confidence_data.batches(batch_size=32):
398
confidence_scores = batch["confidence"]
399
avg_confidence = sum(confidence_scores) / len(confidence_scores)
400
print(f"Batch average confidence: {avg_confidence:.3f}")
401
```
402
403
### Distributed Training Setup
404
405
```python
406
# PyTorch distributed training setup
407
import torch.distributed as dist
408
from torch.nn.parallel import DistributedDataParallel as DDP
409
from torch.utils.data.distributed import DistributedSampler
410
411
def setup_distributed_training(dataset_path, world_size, rank):
412
"""Setup distributed training with Deep Lake."""
413
414
# Initialize distributed training
415
dist.init_process_group("nccl", rank=rank, world_size=world_size)
416
417
# Open dataset
418
dataset = deeplake.open(dataset_path)
419
torch_dataset = dataset.pytorch(transform=custom_transform)
420
421
# Create distributed sampler
422
sampler = DistributedSampler(
423
torch_dataset,
424
num_replicas=world_size,
425
rank=rank,
426
shuffle=True
427
)
428
429
# Create distributed data loader
430
train_loader = DataLoader(
431
torch_dataset,
432
batch_size=32,
433
sampler=sampler,
434
num_workers=4,
435
pin_memory=True
436
)
437
438
return train_loader, sampler
439
440
# Usage in distributed training script
441
def train_distributed():
442
"""Distributed training example."""
443
444
world_size = torch.cuda.device_count()
445
rank = int(os.environ.get("LOCAL_RANK", 0))
446
447
train_loader, sampler = setup_distributed_training(
448
"./distributed_dataset", world_size, rank
449
)
450
451
# Training loop with distributed sampler
452
for epoch in range(10):
453
sampler.set_epoch(epoch) # Important for proper shuffling
454
455
for batch_idx, batch in enumerate(train_loader):
456
# Distributed training step
457
# model_output = model(batch)
458
# loss = criterion(model_output, batch["labels"])
459
# loss.backward()
460
# optimizer.step()
461
462
if batch_idx % 100 == 0 and rank == 0:
463
print(f"Epoch {epoch}, Batch {batch_idx}")
464
```
465
466
### Performance Optimization
467
468
```python
469
# Optimized data loading for high-performance training
470
class OptimizedDataLoader:
471
"""Optimized data loader for Deep Lake datasets."""
472
473
def __init__(self, dataset_path, batch_size=32, num_workers=4):
474
self.dataset = deeplake.open(dataset_path)
475
self.batch_size = batch_size
476
self.num_workers = num_workers
477
478
# Pre-compute dataset length
479
self.length = len(self.dataset)
480
481
# Create optimized transforms
482
self.transform = self._create_optimized_transform()
483
484
def _create_optimized_transform(self):
485
"""Create optimized transform pipeline."""
486
def fast_transform(sample):
487
# Minimize data copying and conversion
488
result = {}
489
490
# Efficient image processing
491
if "images" in sample:
492
image = sample["images"]
493
# Use optimized image loading
494
if isinstance(image, str):
495
# Load image efficiently
496
image = Image.open(image)
497
498
# Fast tensor conversion
499
image_array = np.array(image)
500
result["image"] = torch.from_numpy(image_array).permute(2, 0, 1).float() / 255.0
501
502
# Efficient label processing
503
if "labels" in sample:
504
result["label"] = torch.tensor(sample["labels"], dtype=torch.long)
505
506
return result
507
508
return fast_transform
509
510
def get_dataloader(self):
511
"""Get optimized PyTorch DataLoader."""
512
torch_dataset = self.dataset.pytorch(transform=self.transform)
513
514
return DataLoader(
515
torch_dataset,
516
batch_size=self.batch_size,
517
shuffle=True,
518
num_workers=self.num_workers,
519
pin_memory=True,
520
prefetch_factor=2,
521
persistent_workers=True
522
)
523
524
# Usage
525
optimized_loader = OptimizedDataLoader("./high_perf_dataset", batch_size=64, num_workers=8)
526
train_loader = optimized_loader.get_dataloader()
527
528
# Measure performance
529
import time
530
531
start_time = time.time()
532
total_samples = 0
533
534
for batch_idx, batch in enumerate(train_loader):
535
total_samples += batch["image"].shape[0]
536
537
if batch_idx >= 100: # Test first 100 batches
538
break
539
540
end_time = time.time()
541
throughput = total_samples / (end_time - start_time)
542
print(f"Data loading throughput: {throughput:.1f} samples/second")
543
```
544
545
### Integration with Popular Libraries
546
547
```python
548
# Integration with Hugging Face datasets
549
from datasets import Dataset as HFDataset
550
551
def convert_to_huggingface(deeplake_dataset):
552
"""Convert Deep Lake dataset to Hugging Face format."""
553
554
# Export data
555
data_dict = {"text": [], "labels": []}
556
557
for sample in deeplake_dataset:
558
data_dict["text"].append(sample["text"])
559
data_dict["labels"].append(sample["labels"])
560
561
# Create Hugging Face dataset
562
hf_dataset = HFDataset.from_dict(data_dict)
563
return hf_dataset
564
565
# Integration with Lightning
566
import pytorch_lightning as pl
567
568
class DeepLakeDataModule(pl.LightningDataModule):
569
"""PyTorch Lightning DataModule for Deep Lake."""
570
571
def __init__(self, train_path, val_path, batch_size=32):
572
super().__init__()
573
self.train_path = train_path
574
self.val_path = val_path
575
self.batch_size = batch_size
576
577
def setup(self, stage=None):
578
"""Setup datasets."""
579
if stage == "fit" or stage is None:
580
self.train_dataset = deeplake.open(self.train_path).pytorch(
581
transform=custom_transform
582
)
583
self.val_dataset = deeplake.open(self.val_path).pytorch(
584
transform=custom_transform
585
)
586
587
def train_dataloader(self):
588
return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
589
590
def val_dataloader(self):
591
return DataLoader(self.val_dataset, batch_size=self.batch_size)
592
593
# Usage with Lightning
594
datamodule = DeepLakeDataModule("./train_dataset", "./val_dataset", batch_size=32)
595
596
# Use with Lightning Trainer
597
# trainer = pl.Trainer()
598
# trainer.fit(model, datamodule)
599
```