or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-access.mddata-import-export.mddataset-management.mderror-handling.mdframework-integration.mdindex.mdquery-system.mdschema-templates.mdstorage-system.mdtype-system.mdversion-control.md

framework-integration.mddocs/

0

# Framework Integration

1

2

Seamless integration with PyTorch and TensorFlow for training and inference workflows with optimized data loading, transformation pipelines, and batch processing. Deep Lake provides native framework adapters for efficient ML model training.

3

4

## Capabilities

5

6

### PyTorch Integration

7

8

Native PyTorch Dataset integration with support for custom transforms, data loading, and distributed training.

9

10

```python { .api }

11

class DatasetView:

12

"""PyTorch integration for dataset views."""

13

14

def pytorch(self, transform: Optional[Callable[[Any], Any]] = None) -> Any:

15

"""

16

Create PyTorch Dataset from Deep Lake dataset.

17

18

Parameters:

19

- transform: Optional transform function to apply to samples

20

21

Returns:

22

TorchDataset: PyTorch-compatible dataset object

23

"""

24

```

25

26

### TensorFlow Integration

27

28

Native TensorFlow Dataset integration with optimized data pipelines and GPU acceleration support.

29

30

```python { .api }

31

class DatasetView:

32

"""TensorFlow integration for dataset views."""

33

34

def tensorflow(self) -> Any:

35

"""

36

Create TensorFlow Dataset from Deep Lake dataset.

37

38

Returns:

39

tf.data.Dataset: TensorFlow-compatible dataset object

40

"""

41

```

42

43

### Batch Processing

44

45

Efficient batch iteration with customizable batch sizes and data loading strategies.

46

47

```python { .api }

48

class DatasetView:

49

"""Batch processing capabilities."""

50

51

def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]:

52

"""

53

Iterate over dataset in batches.

54

55

Parameters:

56

- batch_size: Number of samples per batch

57

58

Returns:

59

Iterator[Dict[str, Any]]: Iterator yielding batches as dictionaries

60

"""

61

```

62

63

## Usage Examples

64

65

### Basic PyTorch Integration

66

67

```python

68

import deeplake

69

import torch

70

from torch.utils.data import DataLoader

71

72

# Open dataset

73

dataset = deeplake.open("./ml_dataset")

74

75

# Create PyTorch dataset

76

torch_dataset = dataset.pytorch()

77

78

# Use with PyTorch DataLoader

79

train_loader = DataLoader(

80

torch_dataset,

81

batch_size=32,

82

shuffle=True,

83

num_workers=4

84

)

85

86

# Training loop

87

for batch_idx, batch in enumerate(train_loader):

88

images = batch["images"]

89

labels = batch["labels"]

90

91

# Convert to tensors if needed

92

if isinstance(images, list):

93

images = torch.stack([torch.tensor(img) for img in images])

94

if isinstance(labels, list):

95

labels = torch.tensor(labels)

96

97

print(f"Batch {batch_idx}: images shape {images.shape}, labels shape {labels.shape}")

98

99

if batch_idx >= 2: # Just show first few batches

100

break

101

```

102

103

### PyTorch with Custom Transforms

104

105

```python

106

import torchvision.transforms as transforms

107

from PIL import Image

108

import numpy as np

109

110

# Define custom transform pipeline

111

def custom_transform(sample):

112

"""Custom transform for image-text pairs."""

113

114

# Load and transform image

115

if isinstance(sample["images"], str):

116

# Load image from path

117

image = Image.open(sample["images"])

118

else:

119

# Convert numpy array to PIL

120

image = Image.fromarray(sample["images"])

121

122

# Apply torchvision transforms

123

transform = transforms.Compose([

124

transforms.Resize((224, 224)),

125

transforms.ToTensor(),

126

transforms.Normalize(mean=[0.485, 0.456, 0.406],

127

std=[0.229, 0.224, 0.225])

128

])

129

130

transformed_image = transform(image)

131

132

# Process text label

133

label = sample["labels"]

134

if isinstance(label, str):

135

# Convert string label to integer (example mapping)

136

label_map = {"cat": 0, "dog": 1, "bird": 2}

137

label = label_map.get(label, -1)

138

139

return {

140

"image": transformed_image,

141

"label": torch.tensor(label, dtype=torch.long)

142

}

143

144

# Apply custom transform

145

torch_dataset = dataset.pytorch(transform=custom_transform)

146

147

# Use in training

148

train_loader = DataLoader(torch_dataset, batch_size=16, shuffle=True)

149

150

for batch in train_loader:

151

images = batch["image"] # Already tensor from transform

152

labels = batch["label"] # Already tensor from transform

153

154

print(f"Transformed batch - Images: {images.shape}, Labels: {labels.shape}")

155

break

156

```

157

158

### Advanced PyTorch Usage

159

160

```python

161

# Multi-modal dataset with complex transforms

162

def multimodal_transform(sample):

163

"""Transform for vision-language model training."""

164

165

# Process image

166

image = Image.open(sample["image_path"])

167

image_transform = transforms.Compose([

168

transforms.Resize((224, 224)),

169

transforms.ToTensor(),

170

transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

171

])

172

image_tensor = image_transform(image)

173

174

# Process text

175

text = sample["description"]

176

# Tokenize text (example with simple word-level tokenization)

177

tokens = text.lower().split()[:50] # Limit to 50 tokens

178

179

# Convert to embedding indices (simplified)

180

vocab = {"<pad>": 0, "<unk>": 1} # Simplified vocab

181

vocab.update({word: i+2 for i, word in enumerate(set(tokens))})

182

183

token_ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]

184

# Pad to fixed length

185

token_ids += [vocab["<pad>"]] * (50 - len(token_ids))

186

token_ids = token_ids[:50]

187

188

# Process embeddings if available

189

embeddings = sample.get("embeddings", np.zeros(768))

190

embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)

191

192

return {

193

"image": image_tensor,

194

"text_tokens": torch.tensor(token_ids, dtype=torch.long),

195

"embeddings": embeddings_tensor,

196

"metadata": sample.get("metadata", {})

197

}

198

199

# Create multimodal dataset

200

multimodal_dataset = dataset.pytorch(transform=multimodal_transform)

201

202

# Custom collate function for variable-length data

203

def collate_multimodal(batch):

204

"""Custom collate function for multimodal data."""

205

206

images = torch.stack([item["image"] for item in batch])

207

text_tokens = torch.stack([item["text_tokens"] for item in batch])

208

embeddings = torch.stack([item["embeddings"] for item in batch])

209

210

# Collect metadata

211

metadata = [item["metadata"] for item in batch]

212

213

return {

214

"images": images,

215

"text_tokens": text_tokens,

216

"embeddings": embeddings,

217

"metadata": metadata

218

}

219

220

# Use with custom collate

221

multimodal_loader = DataLoader(

222

multimodal_dataset,

223

batch_size=8,

224

shuffle=True,

225

collate_fn=collate_multimodal,

226

num_workers=2

227

)

228

229

for batch in multimodal_loader:

230

print(f"Multimodal batch:")

231

print(f" Images: {batch['images'].shape}")

232

print(f" Text tokens: {batch['text_tokens'].shape}")

233

print(f" Embeddings: {batch['embeddings'].shape}")

234

print(f" Metadata items: {len(batch['metadata'])}")

235

break

236

```

237

238

### TensorFlow Integration

239

240

```python

241

import tensorflow as tf

242

import deeplake

243

244

# Create TensorFlow dataset

245

dataset = deeplake.open("./ml_dataset")

246

tf_dataset = dataset.tensorflow()

247

248

# Basic usage with TensorFlow

249

for batch in tf_dataset.take(3):

250

print(f"TensorFlow batch keys: {list(batch.keys())}")

251

for key, value in batch.items():

252

print(f" {key}: {value.shape if hasattr(value, 'shape') else type(value)}")

253

254

# Apply TensorFlow transformations

255

def preprocess_tf(sample):

256

"""TensorFlow preprocessing function."""

257

258

# Process image data

259

if "images" in sample:

260

image = sample["images"]

261

# Resize and normalize

262

image = tf.image.resize(image, [224, 224])

263

image = tf.cast(image, tf.float32) / 255.0

264

sample["images"] = image

265

266

# Process labels

267

if "labels" in sample:

268

# Convert string labels to integers

269

label_map = tf.lookup.StaticHashTable(

270

tf.lookup.KeyValueTensorInitializer(

271

keys=["cat", "dog", "bird"],

272

values=[0, 1, 2]

273

),

274

default_value=-1

275

)

276

sample["labels"] = label_map.lookup(sample["labels"])

277

278

return sample

279

280

# Apply preprocessing

281

processed_tf_dataset = tf_dataset.map(

282

preprocess_tf,

283

num_parallel_calls=tf.data.AUTOTUNE

284

)

285

286

# Batch and prefetch for training

287

train_tf_dataset = processed_tf_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

288

289

# Use in TensorFlow training

290

for batch in train_tf_dataset.take(2):

291

images = batch["images"]

292

labels = batch["labels"]

293

print(f"TF Training batch - Images: {images.shape}, Labels: {labels.shape}")

294

```

295

296

### Advanced TensorFlow Usage

297

298

```python

299

# Complex TensorFlow pipeline with multiple data types

300

def create_tf_pipeline(dataset_path, batch_size=32):

301

"""Create optimized TensorFlow data pipeline."""

302

303

dataset = deeplake.open(dataset_path)

304

tf_dataset = dataset.tensorflow()

305

306

def process_sample(sample):

307

"""Process individual sample."""

308

processed = {}

309

310

# Handle images

311

if "images" in sample:

312

image = sample["images"]

313

# Decode if string path

314

if tf.dtypes.string == image.dtype:

315

image = tf.io.read_file(image)

316

image = tf.image.decode_image(image, channels=3)

317

318

# Resize and normalize

319

image = tf.image.resize(image, [224, 224])

320

image = tf.cast(image, tf.float32) / 255.0

321

322

# Data augmentation

323

image = tf.image.random_flip_left_right(image)

324

image = tf.image.random_brightness(image, 0.1)

325

326

processed["image"] = image

327

328

# Handle embeddings

329

if "embeddings" in sample:

330

embeddings = tf.cast(sample["embeddings"], tf.float32)

331

# L2 normalize embeddings

332

embeddings = tf.nn.l2_normalize(embeddings, axis=-1)

333

processed["embeddings"] = embeddings

334

335

# Handle text

336

if "text" in sample:

337

# Simple text processing (in practice, use proper tokenizer)

338

text = sample["text"]

339

processed["text"] = text

340

341

# Handle labels

342

if "labels" in sample:

343

processed["label"] = tf.cast(sample["labels"], tf.int32)

344

345

return processed

346

347

# Apply transformations

348

processed_dataset = (tf_dataset

349

.map(process_sample, num_parallel_calls=tf.data.AUTOTUNE)

350

.batch(batch_size)

351

.prefetch(tf.data.AUTOTUNE))

352

353

return processed_dataset

354

355

# Create optimized pipeline

356

tf_pipeline = create_tf_pipeline("./complex_dataset", batch_size=16)

357

358

# Use in model training

359

for epoch in range(2):

360

print(f"Epoch {epoch + 1}")

361

for step, batch in enumerate(tf_pipeline.take(5)):

362

print(f" Step {step + 1}: {list(batch.keys())}")

363

# Here you would pass batch to your model

364

# loss = model.train_step(batch)

365

```

366

367

### Batch Processing Without Framework

368

369

```python

370

# Direct batch processing using Deep Lake's batches method

371

dataset = deeplake.open("./large_dataset")

372

373

# Process data in batches without ML framework

374

for batch_data in dataset.batches(batch_size=64):

375

# batch_data is a dictionary with column names as keys

376

# and lists of values as values

377

378

images = batch_data["images"]

379

labels = batch_data["labels"]

380

381

print(f"Processing batch with {len(images)} samples")

382

383

# Custom processing logic

384

for i, (image_path, label) in enumerate(zip(images, labels)):

385

# Process individual sample

386

# This could be feature extraction, validation, etc.

387

pass

388

389

# Break after first batch for demo

390

break

391

392

# Batch processing with filtering

393

high_confidence_data = deeplake.query(

394

"SELECT * FROM dataset WHERE confidence > 0.9"

395

)

396

397

for batch in high_confidence_data.batches(batch_size=32):

398

confidence_scores = batch["confidence"]

399

avg_confidence = sum(confidence_scores) / len(confidence_scores)

400

print(f"Batch average confidence: {avg_confidence:.3f}")

401

```

402

403

### Distributed Training Setup

404

405

```python

406

# PyTorch distributed training setup

407

import torch.distributed as dist

408

from torch.nn.parallel import DistributedDataParallel as DDP

409

from torch.utils.data.distributed import DistributedSampler

410

411

def setup_distributed_training(dataset_path, world_size, rank):

412

"""Setup distributed training with Deep Lake."""

413

414

# Initialize distributed training

415

dist.init_process_group("nccl", rank=rank, world_size=world_size)

416

417

# Open dataset

418

dataset = deeplake.open(dataset_path)

419

torch_dataset = dataset.pytorch(transform=custom_transform)

420

421

# Create distributed sampler

422

sampler = DistributedSampler(

423

torch_dataset,

424

num_replicas=world_size,

425

rank=rank,

426

shuffle=True

427

)

428

429

# Create distributed data loader

430

train_loader = DataLoader(

431

torch_dataset,

432

batch_size=32,

433

sampler=sampler,

434

num_workers=4,

435

pin_memory=True

436

)

437

438

return train_loader, sampler

439

440

# Usage in distributed training script

441

def train_distributed():

442

"""Distributed training example."""

443

444

world_size = torch.cuda.device_count()

445

rank = int(os.environ.get("LOCAL_RANK", 0))

446

447

train_loader, sampler = setup_distributed_training(

448

"./distributed_dataset", world_size, rank

449

)

450

451

# Training loop with distributed sampler

452

for epoch in range(10):

453

sampler.set_epoch(epoch) # Important for proper shuffling

454

455

for batch_idx, batch in enumerate(train_loader):

456

# Distributed training step

457

# model_output = model(batch)

458

# loss = criterion(model_output, batch["labels"])

459

# loss.backward()

460

# optimizer.step()

461

462

if batch_idx % 100 == 0 and rank == 0:

463

print(f"Epoch {epoch}, Batch {batch_idx}")

464

```

465

466

### Performance Optimization

467

468

```python

469

# Optimized data loading for high-performance training

470

class OptimizedDataLoader:

471

"""Optimized data loader for Deep Lake datasets."""

472

473

def __init__(self, dataset_path, batch_size=32, num_workers=4):

474

self.dataset = deeplake.open(dataset_path)

475

self.batch_size = batch_size

476

self.num_workers = num_workers

477

478

# Pre-compute dataset length

479

self.length = len(self.dataset)

480

481

# Create optimized transforms

482

self.transform = self._create_optimized_transform()

483

484

def _create_optimized_transform(self):

485

"""Create optimized transform pipeline."""

486

def fast_transform(sample):

487

# Minimize data copying and conversion

488

result = {}

489

490

# Efficient image processing

491

if "images" in sample:

492

image = sample["images"]

493

# Use optimized image loading

494

if isinstance(image, str):

495

# Load image efficiently

496

image = Image.open(image)

497

498

# Fast tensor conversion

499

image_array = np.array(image)

500

result["image"] = torch.from_numpy(image_array).permute(2, 0, 1).float() / 255.0

501

502

# Efficient label processing

503

if "labels" in sample:

504

result["label"] = torch.tensor(sample["labels"], dtype=torch.long)

505

506

return result

507

508

return fast_transform

509

510

def get_dataloader(self):

511

"""Get optimized PyTorch DataLoader."""

512

torch_dataset = self.dataset.pytorch(transform=self.transform)

513

514

return DataLoader(

515

torch_dataset,

516

batch_size=self.batch_size,

517

shuffle=True,

518

num_workers=self.num_workers,

519

pin_memory=True,

520

prefetch_factor=2,

521

persistent_workers=True

522

)

523

524

# Usage

525

optimized_loader = OptimizedDataLoader("./high_perf_dataset", batch_size=64, num_workers=8)

526

train_loader = optimized_loader.get_dataloader()

527

528

# Measure performance

529

import time

530

531

start_time = time.time()

532

total_samples = 0

533

534

for batch_idx, batch in enumerate(train_loader):

535

total_samples += batch["image"].shape[0]

536

537

if batch_idx >= 100: # Test first 100 batches

538

break

539

540

end_time = time.time()

541

throughput = total_samples / (end_time - start_time)

542

print(f"Data loading throughput: {throughput:.1f} samples/second")

543

```

544

545

### Integration with Popular Libraries

546

547

```python

548

# Integration with Hugging Face datasets

549

from datasets import Dataset as HFDataset

550

551

def convert_to_huggingface(deeplake_dataset):

552

"""Convert Deep Lake dataset to Hugging Face format."""

553

554

# Export data

555

data_dict = {"text": [], "labels": []}

556

557

for sample in deeplake_dataset:

558

data_dict["text"].append(sample["text"])

559

data_dict["labels"].append(sample["labels"])

560

561

# Create Hugging Face dataset

562

hf_dataset = HFDataset.from_dict(data_dict)

563

return hf_dataset

564

565

# Integration with Lightning

566

import pytorch_lightning as pl

567

568

class DeepLakeDataModule(pl.LightningDataModule):

569

"""PyTorch Lightning DataModule for Deep Lake."""

570

571

def __init__(self, train_path, val_path, batch_size=32):

572

super().__init__()

573

self.train_path = train_path

574

self.val_path = val_path

575

self.batch_size = batch_size

576

577

def setup(self, stage=None):

578

"""Setup datasets."""

579

if stage == "fit" or stage is None:

580

self.train_dataset = deeplake.open(self.train_path).pytorch(

581

transform=custom_transform

582

)

583

self.val_dataset = deeplake.open(self.val_path).pytorch(

584

transform=custom_transform

585

)

586

587

def train_dataloader(self):

588

return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

589

590

def val_dataloader(self):

591

return DataLoader(self.val_dataset, batch_size=self.batch_size)

592

593

# Usage with Lightning

594

datamodule = DeepLakeDataModule("./train_dataset", "./val_dataset", batch_size=32)

595

596

# Use with Lightning Trainer

597

# trainer = pl.Trainer()

598

# trainer.fit(model, datamodule)

599

```