Database for AI powered by a storage format optimized for deep-learning applications.
75
Evaluation — 75%
↑ 1.59xAgent success when using this tile
Seamless integration with PyTorch and TensorFlow for training and inference workflows with optimized data loading, transformation pipelines, and batch processing. Deep Lake provides native framework adapters for efficient ML model training.
Native PyTorch Dataset integration with support for custom transforms, data loading, and distributed training.
class DatasetView:
"""PyTorch integration for dataset views."""
def pytorch(self, transform: Optional[Callable[[Any], Any]] = None) -> Any:
"""
Create PyTorch Dataset from Deep Lake dataset.
Parameters:
- transform: Optional transform function to apply to samples
Returns:
TorchDataset: PyTorch-compatible dataset object
"""Native TensorFlow Dataset integration with optimized data pipelines and GPU acceleration support.
class DatasetView:
"""TensorFlow integration for dataset views."""
def tensorflow(self) -> Any:
"""
Create TensorFlow Dataset from Deep Lake dataset.
Returns:
tf.data.Dataset: TensorFlow-compatible dataset object
"""Efficient batch iteration with customizable batch sizes and data loading strategies.
class DatasetView:
"""Batch processing capabilities."""
def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]:
"""
Iterate over dataset in batches.
Parameters:
- batch_size: Number of samples per batch
Returns:
Iterator[Dict[str, Any]]: Iterator yielding batches as dictionaries
"""import deeplake
import torch
from torch.utils.data import DataLoader
# Open dataset
dataset = deeplake.open("./ml_dataset")
# Create PyTorch dataset
torch_dataset = dataset.pytorch()
# Use with PyTorch DataLoader
train_loader = DataLoader(
torch_dataset,
batch_size=32,
shuffle=True,
num_workers=4
)
# Training loop
for batch_idx, batch in enumerate(train_loader):
images = batch["images"]
labels = batch["labels"]
# Convert to tensors if needed
if isinstance(images, list):
images = torch.stack([torch.tensor(img) for img in images])
if isinstance(labels, list):
labels = torch.tensor(labels)
print(f"Batch {batch_idx}: images shape {images.shape}, labels shape {labels.shape}")
if batch_idx >= 2: # Just show first few batches
breakimport torchvision.transforms as transforms
from PIL import Image
import numpy as np
# Define custom transform pipeline
def custom_transform(sample):
"""Custom transform for image-text pairs."""
# Load and transform image
if isinstance(sample["images"], str):
# Load image from path
image = Image.open(sample["images"])
else:
# Convert numpy array to PIL
image = Image.fromarray(sample["images"])
# Apply torchvision transforms
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
transformed_image = transform(image)
# Process text label
label = sample["labels"]
if isinstance(label, str):
# Convert string label to integer (example mapping)
label_map = {"cat": 0, "dog": 1, "bird": 2}
label = label_map.get(label, -1)
return {
"image": transformed_image,
"label": torch.tensor(label, dtype=torch.long)
}
# Apply custom transform
torch_dataset = dataset.pytorch(transform=custom_transform)
# Use in training
train_loader = DataLoader(torch_dataset, batch_size=16, shuffle=True)
for batch in train_loader:
images = batch["image"] # Already tensor from transform
labels = batch["label"] # Already tensor from transform
print(f"Transformed batch - Images: {images.shape}, Labels: {labels.shape}")
break# Multi-modal dataset with complex transforms
def multimodal_transform(sample):
"""Transform for vision-language model training."""
# Process image
image = Image.open(sample["image_path"])
image_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
image_tensor = image_transform(image)
# Process text
text = sample["description"]
# Tokenize text (example with simple word-level tokenization)
tokens = text.lower().split()[:50] # Limit to 50 tokens
# Convert to embedding indices (simplified)
vocab = {"<pad>": 0, "<unk>": 1} # Simplified vocab
vocab.update({word: i+2 for i, word in enumerate(set(tokens))})
token_ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]
# Pad to fixed length
token_ids += [vocab["<pad>"]] * (50 - len(token_ids))
token_ids = token_ids[:50]
# Process embeddings if available
embeddings = sample.get("embeddings", np.zeros(768))
embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
return {
"image": image_tensor,
"text_tokens": torch.tensor(token_ids, dtype=torch.long),
"embeddings": embeddings_tensor,
"metadata": sample.get("metadata", {})
}
# Create multimodal dataset
multimodal_dataset = dataset.pytorch(transform=multimodal_transform)
# Custom collate function for variable-length data
def collate_multimodal(batch):
"""Custom collate function for multimodal data."""
images = torch.stack([item["image"] for item in batch])
text_tokens = torch.stack([item["text_tokens"] for item in batch])
embeddings = torch.stack([item["embeddings"] for item in batch])
# Collect metadata
metadata = [item["metadata"] for item in batch]
return {
"images": images,
"text_tokens": text_tokens,
"embeddings": embeddings,
"metadata": metadata
}
# Use with custom collate
multimodal_loader = DataLoader(
multimodal_dataset,
batch_size=8,
shuffle=True,
collate_fn=collate_multimodal,
num_workers=2
)
for batch in multimodal_loader:
print(f"Multimodal batch:")
print(f" Images: {batch['images'].shape}")
print(f" Text tokens: {batch['text_tokens'].shape}")
print(f" Embeddings: {batch['embeddings'].shape}")
print(f" Metadata items: {len(batch['metadata'])}")
breakimport tensorflow as tf
import deeplake
# Create TensorFlow dataset
dataset = deeplake.open("./ml_dataset")
tf_dataset = dataset.tensorflow()
# Basic usage with TensorFlow
for batch in tf_dataset.take(3):
print(f"TensorFlow batch keys: {list(batch.keys())}")
for key, value in batch.items():
print(f" {key}: {value.shape if hasattr(value, 'shape') else type(value)}")
# Apply TensorFlow transformations
def preprocess_tf(sample):
"""TensorFlow preprocessing function."""
# Process image data
if "images" in sample:
image = sample["images"]
# Resize and normalize
image = tf.image.resize(image, [224, 224])
image = tf.cast(image, tf.float32) / 255.0
sample["images"] = image
# Process labels
if "labels" in sample:
# Convert string labels to integers
label_map = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(
keys=["cat", "dog", "bird"],
values=[0, 1, 2]
),
default_value=-1
)
sample["labels"] = label_map.lookup(sample["labels"])
return sample
# Apply preprocessing
processed_tf_dataset = tf_dataset.map(
preprocess_tf,
num_parallel_calls=tf.data.AUTOTUNE
)
# Batch and prefetch for training
train_tf_dataset = processed_tf_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
# Use in TensorFlow training
for batch in train_tf_dataset.take(2):
images = batch["images"]
labels = batch["labels"]
print(f"TF Training batch - Images: {images.shape}, Labels: {labels.shape}")# Complex TensorFlow pipeline with multiple data types
def create_tf_pipeline(dataset_path, batch_size=32):
"""Create optimized TensorFlow data pipeline."""
dataset = deeplake.open(dataset_path)
tf_dataset = dataset.tensorflow()
def process_sample(sample):
"""Process individual sample."""
processed = {}
# Handle images
if "images" in sample:
image = sample["images"]
# Decode if string path
if tf.dtypes.string == image.dtype:
image = tf.io.read_file(image)
image = tf.image.decode_image(image, channels=3)
# Resize and normalize
image = tf.image.resize(image, [224, 224])
image = tf.cast(image, tf.float32) / 255.0
# Data augmentation
image = tf.image.random_flip_left_right(image)
image = tf.image.random_brightness(image, 0.1)
processed["image"] = image
# Handle embeddings
if "embeddings" in sample:
embeddings = tf.cast(sample["embeddings"], tf.float32)
# L2 normalize embeddings
embeddings = tf.nn.l2_normalize(embeddings, axis=-1)
processed["embeddings"] = embeddings
# Handle text
if "text" in sample:
# Simple text processing (in practice, use proper tokenizer)
text = sample["text"]
processed["text"] = text
# Handle labels
if "labels" in sample:
processed["label"] = tf.cast(sample["labels"], tf.int32)
return processed
# Apply transformations
processed_dataset = (tf_dataset
.map(process_sample, num_parallel_calls=tf.data.AUTOTUNE)
.batch(batch_size)
.prefetch(tf.data.AUTOTUNE))
return processed_dataset
# Create optimized pipeline
tf_pipeline = create_tf_pipeline("./complex_dataset", batch_size=16)
# Use in model training
for epoch in range(2):
print(f"Epoch {epoch + 1}")
for step, batch in enumerate(tf_pipeline.take(5)):
print(f" Step {step + 1}: {list(batch.keys())}")
# Here you would pass batch to your model
# loss = model.train_step(batch)# Direct batch processing using Deep Lake's batches method
dataset = deeplake.open("./large_dataset")
# Process data in batches without ML framework
for batch_data in dataset.batches(batch_size=64):
# batch_data is a dictionary with column names as keys
# and lists of values as values
images = batch_data["images"]
labels = batch_data["labels"]
print(f"Processing batch with {len(images)} samples")
# Custom processing logic
for i, (image_path, label) in enumerate(zip(images, labels)):
# Process individual sample
# This could be feature extraction, validation, etc.
pass
# Break after first batch for demo
break
# Batch processing with filtering
high_confidence_data = deeplake.query(
"SELECT * FROM dataset WHERE confidence > 0.9"
)
for batch in high_confidence_data.batches(batch_size=32):
confidence_scores = batch["confidence"]
avg_confidence = sum(confidence_scores) / len(confidence_scores)
print(f"Batch average confidence: {avg_confidence:.3f}")# PyTorch distributed training setup
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
def setup_distributed_training(dataset_path, world_size, rank):
"""Setup distributed training with Deep Lake."""
# Initialize distributed training
dist.init_process_group("nccl", rank=rank, world_size=world_size)
# Open dataset
dataset = deeplake.open(dataset_path)
torch_dataset = dataset.pytorch(transform=custom_transform)
# Create distributed sampler
sampler = DistributedSampler(
torch_dataset,
num_replicas=world_size,
rank=rank,
shuffle=True
)
# Create distributed data loader
train_loader = DataLoader(
torch_dataset,
batch_size=32,
sampler=sampler,
num_workers=4,
pin_memory=True
)
return train_loader, sampler
# Usage in distributed training script
def train_distributed():
"""Distributed training example."""
world_size = torch.cuda.device_count()
rank = int(os.environ.get("LOCAL_RANK", 0))
train_loader, sampler = setup_distributed_training(
"./distributed_dataset", world_size, rank
)
# Training loop with distributed sampler
for epoch in range(10):
sampler.set_epoch(epoch) # Important for proper shuffling
for batch_idx, batch in enumerate(train_loader):
# Distributed training step
# model_output = model(batch)
# loss = criterion(model_output, batch["labels"])
# loss.backward()
# optimizer.step()
if batch_idx % 100 == 0 and rank == 0:
print(f"Epoch {epoch}, Batch {batch_idx}")# Optimized data loading for high-performance training
class OptimizedDataLoader:
"""Optimized data loader for Deep Lake datasets."""
def __init__(self, dataset_path, batch_size=32, num_workers=4):
self.dataset = deeplake.open(dataset_path)
self.batch_size = batch_size
self.num_workers = num_workers
# Pre-compute dataset length
self.length = len(self.dataset)
# Create optimized transforms
self.transform = self._create_optimized_transform()
def _create_optimized_transform(self):
"""Create optimized transform pipeline."""
def fast_transform(sample):
# Minimize data copying and conversion
result = {}
# Efficient image processing
if "images" in sample:
image = sample["images"]
# Use optimized image loading
if isinstance(image, str):
# Load image efficiently
image = Image.open(image)
# Fast tensor conversion
image_array = np.array(image)
result["image"] = torch.from_numpy(image_array).permute(2, 0, 1).float() / 255.0
# Efficient label processing
if "labels" in sample:
result["label"] = torch.tensor(sample["labels"], dtype=torch.long)
return result
return fast_transform
def get_dataloader(self):
"""Get optimized PyTorch DataLoader."""
torch_dataset = self.dataset.pytorch(transform=self.transform)
return DataLoader(
torch_dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=self.num_workers,
pin_memory=True,
prefetch_factor=2,
persistent_workers=True
)
# Usage
optimized_loader = OptimizedDataLoader("./high_perf_dataset", batch_size=64, num_workers=8)
train_loader = optimized_loader.get_dataloader()
# Measure performance
import time
start_time = time.time()
total_samples = 0
for batch_idx, batch in enumerate(train_loader):
total_samples += batch["image"].shape[0]
if batch_idx >= 100: # Test first 100 batches
break
end_time = time.time()
throughput = total_samples / (end_time - start_time)
print(f"Data loading throughput: {throughput:.1f} samples/second")# Integration with Hugging Face datasets
from datasets import Dataset as HFDataset
def convert_to_huggingface(deeplake_dataset):
"""Convert Deep Lake dataset to Hugging Face format."""
# Export data
data_dict = {"text": [], "labels": []}
for sample in deeplake_dataset:
data_dict["text"].append(sample["text"])
data_dict["labels"].append(sample["labels"])
# Create Hugging Face dataset
hf_dataset = HFDataset.from_dict(data_dict)
return hf_dataset
# Integration with Lightning
import pytorch_lightning as pl
class DeepLakeDataModule(pl.LightningDataModule):
"""PyTorch Lightning DataModule for Deep Lake."""
def __init__(self, train_path, val_path, batch_size=32):
super().__init__()
self.train_path = train_path
self.val_path = val_path
self.batch_size = batch_size
def setup(self, stage=None):
"""Setup datasets."""
if stage == "fit" or stage is None:
self.train_dataset = deeplake.open(self.train_path).pytorch(
transform=custom_transform
)
self.val_dataset = deeplake.open(self.val_path).pytorch(
transform=custom_transform
)
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batch_size)
# Usage with Lightning
datamodule = DeepLakeDataModule("./train_dataset", "./val_dataset", batch_size=32)
# Use with Lightning Trainer
# trainer = pl.Trainer()
# trainer.fit(model, datamodule)Install with Tessl CLI
npx tessl i tessl/pypi-deeplakedocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10