tessl/pypi-deeplake

Database for AI powered by a storage format optimized for deep-learning applications.

1.59x

Evaluation — 75%

↑ 1.59x

Agent success when using this tile

Overview

Eval results

Files

Framework Integration

Name: tessl/pypi-deeplake
Rating: 0.75 (1 reviews)
Author: tessl

Seamless integration with PyTorch and TensorFlow for training and inference workflows with optimized data loading, transformation pipelines, and batch processing. Deep Lake provides native framework adapters for efficient ML model training.

Capabilities

PyTorch Integration

Native PyTorch Dataset integration with support for custom transforms, data loading, and distributed training.

class DatasetView:
    """PyTorch integration for dataset views."""
    
    def pytorch(self, transform: Optional[Callable[[Any], Any]] = None) -> Any:
        """
        Create PyTorch Dataset from Deep Lake dataset.
        
        Parameters:
        - transform: Optional transform function to apply to samples
        
        Returns:
        TorchDataset: PyTorch-compatible dataset object
        """

TensorFlow Integration

Native TensorFlow Dataset integration with optimized data pipelines and GPU acceleration support.

class DatasetView:
    """TensorFlow integration for dataset views."""
    
    def tensorflow(self) -> Any:
        """
        Create TensorFlow Dataset from Deep Lake dataset.
        
        Returns:
        tf.data.Dataset: TensorFlow-compatible dataset object
        """

Batch Processing

Efficient batch iteration with customizable batch sizes and data loading strategies.

class DatasetView:
    """Batch processing capabilities."""
    
    def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]:
        """
        Iterate over dataset in batches.
        
        Parameters:
        - batch_size: Number of samples per batch
        
        Returns:
        Iterator[Dict[str, Any]]: Iterator yielding batches as dictionaries
        """

Usage Examples

Basic PyTorch Integration

import deeplake
import torch
from torch.utils.data import DataLoader

# Open dataset
dataset = deeplake.open("./ml_dataset")

# Create PyTorch dataset
torch_dataset = dataset.pytorch()

# Use with PyTorch DataLoader
train_loader = DataLoader(
    torch_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4
)

# Training loop
for batch_idx, batch in enumerate(train_loader):
    images = batch["images"]
    labels = batch["labels"]
    
    # Convert to tensors if needed
    if isinstance(images, list):
        images = torch.stack([torch.tensor(img) for img in images])
    if isinstance(labels, list):
        labels = torch.tensor(labels)
    
    print(f"Batch {batch_idx}: images shape {images.shape}, labels shape {labels.shape}")
    
    if batch_idx >= 2:  # Just show first few batches
        break

PyTorch with Custom Transforms

import torchvision.transforms as transforms
from PIL import Image
import numpy as np

# Define custom transform pipeline
def custom_transform(sample):
    """Custom transform for image-text pairs."""
    
    # Load and transform image
    if isinstance(sample["images"], str):
        # Load image from path
        image = Image.open(sample["images"])
    else:
        # Convert numpy array to PIL
        image = Image.fromarray(sample["images"])
    
    # Apply torchvision transforms
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                           std=[0.229, 0.224, 0.225])
    ])
    
    transformed_image = transform(image)
    
    # Process text label
    label = sample["labels"]
    if isinstance(label, str):
        # Convert string label to integer (example mapping)
        label_map = {"cat": 0, "dog": 1, "bird": 2}
        label = label_map.get(label, -1)
    
    return {
        "image": transformed_image,
        "label": torch.tensor(label, dtype=torch.long)
    }

# Apply custom transform
torch_dataset = dataset.pytorch(transform=custom_transform)

# Use in training
train_loader = DataLoader(torch_dataset, batch_size=16, shuffle=True)

for batch in train_loader:
    images = batch["image"]  # Already tensor from transform
    labels = batch["label"]  # Already tensor from transform
    
    print(f"Transformed batch - Images: {images.shape}, Labels: {labels.shape}")
    break

Advanced PyTorch Usage

# Multi-modal dataset with complex transforms
def multimodal_transform(sample):
    """Transform for vision-language model training."""
    
    # Process image
    image = Image.open(sample["image_path"])
    image_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])
    image_tensor = image_transform(image)
    
    # Process text
    text = sample["description"]
    # Tokenize text (example with simple word-level tokenization)
    tokens = text.lower().split()[:50]  # Limit to 50 tokens
    
    # Convert to embedding indices (simplified)
    vocab = {"<pad>": 0, "<unk>": 1}  # Simplified vocab
    vocab.update({word: i+2 for i, word in enumerate(set(tokens))})
    
    token_ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]
    # Pad to fixed length
    token_ids += [vocab["<pad>"]] * (50 - len(token_ids))
    token_ids = token_ids[:50]
    
    # Process embeddings if available
    embeddings = sample.get("embeddings", np.zeros(768))
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
    
    return {
        "image": image_tensor,
        "text_tokens": torch.tensor(token_ids, dtype=torch.long),
        "embeddings": embeddings_tensor,
        "metadata": sample.get("metadata", {})
    }

# Create multimodal dataset
multimodal_dataset = dataset.pytorch(transform=multimodal_transform)

# Custom collate function for variable-length data
def collate_multimodal(batch):
    """Custom collate function for multimodal data."""
    
    images = torch.stack([item["image"] for item in batch])
    text_tokens = torch.stack([item["text_tokens"] for item in batch])
    embeddings = torch.stack([item["embeddings"] for item in batch])
    
    # Collect metadata
    metadata = [item["metadata"] for item in batch]
    
    return {
        "images": images,
        "text_tokens": text_tokens,
        "embeddings": embeddings,
        "metadata": metadata
    }

# Use with custom collate
multimodal_loader = DataLoader(
    multimodal_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_multimodal,
    num_workers=2
)

for batch in multimodal_loader:
    print(f"Multimodal batch:")
    print(f"  Images: {batch['images'].shape}")
    print(f"  Text tokens: {batch['text_tokens'].shape}")
    print(f"  Embeddings: {batch['embeddings'].shape}")
    print(f"  Metadata items: {len(batch['metadata'])}")
    break

TensorFlow Integration

import tensorflow as tf
import deeplake

# Create TensorFlow dataset
dataset = deeplake.open("./ml_dataset")
tf_dataset = dataset.tensorflow()

# Basic usage with TensorFlow
for batch in tf_dataset.take(3):
    print(f"TensorFlow batch keys: {list(batch.keys())}")
    for key, value in batch.items():
        print(f"  {key}: {value.shape if hasattr(value, 'shape') else type(value)}")

# Apply TensorFlow transformations
def preprocess_tf(sample):
    """TensorFlow preprocessing function."""
    
    # Process image data
    if "images" in sample:
        image = sample["images"]
        # Resize and normalize
        image = tf.image.resize(image, [224, 224])
        image = tf.cast(image, tf.float32) / 255.0
        sample["images"] = image
    
    # Process labels
    if "labels" in sample:
        # Convert string labels to integers
        label_map = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(
                keys=["cat", "dog", "bird"],
                values=[0, 1, 2]
            ),
            default_value=-1
        )
        sample["labels"] = label_map.lookup(sample["labels"])
    
    return sample

# Apply preprocessing
processed_tf_dataset = tf_dataset.map(
    preprocess_tf,
    num_parallel_calls=tf.data.AUTOTUNE
)

# Batch and prefetch for training
train_tf_dataset = processed_tf_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# Use in TensorFlow training
for batch in train_tf_dataset.take(2):
    images = batch["images"]
    labels = batch["labels"]
    print(f"TF Training batch - Images: {images.shape}, Labels: {labels.shape}")

Advanced TensorFlow Usage

# Complex TensorFlow pipeline with multiple data types
def create_tf_pipeline(dataset_path, batch_size=32):
    """Create optimized TensorFlow data pipeline."""
    
    dataset = deeplake.open(dataset_path)
    tf_dataset = dataset.tensorflow()
    
    def process_sample(sample):
        """Process individual sample."""
        processed = {}
        
        # Handle images
        if "images" in sample:
            image = sample["images"]
            # Decode if string path
            if tf.dtypes.string == image.dtype:
                image = tf.io.read_file(image)
                image = tf.image.decode_image(image, channels=3)
            
            # Resize and normalize
            image = tf.image.resize(image, [224, 224])
            image = tf.cast(image, tf.float32) / 255.0
            
            # Data augmentation
            image = tf.image.random_flip_left_right(image)
            image = tf.image.random_brightness(image, 0.1)
            
            processed["image"] = image
        
        # Handle embeddings
        if "embeddings" in sample:
            embeddings = tf.cast(sample["embeddings"], tf.float32)
            # L2 normalize embeddings
            embeddings = tf.nn.l2_normalize(embeddings, axis=-1)
            processed["embeddings"] = embeddings
        
        # Handle text
        if "text" in sample:
            # Simple text processing (in practice, use proper tokenizer)
            text = sample["text"]
            processed["text"] = text
        
        # Handle labels
        if "labels" in sample:
            processed["label"] = tf.cast(sample["labels"], tf.int32)
        
        return processed
    
    # Apply transformations
    processed_dataset = (tf_dataset
                        .map(process_sample, num_parallel_calls=tf.data.AUTOTUNE)
                        .batch(batch_size)
                        .prefetch(tf.data.AUTOTUNE))
    
    return processed_dataset

# Create optimized pipeline
tf_pipeline = create_tf_pipeline("./complex_dataset", batch_size=16)

# Use in model training
for epoch in range(2):
    print(f"Epoch {epoch + 1}")
    for step, batch in enumerate(tf_pipeline.take(5)):
        print(f"  Step {step + 1}: {list(batch.keys())}")
        # Here you would pass batch to your model
        # loss = model.train_step(batch)

Batch Processing Without Framework

# Direct batch processing using Deep Lake's batches method
dataset = deeplake.open("./large_dataset")

# Process data in batches without ML framework
for batch_data in dataset.batches(batch_size=64):
    # batch_data is a dictionary with column names as keys
    # and lists of values as values
    
    images = batch_data["images"]
    labels = batch_data["labels"]
    
    print(f"Processing batch with {len(images)} samples")
    
    # Custom processing logic
    for i, (image_path, label) in enumerate(zip(images, labels)):
        # Process individual sample
        # This could be feature extraction, validation, etc.
        pass
    
    # Break after first batch for demo
    break

# Batch processing with filtering
high_confidence_data = deeplake.query(
    "SELECT * FROM dataset WHERE confidence > 0.9"
)

for batch in high_confidence_data.batches(batch_size=32):
    confidence_scores = batch["confidence"]
    avg_confidence = sum(confidence_scores) / len(confidence_scores)
    print(f"Batch average confidence: {avg_confidence:.3f}")

Distributed Training Setup

# PyTorch distributed training setup
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler

def setup_distributed_training(dataset_path, world_size, rank):
    """Setup distributed training with Deep Lake."""
    
    # Initialize distributed training
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    
    # Open dataset
    dataset = deeplake.open(dataset_path)
    torch_dataset = dataset.pytorch(transform=custom_transform)
    
    # Create distributed sampler
    sampler = DistributedSampler(
        torch_dataset,
        num_replicas=world_size,
        rank=rank,
        shuffle=True
    )
    
    # Create distributed data loader
    train_loader = DataLoader(
        torch_dataset,
        batch_size=32,
        sampler=sampler,
        num_workers=4,
        pin_memory=True
    )
    
    return train_loader, sampler

# Usage in distributed training script
def train_distributed():
    """Distributed training example."""
    
    world_size = torch.cuda.device_count()
    rank = int(os.environ.get("LOCAL_RANK", 0))
    
    train_loader, sampler = setup_distributed_training(
        "./distributed_dataset", world_size, rank
    )
    
    # Training loop with distributed sampler
    for epoch in range(10):
        sampler.set_epoch(epoch)  # Important for proper shuffling
        
        for batch_idx, batch in enumerate(train_loader):
            # Distributed training step
            # model_output = model(batch)
            # loss = criterion(model_output, batch["labels"])
            # loss.backward()
            # optimizer.step()
            
            if batch_idx % 100 == 0 and rank == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}")

Performance Optimization

# Optimized data loading for high-performance training
class OptimizedDataLoader:
    """Optimized data loader for Deep Lake datasets."""
    
    def __init__(self, dataset_path, batch_size=32, num_workers=4):
        self.dataset = deeplake.open(dataset_path)
        self.batch_size = batch_size
        self.num_workers = num_workers
        
        # Pre-compute dataset length
        self.length = len(self.dataset)
        
        # Create optimized transforms
        self.transform = self._create_optimized_transform()
    
    def _create_optimized_transform(self):
        """Create optimized transform pipeline."""
        def fast_transform(sample):
            # Minimize data copying and conversion
            result = {}
            
            # Efficient image processing
            if "images" in sample:
                image = sample["images"]
                # Use optimized image loading
                if isinstance(image, str):
                    # Load image efficiently
                    image = Image.open(image)
                
                # Fast tensor conversion
                image_array = np.array(image)
                result["image"] = torch.from_numpy(image_array).permute(2, 0, 1).float() / 255.0
            
            # Efficient label processing
            if "labels" in sample:
                result["label"] = torch.tensor(sample["labels"], dtype=torch.long)
            
            return result
        
        return fast_transform
    
    def get_dataloader(self):
        """Get optimized PyTorch DataLoader."""
        torch_dataset = self.dataset.pytorch(transform=self.transform)
        
        return DataLoader(
            torch_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=True,
            prefetch_factor=2,
            persistent_workers=True
        )

# Usage
optimized_loader = OptimizedDataLoader("./high_perf_dataset", batch_size=64, num_workers=8)
train_loader = optimized_loader.get_dataloader()

# Measure performance
import time

start_time = time.time()
total_samples = 0

for batch_idx, batch in enumerate(train_loader):
    total_samples += batch["image"].shape[0]
    
    if batch_idx >= 100:  # Test first 100 batches
        break

end_time = time.time()
throughput = total_samples / (end_time - start_time)
print(f"Data loading throughput: {throughput:.1f} samples/second")

Integration with Popular Libraries

# Integration with Hugging Face datasets
from datasets import Dataset as HFDataset

def convert_to_huggingface(deeplake_dataset):
    """Convert Deep Lake dataset to Hugging Face format."""
    
    # Export data
    data_dict = {"text": [], "labels": []}
    
    for sample in deeplake_dataset:
        data_dict["text"].append(sample["text"])
        data_dict["labels"].append(sample["labels"])
    
    # Create Hugging Face dataset
    hf_dataset = HFDataset.from_dict(data_dict)
    return hf_dataset

# Integration with Lightning
import pytorch_lightning as pl

class DeepLakeDataModule(pl.LightningDataModule):
    """PyTorch Lightning DataModule for Deep Lake."""
    
    def __init__(self, train_path, val_path, batch_size=32):
        super().__init__()
        self.train_path = train_path
        self.val_path = val_path
        self.batch_size = batch_size
    
    def setup(self, stage=None):
        """Setup datasets."""
        if stage == "fit" or stage is None:
            self.train_dataset = deeplake.open(self.train_path).pytorch(
                transform=custom_transform
            )
            self.val_dataset = deeplake.open(self.val_path).pytorch(
                transform=custom_transform
            )
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

# Usage with Lightning
datamodule = DeepLakeDataModule("./train_dataset", "./val_dataset", batch_size=32)

# Use with Lightning Trainer
# trainer = pl.Trainer()
# trainer.fit(model, datamodule)

Install with Tessl CLI

npx tessl i tessl/pypi-deeplake

tessl/pypi-deeplake