CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-deeplake

Database for AI powered by a storage format optimized for deep-learning applications.

75

1.59x

Evaluation75%

1.59x

Agent success when using this tile

Overview
Eval results
Files

version-control.mddocs/

Version Control

Git-like version control system with branching, tagging, commit history, and merge operations for dataset evolution and collaboration. Deep Lake provides comprehensive versioning capabilities enabling reproducible ML experiments and dataset lineage tracking.

Capabilities

Dataset Versioning

Core version control operations for tracking dataset changes with commit history and rollback capabilities.

class Dataset:
    """Dataset version control operations."""
    
    version: Version
    history: History
    current_branch: str
    
    def commit(self, message: str = "") -> str:
        """
        Commit current dataset changes.
        
        Parameters:
        - message: Commit message describing changes
        
        Returns:
        str: Commit ID/hash
        """
    
    def rollback(self, version_id: str) -> None:
        """
        Rollback dataset to specific version.
        
        Parameters:
        - version_id: Version ID to rollback to
        """
    
    def refresh(self) -> None:
        """Refresh dataset to latest version from storage."""

class Version:
    """Single version information."""
    
    id: str
    message: str
    timestamp: str
    client_timestamp: str
    
    def open(self) -> ReadOnlyDataset:
        """
        Open this version as read-only dataset.
        
        Returns:
        ReadOnlyDataset: Dataset at this version
        """
    
    def open_async(self) -> Future[ReadOnlyDataset]:
        """
        Open this version asynchronously.
        
        Returns:
        Future[ReadOnlyDataset]: Future resolving to dataset at this version
        """

class History:
    """Version history access."""
    
    def __getitem__(self, key: Union[int, str]) -> Version:
        """
        Access version by index or ID.
        
        Parameters:
        - key: Version index (int) or version ID (str)
        
        Returns:
        Version: Version object
        """
    
    def __iter__(self) -> Iterator[Version]:
        """Iterate over all versions in chronological order."""
    
    def __len__(self) -> int:
        """Get total number of versions."""

Branch Management

Create and manage dataset branches for parallel development and experimentation.

class Dataset:
    """Dataset branch operations."""
    
    branches: Branches
    
    def branch(self, name: str) -> Branch:
        """
        Create new branch from current state.
        
        Parameters:
        - name: Branch name
        
        Returns:
        Branch: New branch object
        """
    
    def merge(self, branch_name: str, message: str = "") -> None:
        """
        Merge branch into current branch.
        
        Parameters:
        - branch_name: Name of branch to merge
        - message: Merge commit message
        """

class Branch:
    """Dataset branch management."""
    
    id: str
    name: str
    timestamp: str
    base: str
    
    def open(self) -> Dataset:
        """
        Open this branch for modification.
        
        Returns:
        Dataset: Mutable dataset on this branch
        """
    
    def open_async(self) -> Future[Dataset]:
        """
        Open this branch asynchronously.
        
        Returns:
        Future[Dataset]: Future resolving to mutable dataset
        """
    
    def delete(self) -> None:
        """Delete this branch (cannot delete main branch)."""
    
    def rename(self, new_name: str) -> None:
        """
        Rename this branch.
        
        Parameters:
        - new_name: New branch name
        """

class BranchView:
    """Read-only branch information."""
    
    id: str
    name: str
    timestamp: str
    base: str
    
    def open(self) -> ReadOnlyDataset:
        """Open this branch as read-only dataset."""
    
    def open_async(self) -> Future[ReadOnlyDataset]:
        """Open this branch asynchronously."""

class Branches:
    """Collection of branches (mutable)."""
    
    def names(self) -> List[str]:
        """
        Get all branch names.
        
        Returns:
        List[str]: List of branch names
        """
    
    def __len__(self) -> int:
        """Get number of branches."""
    
    def __getitem__(self, name: str) -> Branch:
        """
        Get branch by name.
        
        Parameters:
        - name: Branch name
        
        Returns:
        Branch: Branch object
        """

class BranchesView:
    """Collection of branches (read-only)."""
    
    def names(self) -> List[str]:
        """Get all branch names."""
    
    def __len__(self) -> int:
        """Get number of branches."""
    
    def __getitem__(self, name: str) -> BranchView:
        """Get branch by name."""

Tag Management

Create and manage dataset tags for marking important versions and milestones.

class Dataset:
    """Dataset tag operations."""
    
    tags: Tags
    
    def tag(self, name: str, message: str = "") -> Tag:
        """
        Create tag at current version.
        
        Parameters:
        - name: Tag name
        - message: Tag message/description
        
        Returns:
        Tag: New tag object
        """

class DatasetView:
    """Query result tag operations."""
    
    def tag(self, name: str, message: str = "") -> TagView:
        """Create tag from query result view."""

class Tag:
    """Dataset tag management."""
    
    id: str
    name: str
    message: str
    version: str
    timestamp: str
    
    def open(self) -> ReadOnlyDataset:
        """
        Open dataset at this tag version.
        
        Returns:
        ReadOnlyDataset: Dataset at tag version
        """
    
    def open_async(self) -> Future[ReadOnlyDataset]:
        """
        Open dataset at this tag asynchronously.
        
        Returns:
        Future[ReadOnlyDataset]: Future resolving to dataset
        """
    
    def delete(self) -> None:
        """Delete this tag."""
    
    def rename(self, new_name: str) -> None:
        """
        Rename this tag.
        
        Parameters:
        - new_name: New tag name
        """

class TagView:
    """Read-only tag information."""
    
    id: str
    name: str
    message: str
    version: str
    timestamp: str
    
    def open(self) -> ReadOnlyDataset:
        """Open dataset at this tag version."""
    
    def open_async(self) -> Future[ReadOnlyDataset]:
        """Open dataset at this tag asynchronously."""

class Tags:
    """Collection of tags (mutable)."""
    
    def names(self) -> List[str]:
        """
        Get all tag names.
        
        Returns:
        List[str]: List of tag names
        """
    
    def __len__(self) -> int:
        """Get number of tags."""
    
    def __getitem__(self, name: str) -> Tag:
        """
        Get tag by name.
        
        Parameters:
        - name: Tag name
        
        Returns:
        Tag: Tag object
        """

class TagsView:
    """Collection of tags (read-only)."""
    
    def names(self) -> List[str]:
        """Get all tag names."""
    
    def __len__(self) -> int:
        """Get number of tags."""
    
    def __getitem__(self, name: str) -> TagView:
        """Get tag by name."""

Remote Synchronization

Push and pull operations for synchronizing dataset versions with remote storage.

class Dataset:
    """Remote synchronization operations."""
    
    def push(self) -> None:
        """Push local changes to remote storage."""
    
    def pull(self) -> None:
        """Pull remote changes to local dataset."""

class ReadOnlyDataset:
    """Remote operations for read-only datasets."""
    
    def push(self) -> None:
        """Push dataset state to remote (metadata only)."""
    
    def refresh(self) -> None:
        """Refresh dataset from remote storage."""

Usage Examples

Basic Version Control

import deeplake

# Open dataset and make changes
dataset = deeplake.open("./my_dataset")

# Add some data
dataset.append({"images": "new_image.jpg", "labels": "cat"})

# Commit changes
commit_id = dataset.commit("Added new cat image")
print(f"Committed changes: {commit_id}")

# View commit history
for version in dataset.history:
    print(f"Version {version.id}: {version.message} ({version.timestamp})")

# Get current version info
current_version = dataset.version
print(f"Current version: {current_version.id}")
print(f"Commit message: {current_version.message}")

Branch Operations

# Create new branch for experiments
experiment_branch = dataset.branch("feature_experiment")
print(f"Created branch: {experiment_branch.name}")

# List all branches
print("Available branches:")
for branch_name in dataset.branches.names():
    branch = dataset.branches[branch_name]
    print(f"  {branch.name} (created: {branch.timestamp})")

# Switch to experiment branch
experiment_dataset = experiment_branch.open()

# Make experimental changes
experiment_dataset.add_column("confidence", deeplake.types.Float32())
experiment_dataset.append({
    "images": "experiment_image.jpg", 
    "labels": "experimental_label",
    "confidence": 0.95
})

# Commit experimental changes
experiment_dataset.commit("Added confidence scores for experimentation")

# Switch back to main branch
main_dataset = dataset.branches["main"].open()

# Merge experimental branch into main
main_dataset.merge("feature_experiment", "Merged confidence score feature")

# Clean up: delete experimental branch
experiment_branch.delete()

Tag Management

# Create tags for important milestones
v1_tag = dataset.tag("v1.0", "Initial production dataset")
print(f"Created tag: {v1_tag.name}")

# Add more data and create another tag
dataset.extend([
    {"images": f"batch_image_{i}.jpg", "labels": f"label_{i}"} 
    for i in range(100)
])
dataset.commit("Added batch of 100 images")

v1_1_tag = dataset.tag("v1.1", "Added training batch")

# List all tags
print("Available tags:")
for tag_name in dataset.tags.names():
    tag = dataset.tags[tag_name]
    print(f"  {tag.name}: {tag.message} (version: {tag.version})")

# Open dataset at specific tag
v1_dataset = v1_tag.open()
print(f"Dataset at v1.0 has {len(v1_dataset)} rows")

# Compare with current version
print(f"Current dataset has {len(dataset)} rows")

Version History and Rollback

# Examine version history
print(f"Dataset has {len(dataset.history)} versions")

# Get specific version
latest_version = dataset.history[-1]  # Most recent
first_version = dataset.history[0]    # First version

print(f"Latest: {latest_version.message}")
print(f"First: {first_version.message}")

# Open dataset at specific version
historical_dataset = first_version.open()
print(f"First version had {len(historical_dataset)} rows")

# Rollback to previous version if needed
if len(dataset.history) > 1:
    previous_version = dataset.history[-2]
    dataset.rollback(previous_version.id)
    print(f"Rolled back to: {previous_version.message}")

Remote Synchronization

# Push local changes to remote storage
dataset.push()
print("Pushed local changes to remote")

# Pull remote changes (in another location/process)
remote_dataset = deeplake.open("s3://my-bucket/shared_dataset")
remote_dataset.pull()
print("Pulled latest changes from remote")

# Refresh to get latest version without pulling changes
remote_dataset.refresh()
print("Refreshed dataset metadata from remote")

Collaborative Workflows

# Typical collaborative workflow

# Developer A: Create feature branch
dataset_a = deeplake.open("s3://shared-bucket/project_dataset")
feature_branch = dataset_a.branch("add_validation_data")
feature_dataset = feature_branch.open()

# Add validation data
validation_data = [
    {"images": f"val_image_{i}.jpg", "labels": f"val_label_{i}"} 
    for i in range(500)
]
feature_dataset.extend(validation_data)
feature_dataset.commit("Added validation dataset")

# Push branch to remote
feature_dataset.push()

# Developer B: Pull and review changes
dataset_b = deeplake.open("s3://shared-bucket/project_dataset")
dataset_b.pull()

# Review feature branch
feature_branch_b = dataset_b.branches["add_validation_data"]
feature_data_b = feature_branch_b.open()
print(f"Feature branch has {len(feature_data_b)} total rows")

# Merge into main after review
main_dataset = dataset_b.branches["main"].open()
main_dataset.merge("add_validation_data", "Merged validation data from feature branch")
main_dataset.push()

# Tag the release
release_tag = main_dataset.tag("v2.0", "Added validation dataset - ready for training")

Advanced Version Control

# Complex branching scenario
dataset = deeplake.open("./complex_dataset")

# Create multiple feature branches
data_cleaning_branch = dataset.branch("data_cleaning")
augmentation_branch = dataset.branch("data_augmentation")
labeling_branch = dataset.branch("relabeling")

# Work on data cleaning
cleaning_dataset = data_cleaning_branch.open()
# ... perform data cleaning operations
cleaning_dataset.commit("Cleaned corrupted entries")

# Work on augmentation
aug_dataset = augmentation_branch.open()
# ... add augmented data
aug_dataset.commit("Added augmented training examples")

# Merge branches sequentially
main_dataset = dataset.branches["main"].open()

# Merge data cleaning first
main_dataset.merge("data_cleaning", "Merged data cleaning improvements")

# Merge augmentation
main_dataset.merge("data_augmentation", "Merged data augmentation")

# Create milestone tag
milestone_tag = main_dataset.tag("preprocessing_complete", "Completed data preprocessing pipeline")

# Clean up feature branches
data_cleaning_branch.delete()
augmentation_branch.delete()

print(f"Completed preprocessing. Dataset now has {len(main_dataset)} rows")

Version-based Experiment Tracking

# Track ML experiments with versions
dataset = deeplake.open("./experiment_dataset")

# Create experiment tracking
experiment_results = []

for experiment_id in range(5):
    # Create experiment branch
    exp_branch = dataset.branch(f"experiment_{experiment_id}")
    exp_dataset = exp_branch.open()
    
    # Apply different preprocessing
    # ... experiment-specific data modifications
    
    exp_dataset.commit(f"Applied preprocessing for experiment {experiment_id}")
    
    # Tag experiment version
    exp_tag = exp_dataset.tag(f"exp_{experiment_id}_data", f"Data for experiment {experiment_id}")
    
    # Record experiment info
    experiment_results.append({
        "experiment_id": experiment_id,
        "branch": exp_branch.name,
        "tag": exp_tag.name,
        "data_version": exp_dataset.version.id,
        "num_samples": len(exp_dataset)
    })
    
    # Clean up branch after tagging
    exp_branch.delete()

# Review all experiments
print("Experiment Summary:")
for result in experiment_results:
    print(f"Experiment {result['experiment_id']}: {result['num_samples']} samples, tagged as {result['tag']}")

Install with Tessl CLI

npx tessl i tessl/pypi-deeplake

docs

data-access.md

data-import-export.md

dataset-management.md

error-handling.md

framework-integration.md

index.md

query-system.md

schema-templates.md

storage-system.md

type-system.md

version-control.md

tile.json