Database for AI powered by a storage format optimized for deep-learning applications.
npx @tessl/cli install tessl/pypi-deeplake@4.3.0Deep Lake is a database for AI powered by a storage format optimized for deep-learning applications. It provides comprehensive dataset management, querying capabilities, and seamless integration with popular ML frameworks, enabling both data storage/retrieval for LLM applications and dataset management for deep learning model training.
pip install deeplakeimport deeplakeCommon type imports:
from deeplake import types
from deeplake.types import Image, Text, Embedding, ArraySchema template imports:
from deeplake.schemas import TextEmbeddings, COCOImagesimport deeplake
# Create a new dataset
dataset = deeplake.create("./my_dataset")
# Add columns with types
dataset.add_column("images", deeplake.types.Image())
dataset.add_column("labels", deeplake.types.Text())
dataset.add_column("embeddings", deeplake.types.Embedding(size=768))
# Append data
dataset.append({
"images": "path/to/image.jpg",
"labels": "cat",
"embeddings": [0.1, 0.2, 0.3, ...] # 768-dimensional vector
})
# Commit changes
dataset.commit("Added initial data")
# Query data using TQL (Tensor Query Language)
results = deeplake.query("SELECT * FROM dataset WHERE labels == 'cat'")
for row in results:
print(row["labels"].text())
# Open existing dataset
dataset = deeplake.open("./my_dataset")
print(f"Dataset has {len(dataset)} rows")
# Framework integration
pytorch_dataloader = dataset.pytorch(transform=my_transform)
tensorflow_dataset = dataset.tensorflow()Deep Lake's architecture centers around datasets as the primary abstraction, with the following key components:
This design enables Deep Lake to handle data of any size in a serverless manner while maintaining unified access through a single API, supporting all data types (embeddings, audio, text, videos, images, PDFs, annotations) with data versioning and lineage capabilities.
Core functionality for creating, opening, deleting, and copying datasets with support for various storage backends and comprehensive lifecycle management.
def create(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None, schema: Optional[Schema] = None) -> Dataset: ...
def open(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Dataset: ...
def open_read_only(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> ReadOnlyDataset: ...
def delete(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> None: ...
def exists(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> bool: ...
def copy(src: str, dst: str, src_creds: Optional[Dict[str, str]] = None, dst_creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> None: ...Row and column-based data access patterns with comprehensive indexing, slicing, and batch operations for efficient data manipulation.
class Dataset:
def __getitem__(self, key: Union[int, slice, str]) -> Union[Row, RowRange, Column]: ...
def append(self, data: Dict[str, Any]) -> None: ...
def add_column(self, name: str, dtype: Type) -> None: ...
def remove_column(self, name: str) -> None: ...
class Column:
def __getitem__(self, key: Union[int, slice, List[int]]) -> Any: ...
def __setitem__(self, key: Union[int, slice, List[int]], value: Any) -> None: ...TQL (Tensor Query Language) for complex data filtering, aggregation, and transformation with SQL-like syntax optimized for tensor operations.
def query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> DatasetView: ...
def prepare_query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> Executor: ...
def explain_query(query: str, token: Optional[str] = None, creds: Optional[Dict[str, str]] = None) -> ExplainQueryResult: ...
class Executor:
def run_single(self, parameters: Dict[str, Any]) -> DatasetView: ...
def run_batch(self, parameters: List[Dict[str, Any]]) -> List[DatasetView]: ...Rich type hierarchy supporting all ML data types including images, embeddings, audio, video, geometric data, and custom structures with compression and indexing options.
class Image:
def __init__(self, dtype: str = "uint8", sample_compression: str = "png"): ...
class Embedding:
def __init__(self, size: Optional[int] = None, dtype: str = "float32", index_type: Optional[IndexType] = None): ...
class Text:
def __init__(self, index_type: Optional[TextIndexType] = None): ...
class Array:
def __init__(self, dtype: DataType, dimensions: Optional[int] = None, shape: Optional[List[int]] = None): ...Git-like version control with branching, tagging, commit history, and merge operations for dataset evolution and collaboration.
class Dataset:
def commit(self, message: str = "") -> str: ...
def branch(self, name: str) -> Branch: ...
def tag(self, name: str, message: str = "") -> Tag: ...
def push(self) -> None: ...
def pull(self) -> None: ...
class Branch:
def open(self) -> Dataset: ...
def delete(self) -> None: ...
def rename(self, new_name: str) -> None: ...Multi-cloud storage abstraction supporting local filesystem, S3, GCS, Azure with built-in compression, encryption, and performance optimization.
class Reader:
def get(self, path: str) -> bytes: ...
def list(self, path: str = "") -> List[str]: ...
def subdir(self, path: str) -> Reader: ...
class Writer:
def set(self, path: str, data: bytes) -> None: ...
def remove(self, path: str) -> None: ...
def subdir(self, path: str) -> Writer: ...Comprehensive data import/export capabilities supporting various formats including Parquet, CSV, COCO datasets, and custom data ingestion pipelines.
def from_parquet(url_or_bytes: Union[str, bytes]) -> ReadOnlyDataset: ...
def from_csv(url_or_bytes: Union[str, bytes]) -> ReadOnlyDataset: ...
def from_coco(images_directory: str, annotation_files: List[str], dest: str, dest_creds: Optional[Dict[str, str]] = None) -> Dataset: ...
class DatasetView:
def to_csv(self, path: str) -> None: ...Seamless integration with PyTorch and TensorFlow for training and inference workflows with optimized data loading and transformation pipelines.
class DatasetView:
def pytorch(self, transform: Optional[Callable[[Any], Any]] = None) -> Any: ...
def tensorflow(self) -> Any: ...
def batches(self, batch_size: int = 1) -> Iterator[Dict[str, Any]]: ...Comprehensive exception handling for various failure scenarios including authentication, authorization, storage, dataset operations, and data validation with detailed error information for debugging and recovery.
class AuthenticationError:
"""Authentication failed or credentials invalid."""
class AuthorizationError:
"""User lacks permissions for requested operation."""
class NotFoundError:
"""Requested dataset or resource not found."""
class StorageAccessDenied:
"""Access denied to storage location."""
class BranchExistsError:
"""Branch with given name already exists."""
class ColumnAlreadyExistsError:
"""Column with given name already exists."""Pre-defined schema templates for common ML use cases including text embeddings, COCO datasets, and custom schema creation patterns.
class TextEmbeddings:
def __init__(self, embedding_size: int, quantize: bool = False): ...
class COCOImages:
def __init__(self, embedding_size: int, quantize: bool = False, objects: bool = True, keypoints: bool = False, stuffs: bool = False): ...Client management, telemetry, and configuration utilities for Deep Lake integration and monitoring.
class Client:
"""Deep Lake client for dataset operations and authentication."""
class TelemetryClient:
"""Telemetry client for usage tracking and analytics."""
def client() -> Client:
"""Get current Deep Lake client instance."""
def telemetry_client() -> TelemetryClient:
"""Get current telemetry client instance."""
def disconnect() -> None:
"""Disconnect from Deep Lake services."""Utility functions and helper classes for data generation, caching, and system optimization.
class Random:
"""Random data generation utilities."""
def random() -> Random:
"""Get random data generator instance."""
def _create_global_cache() -> None:
"""Create global cache for performance optimization."""
def __prepare_atfork() -> None:
"""Prepare Deep Lake for fork-based multiprocessing."""class Dataset:
"""Primary mutable dataset class for read-write operations."""
name: str
description: str
metadata: Metadata
schema: Schema
version: Version
history: History
branches: Branches
tags: Tags
class ReadOnlyDataset:
"""Read-only dataset access."""
name: str
description: str
metadata: ReadOnlyMetadata
schema: SchemaView
version: Version
history: History
branches: BranchesView
tags: TagsView
class DatasetView:
"""Query result view of dataset."""
schema: SchemaViewclass Schema:
"""Dataset schema management."""
columns: List[ColumnDefinition]
class ColumnDefinition:
"""Column schema information."""
name: str
dtype: Typeclass Version:
"""Single version information."""
id: str
message: str
timestamp: str
client_timestamp: str
class Branch:
"""Dataset branch management."""
id: str
name: str
timestamp: str
base: str
class Tag:
"""Dataset tag management."""
id: str
name: str
message: str
version: str
timestamp: strclass Future[T]:
"""Asynchronous operation result."""
def result(self) -> T: ...
def is_completed(self) -> bool: ...
def cancel(self) -> bool: ...
class FutureVoid:
"""Asynchronous void operation."""
def wait(self) -> None: ...
def is_completed(self) -> bool: ...
def cancel(self) -> bool: ...