Database for AI powered by a storage format optimized for deep-learning applications.
75
Evaluation — 75%
↑ 1.59xAgent success when using this tile
Core functionality for creating, opening, deleting, and copying datasets with support for various storage backends including local filesystem, S3, GCS, and Azure. Deep Lake provides comprehensive lifecycle management for datasets with automatic optimization and multi-cloud capabilities.
Creates new datasets with optional schema specification and credential configuration for various storage backends.
def create(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None, schema: Optional[Schema] = None) -> Dataset:
"""
Create a new dataset.
Parameters:
- url: Dataset location (local path, S3, GCS, Azure, etc.)
- creds: Storage credentials dictionary
- token: Activeloop authentication token
- schema: Pre-defined schema for the dataset
Returns:
Dataset: New mutable dataset instance
"""
def create_async(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None, schema: Optional[Schema] = None) -> Future[Dataset]:
"""
Create a new dataset asynchronously.
Parameters:
- url: Dataset location
- creds: Storage credentials dictionary
- token: Activeloop authentication token
- schema: Pre-defined schema for the dataset
Returns:
Future[Dataset]: Future resolving to new dataset instance
"""Opens existing datasets for read-write or read-only access with automatic format detection and optimization.
def open(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Dataset:
"""
Open existing dataset for modification.
Parameters:
- url: Dataset location
- creds: Storage credentials dictionary
- token: Activeloop authentication token
Returns:
Dataset: Mutable dataset instance
"""
def open_async(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Future[Dataset]:
"""
Open existing dataset asynchronously.
Parameters:
- url: Dataset location
- creds: Storage credentials dictionary
- token: Activeloop authentication token
Returns:
Future[Dataset]: Future resolving to mutable dataset instance
"""
def open_read_only(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> ReadOnlyDataset:
"""
Open dataset in read-only mode.
Parameters:
- url: Dataset location
- creds: Storage credentials dictionary
- token: Activeloop authentication token
Returns:
ReadOnlyDataset: Read-only dataset instance
"""
def open_read_only_async(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Future[ReadOnlyDataset]:
"""
Open dataset in read-only mode asynchronously.
Parameters:
- url: Dataset location
- creds: Storage credentials dictionary
- token: Activeloop authentication token
Returns:
Future[ReadOnlyDataset]: Future resolving to read-only dataset instance
"""Utility functions for dataset existence checking, deletion, copying, and structure replication.
def exists(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> bool:
"""
Check if dataset exists at the given location.
Parameters:
- url: Dataset location to check
- creds: Storage credentials dictionary
- token: Activeloop authentication token
Returns:
bool: True if dataset exists, False otherwise
"""
def delete(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> None:
"""
Delete a dataset permanently.
Parameters:
- url: Dataset location to delete
- creds: Storage credentials dictionary
- token: Activeloop authentication token
"""
def delete_async(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> FutureVoid:
"""
Delete a dataset permanently (asynchronous).
Parameters:
- url: Dataset location to delete
- creds: Storage credentials dictionary
- token: Activeloop authentication token
Returns:
FutureVoid: Future completing when deletion is done
"""
def copy(src: str, dst: str, src_creds: Optional[Dict[str, str]] = None, dst_creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> None:
"""
Copy dataset from source to destination.
Parameters:
- src: Source dataset location
- dst: Destination dataset location
- src_creds: Source storage credentials
- dst_creds: Destination storage credentials
- token: Activeloop authentication token
"""
def like(src: DatasetView, dest: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Dataset:
"""
Create new dataset with same structure as source.
Parameters:
- src: Source dataset view (typically from query results)
- dest: Destination path for new dataset
- creds: Storage credentials dictionary
- token: Activeloop authentication token
Returns:
Dataset: New dataset with same schema as source
"""Functions for connecting datasets to Activeloop cloud services and managing cloud-based dataset operations.
def connect(src: str, dest: Optional[str] = None, org_id: Optional[str] = None, creds_key: Optional[str] = None, token: Optional[str] = None) -> Dataset:
"""
Connect dataset to Activeloop cloud services.
Parameters:
- src: Source dataset path
- dest: Destination path (optional)
- org_id: Organization ID
- creds_key: Credentials key for cloud storage
- token: Activeloop authentication token
Returns:
Dataset: Connected dataset instance
"""
def disconnect(url: str, token: Optional[str] = None) -> None:
"""
Disconnect dataset from Activeloop cloud services.
Parameters:
- url: Dataset URL to disconnect
- token: Activeloop authentication token
"""Tools for migrating datasets from Deep Lake v3 to v4 format with data preservation and automatic conversion.
def convert(src: str, dst: str, dst_creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> None:
"""
Convert v3 dataset to v4 format.
Parameters:
- src: Source v3 dataset path
- dst: Destination v4 dataset path
- dst_creds: Destination storage credentials
- token: Activeloop authentication token
"""import deeplake
# Create a new dataset locally
dataset = deeplake.create("./my_dataset")
# Add some columns
dataset.add_column("images", deeplake.types.Image())
dataset.add_column("labels", deeplake.types.Text())
# Append data and commit
dataset.append({"images": "image1.jpg", "labels": "cat"})
dataset.commit("Initial data")
# Check if dataset exists
if deeplake.exists("./my_dataset"):
print("Dataset exists!")
# Open existing dataset
dataset = deeplake.open("./my_dataset")
print(f"Dataset has {len(dataset)} rows")
# Copy dataset to cloud storage
deeplake.copy("./my_dataset", "s3://my-bucket/my_dataset",
dst_creds={"aws_access_key_id": "...", "aws_secret_access_key": "..."})# Create dataset on S3
s3_creds = {
"aws_access_key_id": "your_access_key",
"aws_secret_access_key": "your_secret_key"
}
dataset = deeplake.create("s3://my-bucket/my_dataset", creds=s3_creds)
# Create dataset on GCS
gcs_creds = {
"google_application_credentials": "path/to/credentials.json"
}
dataset = deeplake.create("gcs://my-bucket/my_dataset", creds=gcs_creds)
# Create dataset on Azure
azure_creds = {
"azure_storage_account": "myaccount",
"azure_storage_key": "mykey"
}
dataset = deeplake.create("azure://my-container/my_dataset", creds=azure_creds)import asyncio
async def create_multiple_datasets():
# Create multiple datasets concurrently
tasks = [
deeplake.create_async(f"./dataset_{i}")
for i in range(5)
]
datasets = await asyncio.gather(*tasks)
return datasets
# Run async operation
datasets = asyncio.run(create_multiple_datasets())from deeplake.schemas import TextEmbeddings
# Create dataset with predefined schema
schema = TextEmbeddings(embedding_size=768)
dataset = deeplake.create("./embeddings_dataset", schema=schema)
# Schema is automatically applied
print(dataset.schema.columns) # Shows text and embedding columnsInstall with Tessl CLI
npx tessl i tessl/pypi-deeplakedocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10