CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-deeplake

Database for AI powered by a storage format optimized for deep-learning applications.

75

1.59x

Evaluation75%

1.59x

Agent success when using this tile

Overview
Eval results
Files

data-access.mddocs/

Data Access and Manipulation

Comprehensive row and column-based data access patterns with support for indexing, slicing, batch operations, and efficient data manipulation. Deep Lake provides both mutable and read-only access patterns optimized for ML workflows.

Capabilities

Dataset Access Patterns

Dataset objects provide dictionary-like and array-like access to data with automatic type handling and optimization.

class Dataset:
    """Primary mutable dataset class."""
    
    def __getitem__(self, key: Union[int, slice, str]) -> Union[Row, RowRange, Column]:
        """
        Access dataset elements by index or name.
        
        Parameters:
        - key: Row index (int), row range (slice), or column name (str)
        
        Returns:
        - Row: Single row access (when key is int)
        - RowRange: Multiple row access (when key is slice) 
        - Column: Column access (when key is str)
        """
    
    def __len__(self) -> int:
        """Get number of rows in dataset."""
    
    def append(self, data: Dict[str, Any]) -> None:
        """
        Append new row to dataset.
        
        Parameters:
        - data: Dictionary mapping column names to values
        """
    
    def extend(self, data: List[Dict[str, Any]]) -> None:
        """
        Append multiple rows to dataset.
        
        Parameters:
        - data: List of dictionaries mapping column names to values
        """
    
    def add_column(self, name: str, dtype: Type) -> None:
        """
        Add new column to dataset.
        
        Parameters:
        - name: Column name
        - dtype: Column data type
        """
    
    def remove_column(self, name: str) -> None:
        """
        Remove column from dataset.
        
        Parameters:
        - name: Column name to remove
        """
    
    def rename_column(self, old_name: str, new_name: str) -> None:
        """
        Rename existing column.
        
        Parameters:
        - old_name: Current column name
        - new_name: New column name
        """

class ReadOnlyDataset:
    """Read-only dataset access."""
    
    def __getitem__(self, key: Union[int, slice, str]) -> Union[RowView, RowRangeView, ColumnView]:
        """Access dataset elements (read-only)."""
    
    def __len__(self) -> int:
        """Get number of rows in dataset."""

class DatasetView:
    """Query result view of dataset."""
    
    def __getitem__(self, key: Union[int, slice, str]) -> Union[RowView, RowRangeView, ColumnView]:
        """Access query result elements."""
    
    def __len__(self) -> int:
        """Get number of rows in view."""
    
    def summary(self) -> str:
        """Get summary statistics of the dataset view."""

Column Access and Manipulation

Column objects provide typed access to homogeneous data with support for indexing, slicing, and batch operations.

class Column:
    """Mutable column access."""
    
    name: str
    metadata: Metadata
    indexes: List[str]
    
    def __getitem__(self, key: Union[int, slice, List[int]]) -> Any:
        """
        Get column values by index.
        
        Parameters:
        - key: Row index (int), slice, or list of indices
        
        Returns:
        - Any: Single value or list of values
        """
    
    def __setitem__(self, key: Union[int, slice, List[int]], value: Any) -> None:
        """
        Set column values by index.
        
        Parameters:
        - key: Row index (int), slice, or list of indices
        - value: Value(s) to set
        """
    
    def __len__(self) -> int:
        """Get number of elements in column."""
    
    def create_index(self, type: IndexType) -> None:
        """
        Create index on column for query optimization.
        
        Parameters:
        - type: Index type specification
        """
    
    def drop_index(self, name: str) -> None:
        """
        Drop existing index.
        
        Parameters:
        - name: Index name to drop
        """
    
    def get_async(self, index: int) -> Future[Any]:
        """
        Get column value asynchronously.
        
        Parameters:
        - index: Row index
        
        Returns:
        Future[Any]: Future resolving to column value
        """
    
    def set_async(self, index: int, value: Any) -> FutureVoid:
        """
        Set column value asynchronously.
        
        Parameters:
        - index: Row index
        - value: Value to set
        
        Returns:
        FutureVoid: Future completing when set operation is done
        """
    
    def get_bytes(self, index: int) -> bytes:
        """
        Get raw bytes representation of column value.
        
        Parameters:
        - index: Row index
        
        Returns:
        bytes: Raw bytes data
        """
    
    def get_bytes_async(self, index: int) -> Future[bytes]:
        """
        Get raw bytes representation asynchronously.
        
        Parameters:
        - index: Row index
        
        Returns:
        Future[bytes]: Future resolving to raw bytes data
        """

class ColumnView:
    """Read-only column access."""
    
    name: str
    metadata: ReadOnlyMetadata
    indexes: List[str]
    
    def __getitem__(self, key: Union[int, slice, List[int]]) -> Any:
        """Get column values by index (read-only)."""
    
    def __len__(self) -> int:
        """Get number of elements in column."""
    
    def get_async(self, index: int) -> Future[Any]:
        """Get column value asynchronously."""
    
    def get_bytes(self, index: int) -> bytes:
        """Get raw bytes representation of column value."""
    
    def get_bytes_async(self, index: int) -> Future[bytes]:
        """Get raw bytes representation asynchronously."""
    
    def _links_info(self) -> Dict[str, Any]:
        """Get link information for linked columns."""

class ColumnDefinition:
    """Mutable column definition."""
    
    name: str
    dtype: Type
    
    def drop(self) -> None:
        """Drop this column from dataset."""
    
    def rename(self, new_name: str) -> None:
        """Rename this column."""

class ColumnDefinitionView:
    """Read-only column definition."""
    
    name: str
    dtype: Type

Row Access and Manipulation

Row objects provide dictionary-like access to individual records with type-aware value handling.

class Row:
    """Mutable single row access."""
    
    row_id: int
    
    def __getitem__(self, column_name: str) -> Any:
        """
        Get value from specific column.
        
        Parameters:
        - column_name: Column name
        
        Returns:
        Any: Column value for this row
        """
    
    def __setitem__(self, column_name: str, value: Any) -> None:
        """
        Set value in specific column.
        
        Parameters:
        - column_name: Column name
        - value: Value to set
        """
    
    def to_dict(self) -> Dict[str, Any]:
        """
        Convert row to dictionary.
        
        Returns:
        Dict[str, Any]: Dictionary mapping column names to values
        """
    
    def get_async(self, column_name: str) -> Future[Any]:
        """Get column value asynchronously."""
    
    def set_async(self, column_name: str, value: Any) -> FutureVoid:
        """Set column value asynchronously."""
    
    def get_bytes(self, column_name: str) -> bytes:
        """Get raw bytes representation of column value."""
    
    def get_bytes_async(self, column_name: str) -> Future[bytes]:
        """Get raw bytes representation asynchronously."""

class RowView:
    """Read-only single row access."""
    
    row_id: int
    
    def __getitem__(self, column_name: str) -> Any:
        """Get value from specific column (read-only)."""
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert row to dictionary."""
    
    def get_async(self, column_name: str) -> Future[Any]:
        """Get column value asynchronously."""
    
    def get_bytes(self, column_name: str) -> bytes:
        """Get raw bytes representation of column value."""
    
    def get_bytes_async(self, column_name: str) -> Future[bytes]:
        """Get raw bytes representation asynchronously."""

class RowRange:
    """Mutable multiple row access."""
    
    def __getitem__(self, column_name: str) -> List[Any]:
        """Get values from specific column across all rows in range."""
    
    def __setitem__(self, column_name: str, values: List[Any]) -> None:
        """Set values in specific column across all rows in range."""
    
    def __len__(self) -> int:
        """Get number of rows in range."""
    
    def __iter__(self) -> Iterator[Row]:
        """Iterate over rows in range."""
    
    def summary(self) -> str:
        """Get summary statistics of the row range."""

class RowRangeView:
    """Read-only multiple row access."""
    
    def __getitem__(self, column_name: str) -> List[Any]:
        """Get values from specific column across all rows in range."""
    
    def __len__(self) -> int:
        """Get number of rows in range."""
    
    def __iter__(self) -> Iterator[RowView]:
        """Iterate over rows in range."""
    
    def summary(self) -> str:
        """Get summary statistics of the row range."""

Metadata Management

Metadata objects provide key-value storage for dataset and column metadata with type preservation.

class Metadata:
    """Mutable metadata storage."""
    
    def __getitem__(self, key: str) -> Any:
        """Get metadata value by key."""
    
    def __setitem__(self, key: str, value: Any) -> None:
        """Set metadata value by key."""
    
    def __contains__(self, key: str) -> bool:
        """Check if metadata key exists."""
    
    def keys(self) -> List[str]:
        """Get all metadata keys."""

class ReadOnlyMetadata:
    """Read-only metadata storage."""
    
    def __getitem__(self, key: str) -> Any:
        """Get metadata value by key."""
    
    def __contains__(self, key: str) -> bool:
        """Check if metadata key exists."""
    
    def keys(self) -> List[str]:
        """Get all metadata keys."""

Usage Examples

Basic Data Access

import deeplake

# Open dataset
dataset = deeplake.open("./my_dataset")

# Row access
row = dataset[0]  # First row
print(row["image_path"])  # Access column value
print(row.to_dict())  # Convert to dictionary

# Row range access
rows = dataset[0:10]  # First 10 rows
for row in rows:
    print(row["label"])

# Column access
images_column = dataset["images"]
print(len(images_column))  # Number of images
first_image = images_column[0]  # First image

# Column slicing
batch_images = images_column[0:32]  # First 32 images

Data Manipulation

# Add new column
dataset.add_column("scores", deeplake.types.Float32())

# Append single row
dataset.append({
    "images": "new_image.jpg",
    "labels": "dog",
    "scores": 0.95
})

# Append multiple rows
batch_data = [
    {"images": f"image_{i}.jpg", "labels": f"label_{i}", "scores": 0.8 + i * 0.01}
    for i in range(100)
]
dataset.extend(batch_data)

# Update specific values
dataset[0]["scores"] = 0.99  # Update single value
dataset["scores"][0:10] = [0.9] * 10  # Update range

# Column operations
scores = dataset["scores"]
scores[100] = 0.85  # Set specific score
high_scores = scores[scores > 0.9]  # Filter high scores

Batch Operations

# Access data in batches
batch_size = 32
for i in range(0, len(dataset), batch_size):
    batch = dataset[i:i+batch_size]
    
    # Get batch data as lists
    images = batch["images"]
    labels = batch["labels"]
    
    # Process batch
    process_batch(images, labels)

# Column-wise batch operations
images_column = dataset["images"]
for i in range(0, len(images_column), batch_size):
    image_batch = images_column[i:i+batch_size]
    processed_batch = preprocess_images(image_batch)
    # Save processed results...

Async Operations

import asyncio

async def process_data_async(dataset):
    # Get multiple values concurrently
    tasks = [
        dataset["images"].get_async(i)
        for i in range(10)
    ]
    
    images = await asyncio.gather(*tasks)
    return images

# Set values asynchronously
async def update_scores_async(dataset, new_scores):
    tasks = [
        dataset["scores"].set_async(i, score)
        for i, score in enumerate(new_scores)
    ]
    
    await asyncio.gather(*tasks)

Metadata Usage

# Dataset metadata
dataset.metadata["version"] = "1.0"
dataset.metadata["description"] = "Training dataset for image classification"
print(dataset.metadata.keys())

# Column metadata
images_column = dataset["images"]
images_column.metadata["preprocessing"] = "normalized"
images_column.metadata["source"] = "camera_feed"

# Access metadata
if "version" in dataset.metadata:
    print(f"Dataset version: {dataset.metadata['version']}")

Indexing for Performance

# Create index on text column for fast queries
text_column = dataset["descriptions"]
text_column.create_index(deeplake.types.TextIndex(deeplake.types.Inverted))

# Create embedding index for similarity search
embedding_column = dataset["embeddings"]
embedding_column.create_index(
    deeplake.types.EmbeddingIndex(deeplake.types.Clustered)
)

# List all indexes on column
print(text_column.indexes)

# Drop index when no longer needed
text_column.drop_index("inverted_index")

Install with Tessl CLI

npx tessl i tessl/pypi-deeplake

docs

data-access.md

data-import-export.md

dataset-management.md

error-handling.md

framework-integration.md

index.md

query-system.md

schema-templates.md

storage-system.md

type-system.md

version-control.md

tile.json