CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-deeplake

Database for AI powered by a storage format optimized for deep-learning applications.

75

1.59x

Evaluation75%

1.59x

Agent success when using this tile

Overview
Eval results
Files

type-system.mddocs/

Type System

Deep Lake's rich type hierarchy supports all ML data types including images, embeddings, audio, video, geometric data, and custom structures. The type system provides automatic compression, indexing capabilities, and seamless integration with NumPy and ML frameworks.

Capabilities

Base Type Classes

Foundation classes for the type system providing common functionality and extensibility.

class DataType:
    """Base class for all data types."""
    pass

class Type:
    """Base class for complex data types."""
    pass

class TypeKind:
    """Enumeration of type categories."""
    pass

Primitive Types

Basic data types for fundamental values with automatic optimization and compression.

class Bool:
    """Boolean values type."""
    
    def __init__(self): ...

class Int8:
    """8-bit signed integer type."""
    
    def __init__(self): ...

class Int16:
    """16-bit signed integer type."""
    
    def __init__(self): ...

class Int32:
    """32-bit signed integer type."""
    
    def __init__(self): ...

class Int64:
    """64-bit signed integer type."""
    
    def __init__(self): ...

class UInt8:
    """8-bit unsigned integer type."""
    
    def __init__(self): ...

class UInt16:
    """16-bit unsigned integer type."""
    
    def __init__(self): ...

class UInt32:
    """32-bit unsigned integer type."""
    
    def __init__(self): ...

class UInt64:
    """64-bit unsigned integer type."""
    
    def __init__(self): ...

class Float16:
    """16-bit floating point type."""
    
    def __init__(self): ...

class Float32:
    """32-bit floating point type."""
    
    def __init__(self): ...

class Float64:
    """64-bit floating point type."""
    
    def __init__(self): ...

class Bytes:
    """Byte array type."""
    
    def __init__(self): ...

class Text:
    """Text string type with optional indexing."""
    
    def __init__(self, index_type: Optional[TextIndexType] = None):
        """
        Initialize text type.
        
        Parameters:
        - index_type: Optional text index for search optimization
        """

Container Types

Composite types for structured data with nested type support and flexible schemas.

class Array:
    """N-dimensional array type."""
    
    def __init__(self, dtype: DataType, dimensions: Optional[int] = None, shape: Optional[List[int]] = None):
        """
        Initialize array type.
        
        Parameters:
        - dtype: Element data type
        - dimensions: Number of dimensions (optional)
        - shape: Fixed shape specification (optional)
        """

class Dict:
    """Key-value dictionary type."""
    
    def __init__(self): ...

class Struct:
    """Structured data type with defined fields."""
    
    def __init__(self, fields: Dict[str, DataType]):
        """
        Initialize struct type.
        
        Parameters:
        - fields: Dictionary mapping field names to types
        """

class Sequence:
    """Ordered sequence type."""
    
    def __init__(self, nested_type: DataType):
        """
        Initialize sequence type.
        
        Parameters:
        - nested_type: Type of sequence elements
        """

ML-Specific Types

Specialized types for machine learning data with built-in compression and framework integration.

class Embedding:
    """Vector embedding type with optional indexing."""
    
    def __init__(self, size: Optional[int] = None, dtype: str = "float32", index_type: Optional[EmbeddingIndexType] = None):
        """
        Initialize embedding type.
        
        Parameters:
        - size: Embedding dimension (inferred if not specified)
        - dtype: Element data type ("float32", "float16", etc.)
        - index_type: Optional embedding index for similarity search
        """

class Image:
    """Image data type with compression options."""
    
    def __init__(self, dtype: str = "uint8", sample_compression: str = "png"):
        """
        Initialize image type.
        
        Parameters:
        - dtype: Image data type ("uint8", "uint16", "float32")
        - sample_compression: Compression format ("png", "jpeg", "tiff", "bmp")
        """

class Audio:
    """Audio data type with compression options."""
    
    def __init__(self, dtype: str = "uint8", sample_compression: str = "mp3"):
        """
        Initialize audio type.
        
        Parameters:
        - dtype: Audio data type
        - sample_compression: Compression format ("mp3", "wav", "flac")
        """

class Video:
    """Video data type with compression options."""
    
    def __init__(self, compression: str = "mp4"):
        """
        Initialize video type.
        
        Parameters:
        - compression: Video compression format ("mp4", "avi", "mkv")
        """

class Medical:
    """Medical image type (DICOM, NIfTI)."""
    
    def __init__(self, compression: str):
        """
        Initialize medical image type.
        
        Parameters:
        - compression: Medical format ("dicom", "nifti")
        """

Geometric Types

Types for geometric and spatial data with specialized processing and indexing.

class BoundingBox:
    """Bounding box coordinates type."""
    
    def __init__(self, dtype: str = "float32", format: Optional[str] = None, bbox_type: Optional[str] = None):
        """
        Initialize bounding box type.
        
        Parameters:
        - dtype: Coordinate data type
        - format: Coordinate format ("xyxy", "xywh", "cxcywh")
        - bbox_type: Bounding box type specification
        """

class Point:
    """Point coordinates type."""
    
    def __init__(self, dimensions: int = 2):
        """
        Initialize point type.
        
        Parameters:
        - dimensions: Number of spatial dimensions (2D, 3D, etc.)
        """

class Polygon:
    """Polygon shape type."""
    
    def __init__(self): ...

class BinaryMask:
    """Binary mask type with compression options."""
    
    def __init__(self, sample_compression: Optional[str] = None, chunk_compression: Optional[str] = None):
        """
        Initialize binary mask type.
        
        Parameters:
        - sample_compression: Per-sample compression
        - chunk_compression: Chunk-level compression
        """

class SegmentMask:
    """Segmentation mask type with compression options."""
    
    def __init__(self, dtype: str = "uint8", sample_compression: Optional[str] = None, chunk_compression: Optional[str] = None):
        """
        Initialize segmentation mask type.
        
        Parameters:
        - dtype: Mask data type
        - sample_compression: Per-sample compression
        - chunk_compression: Chunk-level compression
        """

Classification Types

Types for classification tasks and external data references.

class ClassLabel:
    """Classification label type."""
    
    def __init__(self, dtype: DataType):
        """
        Initialize class label type.
        
        Parameters:
        - dtype: Label data type (Text, Int32, etc.)
        """

class Link:
    """External resource link type."""
    
    def __init__(self, type: DataType):
        """
        Initialize link type.
        
        Parameters:
        - type: Type of linked data
        """

Index Types

Indexing system for query optimization and similarity search across different data types.

class TextIndexType:
    """Text index configuration."""
    
    def __init__(self, type: TextIndexEnumType): ...

class TextIndex:
    """Text index creation."""
    
    def __init__(self, type: TextIndexEnumType): ...

class TextIndexEnumType:
    """Text index type enumeration."""
    Inverted: str
    BM25: str
    Exact: str

class EmbeddingIndexType:
    """Embedding index configuration."""
    
    def __init__(self, type: EmbeddingIndexEnumType): ...

class EmbeddingIndex:
    """Embedding index creation."""
    
    def __init__(self, type: Optional[EmbeddingIndexEnumType] = None): ...

class EmbeddingIndexEnumType:
    """Embedding index type enumeration."""
    Clustered: str
    ClusteredQuantized: str

class EmbeddingsMatrixIndex:
    """Matrix index for embeddings."""
    
    def __init__(self): ...

class EmbeddingsMatrixIndexType:
    """Matrix index type for embeddings."""
    
    def __init__(self): ...

class NumericIndexType:
    """Numeric index configuration."""
    
    def __init__(self, type: NumericIndexEnumType): ...

class NumericIndex:
    """Numeric index creation."""
    
    def __init__(self, type: NumericIndexEnumType): ...

class NumericIndexEnumType:
    """Numeric index type enumeration."""
    Inverted: str

class IndexType:
    """Universal index type wrapper."""
    
    def __init__(self, index_type: Any): ...

Quantization and Constants

Quantization options and type system constants for optimization and performance.

class QuantizationType:
    """Quantization type enumeration."""
    Binary: str

# Index type constants
Binary: str
Inverted: str
BM25: str
Exact: str
Clustered: str
ClusteredQuantized: str

Usage Examples

Basic Type Usage

import deeplake
from deeplake import types

# Create dataset with typed columns
dataset = deeplake.create("./typed_dataset")

# Add columns with different types
dataset.add_column("id", types.Int64())
dataset.add_column("name", types.Text())
dataset.add_column("score", types.Float32())
dataset.add_column("active", types.Bool())
dataset.add_column("data", types.Bytes())

# Append typed data
dataset.append({
    "id": 1,
    "name": "sample_1",
    "score": 0.95,
    "active": True,
    "data": b"binary_data"
})

Image and Media Types

# Image column with PNG compression
dataset.add_column("images", types.Image(dtype="uint8", sample_compression="png"))

# High dynamic range images
dataset.add_column("hdr_images", types.Image(dtype="float32", sample_compression="tiff"))

# Audio column
dataset.add_column("audio", types.Audio(sample_compression="wav"))

# Video column
dataset.add_column("videos", types.Video(compression="mp4"))

# Medical images
dataset.add_column("scans", types.Medical(compression="dicom"))

# Append media data
dataset.append({
    "images": "path/to/image.png", 
    "audio": "path/to/audio.wav",
    "videos": "path/to/video.mp4",
    "scans": "path/to/scan.dcm"
})

Embedding Types with Indexing

# Text embeddings with similarity search index
dataset.add_column("text_embeddings", 
                   types.Embedding(size=768, dtype="float32", 
                                 index_type=types.EmbeddingIndex(types.Clustered)))

# Image embeddings with quantized index for memory efficiency
dataset.add_column("image_embeddings", 
                   types.Embedding(size=2048, dtype="float16",
                                 index_type=types.EmbeddingIndex(types.ClusteredQuantized)))

# Append embedding data
import numpy as np

dataset.append({
    "text_embeddings": np.random.random(768).astype(np.float32),
    "image_embeddings": np.random.random(2048).astype(np.float16)
})

Geometric Types

# Bounding boxes in different formats
dataset.add_column("bbox_xyxy", types.BoundingBox(format="xyxy"))
dataset.add_column("bbox_xywh", types.BoundingBox(format="xywh"))

# 2D and 3D points
dataset.add_column("points_2d", types.Point(dimensions=2))
dataset.add_column("points_3d", types.Point(dimensions=3))

# Segmentation masks
dataset.add_column("binary_masks", types.BinaryMask(sample_compression="png"))
dataset.add_column("segment_masks", types.SegmentMask(dtype="uint8"))

# Polygons for complex shapes
dataset.add_column("polygons", types.Polygon())

# Append geometric data
dataset.append({
    "bbox_xyxy": [10, 10, 100, 100],  # x1, y1, x2, y2
    "bbox_xywh": [10, 10, 90, 90],    # x, y, width, height
    "points_2d": [50, 50],
    "points_3d": [50, 50, 25],
    "binary_masks": "path/to/mask.png",
    "segment_masks": "path/to/segments.png",
    "polygons": [[10, 10], [100, 10], [100, 100], [10, 100]]
})

Container Types

# Array types for structured data
dataset.add_column("features", types.Array(types.Float32(), dimensions=2, shape=[224, 224]))
dataset.add_column("rgb_channels", types.Array(types.UInt8(), shape=[3]))

# Struct type for complex objects
person_struct = types.Struct({
    "name": types.Text(),
    "age": types.Int32(),
    "email": types.Text()
})
dataset.add_column("person_info", person_struct)

# Sequence type for variable-length data
dataset.add_column("token_ids", types.Sequence(types.Int32()))

# Append structured data
dataset.append({
    "features": np.random.random((224, 224)).astype(np.float32),
    "rgb_channels": [255, 128, 64],
    "person_info": {"name": "Alice", "age": 30, "email": "alice@example.com"},
    "token_ids": [101, 2048, 1045, 2342, 102]
})

Text Types with Indexing

# Text with different index types for search optimization
dataset.add_column("descriptions", 
                   types.Text(index_type=types.TextIndex(types.Inverted)))

dataset.add_column("content", 
                   types.Text(index_type=types.TextIndex(types.BM25)))

dataset.add_column("exact_matches", 
                   types.Text(index_type=types.TextIndex(types.Exact)))

# Classification labels
dataset.add_column("categories", types.ClassLabel(types.Text()))
dataset.add_column("class_ids", types.ClassLabel(types.Int32()))

# Append text data
dataset.append({
    "descriptions": "A beautiful sunset over the mountains",
    "content": "Full text content for search indexing",
    "exact_matches": "EXACT_IDENTIFIER_123",
    "categories": "landscape",
    "class_ids": 42
})

Numeric Types with Indexing

# Numeric columns with inverted index for range queries
dataset.add_column("scores", 
                   types.Float32())

# Create index after data is added
scores_column = dataset["scores"]
scores_column.create_index(types.NumericIndex(types.Inverted))

# Different precision numeric types
dataset.add_column("timestamps", types.Int64())
dataset.add_column("small_values", types.Float16())
dataset.add_column("precise_values", types.Float64())

# Unsigned integers for IDs and counters
dataset.add_column("user_id", types.UInt32())
dataset.add_column("counter", types.UInt64())

Link Types for External Data

# Link to external image files
dataset.add_column("external_images", types.Link(types.Image()))

# Link to external embeddings
dataset.add_column("external_embeddings", types.Link(types.Embedding(size=512)))

# Append link data (references to external files)
dataset.append({
    "external_images": "s3://my-bucket/images/photo001.jpg",
    "external_embeddings": "s3://my-bucket/embeddings/embed001.npy"
})

Advanced Type Combinations

# Complex nested structure
annotation_struct = types.Struct({
    "bbox": types.BoundingBox(),
    "label": types.ClassLabel(types.Text()),
    "confidence": types.Float32(),
    "attributes": types.Dict()
})

# Sequence of annotations for object detection
dataset.add_column("annotations", types.Sequence(annotation_struct))

# Append complex nested data
dataset.append({
    "annotations": [
        {
            "bbox": [10, 10, 50, 50],
            "label": "person",
            "confidence": 0.95,
            "attributes": {"age": "adult", "gender": "unknown"}
        },
        {
            "bbox": [60, 60, 100, 100],
            "label": "car",
            "confidence": 0.88,
            "attributes": {"color": "red", "type": "sedan"}
        }
    ]
})

Type Introspection

# Get column type information
images_column = dataset["images"]
print(f"Column type: {type(images_column.dtype)}")
print(f"Image compression: {images_column.dtype.sample_compression}")

# Check if column has index
if images_column.indexes:
    print(f"Column has indexes: {images_column.indexes}")

# Schema introspection
schema = dataset.schema
for col_def in schema.columns:
    print(f"Column: {col_def.name}, Type: {col_def.dtype}")

Install with Tessl CLI

npx tessl i tessl/pypi-deeplake

docs

data-access.md

data-import-export.md

dataset-management.md

error-handling.md

framework-integration.md

index.md

query-system.md

schema-templates.md

storage-system.md

type-system.md

version-control.md

tile.json