0
# Type System
1
2
Deep Lake's rich type hierarchy supports all ML data types including images, embeddings, audio, video, geometric data, and custom structures. The type system provides automatic compression, indexing capabilities, and seamless integration with NumPy and ML frameworks.
3
4
## Capabilities
5
6
### Base Type Classes
7
8
Foundation classes for the type system providing common functionality and extensibility.
9
10
```python { .api }
11
class DataType:
12
"""Base class for all data types."""
13
pass
14
15
class Type:
16
"""Base class for complex data types."""
17
pass
18
19
class TypeKind:
20
"""Enumeration of type categories."""
21
pass
22
```
23
24
### Primitive Types
25
26
Basic data types for fundamental values with automatic optimization and compression.
27
28
```python { .api }
29
class Bool:
30
"""Boolean values type."""
31
32
def __init__(self): ...
33
34
class Int8:
35
"""8-bit signed integer type."""
36
37
def __init__(self): ...
38
39
class Int16:
40
"""16-bit signed integer type."""
41
42
def __init__(self): ...
43
44
class Int32:
45
"""32-bit signed integer type."""
46
47
def __init__(self): ...
48
49
class Int64:
50
"""64-bit signed integer type."""
51
52
def __init__(self): ...
53
54
class UInt8:
55
"""8-bit unsigned integer type."""
56
57
def __init__(self): ...
58
59
class UInt16:
60
"""16-bit unsigned integer type."""
61
62
def __init__(self): ...
63
64
class UInt32:
65
"""32-bit unsigned integer type."""
66
67
def __init__(self): ...
68
69
class UInt64:
70
"""64-bit unsigned integer type."""
71
72
def __init__(self): ...
73
74
class Float16:
75
"""16-bit floating point type."""
76
77
def __init__(self): ...
78
79
class Float32:
80
"""32-bit floating point type."""
81
82
def __init__(self): ...
83
84
class Float64:
85
"""64-bit floating point type."""
86
87
def __init__(self): ...
88
89
class Bytes:
90
"""Byte array type."""
91
92
def __init__(self): ...
93
94
class Text:
95
"""Text string type with optional indexing."""
96
97
def __init__(self, index_type: Optional[TextIndexType] = None):
98
"""
99
Initialize text type.
100
101
Parameters:
102
- index_type: Optional text index for search optimization
103
"""
104
```
105
106
### Container Types
107
108
Composite types for structured data with nested type support and flexible schemas.
109
110
```python { .api }
111
class Array:
112
"""N-dimensional array type."""
113
114
def __init__(self, dtype: DataType, dimensions: Optional[int] = None, shape: Optional[List[int]] = None):
115
"""
116
Initialize array type.
117
118
Parameters:
119
- dtype: Element data type
120
- dimensions: Number of dimensions (optional)
121
- shape: Fixed shape specification (optional)
122
"""
123
124
class Dict:
125
"""Key-value dictionary type."""
126
127
def __init__(self): ...
128
129
class Struct:
130
"""Structured data type with defined fields."""
131
132
def __init__(self, fields: Dict[str, DataType]):
133
"""
134
Initialize struct type.
135
136
Parameters:
137
- fields: Dictionary mapping field names to types
138
"""
139
140
class Sequence:
141
"""Ordered sequence type."""
142
143
def __init__(self, nested_type: DataType):
144
"""
145
Initialize sequence type.
146
147
Parameters:
148
- nested_type: Type of sequence elements
149
"""
150
```
151
152
### ML-Specific Types
153
154
Specialized types for machine learning data with built-in compression and framework integration.
155
156
```python { .api }
157
class Embedding:
158
"""Vector embedding type with optional indexing."""
159
160
def __init__(self, size: Optional[int] = None, dtype: str = "float32", index_type: Optional[EmbeddingIndexType] = None):
161
"""
162
Initialize embedding type.
163
164
Parameters:
165
- size: Embedding dimension (inferred if not specified)
166
- dtype: Element data type ("float32", "float16", etc.)
167
- index_type: Optional embedding index for similarity search
168
"""
169
170
class Image:
171
"""Image data type with compression options."""
172
173
def __init__(self, dtype: str = "uint8", sample_compression: str = "png"):
174
"""
175
Initialize image type.
176
177
Parameters:
178
- dtype: Image data type ("uint8", "uint16", "float32")
179
- sample_compression: Compression format ("png", "jpeg", "tiff", "bmp")
180
"""
181
182
class Audio:
183
"""Audio data type with compression options."""
184
185
def __init__(self, dtype: str = "uint8", sample_compression: str = "mp3"):
186
"""
187
Initialize audio type.
188
189
Parameters:
190
- dtype: Audio data type
191
- sample_compression: Compression format ("mp3", "wav", "flac")
192
"""
193
194
class Video:
195
"""Video data type with compression options."""
196
197
def __init__(self, compression: str = "mp4"):
198
"""
199
Initialize video type.
200
201
Parameters:
202
- compression: Video compression format ("mp4", "avi", "mkv")
203
"""
204
205
class Medical:
206
"""Medical image type (DICOM, NIfTI)."""
207
208
def __init__(self, compression: str):
209
"""
210
Initialize medical image type.
211
212
Parameters:
213
- compression: Medical format ("dicom", "nifti")
214
"""
215
```
216
217
### Geometric Types
218
219
Types for geometric and spatial data with specialized processing and indexing.
220
221
```python { .api }
222
class BoundingBox:
223
"""Bounding box coordinates type."""
224
225
def __init__(self, dtype: str = "float32", format: Optional[str] = None, bbox_type: Optional[str] = None):
226
"""
227
Initialize bounding box type.
228
229
Parameters:
230
- dtype: Coordinate data type
231
- format: Coordinate format ("xyxy", "xywh", "cxcywh")
232
- bbox_type: Bounding box type specification
233
"""
234
235
class Point:
236
"""Point coordinates type."""
237
238
def __init__(self, dimensions: int = 2):
239
"""
240
Initialize point type.
241
242
Parameters:
243
- dimensions: Number of spatial dimensions (2D, 3D, etc.)
244
"""
245
246
class Polygon:
247
"""Polygon shape type."""
248
249
def __init__(self): ...
250
251
class BinaryMask:
252
"""Binary mask type with compression options."""
253
254
def __init__(self, sample_compression: Optional[str] = None, chunk_compression: Optional[str] = None):
255
"""
256
Initialize binary mask type.
257
258
Parameters:
259
- sample_compression: Per-sample compression
260
- chunk_compression: Chunk-level compression
261
"""
262
263
class SegmentMask:
264
"""Segmentation mask type with compression options."""
265
266
def __init__(self, dtype: str = "uint8", sample_compression: Optional[str] = None, chunk_compression: Optional[str] = None):
267
"""
268
Initialize segmentation mask type.
269
270
Parameters:
271
- dtype: Mask data type
272
- sample_compression: Per-sample compression
273
- chunk_compression: Chunk-level compression
274
"""
275
```
276
277
### Classification Types
278
279
Types for classification tasks and external data references.
280
281
```python { .api }
282
class ClassLabel:
283
"""Classification label type."""
284
285
def __init__(self, dtype: DataType):
286
"""
287
Initialize class label type.
288
289
Parameters:
290
- dtype: Label data type (Text, Int32, etc.)
291
"""
292
293
class Link:
294
"""External resource link type."""
295
296
def __init__(self, type: DataType):
297
"""
298
Initialize link type.
299
300
Parameters:
301
- type: Type of linked data
302
"""
303
```
304
305
### Index Types
306
307
Indexing system for query optimization and similarity search across different data types.
308
309
```python { .api }
310
class TextIndexType:
311
"""Text index configuration."""
312
313
def __init__(self, type: TextIndexEnumType): ...
314
315
class TextIndex:
316
"""Text index creation."""
317
318
def __init__(self, type: TextIndexEnumType): ...
319
320
class TextIndexEnumType:
321
"""Text index type enumeration."""
322
Inverted: str
323
BM25: str
324
Exact: str
325
326
class EmbeddingIndexType:
327
"""Embedding index configuration."""
328
329
def __init__(self, type: EmbeddingIndexEnumType): ...
330
331
class EmbeddingIndex:
332
"""Embedding index creation."""
333
334
def __init__(self, type: Optional[EmbeddingIndexEnumType] = None): ...
335
336
class EmbeddingIndexEnumType:
337
"""Embedding index type enumeration."""
338
Clustered: str
339
ClusteredQuantized: str
340
341
class EmbeddingsMatrixIndex:
342
"""Matrix index for embeddings."""
343
344
def __init__(self): ...
345
346
class EmbeddingsMatrixIndexType:
347
"""Matrix index type for embeddings."""
348
349
def __init__(self): ...
350
351
class NumericIndexType:
352
"""Numeric index configuration."""
353
354
def __init__(self, type: NumericIndexEnumType): ...
355
356
class NumericIndex:
357
"""Numeric index creation."""
358
359
def __init__(self, type: NumericIndexEnumType): ...
360
361
class NumericIndexEnumType:
362
"""Numeric index type enumeration."""
363
Inverted: str
364
365
class IndexType:
366
"""Universal index type wrapper."""
367
368
def __init__(self, index_type: Any): ...
369
```
370
371
### Quantization and Constants
372
373
Quantization options and type system constants for optimization and performance.
374
375
```python { .api }
376
class QuantizationType:
377
"""Quantization type enumeration."""
378
Binary: str
379
380
# Index type constants
381
Binary: str
382
Inverted: str
383
BM25: str
384
Exact: str
385
Clustered: str
386
ClusteredQuantized: str
387
```
388
389
## Usage Examples
390
391
### Basic Type Usage
392
393
```python
394
import deeplake
395
from deeplake import types
396
397
# Create dataset with typed columns
398
dataset = deeplake.create("./typed_dataset")
399
400
# Add columns with different types
401
dataset.add_column("id", types.Int64())
402
dataset.add_column("name", types.Text())
403
dataset.add_column("score", types.Float32())
404
dataset.add_column("active", types.Bool())
405
dataset.add_column("data", types.Bytes())
406
407
# Append typed data
408
dataset.append({
409
"id": 1,
410
"name": "sample_1",
411
"score": 0.95,
412
"active": True,
413
"data": b"binary_data"
414
})
415
```
416
417
### Image and Media Types
418
419
```python
420
# Image column with PNG compression
421
dataset.add_column("images", types.Image(dtype="uint8", sample_compression="png"))
422
423
# High dynamic range images
424
dataset.add_column("hdr_images", types.Image(dtype="float32", sample_compression="tiff"))
425
426
# Audio column
427
dataset.add_column("audio", types.Audio(sample_compression="wav"))
428
429
# Video column
430
dataset.add_column("videos", types.Video(compression="mp4"))
431
432
# Medical images
433
dataset.add_column("scans", types.Medical(compression="dicom"))
434
435
# Append media data
436
dataset.append({
437
"images": "path/to/image.png",
438
"audio": "path/to/audio.wav",
439
"videos": "path/to/video.mp4",
440
"scans": "path/to/scan.dcm"
441
})
442
```
443
444
### Embedding Types with Indexing
445
446
```python
447
# Text embeddings with similarity search index
448
dataset.add_column("text_embeddings",
449
types.Embedding(size=768, dtype="float32",
450
index_type=types.EmbeddingIndex(types.Clustered)))
451
452
# Image embeddings with quantized index for memory efficiency
453
dataset.add_column("image_embeddings",
454
types.Embedding(size=2048, dtype="float16",
455
index_type=types.EmbeddingIndex(types.ClusteredQuantized)))
456
457
# Append embedding data
458
import numpy as np
459
460
dataset.append({
461
"text_embeddings": np.random.random(768).astype(np.float32),
462
"image_embeddings": np.random.random(2048).astype(np.float16)
463
})
464
```
465
466
### Geometric Types
467
468
```python
469
# Bounding boxes in different formats
470
dataset.add_column("bbox_xyxy", types.BoundingBox(format="xyxy"))
471
dataset.add_column("bbox_xywh", types.BoundingBox(format="xywh"))
472
473
# 2D and 3D points
474
dataset.add_column("points_2d", types.Point(dimensions=2))
475
dataset.add_column("points_3d", types.Point(dimensions=3))
476
477
# Segmentation masks
478
dataset.add_column("binary_masks", types.BinaryMask(sample_compression="png"))
479
dataset.add_column("segment_masks", types.SegmentMask(dtype="uint8"))
480
481
# Polygons for complex shapes
482
dataset.add_column("polygons", types.Polygon())
483
484
# Append geometric data
485
dataset.append({
486
"bbox_xyxy": [10, 10, 100, 100], # x1, y1, x2, y2
487
"bbox_xywh": [10, 10, 90, 90], # x, y, width, height
488
"points_2d": [50, 50],
489
"points_3d": [50, 50, 25],
490
"binary_masks": "path/to/mask.png",
491
"segment_masks": "path/to/segments.png",
492
"polygons": [[10, 10], [100, 10], [100, 100], [10, 100]]
493
})
494
```
495
496
### Container Types
497
498
```python
499
# Array types for structured data
500
dataset.add_column("features", types.Array(types.Float32(), dimensions=2, shape=[224, 224]))
501
dataset.add_column("rgb_channels", types.Array(types.UInt8(), shape=[3]))
502
503
# Struct type for complex objects
504
person_struct = types.Struct({
505
"name": types.Text(),
506
"age": types.Int32(),
507
"email": types.Text()
508
})
509
dataset.add_column("person_info", person_struct)
510
511
# Sequence type for variable-length data
512
dataset.add_column("token_ids", types.Sequence(types.Int32()))
513
514
# Append structured data
515
dataset.append({
516
"features": np.random.random((224, 224)).astype(np.float32),
517
"rgb_channels": [255, 128, 64],
518
"person_info": {"name": "Alice", "age": 30, "email": "alice@example.com"},
519
"token_ids": [101, 2048, 1045, 2342, 102]
520
})
521
```
522
523
### Text Types with Indexing
524
525
```python
526
# Text with different index types for search optimization
527
dataset.add_column("descriptions",
528
types.Text(index_type=types.TextIndex(types.Inverted)))
529
530
dataset.add_column("content",
531
types.Text(index_type=types.TextIndex(types.BM25)))
532
533
dataset.add_column("exact_matches",
534
types.Text(index_type=types.TextIndex(types.Exact)))
535
536
# Classification labels
537
dataset.add_column("categories", types.ClassLabel(types.Text()))
538
dataset.add_column("class_ids", types.ClassLabel(types.Int32()))
539
540
# Append text data
541
dataset.append({
542
"descriptions": "A beautiful sunset over the mountains",
543
"content": "Full text content for search indexing",
544
"exact_matches": "EXACT_IDENTIFIER_123",
545
"categories": "landscape",
546
"class_ids": 42
547
})
548
```
549
550
### Numeric Types with Indexing
551
552
```python
553
# Numeric columns with inverted index for range queries
554
dataset.add_column("scores",
555
types.Float32())
556
557
# Create index after data is added
558
scores_column = dataset["scores"]
559
scores_column.create_index(types.NumericIndex(types.Inverted))
560
561
# Different precision numeric types
562
dataset.add_column("timestamps", types.Int64())
563
dataset.add_column("small_values", types.Float16())
564
dataset.add_column("precise_values", types.Float64())
565
566
# Unsigned integers for IDs and counters
567
dataset.add_column("user_id", types.UInt32())
568
dataset.add_column("counter", types.UInt64())
569
```
570
571
### Link Types for External Data
572
573
```python
574
# Link to external image files
575
dataset.add_column("external_images", types.Link(types.Image()))
576
577
# Link to external embeddings
578
dataset.add_column("external_embeddings", types.Link(types.Embedding(size=512)))
579
580
# Append link data (references to external files)
581
dataset.append({
582
"external_images": "s3://my-bucket/images/photo001.jpg",
583
"external_embeddings": "s3://my-bucket/embeddings/embed001.npy"
584
})
585
```
586
587
### Advanced Type Combinations
588
589
```python
590
# Complex nested structure
591
annotation_struct = types.Struct({
592
"bbox": types.BoundingBox(),
593
"label": types.ClassLabel(types.Text()),
594
"confidence": types.Float32(),
595
"attributes": types.Dict()
596
})
597
598
# Sequence of annotations for object detection
599
dataset.add_column("annotations", types.Sequence(annotation_struct))
600
601
# Append complex nested data
602
dataset.append({
603
"annotations": [
604
{
605
"bbox": [10, 10, 50, 50],
606
"label": "person",
607
"confidence": 0.95,
608
"attributes": {"age": "adult", "gender": "unknown"}
609
},
610
{
611
"bbox": [60, 60, 100, 100],
612
"label": "car",
613
"confidence": 0.88,
614
"attributes": {"color": "red", "type": "sedan"}
615
}
616
]
617
})
618
```
619
620
### Type Introspection
621
622
```python
623
# Get column type information
624
images_column = dataset["images"]
625
print(f"Column type: {type(images_column.dtype)}")
626
print(f"Image compression: {images_column.dtype.sample_compression}")
627
628
# Check if column has index
629
if images_column.indexes:
630
print(f"Column has indexes: {images_column.indexes}")
631
632
# Schema introspection
633
schema = dataset.schema
634
for col_def in schema.columns:
635
print(f"Column: {col_def.name}, Type: {col_def.dtype}")
636
```