0
# Schema Templates
1
2
Pre-defined schema templates for common ML use cases including text embeddings, COCO datasets, and custom schema creation patterns. Schema templates provide standardized dataset structures for specific domains and applications.
3
4
## Capabilities
5
6
### Pre-defined Templates
7
8
Ready-to-use schema templates for common machine learning scenarios with optimized column types and indexing.
9
10
```python { .api }
11
class TextEmbeddings:
12
"""Schema template for text embeddings datasets."""
13
14
def __init__(self, embedding_size: int, quantize: bool = False):
15
"""
16
Initialize text embeddings schema.
17
18
Parameters:
19
- embedding_size: Dimension of embedding vectors
20
- quantize: Whether to use quantized embeddings for memory efficiency
21
"""
22
23
class COCOImages:
24
"""COCO dataset schema template."""
25
26
def __init__(self, embedding_size: int, quantize: bool = False, objects: bool = True, keypoints: bool = False, stuffs: bool = False):
27
"""
28
Initialize COCO images schema.
29
30
Parameters:
31
- embedding_size: Dimension of embedding vectors for images
32
- quantize: Whether to use quantized embeddings
33
- objects: Include object detection annotations
34
- keypoints: Include keypoint detection annotations
35
- stuffs: Include stuff segmentation annotations
36
"""
37
38
class SchemaTemplate:
39
"""Base class for schema templates."""
40
pass
41
```
42
43
## Usage Examples
44
45
### Text Embeddings Schema
46
47
```python
48
import deeplake
49
from deeplake.schemas import TextEmbeddings
50
51
# Create dataset with text embeddings schema
52
schema = TextEmbeddings(embedding_size=768)
53
dataset = deeplake.create("./text_embeddings_dataset", schema=schema)
54
55
# Examine the generated schema
56
print("Text Embeddings Schema:")
57
for col in dataset.schema.columns:
58
print(f" {col.name}: {type(col.dtype).__name__}")
59
60
# Add text data with embeddings
61
import numpy as np
62
63
dataset.append({
64
"text": "This is a sample text for embedding.",
65
"embeddings": np.random.random(768).astype(np.float32)
66
})
67
68
dataset.append({
69
"text": "Another example text with semantic meaning.",
70
"embeddings": np.random.random(768).astype(np.float32)
71
})
72
73
dataset.commit("Added text embeddings data")
74
75
# Query similar texts (using embedding similarity)
76
target_embedding = np.random.random(768).astype(np.float32)
77
similar_texts = deeplake.query(f"""
78
SELECT text, COSINE_SIMILARITY(embeddings, {target_embedding.tolist()}) as similarity
79
FROM dataset
80
WHERE COSINE_SIMILARITY(embeddings, {target_embedding.tolist()}) > 0.5
81
ORDER BY similarity DESC
82
""")
83
84
print(f"Found {len(similar_texts)} similar texts")
85
```
86
87
### Text Embeddings with Quantization
88
89
```python
90
# Create quantized embeddings dataset for memory efficiency
91
quantized_schema = TextEmbeddings(embedding_size=1024, quantize=True)
92
quantized_dataset = deeplake.create("./quantized_embeddings", schema=quantized_schema)
93
94
print("Quantized Embeddings Schema:")
95
for col in quantized_dataset.schema.columns:
96
print(f" {col.name}: {type(col.dtype).__name__}")
97
if hasattr(col.dtype, 'quantization'):
98
print(f" Quantization: {col.dtype.quantization}")
99
100
# Add quantized embedding data
101
large_embeddings = [
102
np.random.random(1024).astype(np.float32) for _ in range(1000)
103
]
104
105
texts = [f"Document {i} content..." for i in range(1000)]
106
107
batch_data = [
108
{"text": text, "embeddings": embedding}
109
for text, embedding in zip(texts, large_embeddings)
110
]
111
112
quantized_dataset.extend(batch_data)
113
quantized_dataset.commit("Added quantized embeddings batch")
114
115
print(f"Quantized dataset size: {len(quantized_dataset)} documents")
116
```
117
118
### COCO Images Schema
119
120
```python
121
from deeplake.schemas import COCOImages
122
123
# Create COCO dataset with object detection
124
coco_schema = COCOImages(embedding_size=512, objects=True, keypoints=False)
125
coco_dataset = deeplake.create("./coco_dataset", schema=coco_schema)
126
127
print("COCO Images Schema:")
128
for col in coco_dataset.schema.columns:
129
print(f" {col.name}: {type(col.dtype).__name__}")
130
131
# Add COCO-style data
132
coco_sample = {
133
"images": "./images/sample_image.jpg",
134
"embeddings": np.random.random(512).astype(np.float32),
135
# Add other COCO-specific fields based on schema
136
}
137
138
# The schema defines the expected structure for COCO data
139
coco_dataset.append(coco_sample)
140
coco_dataset.commit("Added COCO sample")
141
```
142
143
### COCO with Keypoints
144
145
```python
146
# COCO schema with keypoint detection
147
coco_keypoints_schema = COCOImages(
148
embedding_size=256,
149
objects=True,
150
keypoints=True,
151
stuffs=False
152
)
153
154
coco_keypoints_dataset = deeplake.create("./coco_keypoints", schema=coco_keypoints_schema)
155
156
print("COCO Keypoints Schema:")
157
for col in coco_keypoints_dataset.schema.columns:
158
print(f" {col.name}: {type(col.dtype).__name__}")
159
160
# Add keypoint data
161
keypoint_sample = {
162
"images": "./images/person_image.jpg",
163
"embeddings": np.random.random(256).astype(np.float32),
164
# Keypoint-specific fields would be defined by the schema
165
}
166
167
coco_keypoints_dataset.append(keypoint_sample)
168
coco_keypoints_dataset.commit("Added keypoint sample")
169
```
170
171
### Custom Schema Templates
172
173
```python
174
# Create custom schema templates for specific domains
175
176
class VideoAnalysisSchema:
177
"""Custom schema for video analysis datasets."""
178
179
def __init__(self, frame_embedding_size=512, audio_embedding_size=128):
180
self.frame_embedding_size = frame_embedding_size
181
self.audio_embedding_size = audio_embedding_size
182
183
def create_schema(self):
184
"""Create the actual schema definition."""
185
# This would return a schema specification
186
# In practice, this might create the columns directly
187
pass
188
189
class MedicalImagingSchema:
190
"""Custom schema for medical imaging datasets."""
191
192
def __init__(self, include_dicom_metadata=True, embedding_size=1024):
193
self.include_dicom_metadata = include_dicom_metadata
194
self.embedding_size = embedding_size
195
196
def create_schema(self):
197
"""Create medical imaging schema."""
198
pass
199
200
# Implement custom video analysis dataset
201
def create_video_analysis_dataset(path, frame_emb_size=512, audio_emb_size=128):
202
"""Create dataset optimized for video analysis."""
203
204
dataset = deeplake.create(path)
205
206
# Video-specific columns
207
dataset.add_column("video_path", deeplake.types.Text())
208
dataset.add_column("video_metadata", deeplake.types.Dict())
209
dataset.add_column("duration", deeplake.types.Float32())
210
dataset.add_column("fps", deeplake.types.Float32())
211
212
# Frame analysis
213
dataset.add_column("frame_embeddings",
214
deeplake.types.Sequence(
215
deeplake.types.Embedding(size=frame_emb_size)
216
))
217
dataset.add_column("frame_timestamps",
218
deeplake.types.Sequence(deeplake.types.Float32()))
219
220
# Audio analysis
221
dataset.add_column("audio_embeddings",
222
deeplake.types.Sequence(
223
deeplake.types.Embedding(size=audio_emb_size)
224
))
225
dataset.add_column("audio_segments",
226
deeplake.types.Sequence(deeplake.types.Float32()))
227
228
# Analysis results
229
dataset.add_column("scene_labels", deeplake.types.Sequence(deeplake.types.Text()))
230
dataset.add_column("object_detections", deeplake.types.Sequence(deeplake.types.Dict()))
231
dataset.add_column("transcript", deeplake.types.Text())
232
233
return dataset
234
235
# Use custom schema
236
video_dataset = create_video_analysis_dataset("./video_analysis")
237
238
# Add video analysis data
239
video_sample = {
240
"video_path": "./videos/sample_video.mp4",
241
"video_metadata": {"resolution": "1920x1080", "codec": "h264"},
242
"duration": 120.5,
243
"fps": 30.0,
244
"frame_embeddings": [np.random.random(512).astype(np.float32) for _ in range(10)],
245
"frame_timestamps": [i * 0.033 for i in range(10)], # 30fps intervals
246
"audio_embeddings": [np.random.random(128).astype(np.float32) for _ in range(5)],
247
"audio_segments": [i * 24.1 for i in range(5)], # 5 audio segments
248
"scene_labels": ["indoor", "person", "conversation"],
249
"object_detections": [
250
{"bbox": [100, 100, 200, 200], "class": "person", "confidence": 0.95},
251
{"bbox": [300, 150, 400, 250], "class": "chair", "confidence": 0.87}
252
],
253
"transcript": "This is a sample video transcript..."
254
}
255
256
video_dataset.append(video_sample)
257
video_dataset.commit("Added video analysis sample")
258
```
259
260
### Domain-Specific Schema Patterns
261
262
```python
263
# E-commerce product schema
264
def create_ecommerce_schema(path):
265
"""Schema for e-commerce product datasets."""
266
267
dataset = deeplake.create(path)
268
269
# Product information
270
dataset.add_column("product_id", deeplake.types.Text())
271
dataset.add_column("title", deeplake.types.Text(
272
index_type=deeplake.types.TextIndex(deeplake.types.Inverted)
273
))
274
dataset.add_column("description", deeplake.types.Text(
275
index_type=deeplake.types.TextIndex(deeplake.types.BM25)
276
))
277
278
# Visual content
279
dataset.add_column("product_images", deeplake.types.Sequence(deeplake.types.Image()))
280
dataset.add_column("image_embeddings", deeplake.types.Sequence(
281
deeplake.types.Embedding(size=512,
282
index_type=deeplake.types.EmbeddingIndex(deeplake.types.Clustered))
283
))
284
285
# Categorical data
286
dataset.add_column("category", deeplake.types.Text())
287
dataset.add_column("subcategory", deeplake.types.Text())
288
dataset.add_column("brand", deeplake.types.Text())
289
290
# Numerical attributes
291
dataset.add_column("price", deeplake.types.Float32())
292
dataset.add_column("rating", deeplake.types.Float32())
293
dataset.add_column("review_count", deeplake.types.Int32())
294
295
# Rich attributes
296
dataset.add_column("attributes", deeplake.types.Dict()) # Color, size, material, etc.
297
dataset.add_column("tags", deeplake.types.Sequence(deeplake.types.Text()))
298
299
return dataset
300
301
# Genomics data schema
302
def create_genomics_schema(path):
303
"""Schema for genomics datasets."""
304
305
dataset = deeplake.create(path)
306
307
# Sample identification
308
dataset.add_column("sample_id", deeplake.types.Text())
309
dataset.add_column("patient_id", deeplake.types.Text())
310
dataset.add_column("tissue_type", deeplake.types.Text())
311
312
# Sequence data
313
dataset.add_column("sequence", deeplake.types.Text())
314
dataset.add_column("quality_scores", deeplake.types.Sequence(deeplake.types.Int8()))
315
316
# Genomic coordinates
317
dataset.add_column("chromosome", deeplake.types.Text())
318
dataset.add_column("start_position", deeplake.types.Int64())
319
dataset.add_column("end_position", deeplake.types.Int64())
320
321
# Variant information
322
dataset.add_column("variants", deeplake.types.Sequence(deeplake.types.Dict()))
323
dataset.add_column("annotations", deeplake.types.Dict())
324
325
# Expression data
326
dataset.add_column("expression_values", deeplake.types.Array(
327
deeplake.types.Float32(), shape=[20000] # ~20k genes
328
))
329
330
# Embeddings for ML
331
dataset.add_column("sequence_embeddings", deeplake.types.Embedding(size=256))
332
333
return dataset
334
335
# Time series schema
336
def create_timeseries_schema(path, num_features=10):
337
"""Schema for time series datasets."""
338
339
dataset = deeplake.create(path)
340
341
# Time series identification
342
dataset.add_column("series_id", deeplake.types.Text())
343
dataset.add_column("start_time", deeplake.types.Int64()) # Unix timestamp
344
dataset.add_column("end_time", deeplake.types.Int64())
345
dataset.add_column("frequency", deeplake.types.Text()) # 'daily', 'hourly', etc.
346
347
# Time series data
348
dataset.add_column("timestamps", deeplake.types.Sequence(deeplake.types.Int64()))
349
dataset.add_column("values", deeplake.types.Sequence(
350
deeplake.types.Array(deeplake.types.Float32(), shape=[num_features])
351
))
352
353
# Metadata
354
dataset.add_column("source", deeplake.types.Text())
355
dataset.add_column("tags", deeplake.types.Sequence(deeplake.types.Text()))
356
dataset.add_column("metadata", deeplake.types.Dict())
357
358
# Derived features
359
dataset.add_column("statistical_features", deeplake.types.Array(
360
deeplake.types.Float32(), shape=[50] # Pre-computed stats
361
))
362
dataset.add_column("embeddings", deeplake.types.Embedding(size=128))
363
364
return dataset
365
366
# Use domain-specific schemas
367
ecommerce_dataset = create_ecommerce_schema("./ecommerce_products")
368
genomics_dataset = create_genomics_schema("./genomics_samples")
369
timeseries_dataset = create_timeseries_schema("./time_series_data", num_features=15)
370
371
print("Created domain-specific datasets:")
372
print(f"E-commerce columns: {len(ecommerce_dataset.schema.columns)}")
373
print(f"Genomics columns: {len(genomics_dataset.schema.columns)}")
374
print(f"Time series columns: {len(timeseries_dataset.schema.columns)}")
375
```
376
377
### Schema Template Best Practices
378
379
```python
380
# Best practices for creating reusable schema templates
381
382
class FlexibleImageDatasetSchema:
383
"""Flexible schema template for image datasets."""
384
385
def __init__(self,
386
include_embeddings=True,
387
embedding_size=512,
388
include_annotations=True,
389
include_metadata=True,
390
enable_text_search=False,
391
enable_similarity_search=True):
392
393
self.include_embeddings = include_embeddings
394
self.embedding_size = embedding_size
395
self.include_annotations = include_annotations
396
self.include_metadata = include_metadata
397
self.enable_text_search = enable_text_search
398
self.enable_similarity_search = enable_similarity_search
399
400
def create_dataset(self, path):
401
"""Create dataset with flexible schema."""
402
403
dataset = deeplake.create(path)
404
405
# Core image columns (always present)
406
dataset.add_column("image_id", deeplake.types.Text())
407
dataset.add_column("image", deeplake.types.Image())
408
dataset.add_column("width", deeplake.types.Int32())
409
dataset.add_column("height", deeplake.types.Int32())
410
411
# Optional embeddings
412
if self.include_embeddings:
413
index_type = None
414
if self.enable_similarity_search:
415
index_type = deeplake.types.EmbeddingIndex(deeplake.types.Clustered)
416
417
dataset.add_column("embeddings",
418
deeplake.types.Embedding(size=self.embedding_size,
419
index_type=index_type))
420
421
# Optional annotations
422
if self.include_annotations:
423
dataset.add_column("labels", deeplake.types.Sequence(deeplake.types.Text()))
424
dataset.add_column("bboxes", deeplake.types.Sequence(deeplake.types.BoundingBox()))
425
dataset.add_column("masks", deeplake.types.Sequence(deeplake.types.SegmentMask()))
426
427
# Optional text fields with search
428
if self.enable_text_search:
429
dataset.add_column("caption",
430
deeplake.types.Text(
431
index_type=deeplake.types.TextIndex(deeplake.types.BM25)
432
))
433
dataset.add_column("description",
434
deeplake.types.Text(
435
index_type=deeplake.types.TextIndex(deeplake.types.Inverted)
436
))
437
438
# Optional metadata
439
if self.include_metadata:
440
dataset.add_column("metadata", deeplake.types.Dict())
441
dataset.add_column("source", deeplake.types.Text())
442
dataset.add_column("created_at", deeplake.types.Int64())
443
444
return dataset
445
446
# Usage examples of flexible schema
447
# Minimal image dataset
448
minimal_schema = FlexibleImageDatasetSchema(
449
include_embeddings=False,
450
include_annotations=False,
451
include_metadata=False
452
)
453
minimal_dataset = minimal_schema.create_dataset("./minimal_images")
454
455
# Full-featured image dataset
456
full_schema = FlexibleImageDatasetSchema(
457
include_embeddings=True,
458
embedding_size=768,
459
include_annotations=True,
460
include_metadata=True,
461
enable_text_search=True,
462
enable_similarity_search=True
463
)
464
full_dataset = full_schema.create_dataset("./full_featured_images")
465
466
print(f"Minimal schema columns: {len(minimal_dataset.schema.columns)}")
467
print(f"Full schema columns: {len(full_dataset.schema.columns)}")
468
469
# Demonstrate schema validation
470
def validate_schema_compatibility(dataset1, dataset2):
471
"""Check if two datasets have compatible schemas."""
472
473
schema1_cols = {col.name: type(col.dtype) for col in dataset1.schema.columns}
474
schema2_cols = {col.name: type(col.dtype) for col in dataset2.schema.columns}
475
476
common_cols = set(schema1_cols.keys()) & set(schema2_cols.keys())
477
compatible_cols = [
478
col for col in common_cols
479
if schema1_cols[col] == schema2_cols[col]
480
]
481
482
return {
483
"compatible": len(compatible_cols) == len(common_cols),
484
"common_columns": list(common_cols),
485
"compatible_columns": compatible_cols,
486
"schema1_only": list(set(schema1_cols.keys()) - set(schema2_cols.keys())),
487
"schema2_only": list(set(schema2_cols.keys()) - set(schema1_cols.keys()))
488
}
489
490
# Test schema compatibility
491
compatibility = validate_schema_compatibility(minimal_dataset, full_dataset)
492
print(f"Schema compatibility: {compatibility}")
493
```