or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-access.mddata-import-export.mddataset-management.mderror-handling.mdframework-integration.mdindex.mdquery-system.mdschema-templates.mdstorage-system.mdtype-system.mdversion-control.md

schema-templates.mddocs/

0

# Schema Templates

1

2

Pre-defined schema templates for common ML use cases including text embeddings, COCO datasets, and custom schema creation patterns. Schema templates provide standardized dataset structures for specific domains and applications.

3

4

## Capabilities

5

6

### Pre-defined Templates

7

8

Ready-to-use schema templates for common machine learning scenarios with optimized column types and indexing.

9

10

```python { .api }

11

class TextEmbeddings:

12

"""Schema template for text embeddings datasets."""

13

14

def __init__(self, embedding_size: int, quantize: bool = False):

15

"""

16

Initialize text embeddings schema.

17

18

Parameters:

19

- embedding_size: Dimension of embedding vectors

20

- quantize: Whether to use quantized embeddings for memory efficiency

21

"""

22

23

class COCOImages:

24

"""COCO dataset schema template."""

25

26

def __init__(self, embedding_size: int, quantize: bool = False, objects: bool = True, keypoints: bool = False, stuffs: bool = False):

27

"""

28

Initialize COCO images schema.

29

30

Parameters:

31

- embedding_size: Dimension of embedding vectors for images

32

- quantize: Whether to use quantized embeddings

33

- objects: Include object detection annotations

34

- keypoints: Include keypoint detection annotations

35

- stuffs: Include stuff segmentation annotations

36

"""

37

38

class SchemaTemplate:

39

"""Base class for schema templates."""

40

pass

41

```

42

43

## Usage Examples

44

45

### Text Embeddings Schema

46

47

```python

48

import deeplake

49

from deeplake.schemas import TextEmbeddings

50

51

# Create dataset with text embeddings schema

52

schema = TextEmbeddings(embedding_size=768)

53

dataset = deeplake.create("./text_embeddings_dataset", schema=schema)

54

55

# Examine the generated schema

56

print("Text Embeddings Schema:")

57

for col in dataset.schema.columns:

58

print(f" {col.name}: {type(col.dtype).__name__}")

59

60

# Add text data with embeddings

61

import numpy as np

62

63

dataset.append({

64

"text": "This is a sample text for embedding.",

65

"embeddings": np.random.random(768).astype(np.float32)

66

})

67

68

dataset.append({

69

"text": "Another example text with semantic meaning.",

70

"embeddings": np.random.random(768).astype(np.float32)

71

})

72

73

dataset.commit("Added text embeddings data")

74

75

# Query similar texts (using embedding similarity)

76

target_embedding = np.random.random(768).astype(np.float32)

77

similar_texts = deeplake.query(f"""

78

SELECT text, COSINE_SIMILARITY(embeddings, {target_embedding.tolist()}) as similarity

79

FROM dataset

80

WHERE COSINE_SIMILARITY(embeddings, {target_embedding.tolist()}) > 0.5

81

ORDER BY similarity DESC

82

""")

83

84

print(f"Found {len(similar_texts)} similar texts")

85

```

86

87

### Text Embeddings with Quantization

88

89

```python

90

# Create quantized embeddings dataset for memory efficiency

91

quantized_schema = TextEmbeddings(embedding_size=1024, quantize=True)

92

quantized_dataset = deeplake.create("./quantized_embeddings", schema=quantized_schema)

93

94

print("Quantized Embeddings Schema:")

95

for col in quantized_dataset.schema.columns:

96

print(f" {col.name}: {type(col.dtype).__name__}")

97

if hasattr(col.dtype, 'quantization'):

98

print(f" Quantization: {col.dtype.quantization}")

99

100

# Add quantized embedding data

101

large_embeddings = [

102

np.random.random(1024).astype(np.float32) for _ in range(1000)

103

]

104

105

texts = [f"Document {i} content..." for i in range(1000)]

106

107

batch_data = [

108

{"text": text, "embeddings": embedding}

109

for text, embedding in zip(texts, large_embeddings)

110

]

111

112

quantized_dataset.extend(batch_data)

113

quantized_dataset.commit("Added quantized embeddings batch")

114

115

print(f"Quantized dataset size: {len(quantized_dataset)} documents")

116

```

117

118

### COCO Images Schema

119

120

```python

121

from deeplake.schemas import COCOImages

122

123

# Create COCO dataset with object detection

124

coco_schema = COCOImages(embedding_size=512, objects=True, keypoints=False)

125

coco_dataset = deeplake.create("./coco_dataset", schema=coco_schema)

126

127

print("COCO Images Schema:")

128

for col in coco_dataset.schema.columns:

129

print(f" {col.name}: {type(col.dtype).__name__}")

130

131

# Add COCO-style data

132

coco_sample = {

133

"images": "./images/sample_image.jpg",

134

"embeddings": np.random.random(512).astype(np.float32),

135

# Add other COCO-specific fields based on schema

136

}

137

138

# The schema defines the expected structure for COCO data

139

coco_dataset.append(coco_sample)

140

coco_dataset.commit("Added COCO sample")

141

```

142

143

### COCO with Keypoints

144

145

```python

146

# COCO schema with keypoint detection

147

coco_keypoints_schema = COCOImages(

148

embedding_size=256,

149

objects=True,

150

keypoints=True,

151

stuffs=False

152

)

153

154

coco_keypoints_dataset = deeplake.create("./coco_keypoints", schema=coco_keypoints_schema)

155

156

print("COCO Keypoints Schema:")

157

for col in coco_keypoints_dataset.schema.columns:

158

print(f" {col.name}: {type(col.dtype).__name__}")

159

160

# Add keypoint data

161

keypoint_sample = {

162

"images": "./images/person_image.jpg",

163

"embeddings": np.random.random(256).astype(np.float32),

164

# Keypoint-specific fields would be defined by the schema

165

}

166

167

coco_keypoints_dataset.append(keypoint_sample)

168

coco_keypoints_dataset.commit("Added keypoint sample")

169

```

170

171

### Custom Schema Templates

172

173

```python

174

# Create custom schema templates for specific domains

175

176

class VideoAnalysisSchema:

177

"""Custom schema for video analysis datasets."""

178

179

def __init__(self, frame_embedding_size=512, audio_embedding_size=128):

180

self.frame_embedding_size = frame_embedding_size

181

self.audio_embedding_size = audio_embedding_size

182

183

def create_schema(self):

184

"""Create the actual schema definition."""

185

# This would return a schema specification

186

# In practice, this might create the columns directly

187

pass

188

189

class MedicalImagingSchema:

190

"""Custom schema for medical imaging datasets."""

191

192

def __init__(self, include_dicom_metadata=True, embedding_size=1024):

193

self.include_dicom_metadata = include_dicom_metadata

194

self.embedding_size = embedding_size

195

196

def create_schema(self):

197

"""Create medical imaging schema."""

198

pass

199

200

# Implement custom video analysis dataset

201

def create_video_analysis_dataset(path, frame_emb_size=512, audio_emb_size=128):

202

"""Create dataset optimized for video analysis."""

203

204

dataset = deeplake.create(path)

205

206

# Video-specific columns

207

dataset.add_column("video_path", deeplake.types.Text())

208

dataset.add_column("video_metadata", deeplake.types.Dict())

209

dataset.add_column("duration", deeplake.types.Float32())

210

dataset.add_column("fps", deeplake.types.Float32())

211

212

# Frame analysis

213

dataset.add_column("frame_embeddings",

214

deeplake.types.Sequence(

215

deeplake.types.Embedding(size=frame_emb_size)

216

))

217

dataset.add_column("frame_timestamps",

218

deeplake.types.Sequence(deeplake.types.Float32()))

219

220

# Audio analysis

221

dataset.add_column("audio_embeddings",

222

deeplake.types.Sequence(

223

deeplake.types.Embedding(size=audio_emb_size)

224

))

225

dataset.add_column("audio_segments",

226

deeplake.types.Sequence(deeplake.types.Float32()))

227

228

# Analysis results

229

dataset.add_column("scene_labels", deeplake.types.Sequence(deeplake.types.Text()))

230

dataset.add_column("object_detections", deeplake.types.Sequence(deeplake.types.Dict()))

231

dataset.add_column("transcript", deeplake.types.Text())

232

233

return dataset

234

235

# Use custom schema

236

video_dataset = create_video_analysis_dataset("./video_analysis")

237

238

# Add video analysis data

239

video_sample = {

240

"video_path": "./videos/sample_video.mp4",

241

"video_metadata": {"resolution": "1920x1080", "codec": "h264"},

242

"duration": 120.5,

243

"fps": 30.0,

244

"frame_embeddings": [np.random.random(512).astype(np.float32) for _ in range(10)],

245

"frame_timestamps": [i * 0.033 for i in range(10)], # 30fps intervals

246

"audio_embeddings": [np.random.random(128).astype(np.float32) for _ in range(5)],

247

"audio_segments": [i * 24.1 for i in range(5)], # 5 audio segments

248

"scene_labels": ["indoor", "person", "conversation"],

249

"object_detections": [

250

{"bbox": [100, 100, 200, 200], "class": "person", "confidence": 0.95},

251

{"bbox": [300, 150, 400, 250], "class": "chair", "confidence": 0.87}

252

],

253

"transcript": "This is a sample video transcript..."

254

}

255

256

video_dataset.append(video_sample)

257

video_dataset.commit("Added video analysis sample")

258

```

259

260

### Domain-Specific Schema Patterns

261

262

```python

263

# E-commerce product schema

264

def create_ecommerce_schema(path):

265

"""Schema for e-commerce product datasets."""

266

267

dataset = deeplake.create(path)

268

269

# Product information

270

dataset.add_column("product_id", deeplake.types.Text())

271

dataset.add_column("title", deeplake.types.Text(

272

index_type=deeplake.types.TextIndex(deeplake.types.Inverted)

273

))

274

dataset.add_column("description", deeplake.types.Text(

275

index_type=deeplake.types.TextIndex(deeplake.types.BM25)

276

))

277

278

# Visual content

279

dataset.add_column("product_images", deeplake.types.Sequence(deeplake.types.Image()))

280

dataset.add_column("image_embeddings", deeplake.types.Sequence(

281

deeplake.types.Embedding(size=512,

282

index_type=deeplake.types.EmbeddingIndex(deeplake.types.Clustered))

283

))

284

285

# Categorical data

286

dataset.add_column("category", deeplake.types.Text())

287

dataset.add_column("subcategory", deeplake.types.Text())

288

dataset.add_column("brand", deeplake.types.Text())

289

290

# Numerical attributes

291

dataset.add_column("price", deeplake.types.Float32())

292

dataset.add_column("rating", deeplake.types.Float32())

293

dataset.add_column("review_count", deeplake.types.Int32())

294

295

# Rich attributes

296

dataset.add_column("attributes", deeplake.types.Dict()) # Color, size, material, etc.

297

dataset.add_column("tags", deeplake.types.Sequence(deeplake.types.Text()))

298

299

return dataset

300

301

# Genomics data schema

302

def create_genomics_schema(path):

303

"""Schema for genomics datasets."""

304

305

dataset = deeplake.create(path)

306

307

# Sample identification

308

dataset.add_column("sample_id", deeplake.types.Text())

309

dataset.add_column("patient_id", deeplake.types.Text())

310

dataset.add_column("tissue_type", deeplake.types.Text())

311

312

# Sequence data

313

dataset.add_column("sequence", deeplake.types.Text())

314

dataset.add_column("quality_scores", deeplake.types.Sequence(deeplake.types.Int8()))

315

316

# Genomic coordinates

317

dataset.add_column("chromosome", deeplake.types.Text())

318

dataset.add_column("start_position", deeplake.types.Int64())

319

dataset.add_column("end_position", deeplake.types.Int64())

320

321

# Variant information

322

dataset.add_column("variants", deeplake.types.Sequence(deeplake.types.Dict()))

323

dataset.add_column("annotations", deeplake.types.Dict())

324

325

# Expression data

326

dataset.add_column("expression_values", deeplake.types.Array(

327

deeplake.types.Float32(), shape=[20000] # ~20k genes

328

))

329

330

# Embeddings for ML

331

dataset.add_column("sequence_embeddings", deeplake.types.Embedding(size=256))

332

333

return dataset

334

335

# Time series schema

336

def create_timeseries_schema(path, num_features=10):

337

"""Schema for time series datasets."""

338

339

dataset = deeplake.create(path)

340

341

# Time series identification

342

dataset.add_column("series_id", deeplake.types.Text())

343

dataset.add_column("start_time", deeplake.types.Int64()) # Unix timestamp

344

dataset.add_column("end_time", deeplake.types.Int64())

345

dataset.add_column("frequency", deeplake.types.Text()) # 'daily', 'hourly', etc.

346

347

# Time series data

348

dataset.add_column("timestamps", deeplake.types.Sequence(deeplake.types.Int64()))

349

dataset.add_column("values", deeplake.types.Sequence(

350

deeplake.types.Array(deeplake.types.Float32(), shape=[num_features])

351

))

352

353

# Metadata

354

dataset.add_column("source", deeplake.types.Text())

355

dataset.add_column("tags", deeplake.types.Sequence(deeplake.types.Text()))

356

dataset.add_column("metadata", deeplake.types.Dict())

357

358

# Derived features

359

dataset.add_column("statistical_features", deeplake.types.Array(

360

deeplake.types.Float32(), shape=[50] # Pre-computed stats

361

))

362

dataset.add_column("embeddings", deeplake.types.Embedding(size=128))

363

364

return dataset

365

366

# Use domain-specific schemas

367

ecommerce_dataset = create_ecommerce_schema("./ecommerce_products")

368

genomics_dataset = create_genomics_schema("./genomics_samples")

369

timeseries_dataset = create_timeseries_schema("./time_series_data", num_features=15)

370

371

print("Created domain-specific datasets:")

372

print(f"E-commerce columns: {len(ecommerce_dataset.schema.columns)}")

373

print(f"Genomics columns: {len(genomics_dataset.schema.columns)}")

374

print(f"Time series columns: {len(timeseries_dataset.schema.columns)}")

375

```

376

377

### Schema Template Best Practices

378

379

```python

380

# Best practices for creating reusable schema templates

381

382

class FlexibleImageDatasetSchema:

383

"""Flexible schema template for image datasets."""

384

385

def __init__(self,

386

include_embeddings=True,

387

embedding_size=512,

388

include_annotations=True,

389

include_metadata=True,

390

enable_text_search=False,

391

enable_similarity_search=True):

392

393

self.include_embeddings = include_embeddings

394

self.embedding_size = embedding_size

395

self.include_annotations = include_annotations

396

self.include_metadata = include_metadata

397

self.enable_text_search = enable_text_search

398

self.enable_similarity_search = enable_similarity_search

399

400

def create_dataset(self, path):

401

"""Create dataset with flexible schema."""

402

403

dataset = deeplake.create(path)

404

405

# Core image columns (always present)

406

dataset.add_column("image_id", deeplake.types.Text())

407

dataset.add_column("image", deeplake.types.Image())

408

dataset.add_column("width", deeplake.types.Int32())

409

dataset.add_column("height", deeplake.types.Int32())

410

411

# Optional embeddings

412

if self.include_embeddings:

413

index_type = None

414

if self.enable_similarity_search:

415

index_type = deeplake.types.EmbeddingIndex(deeplake.types.Clustered)

416

417

dataset.add_column("embeddings",

418

deeplake.types.Embedding(size=self.embedding_size,

419

index_type=index_type))

420

421

# Optional annotations

422

if self.include_annotations:

423

dataset.add_column("labels", deeplake.types.Sequence(deeplake.types.Text()))

424

dataset.add_column("bboxes", deeplake.types.Sequence(deeplake.types.BoundingBox()))

425

dataset.add_column("masks", deeplake.types.Sequence(deeplake.types.SegmentMask()))

426

427

# Optional text fields with search

428

if self.enable_text_search:

429

dataset.add_column("caption",

430

deeplake.types.Text(

431

index_type=deeplake.types.TextIndex(deeplake.types.BM25)

432

))

433

dataset.add_column("description",

434

deeplake.types.Text(

435

index_type=deeplake.types.TextIndex(deeplake.types.Inverted)

436

))

437

438

# Optional metadata

439

if self.include_metadata:

440

dataset.add_column("metadata", deeplake.types.Dict())

441

dataset.add_column("source", deeplake.types.Text())

442

dataset.add_column("created_at", deeplake.types.Int64())

443

444

return dataset

445

446

# Usage examples of flexible schema

447

# Minimal image dataset

448

minimal_schema = FlexibleImageDatasetSchema(

449

include_embeddings=False,

450

include_annotations=False,

451

include_metadata=False

452

)

453

minimal_dataset = minimal_schema.create_dataset("./minimal_images")

454

455

# Full-featured image dataset

456

full_schema = FlexibleImageDatasetSchema(

457

include_embeddings=True,

458

embedding_size=768,

459

include_annotations=True,

460

include_metadata=True,

461

enable_text_search=True,

462

enable_similarity_search=True

463

)

464

full_dataset = full_schema.create_dataset("./full_featured_images")

465

466

print(f"Minimal schema columns: {len(minimal_dataset.schema.columns)}")

467

print(f"Full schema columns: {len(full_dataset.schema.columns)}")

468

469

# Demonstrate schema validation

470

def validate_schema_compatibility(dataset1, dataset2):

471

"""Check if two datasets have compatible schemas."""

472

473

schema1_cols = {col.name: type(col.dtype) for col in dataset1.schema.columns}

474

schema2_cols = {col.name: type(col.dtype) for col in dataset2.schema.columns}

475

476

common_cols = set(schema1_cols.keys()) & set(schema2_cols.keys())

477

compatible_cols = [

478

col for col in common_cols

479

if schema1_cols[col] == schema2_cols[col]

480

]

481

482

return {

483

"compatible": len(compatible_cols) == len(common_cols),

484

"common_columns": list(common_cols),

485

"compatible_columns": compatible_cols,

486

"schema1_only": list(set(schema1_cols.keys()) - set(schema2_cols.keys())),

487

"schema2_only": list(set(schema2_cols.keys()) - set(schema1_cols.keys()))

488

}

489

490

# Test schema compatibility

491

compatibility = validate_schema_compatibility(minimal_dataset, full_dataset)

492

print(f"Schema compatibility: {compatibility}")

493

```