Tessl Tile for pypi/deeplake@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

data-access.md data-import-export.md dataset-management.md error-handling.md framework-integration.md index.md query-system.md schema-templates.md storage-system.md type-system.md version-control.md

version-control.mddocs/

0
# Version Control
1

2
Git-like version control system with branching, tagging, commit history, and merge operations for dataset evolution and collaboration. Deep Lake provides comprehensive versioning capabilities enabling reproducible ML experiments and dataset lineage tracking.
3

4
## Capabilities
5

6
### Dataset Versioning
7

8
Core version control operations for tracking dataset changes with commit history and rollback capabilities.
9

10
```python { .api }
11
class Dataset:
12
    """Dataset version control operations."""
13
    
14
    version: Version
15
    history: History
16
    current_branch: str
17
    
18
    def commit(self, message: str = "") -> str:
19
        """
20
        Commit current dataset changes.
21
        
22
        Parameters:
23
        - message: Commit message describing changes
24
        
25
        Returns:
26
        str: Commit ID/hash
27
        """
28
    
29
    def rollback(self, version_id: str) -> None:
30
        """
31
        Rollback dataset to specific version.
32
        
33
        Parameters:
34
        - version_id: Version ID to rollback to
35
        """
36
    
37
    def refresh(self) -> None:
38
        """Refresh dataset to latest version from storage."""
39

40
class Version:
41
    """Single version information."""
42
    
43
    id: str
44
    message: str
45
    timestamp: str
46
    client_timestamp: str
47
    
48
    def open(self) -> ReadOnlyDataset:
49
        """
50
        Open this version as read-only dataset.
51
        
52
        Returns:
53
        ReadOnlyDataset: Dataset at this version
54
        """
55
    
56
    def open_async(self) -> Future[ReadOnlyDataset]:
57
        """
58
        Open this version asynchronously.
59
        
60
        Returns:
61
        Future[ReadOnlyDataset]: Future resolving to dataset at this version
62
        """
63

64
class History:
65
    """Version history access."""
66
    
67
    def __getitem__(self, key: Union[int, str]) -> Version:
68
        """
69
        Access version by index or ID.
70
        
71
        Parameters:
72
        - key: Version index (int) or version ID (str)
73
        
74
        Returns:
75
        Version: Version object
76
        """
77
    
78
    def __iter__(self) -> Iterator[Version]:
79
        """Iterate over all versions in chronological order."""
80
    
81
    def __len__(self) -> int:
82
        """Get total number of versions."""
83
```
84

85
### Branch Management
86

87
Create and manage dataset branches for parallel development and experimentation.
88

89
```python { .api }
90
class Dataset:
91
    """Dataset branch operations."""
92
    
93
    branches: Branches
94
    
95
    def branch(self, name: str) -> Branch:
96
        """
97
        Create new branch from current state.
98
        
99
        Parameters:
100
        - name: Branch name
101
        
102
        Returns:
103
        Branch: New branch object
104
        """
105
    
106
    def merge(self, branch_name: str, message: str = "") -> None:
107
        """
108
        Merge branch into current branch.
109
        
110
        Parameters:
111
        - branch_name: Name of branch to merge
112
        - message: Merge commit message
113
        """
114

115
class Branch:
116
    """Dataset branch management."""
117
    
118
    id: str
119
    name: str
120
    timestamp: str
121
    base: str
122
    
123
    def open(self) -> Dataset:
124
        """
125
        Open this branch for modification.
126
        
127
        Returns:
128
        Dataset: Mutable dataset on this branch
129
        """
130
    
131
    def open_async(self) -> Future[Dataset]:
132
        """
133
        Open this branch asynchronously.
134
        
135
        Returns:
136
        Future[Dataset]: Future resolving to mutable dataset
137
        """
138
    
139
    def delete(self) -> None:
140
        """Delete this branch (cannot delete main branch)."""
141
    
142
    def rename(self, new_name: str) -> None:
143
        """
144
        Rename this branch.
145
        
146
        Parameters:
147
        - new_name: New branch name
148
        """
149

150
class BranchView:
151
    """Read-only branch information."""
152
    
153
    id: str
154
    name: str
155
    timestamp: str
156
    base: str
157
    
158
    def open(self) -> ReadOnlyDataset:
159
        """Open this branch as read-only dataset."""
160
    
161
    def open_async(self) -> Future[ReadOnlyDataset]:
162
        """Open this branch asynchronously."""
163

164
class Branches:
165
    """Collection of branches (mutable)."""
166
    
167
    def names(self) -> List[str]:
168
        """
169
        Get all branch names.
170
        
171
        Returns:
172
        List[str]: List of branch names
173
        """
174
    
175
    def __len__(self) -> int:
176
        """Get number of branches."""
177
    
178
    def __getitem__(self, name: str) -> Branch:
179
        """
180
        Get branch by name.
181
        
182
        Parameters:
183
        - name: Branch name
184
        
185
        Returns:
186
        Branch: Branch object
187
        """
188

189
class BranchesView:
190
    """Collection of branches (read-only)."""
191
    
192
    def names(self) -> List[str]:
193
        """Get all branch names."""
194
    
195
    def __len__(self) -> int:
196
        """Get number of branches."""
197
    
198
    def __getitem__(self, name: str) -> BranchView:
199
        """Get branch by name."""
200
```
201

202
### Tag Management
203

204
Create and manage dataset tags for marking important versions and milestones.
205

206
```python { .api }
207
class Dataset:
208
    """Dataset tag operations."""
209
    
210
    tags: Tags
211
    
212
    def tag(self, name: str, message: str = "") -> Tag:
213
        """
214
        Create tag at current version.
215
        
216
        Parameters:
217
        - name: Tag name
218
        - message: Tag message/description
219
        
220
        Returns:
221
        Tag: New tag object
222
        """
223

224
class DatasetView:
225
    """Query result tag operations."""
226
    
227
    def tag(self, name: str, message: str = "") -> TagView:
228
        """Create tag from query result view."""
229

230
class Tag:
231
    """Dataset tag management."""
232
    
233
    id: str
234
    name: str
235
    message: str
236
    version: str
237
    timestamp: str
238
    
239
    def open(self) -> ReadOnlyDataset:
240
        """
241
        Open dataset at this tag version.
242
        
243
        Returns:
244
        ReadOnlyDataset: Dataset at tag version
245
        """
246
    
247
    def open_async(self) -> Future[ReadOnlyDataset]:
248
        """
249
        Open dataset at this tag asynchronously.
250
        
251
        Returns:
252
        Future[ReadOnlyDataset]: Future resolving to dataset
253
        """
254
    
255
    def delete(self) -> None:
256
        """Delete this tag."""
257
    
258
    def rename(self, new_name: str) -> None:
259
        """
260
        Rename this tag.
261
        
262
        Parameters:
263
        - new_name: New tag name
264
        """
265

266
class TagView:
267
    """Read-only tag information."""
268
    
269
    id: str
270
    name: str
271
    message: str
272
    version: str
273
    timestamp: str
274
    
275
    def open(self) -> ReadOnlyDataset:
276
        """Open dataset at this tag version."""
277
    
278
    def open_async(self) -> Future[ReadOnlyDataset]:
279
        """Open dataset at this tag asynchronously."""
280

281
class Tags:
282
    """Collection of tags (mutable)."""
283
    
284
    def names(self) -> List[str]:
285
        """
286
        Get all tag names.
287
        
288
        Returns:
289
        List[str]: List of tag names
290
        """
291
    
292
    def __len__(self) -> int:
293
        """Get number of tags."""
294
    
295
    def __getitem__(self, name: str) -> Tag:
296
        """
297
        Get tag by name.
298
        
299
        Parameters:
300
        - name: Tag name
301
        
302
        Returns:
303
        Tag: Tag object
304
        """
305

306
class TagsView:
307
    """Collection of tags (read-only)."""
308
    
309
    def names(self) -> List[str]:
310
        """Get all tag names."""
311
    
312
    def __len__(self) -> int:
313
        """Get number of tags."""
314
    
315
    def __getitem__(self, name: str) -> TagView:
316
        """Get tag by name."""
317
```
318

319
### Remote Synchronization
320

321
Push and pull operations for synchronizing dataset versions with remote storage.
322

323
```python { .api }
324
class Dataset:
325
    """Remote synchronization operations."""
326
    
327
    def push(self) -> None:
328
        """Push local changes to remote storage."""
329
    
330
    def pull(self) -> None:
331
        """Pull remote changes to local dataset."""
332

333
class ReadOnlyDataset:
334
    """Remote operations for read-only datasets."""
335
    
336
    def push(self) -> None:
337
        """Push dataset state to remote (metadata only)."""
338
    
339
    def refresh(self) -> None:
340
        """Refresh dataset from remote storage."""
341
```
342

343
## Usage Examples
344

345
### Basic Version Control
346

347
```python
348
import deeplake
349

350
# Open dataset and make changes
351
dataset = deeplake.open("./my_dataset")
352

353
# Add some data
354
dataset.append({"images": "new_image.jpg", "labels": "cat"})
355

356
# Commit changes
357
commit_id = dataset.commit("Added new cat image")
358
print(f"Committed changes: {commit_id}")
359

360
# View commit history
361
for version in dataset.history:
362
    print(f"Version {version.id}: {version.message} ({version.timestamp})")
363

364
# Get current version info
365
current_version = dataset.version
366
print(f"Current version: {current_version.id}")
367
print(f"Commit message: {current_version.message}")
368
```
369

370
### Branch Operations
371

372
```python
373
# Create new branch for experiments
374
experiment_branch = dataset.branch("feature_experiment")
375
print(f"Created branch: {experiment_branch.name}")
376

377
# List all branches
378
print("Available branches:")
379
for branch_name in dataset.branches.names():
380
    branch = dataset.branches[branch_name]
381
    print(f"  {branch.name} (created: {branch.timestamp})")
382

383
# Switch to experiment branch
384
experiment_dataset = experiment_branch.open()
385

386
# Make experimental changes
387
experiment_dataset.add_column("confidence", deeplake.types.Float32())
388
experiment_dataset.append({
389
    "images": "experiment_image.jpg", 
390
    "labels": "experimental_label",
391
    "confidence": 0.95
392
})
393

394
# Commit experimental changes
395
experiment_dataset.commit("Added confidence scores for experimentation")
396

397
# Switch back to main branch
398
main_dataset = dataset.branches["main"].open()
399

400
# Merge experimental branch into main
401
main_dataset.merge("feature_experiment", "Merged confidence score feature")
402

403
# Clean up: delete experimental branch
404
experiment_branch.delete()
405
```
406

407
### Tag Management
408

409
```python
410
# Create tags for important milestones
411
v1_tag = dataset.tag("v1.0", "Initial production dataset")
412
print(f"Created tag: {v1_tag.name}")
413

414
# Add more data and create another tag
415
dataset.extend([
416
    {"images": f"batch_image_{i}.jpg", "labels": f"label_{i}"} 
417
    for i in range(100)
418
])
419
dataset.commit("Added batch of 100 images")
420

421
v1_1_tag = dataset.tag("v1.1", "Added training batch")
422

423
# List all tags
424
print("Available tags:")
425
for tag_name in dataset.tags.names():
426
    tag = dataset.tags[tag_name]
427
    print(f"  {tag.name}: {tag.message} (version: {tag.version})")
428

429
# Open dataset at specific tag
430
v1_dataset = v1_tag.open()
431
print(f"Dataset at v1.0 has {len(v1_dataset)} rows")
432

433
# Compare with current version
434
print(f"Current dataset has {len(dataset)} rows")
435
```
436

437
### Version History and Rollback
438

439
```python
440
# Examine version history
441
print(f"Dataset has {len(dataset.history)} versions")
442

443
# Get specific version
444
latest_version = dataset.history[-1]  # Most recent
445
first_version = dataset.history[0]    # First version
446

447
print(f"Latest: {latest_version.message}")
448
print(f"First: {first_version.message}")
449

450
# Open dataset at specific version
451
historical_dataset = first_version.open()
452
print(f"First version had {len(historical_dataset)} rows")
453

454
# Rollback to previous version if needed
455
if len(dataset.history) > 1:
456
    previous_version = dataset.history[-2]
457
    dataset.rollback(previous_version.id)
458
    print(f"Rolled back to: {previous_version.message}")
459
```
460

461
### Remote Synchronization
462

463
```python
464
# Push local changes to remote storage
465
dataset.push()
466
print("Pushed local changes to remote")
467

468
# Pull remote changes (in another location/process)
469
remote_dataset = deeplake.open("s3://my-bucket/shared_dataset")
470
remote_dataset.pull()
471
print("Pulled latest changes from remote")
472

473
# Refresh to get latest version without pulling changes
474
remote_dataset.refresh()
475
print("Refreshed dataset metadata from remote")
476
```
477

478
### Collaborative Workflows
479

480
```python
481
# Typical collaborative workflow
482

483
# Developer A: Create feature branch
484
dataset_a = deeplake.open("s3://shared-bucket/project_dataset")
485
feature_branch = dataset_a.branch("add_validation_data")
486
feature_dataset = feature_branch.open()
487

488
# Add validation data
489
validation_data = [
490
    {"images": f"val_image_{i}.jpg", "labels": f"val_label_{i}"} 
491
    for i in range(500)
492
]
493
feature_dataset.extend(validation_data)
494
feature_dataset.commit("Added validation dataset")
495

496
# Push branch to remote
497
feature_dataset.push()
498

499
# Developer B: Pull and review changes
500
dataset_b = deeplake.open("s3://shared-bucket/project_dataset")
501
dataset_b.pull()
502

503
# Review feature branch
504
feature_branch_b = dataset_b.branches["add_validation_data"]
505
feature_data_b = feature_branch_b.open()
506
print(f"Feature branch has {len(feature_data_b)} total rows")
507

508
# Merge into main after review
509
main_dataset = dataset_b.branches["main"].open()
510
main_dataset.merge("add_validation_data", "Merged validation data from feature branch")
511
main_dataset.push()
512

513
# Tag the release
514
release_tag = main_dataset.tag("v2.0", "Added validation dataset - ready for training")
515
```
516

517
### Advanced Version Control
518

519
```python
520
# Complex branching scenario
521
dataset = deeplake.open("./complex_dataset")
522

523
# Create multiple feature branches
524
data_cleaning_branch = dataset.branch("data_cleaning")
525
augmentation_branch = dataset.branch("data_augmentation")
526
labeling_branch = dataset.branch("relabeling")
527

528
# Work on data cleaning
529
cleaning_dataset = data_cleaning_branch.open()
530
# ... perform data cleaning operations
531
cleaning_dataset.commit("Cleaned corrupted entries")
532

533
# Work on augmentation
534
aug_dataset = augmentation_branch.open()
535
# ... add augmented data
536
aug_dataset.commit("Added augmented training examples")
537

538
# Merge branches sequentially
539
main_dataset = dataset.branches["main"].open()
540

541
# Merge data cleaning first
542
main_dataset.merge("data_cleaning", "Merged data cleaning improvements")
543

544
# Merge augmentation
545
main_dataset.merge("data_augmentation", "Merged data augmentation")
546

547
# Create milestone tag
548
milestone_tag = main_dataset.tag("preprocessing_complete", "Completed data preprocessing pipeline")
549

550
# Clean up feature branches
551
data_cleaning_branch.delete()
552
augmentation_branch.delete()
553

554
print(f"Completed preprocessing. Dataset now has {len(main_dataset)} rows")
555
```
556

557
### Version-based Experiment Tracking
558

559
```python
560
# Track ML experiments with versions
561
dataset = deeplake.open("./experiment_dataset")
562

563
# Create experiment tracking
564
experiment_results = []
565

566
for experiment_id in range(5):
567
    # Create experiment branch
568
    exp_branch = dataset.branch(f"experiment_{experiment_id}")
569
    exp_dataset = exp_branch.open()
570
    
571
    # Apply different preprocessing
572
    # ... experiment-specific data modifications
573
    
574
    exp_dataset.commit(f"Applied preprocessing for experiment {experiment_id}")
575
    
576
    # Tag experiment version
577
    exp_tag = exp_dataset.tag(f"exp_{experiment_id}_data", f"Data for experiment {experiment_id}")
578
    
579
    # Record experiment info
580
    experiment_results.append({
581
        "experiment_id": experiment_id,
582
        "branch": exp_branch.name,
583
        "tag": exp_tag.name,
584
        "data_version": exp_dataset.version.id,
585
        "num_samples": len(exp_dataset)
586
    })
587
    
588
    # Clean up branch after tagging
589
    exp_branch.delete()
590

591
# Review all experiments
592
print("Experiment Summary:")
593
for result in experiment_results:
594
    print(f"Experiment {result['experiment_id']}: {result['num_samples']} samples, tagged as {result['tag']}")
595
```

Version

Tile

Files

version-control.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

version-control.mddocs/