0
# Version Control
1
2
Git-like version control system with branching, tagging, commit history, and merge operations for dataset evolution and collaboration. Deep Lake provides comprehensive versioning capabilities enabling reproducible ML experiments and dataset lineage tracking.
3
4
## Capabilities
5
6
### Dataset Versioning
7
8
Core version control operations for tracking dataset changes with commit history and rollback capabilities.
9
10
```python { .api }
11
class Dataset:
12
"""Dataset version control operations."""
13
14
version: Version
15
history: History
16
current_branch: str
17
18
def commit(self, message: str = "") -> str:
19
"""
20
Commit current dataset changes.
21
22
Parameters:
23
- message: Commit message describing changes
24
25
Returns:
26
str: Commit ID/hash
27
"""
28
29
def rollback(self, version_id: str) -> None:
30
"""
31
Rollback dataset to specific version.
32
33
Parameters:
34
- version_id: Version ID to rollback to
35
"""
36
37
def refresh(self) -> None:
38
"""Refresh dataset to latest version from storage."""
39
40
class Version:
41
"""Single version information."""
42
43
id: str
44
message: str
45
timestamp: str
46
client_timestamp: str
47
48
def open(self) -> ReadOnlyDataset:
49
"""
50
Open this version as read-only dataset.
51
52
Returns:
53
ReadOnlyDataset: Dataset at this version
54
"""
55
56
def open_async(self) -> Future[ReadOnlyDataset]:
57
"""
58
Open this version asynchronously.
59
60
Returns:
61
Future[ReadOnlyDataset]: Future resolving to dataset at this version
62
"""
63
64
class History:
65
"""Version history access."""
66
67
def __getitem__(self, key: Union[int, str]) -> Version:
68
"""
69
Access version by index or ID.
70
71
Parameters:
72
- key: Version index (int) or version ID (str)
73
74
Returns:
75
Version: Version object
76
"""
77
78
def __iter__(self) -> Iterator[Version]:
79
"""Iterate over all versions in chronological order."""
80
81
def __len__(self) -> int:
82
"""Get total number of versions."""
83
```
84
85
### Branch Management
86
87
Create and manage dataset branches for parallel development and experimentation.
88
89
```python { .api }
90
class Dataset:
91
"""Dataset branch operations."""
92
93
branches: Branches
94
95
def branch(self, name: str) -> Branch:
96
"""
97
Create new branch from current state.
98
99
Parameters:
100
- name: Branch name
101
102
Returns:
103
Branch: New branch object
104
"""
105
106
def merge(self, branch_name: str, message: str = "") -> None:
107
"""
108
Merge branch into current branch.
109
110
Parameters:
111
- branch_name: Name of branch to merge
112
- message: Merge commit message
113
"""
114
115
class Branch:
116
"""Dataset branch management."""
117
118
id: str
119
name: str
120
timestamp: str
121
base: str
122
123
def open(self) -> Dataset:
124
"""
125
Open this branch for modification.
126
127
Returns:
128
Dataset: Mutable dataset on this branch
129
"""
130
131
def open_async(self) -> Future[Dataset]:
132
"""
133
Open this branch asynchronously.
134
135
Returns:
136
Future[Dataset]: Future resolving to mutable dataset
137
"""
138
139
def delete(self) -> None:
140
"""Delete this branch (cannot delete main branch)."""
141
142
def rename(self, new_name: str) -> None:
143
"""
144
Rename this branch.
145
146
Parameters:
147
- new_name: New branch name
148
"""
149
150
class BranchView:
151
"""Read-only branch information."""
152
153
id: str
154
name: str
155
timestamp: str
156
base: str
157
158
def open(self) -> ReadOnlyDataset:
159
"""Open this branch as read-only dataset."""
160
161
def open_async(self) -> Future[ReadOnlyDataset]:
162
"""Open this branch asynchronously."""
163
164
class Branches:
165
"""Collection of branches (mutable)."""
166
167
def names(self) -> List[str]:
168
"""
169
Get all branch names.
170
171
Returns:
172
List[str]: List of branch names
173
"""
174
175
def __len__(self) -> int:
176
"""Get number of branches."""
177
178
def __getitem__(self, name: str) -> Branch:
179
"""
180
Get branch by name.
181
182
Parameters:
183
- name: Branch name
184
185
Returns:
186
Branch: Branch object
187
"""
188
189
class BranchesView:
190
"""Collection of branches (read-only)."""
191
192
def names(self) -> List[str]:
193
"""Get all branch names."""
194
195
def __len__(self) -> int:
196
"""Get number of branches."""
197
198
def __getitem__(self, name: str) -> BranchView:
199
"""Get branch by name."""
200
```
201
202
### Tag Management
203
204
Create and manage dataset tags for marking important versions and milestones.
205
206
```python { .api }
207
class Dataset:
208
"""Dataset tag operations."""
209
210
tags: Tags
211
212
def tag(self, name: str, message: str = "") -> Tag:
213
"""
214
Create tag at current version.
215
216
Parameters:
217
- name: Tag name
218
- message: Tag message/description
219
220
Returns:
221
Tag: New tag object
222
"""
223
224
class DatasetView:
225
"""Query result tag operations."""
226
227
def tag(self, name: str, message: str = "") -> TagView:
228
"""Create tag from query result view."""
229
230
class Tag:
231
"""Dataset tag management."""
232
233
id: str
234
name: str
235
message: str
236
version: str
237
timestamp: str
238
239
def open(self) -> ReadOnlyDataset:
240
"""
241
Open dataset at this tag version.
242
243
Returns:
244
ReadOnlyDataset: Dataset at tag version
245
"""
246
247
def open_async(self) -> Future[ReadOnlyDataset]:
248
"""
249
Open dataset at this tag asynchronously.
250
251
Returns:
252
Future[ReadOnlyDataset]: Future resolving to dataset
253
"""
254
255
def delete(self) -> None:
256
"""Delete this tag."""
257
258
def rename(self, new_name: str) -> None:
259
"""
260
Rename this tag.
261
262
Parameters:
263
- new_name: New tag name
264
"""
265
266
class TagView:
267
"""Read-only tag information."""
268
269
id: str
270
name: str
271
message: str
272
version: str
273
timestamp: str
274
275
def open(self) -> ReadOnlyDataset:
276
"""Open dataset at this tag version."""
277
278
def open_async(self) -> Future[ReadOnlyDataset]:
279
"""Open dataset at this tag asynchronously."""
280
281
class Tags:
282
"""Collection of tags (mutable)."""
283
284
def names(self) -> List[str]:
285
"""
286
Get all tag names.
287
288
Returns:
289
List[str]: List of tag names
290
"""
291
292
def __len__(self) -> int:
293
"""Get number of tags."""
294
295
def __getitem__(self, name: str) -> Tag:
296
"""
297
Get tag by name.
298
299
Parameters:
300
- name: Tag name
301
302
Returns:
303
Tag: Tag object
304
"""
305
306
class TagsView:
307
"""Collection of tags (read-only)."""
308
309
def names(self) -> List[str]:
310
"""Get all tag names."""
311
312
def __len__(self) -> int:
313
"""Get number of tags."""
314
315
def __getitem__(self, name: str) -> TagView:
316
"""Get tag by name."""
317
```
318
319
### Remote Synchronization
320
321
Push and pull operations for synchronizing dataset versions with remote storage.
322
323
```python { .api }
324
class Dataset:
325
"""Remote synchronization operations."""
326
327
def push(self) -> None:
328
"""Push local changes to remote storage."""
329
330
def pull(self) -> None:
331
"""Pull remote changes to local dataset."""
332
333
class ReadOnlyDataset:
334
"""Remote operations for read-only datasets."""
335
336
def push(self) -> None:
337
"""Push dataset state to remote (metadata only)."""
338
339
def refresh(self) -> None:
340
"""Refresh dataset from remote storage."""
341
```
342
343
## Usage Examples
344
345
### Basic Version Control
346
347
```python
348
import deeplake
349
350
# Open dataset and make changes
351
dataset = deeplake.open("./my_dataset")
352
353
# Add some data
354
dataset.append({"images": "new_image.jpg", "labels": "cat"})
355
356
# Commit changes
357
commit_id = dataset.commit("Added new cat image")
358
print(f"Committed changes: {commit_id}")
359
360
# View commit history
361
for version in dataset.history:
362
print(f"Version {version.id}: {version.message} ({version.timestamp})")
363
364
# Get current version info
365
current_version = dataset.version
366
print(f"Current version: {current_version.id}")
367
print(f"Commit message: {current_version.message}")
368
```
369
370
### Branch Operations
371
372
```python
373
# Create new branch for experiments
374
experiment_branch = dataset.branch("feature_experiment")
375
print(f"Created branch: {experiment_branch.name}")
376
377
# List all branches
378
print("Available branches:")
379
for branch_name in dataset.branches.names():
380
branch = dataset.branches[branch_name]
381
print(f" {branch.name} (created: {branch.timestamp})")
382
383
# Switch to experiment branch
384
experiment_dataset = experiment_branch.open()
385
386
# Make experimental changes
387
experiment_dataset.add_column("confidence", deeplake.types.Float32())
388
experiment_dataset.append({
389
"images": "experiment_image.jpg",
390
"labels": "experimental_label",
391
"confidence": 0.95
392
})
393
394
# Commit experimental changes
395
experiment_dataset.commit("Added confidence scores for experimentation")
396
397
# Switch back to main branch
398
main_dataset = dataset.branches["main"].open()
399
400
# Merge experimental branch into main
401
main_dataset.merge("feature_experiment", "Merged confidence score feature")
402
403
# Clean up: delete experimental branch
404
experiment_branch.delete()
405
```
406
407
### Tag Management
408
409
```python
410
# Create tags for important milestones
411
v1_tag = dataset.tag("v1.0", "Initial production dataset")
412
print(f"Created tag: {v1_tag.name}")
413
414
# Add more data and create another tag
415
dataset.extend([
416
{"images": f"batch_image_{i}.jpg", "labels": f"label_{i}"}
417
for i in range(100)
418
])
419
dataset.commit("Added batch of 100 images")
420
421
v1_1_tag = dataset.tag("v1.1", "Added training batch")
422
423
# List all tags
424
print("Available tags:")
425
for tag_name in dataset.tags.names():
426
tag = dataset.tags[tag_name]
427
print(f" {tag.name}: {tag.message} (version: {tag.version})")
428
429
# Open dataset at specific tag
430
v1_dataset = v1_tag.open()
431
print(f"Dataset at v1.0 has {len(v1_dataset)} rows")
432
433
# Compare with current version
434
print(f"Current dataset has {len(dataset)} rows")
435
```
436
437
### Version History and Rollback
438
439
```python
440
# Examine version history
441
print(f"Dataset has {len(dataset.history)} versions")
442
443
# Get specific version
444
latest_version = dataset.history[-1] # Most recent
445
first_version = dataset.history[0] # First version
446
447
print(f"Latest: {latest_version.message}")
448
print(f"First: {first_version.message}")
449
450
# Open dataset at specific version
451
historical_dataset = first_version.open()
452
print(f"First version had {len(historical_dataset)} rows")
453
454
# Rollback to previous version if needed
455
if len(dataset.history) > 1:
456
previous_version = dataset.history[-2]
457
dataset.rollback(previous_version.id)
458
print(f"Rolled back to: {previous_version.message}")
459
```
460
461
### Remote Synchronization
462
463
```python
464
# Push local changes to remote storage
465
dataset.push()
466
print("Pushed local changes to remote")
467
468
# Pull remote changes (in another location/process)
469
remote_dataset = deeplake.open("s3://my-bucket/shared_dataset")
470
remote_dataset.pull()
471
print("Pulled latest changes from remote")
472
473
# Refresh to get latest version without pulling changes
474
remote_dataset.refresh()
475
print("Refreshed dataset metadata from remote")
476
```
477
478
### Collaborative Workflows
479
480
```python
481
# Typical collaborative workflow
482
483
# Developer A: Create feature branch
484
dataset_a = deeplake.open("s3://shared-bucket/project_dataset")
485
feature_branch = dataset_a.branch("add_validation_data")
486
feature_dataset = feature_branch.open()
487
488
# Add validation data
489
validation_data = [
490
{"images": f"val_image_{i}.jpg", "labels": f"val_label_{i}"}
491
for i in range(500)
492
]
493
feature_dataset.extend(validation_data)
494
feature_dataset.commit("Added validation dataset")
495
496
# Push branch to remote
497
feature_dataset.push()
498
499
# Developer B: Pull and review changes
500
dataset_b = deeplake.open("s3://shared-bucket/project_dataset")
501
dataset_b.pull()
502
503
# Review feature branch
504
feature_branch_b = dataset_b.branches["add_validation_data"]
505
feature_data_b = feature_branch_b.open()
506
print(f"Feature branch has {len(feature_data_b)} total rows")
507
508
# Merge into main after review
509
main_dataset = dataset_b.branches["main"].open()
510
main_dataset.merge("add_validation_data", "Merged validation data from feature branch")
511
main_dataset.push()
512
513
# Tag the release
514
release_tag = main_dataset.tag("v2.0", "Added validation dataset - ready for training")
515
```
516
517
### Advanced Version Control
518
519
```python
520
# Complex branching scenario
521
dataset = deeplake.open("./complex_dataset")
522
523
# Create multiple feature branches
524
data_cleaning_branch = dataset.branch("data_cleaning")
525
augmentation_branch = dataset.branch("data_augmentation")
526
labeling_branch = dataset.branch("relabeling")
527
528
# Work on data cleaning
529
cleaning_dataset = data_cleaning_branch.open()
530
# ... perform data cleaning operations
531
cleaning_dataset.commit("Cleaned corrupted entries")
532
533
# Work on augmentation
534
aug_dataset = augmentation_branch.open()
535
# ... add augmented data
536
aug_dataset.commit("Added augmented training examples")
537
538
# Merge branches sequentially
539
main_dataset = dataset.branches["main"].open()
540
541
# Merge data cleaning first
542
main_dataset.merge("data_cleaning", "Merged data cleaning improvements")
543
544
# Merge augmentation
545
main_dataset.merge("data_augmentation", "Merged data augmentation")
546
547
# Create milestone tag
548
milestone_tag = main_dataset.tag("preprocessing_complete", "Completed data preprocessing pipeline")
549
550
# Clean up feature branches
551
data_cleaning_branch.delete()
552
augmentation_branch.delete()
553
554
print(f"Completed preprocessing. Dataset now has {len(main_dataset)} rows")
555
```
556
557
### Version-based Experiment Tracking
558
559
```python
560
# Track ML experiments with versions
561
dataset = deeplake.open("./experiment_dataset")
562
563
# Create experiment tracking
564
experiment_results = []
565
566
for experiment_id in range(5):
567
# Create experiment branch
568
exp_branch = dataset.branch(f"experiment_{experiment_id}")
569
exp_dataset = exp_branch.open()
570
571
# Apply different preprocessing
572
# ... experiment-specific data modifications
573
574
exp_dataset.commit(f"Applied preprocessing for experiment {experiment_id}")
575
576
# Tag experiment version
577
exp_tag = exp_dataset.tag(f"exp_{experiment_id}_data", f"Data for experiment {experiment_id}")
578
579
# Record experiment info
580
experiment_results.append({
581
"experiment_id": experiment_id,
582
"branch": exp_branch.name,
583
"tag": exp_tag.name,
584
"data_version": exp_dataset.version.id,
585
"num_samples": len(exp_dataset)
586
})
587
588
# Clean up branch after tagging
589
exp_branch.delete()
590
591
# Review all experiments
592
print("Experiment Summary:")
593
for result in experiment_results:
594
print(f"Experiment {result['experiment_id']}: {result['num_samples']} samples, tagged as {result['tag']}")
595
```