or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-access.mddata-import-export.mddataset-management.mderror-handling.mdframework-integration.mdindex.mdquery-system.mdschema-templates.mdstorage-system.mdtype-system.mdversion-control.md

version-control.mddocs/

0

# Version Control

1

2

Git-like version control system with branching, tagging, commit history, and merge operations for dataset evolution and collaboration. Deep Lake provides comprehensive versioning capabilities enabling reproducible ML experiments and dataset lineage tracking.

3

4

## Capabilities

5

6

### Dataset Versioning

7

8

Core version control operations for tracking dataset changes with commit history and rollback capabilities.

9

10

```python { .api }

11

class Dataset:

12

"""Dataset version control operations."""

13

14

version: Version

15

history: History

16

current_branch: str

17

18

def commit(self, message: str = "") -> str:

19

"""

20

Commit current dataset changes.

21

22

Parameters:

23

- message: Commit message describing changes

24

25

Returns:

26

str: Commit ID/hash

27

"""

28

29

def rollback(self, version_id: str) -> None:

30

"""

31

Rollback dataset to specific version.

32

33

Parameters:

34

- version_id: Version ID to rollback to

35

"""

36

37

def refresh(self) -> None:

38

"""Refresh dataset to latest version from storage."""

39

40

class Version:

41

"""Single version information."""

42

43

id: str

44

message: str

45

timestamp: str

46

client_timestamp: str

47

48

def open(self) -> ReadOnlyDataset:

49

"""

50

Open this version as read-only dataset.

51

52

Returns:

53

ReadOnlyDataset: Dataset at this version

54

"""

55

56

def open_async(self) -> Future[ReadOnlyDataset]:

57

"""

58

Open this version asynchronously.

59

60

Returns:

61

Future[ReadOnlyDataset]: Future resolving to dataset at this version

62

"""

63

64

class History:

65

"""Version history access."""

66

67

def __getitem__(self, key: Union[int, str]) -> Version:

68

"""

69

Access version by index or ID.

70

71

Parameters:

72

- key: Version index (int) or version ID (str)

73

74

Returns:

75

Version: Version object

76

"""

77

78

def __iter__(self) -> Iterator[Version]:

79

"""Iterate over all versions in chronological order."""

80

81

def __len__(self) -> int:

82

"""Get total number of versions."""

83

```

84

85

### Branch Management

86

87

Create and manage dataset branches for parallel development and experimentation.

88

89

```python { .api }

90

class Dataset:

91

"""Dataset branch operations."""

92

93

branches: Branches

94

95

def branch(self, name: str) -> Branch:

96

"""

97

Create new branch from current state.

98

99

Parameters:

100

- name: Branch name

101

102

Returns:

103

Branch: New branch object

104

"""

105

106

def merge(self, branch_name: str, message: str = "") -> None:

107

"""

108

Merge branch into current branch.

109

110

Parameters:

111

- branch_name: Name of branch to merge

112

- message: Merge commit message

113

"""

114

115

class Branch:

116

"""Dataset branch management."""

117

118

id: str

119

name: str

120

timestamp: str

121

base: str

122

123

def open(self) -> Dataset:

124

"""

125

Open this branch for modification.

126

127

Returns:

128

Dataset: Mutable dataset on this branch

129

"""

130

131

def open_async(self) -> Future[Dataset]:

132

"""

133

Open this branch asynchronously.

134

135

Returns:

136

Future[Dataset]: Future resolving to mutable dataset

137

"""

138

139

def delete(self) -> None:

140

"""Delete this branch (cannot delete main branch)."""

141

142

def rename(self, new_name: str) -> None:

143

"""

144

Rename this branch.

145

146

Parameters:

147

- new_name: New branch name

148

"""

149

150

class BranchView:

151

"""Read-only branch information."""

152

153

id: str

154

name: str

155

timestamp: str

156

base: str

157

158

def open(self) -> ReadOnlyDataset:

159

"""Open this branch as read-only dataset."""

160

161

def open_async(self) -> Future[ReadOnlyDataset]:

162

"""Open this branch asynchronously."""

163

164

class Branches:

165

"""Collection of branches (mutable)."""

166

167

def names(self) -> List[str]:

168

"""

169

Get all branch names.

170

171

Returns:

172

List[str]: List of branch names

173

"""

174

175

def __len__(self) -> int:

176

"""Get number of branches."""

177

178

def __getitem__(self, name: str) -> Branch:

179

"""

180

Get branch by name.

181

182

Parameters:

183

- name: Branch name

184

185

Returns:

186

Branch: Branch object

187

"""

188

189

class BranchesView:

190

"""Collection of branches (read-only)."""

191

192

def names(self) -> List[str]:

193

"""Get all branch names."""

194

195

def __len__(self) -> int:

196

"""Get number of branches."""

197

198

def __getitem__(self, name: str) -> BranchView:

199

"""Get branch by name."""

200

```

201

202

### Tag Management

203

204

Create and manage dataset tags for marking important versions and milestones.

205

206

```python { .api }

207

class Dataset:

208

"""Dataset tag operations."""

209

210

tags: Tags

211

212

def tag(self, name: str, message: str = "") -> Tag:

213

"""

214

Create tag at current version.

215

216

Parameters:

217

- name: Tag name

218

- message: Tag message/description

219

220

Returns:

221

Tag: New tag object

222

"""

223

224

class DatasetView:

225

"""Query result tag operations."""

226

227

def tag(self, name: str, message: str = "") -> TagView:

228

"""Create tag from query result view."""

229

230

class Tag:

231

"""Dataset tag management."""

232

233

id: str

234

name: str

235

message: str

236

version: str

237

timestamp: str

238

239

def open(self) -> ReadOnlyDataset:

240

"""

241

Open dataset at this tag version.

242

243

Returns:

244

ReadOnlyDataset: Dataset at tag version

245

"""

246

247

def open_async(self) -> Future[ReadOnlyDataset]:

248

"""

249

Open dataset at this tag asynchronously.

250

251

Returns:

252

Future[ReadOnlyDataset]: Future resolving to dataset

253

"""

254

255

def delete(self) -> None:

256

"""Delete this tag."""

257

258

def rename(self, new_name: str) -> None:

259

"""

260

Rename this tag.

261

262

Parameters:

263

- new_name: New tag name

264

"""

265

266

class TagView:

267

"""Read-only tag information."""

268

269

id: str

270

name: str

271

message: str

272

version: str

273

timestamp: str

274

275

def open(self) -> ReadOnlyDataset:

276

"""Open dataset at this tag version."""

277

278

def open_async(self) -> Future[ReadOnlyDataset]:

279

"""Open dataset at this tag asynchronously."""

280

281

class Tags:

282

"""Collection of tags (mutable)."""

283

284

def names(self) -> List[str]:

285

"""

286

Get all tag names.

287

288

Returns:

289

List[str]: List of tag names

290

"""

291

292

def __len__(self) -> int:

293

"""Get number of tags."""

294

295

def __getitem__(self, name: str) -> Tag:

296

"""

297

Get tag by name.

298

299

Parameters:

300

- name: Tag name

301

302

Returns:

303

Tag: Tag object

304

"""

305

306

class TagsView:

307

"""Collection of tags (read-only)."""

308

309

def names(self) -> List[str]:

310

"""Get all tag names."""

311

312

def __len__(self) -> int:

313

"""Get number of tags."""

314

315

def __getitem__(self, name: str) -> TagView:

316

"""Get tag by name."""

317

```

318

319

### Remote Synchronization

320

321

Push and pull operations for synchronizing dataset versions with remote storage.

322

323

```python { .api }

324

class Dataset:

325

"""Remote synchronization operations."""

326

327

def push(self) -> None:

328

"""Push local changes to remote storage."""

329

330

def pull(self) -> None:

331

"""Pull remote changes to local dataset."""

332

333

class ReadOnlyDataset:

334

"""Remote operations for read-only datasets."""

335

336

def push(self) -> None:

337

"""Push dataset state to remote (metadata only)."""

338

339

def refresh(self) -> None:

340

"""Refresh dataset from remote storage."""

341

```

342

343

## Usage Examples

344

345

### Basic Version Control

346

347

```python

348

import deeplake

349

350

# Open dataset and make changes

351

dataset = deeplake.open("./my_dataset")

352

353

# Add some data

354

dataset.append({"images": "new_image.jpg", "labels": "cat"})

355

356

# Commit changes

357

commit_id = dataset.commit("Added new cat image")

358

print(f"Committed changes: {commit_id}")

359

360

# View commit history

361

for version in dataset.history:

362

print(f"Version {version.id}: {version.message} ({version.timestamp})")

363

364

# Get current version info

365

current_version = dataset.version

366

print(f"Current version: {current_version.id}")

367

print(f"Commit message: {current_version.message}")

368

```

369

370

### Branch Operations

371

372

```python

373

# Create new branch for experiments

374

experiment_branch = dataset.branch("feature_experiment")

375

print(f"Created branch: {experiment_branch.name}")

376

377

# List all branches

378

print("Available branches:")

379

for branch_name in dataset.branches.names():

380

branch = dataset.branches[branch_name]

381

print(f" {branch.name} (created: {branch.timestamp})")

382

383

# Switch to experiment branch

384

experiment_dataset = experiment_branch.open()

385

386

# Make experimental changes

387

experiment_dataset.add_column("confidence", deeplake.types.Float32())

388

experiment_dataset.append({

389

"images": "experiment_image.jpg",

390

"labels": "experimental_label",

391

"confidence": 0.95

392

})

393

394

# Commit experimental changes

395

experiment_dataset.commit("Added confidence scores for experimentation")

396

397

# Switch back to main branch

398

main_dataset = dataset.branches["main"].open()

399

400

# Merge experimental branch into main

401

main_dataset.merge("feature_experiment", "Merged confidence score feature")

402

403

# Clean up: delete experimental branch

404

experiment_branch.delete()

405

```

406

407

### Tag Management

408

409

```python

410

# Create tags for important milestones

411

v1_tag = dataset.tag("v1.0", "Initial production dataset")

412

print(f"Created tag: {v1_tag.name}")

413

414

# Add more data and create another tag

415

dataset.extend([

416

{"images": f"batch_image_{i}.jpg", "labels": f"label_{i}"}

417

for i in range(100)

418

])

419

dataset.commit("Added batch of 100 images")

420

421

v1_1_tag = dataset.tag("v1.1", "Added training batch")

422

423

# List all tags

424

print("Available tags:")

425

for tag_name in dataset.tags.names():

426

tag = dataset.tags[tag_name]

427

print(f" {tag.name}: {tag.message} (version: {tag.version})")

428

429

# Open dataset at specific tag

430

v1_dataset = v1_tag.open()

431

print(f"Dataset at v1.0 has {len(v1_dataset)} rows")

432

433

# Compare with current version

434

print(f"Current dataset has {len(dataset)} rows")

435

```

436

437

### Version History and Rollback

438

439

```python

440

# Examine version history

441

print(f"Dataset has {len(dataset.history)} versions")

442

443

# Get specific version

444

latest_version = dataset.history[-1] # Most recent

445

first_version = dataset.history[0] # First version

446

447

print(f"Latest: {latest_version.message}")

448

print(f"First: {first_version.message}")

449

450

# Open dataset at specific version

451

historical_dataset = first_version.open()

452

print(f"First version had {len(historical_dataset)} rows")

453

454

# Rollback to previous version if needed

455

if len(dataset.history) > 1:

456

previous_version = dataset.history[-2]

457

dataset.rollback(previous_version.id)

458

print(f"Rolled back to: {previous_version.message}")

459

```

460

461

### Remote Synchronization

462

463

```python

464

# Push local changes to remote storage

465

dataset.push()

466

print("Pushed local changes to remote")

467

468

# Pull remote changes (in another location/process)

469

remote_dataset = deeplake.open("s3://my-bucket/shared_dataset")

470

remote_dataset.pull()

471

print("Pulled latest changes from remote")

472

473

# Refresh to get latest version without pulling changes

474

remote_dataset.refresh()

475

print("Refreshed dataset metadata from remote")

476

```

477

478

### Collaborative Workflows

479

480

```python

481

# Typical collaborative workflow

482

483

# Developer A: Create feature branch

484

dataset_a = deeplake.open("s3://shared-bucket/project_dataset")

485

feature_branch = dataset_a.branch("add_validation_data")

486

feature_dataset = feature_branch.open()

487

488

# Add validation data

489

validation_data = [

490

{"images": f"val_image_{i}.jpg", "labels": f"val_label_{i}"}

491

for i in range(500)

492

]

493

feature_dataset.extend(validation_data)

494

feature_dataset.commit("Added validation dataset")

495

496

# Push branch to remote

497

feature_dataset.push()

498

499

# Developer B: Pull and review changes

500

dataset_b = deeplake.open("s3://shared-bucket/project_dataset")

501

dataset_b.pull()

502

503

# Review feature branch

504

feature_branch_b = dataset_b.branches["add_validation_data"]

505

feature_data_b = feature_branch_b.open()

506

print(f"Feature branch has {len(feature_data_b)} total rows")

507

508

# Merge into main after review

509

main_dataset = dataset_b.branches["main"].open()

510

main_dataset.merge("add_validation_data", "Merged validation data from feature branch")

511

main_dataset.push()

512

513

# Tag the release

514

release_tag = main_dataset.tag("v2.0", "Added validation dataset - ready for training")

515

```

516

517

### Advanced Version Control

518

519

```python

520

# Complex branching scenario

521

dataset = deeplake.open("./complex_dataset")

522

523

# Create multiple feature branches

524

data_cleaning_branch = dataset.branch("data_cleaning")

525

augmentation_branch = dataset.branch("data_augmentation")

526

labeling_branch = dataset.branch("relabeling")

527

528

# Work on data cleaning

529

cleaning_dataset = data_cleaning_branch.open()

530

# ... perform data cleaning operations

531

cleaning_dataset.commit("Cleaned corrupted entries")

532

533

# Work on augmentation

534

aug_dataset = augmentation_branch.open()

535

# ... add augmented data

536

aug_dataset.commit("Added augmented training examples")

537

538

# Merge branches sequentially

539

main_dataset = dataset.branches["main"].open()

540

541

# Merge data cleaning first

542

main_dataset.merge("data_cleaning", "Merged data cleaning improvements")

543

544

# Merge augmentation

545

main_dataset.merge("data_augmentation", "Merged data augmentation")

546

547

# Create milestone tag

548

milestone_tag = main_dataset.tag("preprocessing_complete", "Completed data preprocessing pipeline")

549

550

# Clean up feature branches

551

data_cleaning_branch.delete()

552

augmentation_branch.delete()

553

554

print(f"Completed preprocessing. Dataset now has {len(main_dataset)} rows")

555

```

556

557

### Version-based Experiment Tracking

558

559

```python

560

# Track ML experiments with versions

561

dataset = deeplake.open("./experiment_dataset")

562

563

# Create experiment tracking

564

experiment_results = []

565

566

for experiment_id in range(5):

567

# Create experiment branch

568

exp_branch = dataset.branch(f"experiment_{experiment_id}")

569

exp_dataset = exp_branch.open()

570

571

# Apply different preprocessing

572

# ... experiment-specific data modifications

573

574

exp_dataset.commit(f"Applied preprocessing for experiment {experiment_id}")

575

576

# Tag experiment version

577

exp_tag = exp_dataset.tag(f"exp_{experiment_id}_data", f"Data for experiment {experiment_id}")

578

579

# Record experiment info

580

experiment_results.append({

581

"experiment_id": experiment_id,

582

"branch": exp_branch.name,

583

"tag": exp_tag.name,

584

"data_version": exp_dataset.version.id,

585

"num_samples": len(exp_dataset)

586

})

587

588

# Clean up branch after tagging

589

exp_branch.delete()

590

591

# Review all experiments

592

print("Experiment Summary:")

593

for result in experiment_results:

594

print(f"Experiment {result['experiment_id']}: {result['num_samples']} samples, tagged as {result['tag']}")

595

```