or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-access.mddata-import-export.mddataset-management.mderror-handling.mdframework-integration.mdindex.mdquery-system.mdschema-templates.mdstorage-system.mdtype-system.mdversion-control.md

dataset-management.mddocs/

0

# Dataset Management

1

2

Core functionality for creating, opening, deleting, and copying datasets with support for various storage backends including local filesystem, S3, GCS, and Azure. Deep Lake provides comprehensive lifecycle management for datasets with automatic optimization and multi-cloud capabilities.

3

4

## Capabilities

5

6

### Dataset Creation

7

8

Creates new datasets with optional schema specification and credential configuration for various storage backends.

9

10

```python { .api }

11

def create(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None, schema: Optional[Schema] = None) -> Dataset:

12

"""

13

Create a new dataset.

14

15

Parameters:

16

- url: Dataset location (local path, S3, GCS, Azure, etc.)

17

- creds: Storage credentials dictionary

18

- token: Activeloop authentication token

19

- schema: Pre-defined schema for the dataset

20

21

Returns:

22

Dataset: New mutable dataset instance

23

"""

24

25

def create_async(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None, schema: Optional[Schema] = None) -> Future[Dataset]:

26

"""

27

Create a new dataset asynchronously.

28

29

Parameters:

30

- url: Dataset location

31

- creds: Storage credentials dictionary

32

- token: Activeloop authentication token

33

- schema: Pre-defined schema for the dataset

34

35

Returns:

36

Future[Dataset]: Future resolving to new dataset instance

37

"""

38

```

39

40

### Dataset Opening

41

42

Opens existing datasets for read-write or read-only access with automatic format detection and optimization.

43

44

```python { .api }

45

def open(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Dataset:

46

"""

47

Open existing dataset for modification.

48

49

Parameters:

50

- url: Dataset location

51

- creds: Storage credentials dictionary

52

- token: Activeloop authentication token

53

54

Returns:

55

Dataset: Mutable dataset instance

56

"""

57

58

def open_async(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Future[Dataset]:

59

"""

60

Open existing dataset asynchronously.

61

62

Parameters:

63

- url: Dataset location

64

- creds: Storage credentials dictionary

65

- token: Activeloop authentication token

66

67

Returns:

68

Future[Dataset]: Future resolving to mutable dataset instance

69

"""

70

71

def open_read_only(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> ReadOnlyDataset:

72

"""

73

Open dataset in read-only mode.

74

75

Parameters:

76

- url: Dataset location

77

- creds: Storage credentials dictionary

78

- token: Activeloop authentication token

79

80

Returns:

81

ReadOnlyDataset: Read-only dataset instance

82

"""

83

84

def open_read_only_async(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Future[ReadOnlyDataset]:

85

"""

86

Open dataset in read-only mode asynchronously.

87

88

Parameters:

89

- url: Dataset location

90

- creds: Storage credentials dictionary

91

- token: Activeloop authentication token

92

93

Returns:

94

Future[ReadOnlyDataset]: Future resolving to read-only dataset instance

95

"""

96

```

97

98

### Dataset Utilities

99

100

Utility functions for dataset existence checking, deletion, copying, and structure replication.

101

102

```python { .api }

103

def exists(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> bool:

104

"""

105

Check if dataset exists at the given location.

106

107

Parameters:

108

- url: Dataset location to check

109

- creds: Storage credentials dictionary

110

- token: Activeloop authentication token

111

112

Returns:

113

bool: True if dataset exists, False otherwise

114

"""

115

116

def delete(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> None:

117

"""

118

Delete a dataset permanently.

119

120

Parameters:

121

- url: Dataset location to delete

122

- creds: Storage credentials dictionary

123

- token: Activeloop authentication token

124

"""

125

126

def delete_async(url: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> FutureVoid:

127

"""

128

Delete a dataset permanently (asynchronous).

129

130

Parameters:

131

- url: Dataset location to delete

132

- creds: Storage credentials dictionary

133

- token: Activeloop authentication token

134

135

Returns:

136

FutureVoid: Future completing when deletion is done

137

"""

138

139

def copy(src: str, dst: str, src_creds: Optional[Dict[str, str]] = None, dst_creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> None:

140

"""

141

Copy dataset from source to destination.

142

143

Parameters:

144

- src: Source dataset location

145

- dst: Destination dataset location

146

- src_creds: Source storage credentials

147

- dst_creds: Destination storage credentials

148

- token: Activeloop authentication token

149

"""

150

151

def like(src: DatasetView, dest: str, creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> Dataset:

152

"""

153

Create new dataset with same structure as source.

154

155

Parameters:

156

- src: Source dataset view (typically from query results)

157

- dest: Destination path for new dataset

158

- creds: Storage credentials dictionary

159

- token: Activeloop authentication token

160

161

Returns:

162

Dataset: New dataset with same schema as source

163

"""

164

```

165

166

### Cloud Integration

167

168

Functions for connecting datasets to Activeloop cloud services and managing cloud-based dataset operations.

169

170

```python { .api }

171

def connect(src: str, dest: Optional[str] = None, org_id: Optional[str] = None, creds_key: Optional[str] = None, token: Optional[str] = None) -> Dataset:

172

"""

173

Connect dataset to Activeloop cloud services.

174

175

Parameters:

176

- src: Source dataset path

177

- dest: Destination path (optional)

178

- org_id: Organization ID

179

- creds_key: Credentials key for cloud storage

180

- token: Activeloop authentication token

181

182

Returns:

183

Dataset: Connected dataset instance

184

"""

185

186

def disconnect(url: str, token: Optional[str] = None) -> None:

187

"""

188

Disconnect dataset from Activeloop cloud services.

189

190

Parameters:

191

- url: Dataset URL to disconnect

192

- token: Activeloop authentication token

193

"""

194

```

195

196

### Legacy Migration

197

198

Tools for migrating datasets from Deep Lake v3 to v4 format with data preservation and automatic conversion.

199

200

```python { .api }

201

def convert(src: str, dst: str, dst_creds: Optional[Dict[str, str]] = None, token: Optional[str] = None) -> None:

202

"""

203

Convert v3 dataset to v4 format.

204

205

Parameters:

206

- src: Source v3 dataset path

207

- dst: Destination v4 dataset path

208

- dst_creds: Destination storage credentials

209

- token: Activeloop authentication token

210

"""

211

```

212

213

## Usage Examples

214

215

### Basic Dataset Lifecycle

216

217

```python

218

import deeplake

219

220

# Create a new dataset locally

221

dataset = deeplake.create("./my_dataset")

222

223

# Add some columns

224

dataset.add_column("images", deeplake.types.Image())

225

dataset.add_column("labels", deeplake.types.Text())

226

227

# Append data and commit

228

dataset.append({"images": "image1.jpg", "labels": "cat"})

229

dataset.commit("Initial data")

230

231

# Check if dataset exists

232

if deeplake.exists("./my_dataset"):

233

print("Dataset exists!")

234

235

# Open existing dataset

236

dataset = deeplake.open("./my_dataset")

237

print(f"Dataset has {len(dataset)} rows")

238

239

# Copy dataset to cloud storage

240

deeplake.copy("./my_dataset", "s3://my-bucket/my_dataset",

241

dst_creds={"aws_access_key_id": "...", "aws_secret_access_key": "..."})

242

```

243

244

### Cloud Storage Integration

245

246

```python

247

# Create dataset on S3

248

s3_creds = {

249

"aws_access_key_id": "your_access_key",

250

"aws_secret_access_key": "your_secret_key"

251

}

252

253

dataset = deeplake.create("s3://my-bucket/my_dataset", creds=s3_creds)

254

255

# Create dataset on GCS

256

gcs_creds = {

257

"google_application_credentials": "path/to/credentials.json"

258

}

259

260

dataset = deeplake.create("gcs://my-bucket/my_dataset", creds=gcs_creds)

261

262

# Create dataset on Azure

263

azure_creds = {

264

"azure_storage_account": "myaccount",

265

"azure_storage_key": "mykey"

266

}

267

268

dataset = deeplake.create("azure://my-container/my_dataset", creds=azure_creds)

269

```

270

271

### Async Operations

272

273

```python

274

import asyncio

275

276

async def create_multiple_datasets():

277

# Create multiple datasets concurrently

278

tasks = [

279

deeplake.create_async(f"./dataset_{i}")

280

for i in range(5)

281

]

282

283

datasets = await asyncio.gather(*tasks)

284

return datasets

285

286

# Run async operation

287

datasets = asyncio.run(create_multiple_datasets())

288

```

289

290

### Schema-based Creation

291

292

```python

293

from deeplake.schemas import TextEmbeddings

294

295

# Create dataset with predefined schema

296

schema = TextEmbeddings(embedding_size=768)

297

dataset = deeplake.create("./embeddings_dataset", schema=schema)

298

299

# Schema is automatically applied

300

print(dataset.schema.columns) # Shows text and embedding columns

301

```