or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-access.mddata-import-export.mddataset-management.mderror-handling.mdframework-integration.mdindex.mdquery-system.mdschema-templates.mdstorage-system.mdtype-system.mdversion-control.md

data-access.mddocs/

0

# Data Access and Manipulation

1

2

Comprehensive row and column-based data access patterns with support for indexing, slicing, batch operations, and efficient data manipulation. Deep Lake provides both mutable and read-only access patterns optimized for ML workflows.

3

4

## Capabilities

5

6

### Dataset Access Patterns

7

8

Dataset objects provide dictionary-like and array-like access to data with automatic type handling and optimization.

9

10

```python { .api }

11

class Dataset:

12

"""Primary mutable dataset class."""

13

14

def __getitem__(self, key: Union[int, slice, str]) -> Union[Row, RowRange, Column]:

15

"""

16

Access dataset elements by index or name.

17

18

Parameters:

19

- key: Row index (int), row range (slice), or column name (str)

20

21

Returns:

22

- Row: Single row access (when key is int)

23

- RowRange: Multiple row access (when key is slice)

24

- Column: Column access (when key is str)

25

"""

26

27

def __len__(self) -> int:

28

"""Get number of rows in dataset."""

29

30

def append(self, data: Dict[str, Any]) -> None:

31

"""

32

Append new row to dataset.

33

34

Parameters:

35

- data: Dictionary mapping column names to values

36

"""

37

38

def extend(self, data: List[Dict[str, Any]]) -> None:

39

"""

40

Append multiple rows to dataset.

41

42

Parameters:

43

- data: List of dictionaries mapping column names to values

44

"""

45

46

def add_column(self, name: str, dtype: Type) -> None:

47

"""

48

Add new column to dataset.

49

50

Parameters:

51

- name: Column name

52

- dtype: Column data type

53

"""

54

55

def remove_column(self, name: str) -> None:

56

"""

57

Remove column from dataset.

58

59

Parameters:

60

- name: Column name to remove

61

"""

62

63

def rename_column(self, old_name: str, new_name: str) -> None:

64

"""

65

Rename existing column.

66

67

Parameters:

68

- old_name: Current column name

69

- new_name: New column name

70

"""

71

72

class ReadOnlyDataset:

73

"""Read-only dataset access."""

74

75

def __getitem__(self, key: Union[int, slice, str]) -> Union[RowView, RowRangeView, ColumnView]:

76

"""Access dataset elements (read-only)."""

77

78

def __len__(self) -> int:

79

"""Get number of rows in dataset."""

80

81

class DatasetView:

82

"""Query result view of dataset."""

83

84

def __getitem__(self, key: Union[int, slice, str]) -> Union[RowView, RowRangeView, ColumnView]:

85

"""Access query result elements."""

86

87

def __len__(self) -> int:

88

"""Get number of rows in view."""

89

90

def summary(self) -> str:

91

"""Get summary statistics of the dataset view."""

92

```

93

94

### Column Access and Manipulation

95

96

Column objects provide typed access to homogeneous data with support for indexing, slicing, and batch operations.

97

98

```python { .api }

99

class Column:

100

"""Mutable column access."""

101

102

name: str

103

metadata: Metadata

104

indexes: List[str]

105

106

def __getitem__(self, key: Union[int, slice, List[int]]) -> Any:

107

"""

108

Get column values by index.

109

110

Parameters:

111

- key: Row index (int), slice, or list of indices

112

113

Returns:

114

- Any: Single value or list of values

115

"""

116

117

def __setitem__(self, key: Union[int, slice, List[int]], value: Any) -> None:

118

"""

119

Set column values by index.

120

121

Parameters:

122

- key: Row index (int), slice, or list of indices

123

- value: Value(s) to set

124

"""

125

126

def __len__(self) -> int:

127

"""Get number of elements in column."""

128

129

def create_index(self, type: IndexType) -> None:

130

"""

131

Create index on column for query optimization.

132

133

Parameters:

134

- type: Index type specification

135

"""

136

137

def drop_index(self, name: str) -> None:

138

"""

139

Drop existing index.

140

141

Parameters:

142

- name: Index name to drop

143

"""

144

145

def get_async(self, index: int) -> Future[Any]:

146

"""

147

Get column value asynchronously.

148

149

Parameters:

150

- index: Row index

151

152

Returns:

153

Future[Any]: Future resolving to column value

154

"""

155

156

def set_async(self, index: int, value: Any) -> FutureVoid:

157

"""

158

Set column value asynchronously.

159

160

Parameters:

161

- index: Row index

162

- value: Value to set

163

164

Returns:

165

FutureVoid: Future completing when set operation is done

166

"""

167

168

def get_bytes(self, index: int) -> bytes:

169

"""

170

Get raw bytes representation of column value.

171

172

Parameters:

173

- index: Row index

174

175

Returns:

176

bytes: Raw bytes data

177

"""

178

179

def get_bytes_async(self, index: int) -> Future[bytes]:

180

"""

181

Get raw bytes representation asynchronously.

182

183

Parameters:

184

- index: Row index

185

186

Returns:

187

Future[bytes]: Future resolving to raw bytes data

188

"""

189

190

class ColumnView:

191

"""Read-only column access."""

192

193

name: str

194

metadata: ReadOnlyMetadata

195

indexes: List[str]

196

197

def __getitem__(self, key: Union[int, slice, List[int]]) -> Any:

198

"""Get column values by index (read-only)."""

199

200

def __len__(self) -> int:

201

"""Get number of elements in column."""

202

203

def get_async(self, index: int) -> Future[Any]:

204

"""Get column value asynchronously."""

205

206

def get_bytes(self, index: int) -> bytes:

207

"""Get raw bytes representation of column value."""

208

209

def get_bytes_async(self, index: int) -> Future[bytes]:

210

"""Get raw bytes representation asynchronously."""

211

212

def _links_info(self) -> Dict[str, Any]:

213

"""Get link information for linked columns."""

214

215

class ColumnDefinition:

216

"""Mutable column definition."""

217

218

name: str

219

dtype: Type

220

221

def drop(self) -> None:

222

"""Drop this column from dataset."""

223

224

def rename(self, new_name: str) -> None:

225

"""Rename this column."""

226

227

class ColumnDefinitionView:

228

"""Read-only column definition."""

229

230

name: str

231

dtype: Type

232

```

233

234

### Row Access and Manipulation

235

236

Row objects provide dictionary-like access to individual records with type-aware value handling.

237

238

```python { .api }

239

class Row:

240

"""Mutable single row access."""

241

242

row_id: int

243

244

def __getitem__(self, column_name: str) -> Any:

245

"""

246

Get value from specific column.

247

248

Parameters:

249

- column_name: Column name

250

251

Returns:

252

Any: Column value for this row

253

"""

254

255

def __setitem__(self, column_name: str, value: Any) -> None:

256

"""

257

Set value in specific column.

258

259

Parameters:

260

- column_name: Column name

261

- value: Value to set

262

"""

263

264

def to_dict(self) -> Dict[str, Any]:

265

"""

266

Convert row to dictionary.

267

268

Returns:

269

Dict[str, Any]: Dictionary mapping column names to values

270

"""

271

272

def get_async(self, column_name: str) -> Future[Any]:

273

"""Get column value asynchronously."""

274

275

def set_async(self, column_name: str, value: Any) -> FutureVoid:

276

"""Set column value asynchronously."""

277

278

def get_bytes(self, column_name: str) -> bytes:

279

"""Get raw bytes representation of column value."""

280

281

def get_bytes_async(self, column_name: str) -> Future[bytes]:

282

"""Get raw bytes representation asynchronously."""

283

284

class RowView:

285

"""Read-only single row access."""

286

287

row_id: int

288

289

def __getitem__(self, column_name: str) -> Any:

290

"""Get value from specific column (read-only)."""

291

292

def to_dict(self) -> Dict[str, Any]:

293

"""Convert row to dictionary."""

294

295

def get_async(self, column_name: str) -> Future[Any]:

296

"""Get column value asynchronously."""

297

298

def get_bytes(self, column_name: str) -> bytes:

299

"""Get raw bytes representation of column value."""

300

301

def get_bytes_async(self, column_name: str) -> Future[bytes]:

302

"""Get raw bytes representation asynchronously."""

303

304

class RowRange:

305

"""Mutable multiple row access."""

306

307

def __getitem__(self, column_name: str) -> List[Any]:

308

"""Get values from specific column across all rows in range."""

309

310

def __setitem__(self, column_name: str, values: List[Any]) -> None:

311

"""Set values in specific column across all rows in range."""

312

313

def __len__(self) -> int:

314

"""Get number of rows in range."""

315

316

def __iter__(self) -> Iterator[Row]:

317

"""Iterate over rows in range."""

318

319

def summary(self) -> str:

320

"""Get summary statistics of the row range."""

321

322

class RowRangeView:

323

"""Read-only multiple row access."""

324

325

def __getitem__(self, column_name: str) -> List[Any]:

326

"""Get values from specific column across all rows in range."""

327

328

def __len__(self) -> int:

329

"""Get number of rows in range."""

330

331

def __iter__(self) -> Iterator[RowView]:

332

"""Iterate over rows in range."""

333

334

def summary(self) -> str:

335

"""Get summary statistics of the row range."""

336

```

337

338

### Metadata Management

339

340

Metadata objects provide key-value storage for dataset and column metadata with type preservation.

341

342

```python { .api }

343

class Metadata:

344

"""Mutable metadata storage."""

345

346

def __getitem__(self, key: str) -> Any:

347

"""Get metadata value by key."""

348

349

def __setitem__(self, key: str, value: Any) -> None:

350

"""Set metadata value by key."""

351

352

def __contains__(self, key: str) -> bool:

353

"""Check if metadata key exists."""

354

355

def keys(self) -> List[str]:

356

"""Get all metadata keys."""

357

358

class ReadOnlyMetadata:

359

"""Read-only metadata storage."""

360

361

def __getitem__(self, key: str) -> Any:

362

"""Get metadata value by key."""

363

364

def __contains__(self, key: str) -> bool:

365

"""Check if metadata key exists."""

366

367

def keys(self) -> List[str]:

368

"""Get all metadata keys."""

369

```

370

371

## Usage Examples

372

373

### Basic Data Access

374

375

```python

376

import deeplake

377

378

# Open dataset

379

dataset = deeplake.open("./my_dataset")

380

381

# Row access

382

row = dataset[0] # First row

383

print(row["image_path"]) # Access column value

384

print(row.to_dict()) # Convert to dictionary

385

386

# Row range access

387

rows = dataset[0:10] # First 10 rows

388

for row in rows:

389

print(row["label"])

390

391

# Column access

392

images_column = dataset["images"]

393

print(len(images_column)) # Number of images

394

first_image = images_column[0] # First image

395

396

# Column slicing

397

batch_images = images_column[0:32] # First 32 images

398

```

399

400

### Data Manipulation

401

402

```python

403

# Add new column

404

dataset.add_column("scores", deeplake.types.Float32())

405

406

# Append single row

407

dataset.append({

408

"images": "new_image.jpg",

409

"labels": "dog",

410

"scores": 0.95

411

})

412

413

# Append multiple rows

414

batch_data = [

415

{"images": f"image_{i}.jpg", "labels": f"label_{i}", "scores": 0.8 + i * 0.01}

416

for i in range(100)

417

]

418

dataset.extend(batch_data)

419

420

# Update specific values

421

dataset[0]["scores"] = 0.99 # Update single value

422

dataset["scores"][0:10] = [0.9] * 10 # Update range

423

424

# Column operations

425

scores = dataset["scores"]

426

scores[100] = 0.85 # Set specific score

427

high_scores = scores[scores > 0.9] # Filter high scores

428

```

429

430

### Batch Operations

431

432

```python

433

# Access data in batches

434

batch_size = 32

435

for i in range(0, len(dataset), batch_size):

436

batch = dataset[i:i+batch_size]

437

438

# Get batch data as lists

439

images = batch["images"]

440

labels = batch["labels"]

441

442

# Process batch

443

process_batch(images, labels)

444

445

# Column-wise batch operations

446

images_column = dataset["images"]

447

for i in range(0, len(images_column), batch_size):

448

image_batch = images_column[i:i+batch_size]

449

processed_batch = preprocess_images(image_batch)

450

# Save processed results...

451

```

452

453

### Async Operations

454

455

```python

456

import asyncio

457

458

async def process_data_async(dataset):

459

# Get multiple values concurrently

460

tasks = [

461

dataset["images"].get_async(i)

462

for i in range(10)

463

]

464

465

images = await asyncio.gather(*tasks)

466

return images

467

468

# Set values asynchronously

469

async def update_scores_async(dataset, new_scores):

470

tasks = [

471

dataset["scores"].set_async(i, score)

472

for i, score in enumerate(new_scores)

473

]

474

475

await asyncio.gather(*tasks)

476

```

477

478

### Metadata Usage

479

480

```python

481

# Dataset metadata

482

dataset.metadata["version"] = "1.0"

483

dataset.metadata["description"] = "Training dataset for image classification"

484

print(dataset.metadata.keys())

485

486

# Column metadata

487

images_column = dataset["images"]

488

images_column.metadata["preprocessing"] = "normalized"

489

images_column.metadata["source"] = "camera_feed"

490

491

# Access metadata

492

if "version" in dataset.metadata:

493

print(f"Dataset version: {dataset.metadata['version']}")

494

```

495

496

### Indexing for Performance

497

498

```python

499

# Create index on text column for fast queries

500

text_column = dataset["descriptions"]

501

text_column.create_index(deeplake.types.TextIndex(deeplake.types.Inverted))

502

503

# Create embedding index for similarity search

504

embedding_column = dataset["embeddings"]

505

embedding_column.create_index(

506

deeplake.types.EmbeddingIndex(deeplake.types.Clustered)

507

)

508

509

# List all indexes on column

510

print(text_column.indexes)

511

512

# Drop index when no longer needed

513

text_column.drop_index("inverted_index")

514

```