or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

admin.mdbucket-operations.mdconfig-session.mddata-access.mdhooks.mdindex.mdpackage-management.mdregistry-operations.md

data-access.mddocs/

0

# Package Data Access

1

2

Methods for accessing, deserializing, and working with data files within packages. Supports various data formats with caching and optimization features.

3

4

## Capabilities

5

6

### File Access and Retrieval

7

8

Access files within packages and retrieve their physical locations.

9

10

```python { .api }

11

class PackageEntry:

12

def get(self) -> str:

13

"""

14

Returns the physical key of this PackageEntry.

15

16

Returns:

17

Physical path or S3 URI to the file

18

"""

19

20

def get_cached_path(self) -> str:

21

"""

22

Returns a locally cached physical key, if available.

23

24

Returns:

25

Local file path if cached, otherwise None

26

"""

27

28

def fetch(self, dest: str = None):

29

"""

30

Gets objects from entry and saves them to dest.

31

32

Parameters:

33

- dest: Destination path for the downloaded file

34

35

Returns:

36

Path to the fetched file

37

"""

38

```

39

40

### Binary Data Access

41

42

Retrieve raw bytes from package entries with caching support.

43

44

```python { .api }

45

class PackageEntry:

46

def get_bytes(self, use_cache_if_available: bool = True) -> bytes:

47

"""

48

Returns the bytes of the object this entry corresponds to.

49

50

Parameters:

51

- use_cache_if_available: Whether to use cached version if available

52

53

Returns:

54

Raw bytes of the file contents

55

"""

56

```

57

58

### Structured Data Access

59

60

Access structured data formats like JSON and text files.

61

62

```python { .api }

63

class PackageEntry:

64

def get_as_json(self, use_cache_if_available: bool = True) -> dict:

65

"""

66

Returns a JSON file as a dict. Assumes that the file is encoded using utf-8.

67

68

Parameters:

69

- use_cache_if_available: Whether to use cached version if available

70

71

Returns:

72

Parsed JSON data as a dictionary

73

74

Raises:

75

JSONDecodeError if file is not valid JSON

76

"""

77

78

def get_as_string(self, use_cache_if_available: bool = True) -> str:

79

"""

80

Return the object as a string. Assumes that the file is encoded using utf-8.

81

82

Parameters:

83

- use_cache_if_available: Whether to use cached version if available

84

85

Returns:

86

File contents as a UTF-8 decoded string

87

"""

88

```

89

90

### Data Deserialization

91

92

Deserialize files using format-specific handlers and custom functions.

93

94

```python { .api }

95

class PackageEntry:

96

def deserialize(self, func=None, **format_opts):

97

"""

98

Returns the object this entry corresponds to.

99

100

Parameters:

101

- func: Custom deserialization function

102

- **format_opts: Format-specific options

103

104

Returns:

105

Deserialized data object (format depends on file type and func)

106

107

Supported formats:

108

- CSV: Returns pandas DataFrame (requires pandas)

109

- Parquet: Returns pandas DataFrame (requires pandas, pyarrow)

110

- JSON: Returns parsed JSON object

111

- Custom: Uses provided func parameter

112

"""

113

114

def __call__(self, func=None, **kwargs):

115

"""

116

Shorthand for self.deserialize()

117

118

Parameters:

119

- func: Custom deserialization function

120

- **kwargs: Passed to deserialize method

121

122

Returns:

123

Deserialized data object

124

"""

125

```

126

127

### Entry Metadata and Properties

128

129

Access and modify entry metadata and properties.

130

131

```python { .api }

132

class PackageEntry:

133

@property

134

def meta(self) -> dict:

135

"""

136

Get user metadata for this entry.

137

138

Returns:

139

Dictionary of user metadata

140

"""

141

142

def set_meta(self, meta: dict):

143

"""

144

Sets the user_meta for this PackageEntry.

145

146

Parameters:

147

- meta: Dictionary of metadata to set

148

"""

149

150

def set(self, path: str = None, meta: dict = None):

151

"""

152

Returns self with the physical key set to path.

153

154

Parameters:

155

- path: New physical path for the entry

156

- meta: New metadata for the entry

157

158

Returns:

159

New PackageEntry with updated properties

160

"""

161

162

@property

163

def size(self) -> int:

164

"""Size of the entry in bytes."""

165

166

@property

167

def hash(self) -> dict:

168

"""Hash information for the entry."""

169

170

def as_dict(self) -> dict:

171

"""

172

Returns dict representation of entry.

173

174

Returns:

175

Dictionary containing entry metadata and properties

176

"""

177

178

def with_physical_key(self, key):

179

"""

180

Returns a new PackageEntry with a different physical key.

181

182

Parameters:

183

- key: New PhysicalKey for the entry

184

185

Returns:

186

New PackageEntry with updated physical key

187

"""

188

```

189

190

### Entry Representation and Equality

191

192

String representation and equality comparison for entries.

193

194

```python { .api }

195

class PackageEntry:

196

def __repr__(self) -> str:

197

"""String representation of the PackageEntry."""

198

199

def __eq__(self, other) -> bool:

200

"""

201

Equality comparison between PackageEntry objects.

202

203

Parameters:

204

- other: Another PackageEntry to compare with

205

206

Returns:

207

True if entries are equivalent (same size and hash)

208

"""

209

```

210

211

## Usage Examples

212

213

### Basic File Access

214

215

```python

216

import quilt3

217

218

# Browse a package

219

pkg = quilt3.Package.browse("my-username/my-dataset")

220

221

# Get a specific file entry

222

data_file = pkg["data/measurements.csv"]

223

224

# Get the physical location

225

file_path = data_file.get()

226

print(f"File location: {file_path}")

227

228

# Download file locally

229

local_path = data_file.fetch("./downloaded_measurements.csv")

230

print(f"Downloaded to: {local_path}")

231

```

232

233

### Working with Different Data Formats

234

235

```python

236

# JSON data access

237

config_entry = pkg["config/settings.json"]

238

config_data = config_entry.get_as_json()

239

print(f"Configuration: {config_data}")

240

241

# Text file access

242

readme_entry = pkg["README.txt"]

243

readme_content = readme_entry.get_as_string()

244

print(readme_content)

245

246

# Binary data access

247

image_entry = pkg["images/photo.jpg"]

248

image_bytes = image_entry.get_bytes()

249

print(f"Image size: {len(image_bytes)} bytes")

250

```

251

252

### Data Deserialization with pandas

253

254

```python

255

# Deserialize CSV to pandas DataFrame (requires pandas)

256

csv_entry = pkg["data/measurements.csv"]

257

df = csv_entry.deserialize() # Automatically detects CSV format

258

print(df.head())

259

260

# Deserialize Parquet file (requires pandas and pyarrow)

261

parquet_entry = pkg["data/results.parquet"]

262

df = parquet_entry.deserialize()

263

print(f"DataFrame shape: {df.shape}")

264

265

# Custom deserialization function

266

def load_custom_format(file_path):

267

# Custom loading logic

268

return {"loaded_from": file_path}

269

270

custom_entry = pkg["data/custom.dat"]

271

custom_data = custom_entry.deserialize(func=load_custom_format)

272

print(custom_data)

273

```

274

275

### Entry Metadata Management

276

277

```python

278

# Access entry metadata

279

data_entry = pkg["data/experiment_1.csv"]

280

metadata = data_entry.meta

281

print(f"Entry metadata: {metadata}")

282

283

# Create new entry with metadata

284

new_entry = data_entry.set(meta={

285

"experiment": "exp_001",

286

"date": "2024-01-15",

287

"researcher": "Dr. Smith"

288

})

289

290

# Get entry properties

291

print(f"File size: {data_entry.size} bytes")

292

print(f"Hash info: {data_entry.hash}")

293

print(f"Entry dict: {data_entry.as_dict()}")

294

```

295

296

### Cached Access

297

298

```python

299

# First access - downloads and caches

300

data = csv_entry.get_bytes(use_cache_if_available=True)

301

302

# Second access - uses cached version

303

data_cached = csv_entry.get_bytes(use_cache_if_available=True) # Faster

304

305

# Force fresh download

306

data_fresh = csv_entry.get_bytes(use_cache_if_available=False)

307

308

# Check if cached version exists

309

cached_path = csv_entry.get_cached_path()

310

if cached_path:

311

print(f"Cached at: {cached_path}")

312

else:

313

print("No cached version available")

314

```

315

316

### Working with Large Files

317

318

```python

319

# Stream large files without loading entirely into memory

320

large_file = pkg["data/large_dataset.csv"]

321

322

# Get file handle for streaming

323

file_path = large_file.get()

324

325

# Use with context manager for efficient access

326

with open(file_path, 'r') as f:

327

for line_num, line in enumerate(f):

328

if line_num > 100: # Process first 100 lines

329

break

330

process_line(line)

331

332

# Or deserialize with chunking (for pandas)

333

for chunk in large_file.deserialize(chunksize=1000):

334

process_chunk(chunk)

335

```