or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archives.mdcloud-storage.mdconfiguration.mddata-formats.mddirectory-management.mdfile-operations.mdindex.mdmodule-class.mdnltk-integration.mdweb-scraping.md

cloud-storage.mddocs/

0

# Cloud Storage Integration

1

2

PyStow provides built-in support for downloading files from major cloud storage services, including AWS S3 and Google Drive. This enables seamless integration with cloud-hosted datasets and files.

3

4

## AWS S3 Support

5

6

### S3 File Download

7

8

```python { .api }

9

def ensure_from_s3(key: str, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, force: bool = False, **kwargs: Any) -> Path:

10

"""Ensure a file is downloaded from AWS S3.

11

12

Args:

13

key: The name of the module. No funny characters. The envvar <key>_HOME where

14

key is uppercased is checked first before using the default home directory.

15

subkeys: A sequence of additional strings to join. If none are given, returns

16

the directory for this module.

17

s3_bucket: The S3 bucket name

18

s3_key: The S3 key name

19

name: Overrides the name of the file at the end of the S3 key, if given.

20

force: Should the download be done again, even if the path already exists?

21

Defaults to false.

22

kwargs: Remaining kwargs to forward to Module.ensure_from_s3.

23

24

Returns:

25

The path of the file that has been downloaded (or already exists)

26

"""

27

```

28

29

## Google Drive Support

30

31

### Google Drive File Download

32

33

```python { .api }

34

def ensure_from_google(key: str, *subkeys: str, name: str, file_id: str, force: bool = False) -> Path:

35

"""Ensure a file is downloaded from Google Drive.

36

37

Args:

38

key: The name of the module. No funny characters. The envvar <key>_HOME where

39

key is uppercased is checked first before using the default home directory.

40

subkeys: A sequence of additional strings to join. If none are given, returns

41

the directory for this module.

42

name: The name of the file

43

file_id: The file identifier of the Google file. If your share link is

44

https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then

45

your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.

46

force: Should the download be done again, even if the path already exists?

47

Defaults to false.

48

49

Returns:

50

The path of the file that has been downloaded (or already exists)

51

"""

52

```

53

54

## Usage Examples

55

56

### AWS S3 Downloads

57

58

```python

59

import pystow

60

61

# Download file from S3 bucket

62

path = pystow.ensure_from_s3(

63

"myapp", "datasets",

64

s3_bucket="my-data-bucket",

65

s3_key="datasets/v1/train.csv",

66

name="training_data.csv"

67

)

68

69

# Download with nested S3 key

70

path = pystow.ensure_from_s3(

71

"myapp", "models",

72

s3_bucket="ml-models",

73

s3_key=["experiments", "model_v2", "checkpoint.pkl"],

74

name="model_checkpoint.pkl"

75

)

76

77

# Use custom name

78

path = pystow.ensure_from_s3(

79

"myapp", "resources",

80

s3_bucket="public-datasets",

81

s3_key="data/raw/file_with_complex_name.csv",

82

name="simple_name.csv" # Rename for local storage

83

)

84

```

85

86

### Google Drive Downloads

87

88

```python

89

import pystow

90

91

# Download from Google Drive using file ID

92

path = pystow.ensure_from_google(

93

"myapp", "datasets",

94

name="dataset.zip",

95

file_id="1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z"

96

)

97

98

# Force re-download

99

path = pystow.ensure_from_google(

100

"myapp", "models",

101

name="pretrained_model.pkl",

102

file_id="1BcDfG2hIjKlMnOpQrStUvWxYz3456789",

103

force=True

104

)

105

```

106

107

### Module-Based Cloud Downloads

108

109

```python

110

import pystow

111

112

# Create module for project

113

module = pystow.module("myproject")

114

115

# Download from S3 using module

116

s3_path = module.ensure_from_s3(

117

"data", "raw",

118

s3_bucket="research-data",

119

s3_key="experiments/dataset_v3.csv"

120

)

121

122

# Download from Google Drive using module

123

gdrive_path = module.ensure_from_google(

124

"models", "pretrained",

125

name="bert_model.tar.gz",

126

file_id="1ExAmPlE_fIlE_iD_123456789"

127

)

128

```

129

130

### AWS S3 Configuration

131

132

```python

133

import pystow

134

import boto3

135

136

# Download with custom boto3 client configuration

137

path = pystow.ensure_from_s3(

138

"myapp", "secure_data",

139

s3_bucket="private-bucket",

140

s3_key="sensitive/data.json",

141

client_kwargs={

142

"region_name": "us-west-2",

143

"aws_access_key_id": "your_access_key",

144

"aws_secret_access_key": "your_secret_key"

145

}

146

)

147

148

# Using existing boto3 client

149

s3_client = boto3.client('s3', region_name='eu-west-1')

150

path = pystow.ensure_from_s3(

151

"myapp", "eu_data",

152

s3_bucket="eu-data-bucket",

153

s3_key="regional/dataset.csv",

154

client=s3_client

155

)

156

```

157

158

### Advanced S3 Downloads

159

160

```python

161

import pystow

162

163

# Download with additional S3 transfer options

164

path = pystow.ensure_from_s3(

165

"myapp", "large_files",

166

s3_bucket="big-data-bucket",

167

s3_key="large_dataset/data.parquet",

168

download_file_kwargs={

169

"Config": {

170

"multipart_threshold": 1024 * 25, # 25MB

171

"max_concurrency": 10,

172

"multipart_chunksize": 1024 * 25,

173

"use_threads": True

174

}

175

}

176

)

177

178

# Download and force refresh

179

path = pystow.ensure_from_s3(

180

"myapp", "live_data",

181

s3_bucket="streaming-data",

182

s3_key="current/metrics.json",

183

force=True # Always fetch latest version

184

)

185

```

186

187

### Error Handling and Authentication

188

189

```python

190

import pystow

191

from botocore.exceptions import NoCredentialsError, ClientError

192

193

try:

194

# Download from S3

195

path = pystow.ensure_from_s3(

196

"myapp", "datasets",

197

s3_bucket="secure-bucket",

198

s3_key="protected/data.csv"

199

)

200

print(f"Downloaded to: {path}")

201

202

except NoCredentialsError:

203

print("AWS credentials not found. Please configure AWS CLI or set environment variables.")

204

205

except ClientError as e:

206

error_code = e.response['Error']['Code']

207

if error_code == 'NoSuchBucket':

208

print("S3 bucket does not exist")

209

elif error_code == 'NoSuchKey':

210

print("S3 key does not exist")

211

elif error_code == 'AccessDenied':

212

print("Access denied to S3 resource")

213

else:

214

print(f"S3 error: {e}")

215

```

216

217

### Cloud-Based Data Processing Workflows

218

219

```python

220

import pystow

221

import pandas as pd

222

223

def process_s3_dataset(bucket, key, output_name):

224

"""Download S3 dataset, process it, and save locally"""

225

226

# Download raw data from S3

227

raw_path = pystow.ensure_from_s3(

228

"myapp", "raw_data",

229

s3_bucket=bucket,

230

s3_key=key

231

)

232

233

# Load and process data

234

df = pd.read_csv(raw_path)

235

processed_df = df.groupby('category').agg({

236

'value': 'mean',

237

'count': 'sum'

238

}).reset_index()

239

240

# Save processed data locally

241

pystow.dump_df(

242

"myapp", "processed",

243

name=output_name,

244

obj=processed_df

245

)

246

247

return processed_df

248

249

# Use the function

250

result = process_s3_dataset(

251

bucket="analytics-data",

252

key="daily_reports/2023/report_2023_12_01.csv",

253

output_name="daily_summary.csv"

254

)

255

```

256

257

### Multi-Source Data Integration

258

259

```python

260

import pystow

261

import pandas as pd

262

263

def integrate_cloud_datasets():

264

"""Integrate datasets from multiple cloud sources"""

265

266

# Download from S3

267

s3_data_path = pystow.ensure_from_s3(

268

"myapp", "sources", "s3",

269

s3_bucket="primary-data",

270

s3_key="exports/dataset_a.csv"

271

)

272

273

# Download from Google Drive

274

gdrive_data_path = pystow.ensure_from_google(

275

"myapp", "sources", "gdrive",

276

name="dataset_b.csv",

277

file_id="1ExAmPlE_gDrIvE_fIlE_iD"

278

)

279

280

# Load both datasets

281

df_a = pd.read_csv(s3_data_path)

282

df_b = pd.read_csv(gdrive_data_path)

283

284

# Merge datasets

285

merged_df = pd.merge(df_a, df_b, on='id', how='inner')

286

287

# Save integrated dataset

288

pystow.dump_df(

289

"myapp", "integrated",

290

name="combined_dataset.csv",

291

obj=merged_df

292

)

293

294

return merged_df

295

296

# Integrate data from multiple sources

297

combined_data = integrate_cloud_datasets()

298

```

299

300

## Authentication Setup

301

302

### AWS S3 Authentication

303

304

PyStow uses boto3 for S3 access, which supports multiple authentication methods:

305

306

1. **AWS CLI Configuration**:

307

```bash

308

aws configure

309

```

310

311

2. **Environment Variables**:

312

```bash

313

export AWS_ACCESS_KEY_ID=your_access_key

314

export AWS_SECRET_ACCESS_KEY=your_secret_key

315

export AWS_DEFAULT_REGION=us-east-1

316

```

317

318

3. **IAM Roles** (when running on AWS infrastructure)

319

320

4. **Programmatic Configuration**:

321

```python

322

path = pystow.ensure_from_s3(

323

"myapp", "data",

324

s3_bucket="my-bucket",

325

s3_key="data.csv",

326

client_kwargs={

327

"aws_access_key_id": "your_key",

328

"aws_secret_access_key": "your_secret",

329

"region_name": "us-west-2"

330

}

331

)

332

```

333

334

### Google Drive Authentication

335

336

Google Drive downloads work with publicly shared files using the file ID from the share URL. For private files, additional authentication setup may be required through the Google API.