0
# Cloud Storage Integration
1
2
PyStow provides built-in support for downloading files from major cloud storage services, including AWS S3 and Google Drive. This enables seamless integration with cloud-hosted datasets and files.
3
4
## AWS S3 Support
5
6
### S3 File Download
7
8
```python { .api }
9
def ensure_from_s3(key: str, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, force: bool = False, **kwargs: Any) -> Path:
10
"""Ensure a file is downloaded from AWS S3.
11
12
Args:
13
key: The name of the module. No funny characters. The envvar <key>_HOME where
14
key is uppercased is checked first before using the default home directory.
15
subkeys: A sequence of additional strings to join. If none are given, returns
16
the directory for this module.
17
s3_bucket: The S3 bucket name
18
s3_key: The S3 key name
19
name: Overrides the name of the file at the end of the S3 key, if given.
20
force: Should the download be done again, even if the path already exists?
21
Defaults to false.
22
kwargs: Remaining kwargs to forward to Module.ensure_from_s3.
23
24
Returns:
25
The path of the file that has been downloaded (or already exists)
26
"""
27
```
28
29
## Google Drive Support
30
31
### Google Drive File Download
32
33
```python { .api }
34
def ensure_from_google(key: str, *subkeys: str, name: str, file_id: str, force: bool = False) -> Path:
35
"""Ensure a file is downloaded from Google Drive.
36
37
Args:
38
key: The name of the module. No funny characters. The envvar <key>_HOME where
39
key is uppercased is checked first before using the default home directory.
40
subkeys: A sequence of additional strings to join. If none are given, returns
41
the directory for this module.
42
name: The name of the file
43
file_id: The file identifier of the Google file. If your share link is
44
https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then
45
your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.
46
force: Should the download be done again, even if the path already exists?
47
Defaults to false.
48
49
Returns:
50
The path of the file that has been downloaded (or already exists)
51
"""
52
```
53
54
## Usage Examples
55
56
### AWS S3 Downloads
57
58
```python
59
import pystow
60
61
# Download file from S3 bucket
62
path = pystow.ensure_from_s3(
63
"myapp", "datasets",
64
s3_bucket="my-data-bucket",
65
s3_key="datasets/v1/train.csv",
66
name="training_data.csv"
67
)
68
69
# Download with nested S3 key
70
path = pystow.ensure_from_s3(
71
"myapp", "models",
72
s3_bucket="ml-models",
73
s3_key=["experiments", "model_v2", "checkpoint.pkl"],
74
name="model_checkpoint.pkl"
75
)
76
77
# Use custom name
78
path = pystow.ensure_from_s3(
79
"myapp", "resources",
80
s3_bucket="public-datasets",
81
s3_key="data/raw/file_with_complex_name.csv",
82
name="simple_name.csv" # Rename for local storage
83
)
84
```
85
86
### Google Drive Downloads
87
88
```python
89
import pystow
90
91
# Download from Google Drive using file ID
92
path = pystow.ensure_from_google(
93
"myapp", "datasets",
94
name="dataset.zip",
95
file_id="1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z"
96
)
97
98
# Force re-download
99
path = pystow.ensure_from_google(
100
"myapp", "models",
101
name="pretrained_model.pkl",
102
file_id="1BcDfG2hIjKlMnOpQrStUvWxYz3456789",
103
force=True
104
)
105
```
106
107
### Module-Based Cloud Downloads
108
109
```python
110
import pystow
111
112
# Create module for project
113
module = pystow.module("myproject")
114
115
# Download from S3 using module
116
s3_path = module.ensure_from_s3(
117
"data", "raw",
118
s3_bucket="research-data",
119
s3_key="experiments/dataset_v3.csv"
120
)
121
122
# Download from Google Drive using module
123
gdrive_path = module.ensure_from_google(
124
"models", "pretrained",
125
name="bert_model.tar.gz",
126
file_id="1ExAmPlE_fIlE_iD_123456789"
127
)
128
```
129
130
### AWS S3 Configuration
131
132
```python
133
import pystow
134
import boto3
135
136
# Download with custom boto3 client configuration
137
path = pystow.ensure_from_s3(
138
"myapp", "secure_data",
139
s3_bucket="private-bucket",
140
s3_key="sensitive/data.json",
141
client_kwargs={
142
"region_name": "us-west-2",
143
"aws_access_key_id": "your_access_key",
144
"aws_secret_access_key": "your_secret_key"
145
}
146
)
147
148
# Using existing boto3 client
149
s3_client = boto3.client('s3', region_name='eu-west-1')
150
path = pystow.ensure_from_s3(
151
"myapp", "eu_data",
152
s3_bucket="eu-data-bucket",
153
s3_key="regional/dataset.csv",
154
client=s3_client
155
)
156
```
157
158
### Advanced S3 Downloads
159
160
```python
161
import pystow
162
163
# Download with additional S3 transfer options
164
path = pystow.ensure_from_s3(
165
"myapp", "large_files",
166
s3_bucket="big-data-bucket",
167
s3_key="large_dataset/data.parquet",
168
download_file_kwargs={
169
"Config": {
170
"multipart_threshold": 1024 * 25, # 25MB
171
"max_concurrency": 10,
172
"multipart_chunksize": 1024 * 25,
173
"use_threads": True
174
}
175
}
176
)
177
178
# Download and force refresh
179
path = pystow.ensure_from_s3(
180
"myapp", "live_data",
181
s3_bucket="streaming-data",
182
s3_key="current/metrics.json",
183
force=True # Always fetch latest version
184
)
185
```
186
187
### Error Handling and Authentication
188
189
```python
190
import pystow
191
from botocore.exceptions import NoCredentialsError, ClientError
192
193
try:
194
# Download from S3
195
path = pystow.ensure_from_s3(
196
"myapp", "datasets",
197
s3_bucket="secure-bucket",
198
s3_key="protected/data.csv"
199
)
200
print(f"Downloaded to: {path}")
201
202
except NoCredentialsError:
203
print("AWS credentials not found. Please configure AWS CLI or set environment variables.")
204
205
except ClientError as e:
206
error_code = e.response['Error']['Code']
207
if error_code == 'NoSuchBucket':
208
print("S3 bucket does not exist")
209
elif error_code == 'NoSuchKey':
210
print("S3 key does not exist")
211
elif error_code == 'AccessDenied':
212
print("Access denied to S3 resource")
213
else:
214
print(f"S3 error: {e}")
215
```
216
217
### Cloud-Based Data Processing Workflows
218
219
```python
220
import pystow
221
import pandas as pd
222
223
def process_s3_dataset(bucket, key, output_name):
224
"""Download S3 dataset, process it, and save locally"""
225
226
# Download raw data from S3
227
raw_path = pystow.ensure_from_s3(
228
"myapp", "raw_data",
229
s3_bucket=bucket,
230
s3_key=key
231
)
232
233
# Load and process data
234
df = pd.read_csv(raw_path)
235
processed_df = df.groupby('category').agg({
236
'value': 'mean',
237
'count': 'sum'
238
}).reset_index()
239
240
# Save processed data locally
241
pystow.dump_df(
242
"myapp", "processed",
243
name=output_name,
244
obj=processed_df
245
)
246
247
return processed_df
248
249
# Use the function
250
result = process_s3_dataset(
251
bucket="analytics-data",
252
key="daily_reports/2023/report_2023_12_01.csv",
253
output_name="daily_summary.csv"
254
)
255
```
256
257
### Multi-Source Data Integration
258
259
```python
260
import pystow
261
import pandas as pd
262
263
def integrate_cloud_datasets():
264
"""Integrate datasets from multiple cloud sources"""
265
266
# Download from S3
267
s3_data_path = pystow.ensure_from_s3(
268
"myapp", "sources", "s3",
269
s3_bucket="primary-data",
270
s3_key="exports/dataset_a.csv"
271
)
272
273
# Download from Google Drive
274
gdrive_data_path = pystow.ensure_from_google(
275
"myapp", "sources", "gdrive",
276
name="dataset_b.csv",
277
file_id="1ExAmPlE_gDrIvE_fIlE_iD"
278
)
279
280
# Load both datasets
281
df_a = pd.read_csv(s3_data_path)
282
df_b = pd.read_csv(gdrive_data_path)
283
284
# Merge datasets
285
merged_df = pd.merge(df_a, df_b, on='id', how='inner')
286
287
# Save integrated dataset
288
pystow.dump_df(
289
"myapp", "integrated",
290
name="combined_dataset.csv",
291
obj=merged_df
292
)
293
294
return merged_df
295
296
# Integrate data from multiple sources
297
combined_data = integrate_cloud_datasets()
298
```
299
300
## Authentication Setup
301
302
### AWS S3 Authentication
303
304
PyStow uses boto3 for S3 access, which supports multiple authentication methods:
305
306
1. **AWS CLI Configuration**:
307
```bash
308
aws configure
309
```
310
311
2. **Environment Variables**:
312
```bash
313
export AWS_ACCESS_KEY_ID=your_access_key
314
export AWS_SECRET_ACCESS_KEY=your_secret_key
315
export AWS_DEFAULT_REGION=us-east-1
316
```
317
318
3. **IAM Roles** (when running on AWS infrastructure)
319
320
4. **Programmatic Configuration**:
321
```python
322
path = pystow.ensure_from_s3(
323
"myapp", "data",
324
s3_bucket="my-bucket",
325
s3_key="data.csv",
326
client_kwargs={
327
"aws_access_key_id": "your_key",
328
"aws_secret_access_key": "your_secret",
329
"region_name": "us-west-2"
330
}
331
)
332
```
333
334
### Google Drive Authentication
335
336
Google Drive downloads work with publicly shared files using the file ID from the share URL. For private files, additional authentication setup may be required through the Google API.