Tessl Tile for pypi/pystow@0.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

archives.md cloud-storage.md configuration.md data-formats.md directory-management.md file-operations.md index.md module-class.md nltk-integration.md web-scraping.md

cloud-storage.mddocs/

0
# Cloud Storage Integration
1

2
PyStow provides built-in support for downloading files from major cloud storage services, including AWS S3 and Google Drive. This enables seamless integration with cloud-hosted datasets and files.
3

4
## AWS S3 Support
5

6
### S3 File Download
7

8
```python { .api }
9
def ensure_from_s3(key: str, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, force: bool = False, **kwargs: Any) -> Path:
10
    """Ensure a file is downloaded from AWS S3.
11
    
12
    Args:
13
        key: The name of the module. No funny characters. The envvar <key>_HOME where
14
            key is uppercased is checked first before using the default home directory.
15
        subkeys: A sequence of additional strings to join. If none are given, returns
16
            the directory for this module.
17
        s3_bucket: The S3 bucket name
18
        s3_key: The S3 key name
19
        name: Overrides the name of the file at the end of the S3 key, if given.
20
        force: Should the download be done again, even if the path already exists?
21
            Defaults to false.
22
        kwargs: Remaining kwargs to forward to Module.ensure_from_s3.
23
    
24
    Returns:
25
        The path of the file that has been downloaded (or already exists)
26
    """
27
```
28

29
## Google Drive Support
30

31
### Google Drive File Download
32

33
```python { .api }
34
def ensure_from_google(key: str, *subkeys: str, name: str, file_id: str, force: bool = False) -> Path:
35
    """Ensure a file is downloaded from Google Drive.
36
    
37
    Args:
38
        key: The name of the module. No funny characters. The envvar <key>_HOME where
39
            key is uppercased is checked first before using the default home directory.
40
        subkeys: A sequence of additional strings to join. If none are given, returns
41
            the directory for this module.
42
        name: The name of the file
43
        file_id: The file identifier of the Google file. If your share link is
44
            https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then
45
            your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.
46
        force: Should the download be done again, even if the path already exists?
47
            Defaults to false.
48
    
49
    Returns:
50
        The path of the file that has been downloaded (or already exists)
51
    """
52
```
53

54
## Usage Examples
55

56
### AWS S3 Downloads
57

58
```python
59
import pystow
60

61
# Download file from S3 bucket
62
path = pystow.ensure_from_s3(
63
    "myapp", "datasets",
64
    s3_bucket="my-data-bucket",
65
    s3_key="datasets/v1/train.csv",
66
    name="training_data.csv"
67
)
68

69
# Download with nested S3 key
70
path = pystow.ensure_from_s3(
71
    "myapp", "models",
72
    s3_bucket="ml-models",
73
    s3_key=["experiments", "model_v2", "checkpoint.pkl"],
74
    name="model_checkpoint.pkl"
75
)
76

77
# Use custom name
78
path = pystow.ensure_from_s3(
79
    "myapp", "resources",
80
    s3_bucket="public-datasets", 
81
    s3_key="data/raw/file_with_complex_name.csv",
82
    name="simple_name.csv"  # Rename for local storage
83
)
84
```
85

86
### Google Drive Downloads
87

88
```python
89
import pystow
90

91
# Download from Google Drive using file ID
92
path = pystow.ensure_from_google(
93
    "myapp", "datasets",
94
    name="dataset.zip",
95
    file_id="1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z"
96
)
97

98
# Force re-download
99
path = pystow.ensure_from_google(
100
    "myapp", "models",
101
    name="pretrained_model.pkl",
102
    file_id="1BcDfG2hIjKlMnOpQrStUvWxYz3456789",
103
    force=True
104
)
105
```
106

107
### Module-Based Cloud Downloads
108

109
```python
110
import pystow
111

112
# Create module for project
113
module = pystow.module("myproject")
114

115
# Download from S3 using module
116
s3_path = module.ensure_from_s3(
117
    "data", "raw",
118
    s3_bucket="research-data",
119
    s3_key="experiments/dataset_v3.csv"
120
)
121

122
# Download from Google Drive using module
123
gdrive_path = module.ensure_from_google(
124
    "models", "pretrained",
125
    name="bert_model.tar.gz",
126
    file_id="1ExAmPlE_fIlE_iD_123456789"
127
)
128
```
129

130
### AWS S3 Configuration
131

132
```python
133
import pystow
134
import boto3
135

136
# Download with custom boto3 client configuration
137
path = pystow.ensure_from_s3(
138
    "myapp", "secure_data",
139
    s3_bucket="private-bucket",
140
    s3_key="sensitive/data.json",
141
    client_kwargs={
142
        "region_name": "us-west-2",
143
        "aws_access_key_id": "your_access_key",
144
        "aws_secret_access_key": "your_secret_key"
145
    }
146
)
147

148
# Using existing boto3 client
149
s3_client = boto3.client('s3', region_name='eu-west-1')
150
path = pystow.ensure_from_s3(
151
    "myapp", "eu_data",
152
    s3_bucket="eu-data-bucket",
153
    s3_key="regional/dataset.csv",
154
    client=s3_client
155
)
156
```
157

158
### Advanced S3 Downloads
159

160
```python
161
import pystow
162

163
# Download with additional S3 transfer options
164
path = pystow.ensure_from_s3(
165
    "myapp", "large_files",
166
    s3_bucket="big-data-bucket",
167
    s3_key="large_dataset/data.parquet",
168
    download_file_kwargs={
169
        "Config": {
170
            "multipart_threshold": 1024 * 25,  # 25MB
171
            "max_concurrency": 10,
172
            "multipart_chunksize": 1024 * 25,
173
            "use_threads": True
174
        }
175
    }
176
)
177

178
# Download and force refresh
179
path = pystow.ensure_from_s3(
180
    "myapp", "live_data",
181
    s3_bucket="streaming-data",
182
    s3_key="current/metrics.json",
183
    force=True  # Always fetch latest version
184
)
185
```
186

187
### Error Handling and Authentication
188

189
```python
190
import pystow
191
from botocore.exceptions import NoCredentialsError, ClientError
192

193
try:
194
    # Download from S3
195
    path = pystow.ensure_from_s3(
196
        "myapp", "datasets",
197
        s3_bucket="secure-bucket",
198
        s3_key="protected/data.csv"
199
    )
200
    print(f"Downloaded to: {path}")
201
    
202
except NoCredentialsError:
203
    print("AWS credentials not found. Please configure AWS CLI or set environment variables.")
204
    
205
except ClientError as e:
206
    error_code = e.response['Error']['Code']
207
    if error_code == 'NoSuchBucket':
208
        print("S3 bucket does not exist")
209
    elif error_code == 'NoSuchKey':
210
        print("S3 key does not exist")
211
    elif error_code == 'AccessDenied':
212
        print("Access denied to S3 resource")
213
    else:
214
        print(f"S3 error: {e}")
215
```
216

217
### Cloud-Based Data Processing Workflows
218

219
```python
220
import pystow
221
import pandas as pd
222

223
def process_s3_dataset(bucket, key, output_name):
224
    """Download S3 dataset, process it, and save locally"""
225
    
226
    # Download raw data from S3
227
    raw_path = pystow.ensure_from_s3(
228
        "myapp", "raw_data",
229
        s3_bucket=bucket,
230
        s3_key=key
231
    )
232
    
233
    # Load and process data
234
    df = pd.read_csv(raw_path)
235
    processed_df = df.groupby('category').agg({
236
        'value': 'mean',
237
        'count': 'sum'
238
    }).reset_index()
239
    
240
    # Save processed data locally
241
    pystow.dump_df(
242
        "myapp", "processed",
243
        name=output_name,
244
        obj=processed_df
245
    )
246
    
247
    return processed_df
248

249
# Use the function
250
result = process_s3_dataset(
251
    bucket="analytics-data",
252
    key="daily_reports/2023/report_2023_12_01.csv",
253
    output_name="daily_summary.csv"
254
)
255
```
256

257
### Multi-Source Data Integration
258

259
```python
260
import pystow
261
import pandas as pd
262

263
def integrate_cloud_datasets():
264
    """Integrate datasets from multiple cloud sources"""
265
    
266
    # Download from S3
267
    s3_data_path = pystow.ensure_from_s3(
268
        "myapp", "sources", "s3",
269
        s3_bucket="primary-data",
270
        s3_key="exports/dataset_a.csv"
271
    )
272
    
273
    # Download from Google Drive
274
    gdrive_data_path = pystow.ensure_from_google(
275
        "myapp", "sources", "gdrive", 
276
        name="dataset_b.csv",
277
        file_id="1ExAmPlE_gDrIvE_fIlE_iD"
278
    )
279
    
280
    # Load both datasets
281
    df_a = pd.read_csv(s3_data_path)
282
    df_b = pd.read_csv(gdrive_data_path)
283
    
284
    # Merge datasets
285
    merged_df = pd.merge(df_a, df_b, on='id', how='inner')
286
    
287
    # Save integrated dataset
288
    pystow.dump_df(
289
        "myapp", "integrated",
290
        name="combined_dataset.csv",
291
        obj=merged_df
292
    )
293
    
294
    return merged_df
295

296
# Integrate data from multiple sources
297
combined_data = integrate_cloud_datasets()
298
```
299

300
## Authentication Setup
301

302
### AWS S3 Authentication
303

304
PyStow uses boto3 for S3 access, which supports multiple authentication methods:
305

306
1. **AWS CLI Configuration**:
307
   ```bash
308
   aws configure
309
   ```
310

311
2. **Environment Variables**:
312
   ```bash
313
   export AWS_ACCESS_KEY_ID=your_access_key
314
   export AWS_SECRET_ACCESS_KEY=your_secret_key
315
   export AWS_DEFAULT_REGION=us-east-1
316
   ```
317

318
3. **IAM Roles** (when running on AWS infrastructure)
319

320
4. **Programmatic Configuration**:
321
   ```python
322
   path = pystow.ensure_from_s3(
323
       "myapp", "data",
324
       s3_bucket="my-bucket",
325
       s3_key="data.csv",
326
       client_kwargs={
327
           "aws_access_key_id": "your_key",
328
           "aws_secret_access_key": "your_secret",
329
           "region_name": "us-west-2"
330
       }
331
   )
332
   ```
333

334
### Google Drive Authentication
335

336
Google Drive downloads work with publicly shared files using the file ID from the share URL. For private files, additional authentication setup may be required through the Google API.

Version

Tile

Files

cloud-storage.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cloud-storage.mddocs/